Mercurial > emacs
comparison src/coding.c @ 89483:2f877ed80fa6
*** empty log message ***
| author | Kenichi Handa <handa@m17n.org> |
|---|---|
| date | Mon, 08 Sep 2003 12:53:41 +0000 |
| parents | 375f2633d815 4896b8834fb6 |
| children | 040a08a2a879 |
comparison
equal
deleted
inserted
replaced
| 88123:375f2633d815 | 89483:2f877ed80fa6 |
|---|---|
| 1 /* Coding system handler (conversion, detection, and etc). | 1 /* Coding system handler (conversion, detection, etc). |
| 2 Copyright (C) 1995,97,1998,2002,2003 Electrotechnical Laboratory, JAPAN. | 2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN. |
| 3 Licensed to the Free Software Foundation. | 3 Licensed to the Free Software Foundation. |
| 4 Copyright (C) 2001,2002,2003 Free Software Foundation, Inc. | 4 Copyright (C) 2001, 2002 Free Software Foundation, Inc. |
| 5 Copyright (C) 2003 | |
| 6 National Institute of Advanced Industrial Science and Technology (AIST) | |
| 7 Registration Number H13PRO009 | |
| 5 | 8 |
| 6 This file is part of GNU Emacs. | 9 This file is part of GNU Emacs. |
| 7 | 10 |
| 8 GNU Emacs is free software; you can redistribute it and/or modify | 11 GNU Emacs is free software; you can redistribute it and/or modify |
| 9 it under the terms of the GNU General Public License as published by | 12 it under the terms of the GNU General Public License as published by |
| 22 | 25 |
| 23 /*** TABLE OF CONTENTS *** | 26 /*** TABLE OF CONTENTS *** |
| 24 | 27 |
| 25 0. General comments | 28 0. General comments |
| 26 1. Preamble | 29 1. Preamble |
| 27 2. Emacs' internal format (emacs-mule) handlers | 30 2. Emacs' internal format (emacs-utf-8) handlers |
| 28 3. ISO2022 handlers | 31 3. UTF-8 handlers |
| 29 4. Shift-JIS and BIG5 handlers | 32 4. UTF-16 handlers |
| 30 5. CCL handlers | 33 5. Charset-base coding systems handlers |
| 31 6. End-of-line handlers | 34 6. emacs-mule (old Emacs' internal format) handlers |
| 32 7. C library functions | 35 7. ISO2022 handlers |
| 33 8. Emacs Lisp library functions | 36 8. Shift-JIS and BIG5 handlers |
| 34 9. Post-amble | 37 9. CCL handlers |
| 38 10. C library functions | |
| 39 11. Emacs Lisp library functions | |
| 40 12. Postamble | |
| 35 | 41 |
| 36 */ | 42 */ |
| 37 | 43 |
| 38 /*** 0. General comments ***/ | 44 /*** 0. General comments *** |
| 39 | 45 |
| 40 | 46 |
| 41 /*** GENERAL NOTE on CODING SYSTEMS *** | 47 CODING SYSTEM |
| 42 | 48 |
| 43 A coding system is an encoding mechanism for one or more character | 49 A coding system is an object for an encoding mechanism that contains |
| 44 sets. Here's a list of coding systems which Emacs can handle. When | 50 information about how to convert byte sequences to character |
| 45 we say "decode", it means converting some other coding system to | 51 sequences and vice versa. When we say "decode", it means converting |
| 46 Emacs' internal format (emacs-mule), and when we say "encode", | 52 a byte sequence of a specific coding system into a character |
| 47 it means converting the coding system emacs-mule to some other | 53 sequence that is represented by Emacs' internal coding system |
| 54 `emacs-utf-8', and when we say "encode", it means converting a | |
| 55 character sequence of emacs-utf-8 to a byte sequence of a specific | |
| 48 coding system. | 56 coding system. |
| 49 | 57 |
| 50 0. Emacs' internal format (emacs-mule) | 58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In |
| 51 | 59 C level, a coding system is represented by a vector of attributes |
| 52 Emacs itself holds a multi-lingual character in buffers and strings | 60 stored in the hash table Vcharset_hash_table. The conversion from |
| 53 in a special format. Details are described in section 2. | 61 coding system symbol to attributes vector is done by looking up |
| 54 | 62 Vcharset_hash_table by the symbol. |
| 55 1. ISO2022 | 63 |
| 64 Coding systems are classified into the following types depending on | |
| 65 the encoding mechanism. Here's a brief description of the types. | |
| 66 | |
| 67 o UTF-8 | |
| 68 | |
| 69 o UTF-16 | |
| 70 | |
| 71 o Charset-base coding system | |
| 72 | |
| 73 A coding system defined by one or more (coded) character sets. | |
| 74 Decoding and encoding are done by a code converter defined for each | |
| 75 character set. | |
| 76 | |
| 77 o Old Emacs internal format (emacs-mule) | |
| 78 | |
| 79 The coding system adopted by old versions of Emacs (20 and 21). | |
| 80 | |
| 81 o ISO2022-base coding system | |
| 56 | 82 |
| 57 The most famous coding system for multiple character sets. X's | 83 The most famous coding system for multiple character sets. X's |
| 58 Compound Text, various EUCs (Extended Unix Code), and coding | 84 Compound Text, various EUCs (Extended Unix Code), and coding systems |
| 59 systems used in Internet communication such as ISO-2022-JP are | 85 used in the Internet communication such as ISO-2022-JP are all |
| 60 all variants of ISO2022. Details are described in section 3. | 86 variants of ISO2022. |
| 61 | 87 |
| 62 2. SJIS (or Shift-JIS or MS-Kanji-Code) | 88 o SJIS (or Shift-JIS or MS-Kanji-Code) |
| 63 | 89 |
| 64 A coding system to encode character sets: ASCII, JISX0201, and | 90 A coding system to encode character sets: ASCII, JISX0201, and |
| 65 JISX0208. Widely used for PC's in Japan. Details are described in | 91 JISX0208. Widely used for PC's in Japan. Details are described in |
| 66 section 4. | 92 section 8. |
| 67 | 93 |
| 68 3. BIG5 | 94 o BIG5 |
| 69 | 95 |
| 70 A coding system to encode the character sets ASCII and Big5. Widely | 96 A coding system to encode character sets: ASCII and Big5. Widely |
| 71 used for Chinese (mainly in Taiwan and Hong Kong). Details are | 97 used for Chinese (mainly in Taiwan and Hong Kong). Details are |
| 72 described in section 4. In this file, when we write "BIG5" | 98 described in section 8. In this file, when we write "big5" (all |
| 73 (all uppercase), we mean the coding system, and when we write | 99 lowercase), we mean the coding system, and when we write "Big5" |
| 74 "Big5" (capitalized), we mean the character set. | 100 (capitalized), we mean the character set. |
| 75 | 101 |
| 76 4. Raw text | 102 o CCL |
| 77 | 103 |
| 78 A coding system for text containing random 8-bit code. Emacs does | 104 If a user wants to decode/encode text encoded in a coding system |
| 79 no code conversion on such text except for end-of-line format. | 105 not listed above, he can supply a decoder and an encoder for it in |
| 80 | 106 CCL (Code Conversion Language) programs. Emacs executes the CCL |
| 81 5. Other | 107 program while decoding/encoding. |
| 82 | 108 |
| 83 If a user wants to read/write text encoded in a coding system not | 109 o Raw-text |
| 84 listed above, he can supply a decoder and an encoder for it as CCL | 110 |
| 85 (Code Conversion Language) programs. Emacs executes the CCL program | 111 A coding system for text containing raw eight-bit data. Emacs |
| 86 while reading/writing. | 112 treats each byte of source text as a character (except for |
| 87 | 113 end-of-line conversion). |
| 88 Emacs represents a coding system by a Lisp symbol that has a property | 114 |
| 89 `coding-system'. But, before actually using the coding system, the | 115 o No-conversion |
| 90 information about it is set in a structure of type `struct | 116 |
| 91 coding_system' for rapid processing. See section 6 for more details. | 117 Like raw text, but don't do end-of-line conversion. |
| 92 | 118 |
| 93 */ | 119 |
| 94 | 120 END-OF-LINE FORMAT |
| 95 /*** GENERAL NOTES on END-OF-LINE FORMAT *** | 121 |
| 96 | 122 How text end-of-line is encoded depends on operating system. For |
| 97 How end-of-line of text is encoded depends on the operating system. | 123 instance, Unix's format is just one byte of LF (line-feed) code, |
| 98 For instance, Unix's format is just one byte of `line-feed' code, | |
| 99 whereas DOS's format is two-byte sequence of `carriage-return' and | 124 whereas DOS's format is two-byte sequence of `carriage-return' and |
| 100 `line-feed' codes. MacOS's format is usually one byte of | 125 `line-feed' codes. MacOS's format is usually one byte of |
| 101 `carriage-return'. | 126 `carriage-return'. |
| 102 | 127 |
| 103 Since text character encoding and end-of-line encoding are | 128 Since text character encoding and end-of-line encoding are |
| 104 independent, any coding system described above can have any | 129 independent, any coding system described above can take any format |
| 105 end-of-line format. So Emacs has information about end-of-line | 130 of end-of-line (except for no-conversion). |
| 106 format in each coding-system. See section 6 for more details. | 131 |
| 132 STRUCT CODING_SYSTEM | |
| 133 | |
| 134 Before using a coding system for code conversion (i.e. decoding and | |
| 135 encoding), we setup a structure of type `struct coding_system'. | |
| 136 This structure keeps various information about a specific code | |
| 137 conversion (e.g. the location of source and destination data). | |
| 107 | 138 |
| 108 */ | 139 */ |
| 109 | 140 |
| 141 /* COMMON MACROS */ | |
| 142 | |
| 143 | |
| 110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions *** | 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions *** |
| 111 | 145 |
| 112 These functions check if a text between SRC and SRC_END is encoded | 146 These functions check if a byte sequence specified as a source in |
| 113 in the coding system category XXX. Each returns an integer value in | 147 CODING conforms to the format of XXX, and update the members of |
| 114 which appropriate flag bits for the category XXX are set. The flag | 148 DETECT_INFO. |
| 115 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the | 149 |
| 116 template for these functions. If MULTIBYTEP is nonzero, 8-bit codes | 150 Return 1 if the byte sequence conforms to XXX, otherwise return 0. |
| 117 of the range 0x80..0x9F are in multibyte form. */ | 151 |
| 152 Below is the template of these functions. */ | |
| 153 | |
| 118 #if 0 | 154 #if 0 |
| 119 int | 155 static int |
| 120 detect_coding_emacs_mule (src, src_end, multibytep) | 156 detect_coding_XXX (coding, detect_info) |
| 121 unsigned char *src, *src_end; | 157 struct coding_system *coding; |
| 122 int multibytep; | 158 struct coding_detection_info *detect_info; |
| 123 { | 159 { |
| 124 ... | 160 unsigned char *src = coding->source; |
| 161 unsigned char *src_end = coding->source + coding->src_bytes; | |
| 162 int multibytep = coding->src_multibyte; | |
| 163 int consumed_chars = 0; | |
| 164 int found = 0; | |
| 165 ...; | |
| 166 | |
| 167 while (1) | |
| 168 { | |
| 169 /* Get one byte from the source. If the souce is exausted, jump | |
| 170 to no_more_source:. */ | |
| 171 ONE_MORE_BYTE (c); | |
| 172 | |
| 173 if (! __C_conforms_to_XXX___ (c)) | |
| 174 break; | |
| 175 if (! __C_strongly_suggests_XXX__ (c)) | |
| 176 found = CATEGORY_MASK_XXX; | |
| 177 } | |
| 178 /* The byte sequence is invalid for XXX. */ | |
| 179 detect_info->rejected |= CATEGORY_MASK_XXX; | |
| 180 return 0; | |
| 181 | |
| 182 no_more_source: | |
| 183 /* The source exausted successfully. */ | |
| 184 detect_info->found |= found; | |
| 185 return 1; | |
| 125 } | 186 } |
| 126 #endif | 187 #endif |
| 127 | 188 |
| 128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions *** | 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions *** |
| 129 | 190 |
| 130 These functions decode SRC_BYTES length of unibyte text at SOURCE | 191 These functions decode a byte sequence specified as a source by |
| 131 encoded in CODING to Emacs' internal format. The resulting | 192 CODING. The resulting multibyte text goes to a place pointed to by |
| 132 multibyte text goes to a place pointed to by DESTINATION, the length | 193 CODING->charbuf, the length of which should not exceed |
| 133 of which should not exceed DST_BYTES. | 194 CODING->charbuf_size; |
| 134 | 195 |
| 135 These functions set the information about original and decoded texts | 196 These functions set the information of original and decoded texts in |
| 136 in the members `produced', `produced_char', `consumed', and | 197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used. |
| 137 `consumed_char' of the structure *CODING. They also set the member | 198 They also set CODING->result to one of CODING_RESULT_XXX indicating |
| 138 `result' to one of CODING_FINISH_XXX indicating how the decoding | 199 how the decoding is finished. |
| 139 finished. | 200 |
| 140 | 201 Below is the template of these functions. */ |
| 141 DST_BYTES zero means that the source area and destination area are | 202 |
| 142 overlapped, which means that we can produce a decoded text until it | |
| 143 reaches the head of the not-yet-decoded source text. | |
| 144 | |
| 145 Below is a template for these functions. */ | |
| 146 #if 0 | 203 #if 0 |
| 147 static void | 204 static void |
| 148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes) | 205 decode_coding_XXXX (coding) |
| 149 struct coding_system *coding; | 206 struct coding_system *coding; |
| 150 unsigned char *source, *destination; | 207 { |
| 151 int src_bytes, dst_bytes; | 208 unsigned char *src = coding->source + coding->consumed; |
| 152 { | 209 unsigned char *src_end = coding->source + coding->src_bytes; |
| 153 ... | 210 /* SRC_BASE remembers the start position in source in each loop. |
| 211 The loop will be exited when there's not enough source code, or | |
| 212 when there's no room in CHARBUF for a decoded character. */ | |
| 213 unsigned char *src_base; | |
| 214 /* A buffer to produce decoded characters. */ | |
| 215 int *charbuf = coding->charbuf; | |
| 216 int *charbuf_end = charbuf + coding->charbuf_size; | |
| 217 int multibytep = coding->src_multibyte; | |
| 218 | |
| 219 while (1) | |
| 220 { | |
| 221 src_base = src; | |
| 222 if (charbuf < charbuf_end) | |
| 223 /* No more room to produce a decoded character. */ | |
| 224 break; | |
| 225 ONE_MORE_BYTE (c); | |
| 226 /* Decode it. */ | |
| 227 } | |
| 228 | |
| 229 no_more_source: | |
| 230 if (src_base < src_end | |
| 231 && coding->mode & CODING_MODE_LAST_BLOCK) | |
| 232 /* If the source ends by partial bytes to construct a character, | |
| 233 treat them as eight-bit raw data. */ | |
| 234 while (src_base < src_end && charbuf < charbuf_end) | |
| 235 *charbuf++ = *src_base++; | |
| 236 /* Remember how many bytes and characters we consumed. If the | |
| 237 source is multibyte, the bytes and chars are not identical. */ | |
| 238 coding->consumed = coding->consumed_char = src_base - coding->source; | |
| 239 /* Remember how many characters we produced. */ | |
| 240 coding->charbuf_used = charbuf - coding->charbuf; | |
| 154 } | 241 } |
| 155 #endif | 242 #endif |
| 156 | 243 |
| 157 /*** GENERAL NOTES on `encode_coding_XXX ()' functions *** | 244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions *** |
| 158 | 245 |
| 159 These functions encode SRC_BYTES length text at SOURCE from Emacs' | 246 These functions encode SRC_BYTES length text at SOURCE of Emacs' |
| 160 internal multibyte format to CODING. The resulting unibyte text | 247 internal multibyte format by CODING. The resulting byte sequence |
| 161 goes to a place pointed to by DESTINATION, the length of which | 248 goes to a place pointed to by DESTINATION, the length of which |
| 162 should not exceed DST_BYTES. | 249 should not exceed DST_BYTES. |
| 163 | 250 |
| 164 These functions set the information about original and encoded texts | 251 These functions set the information of original and encoded texts in |
| 165 in the members `produced', `produced_char', `consumed', and | 252 the members produced, produced_char, consumed, and consumed_char of |
| 166 `consumed_char' of the structure *CODING. They also set the member | 253 the structure *CODING. They also set the member result to one of |
| 167 `result' to one of CODING_FINISH_XXX indicating how the encoding | 254 CODING_RESULT_XXX indicating how the encoding finished. |
| 168 finished. | 255 |
| 169 | 256 DST_BYTES zero means that source area and destination area are |
| 170 DST_BYTES zero means that the source area and destination area are | 257 overlapped, which means that we can produce a encoded text until it |
| 171 overlapped, which means that we can produce encoded text until it | 258 reaches at the head of not-yet-encoded source text. |
| 172 reaches at the head of the not-yet-encoded source text. | 259 |
| 173 | 260 Below is a template of these functions. */ |
| 174 Below is a template for these functions. */ | |
| 175 #if 0 | 261 #if 0 |
| 176 static void | 262 static void |
| 177 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes) | 263 encode_coding_XXX (coding) |
| 178 struct coding_system *coding; | 264 struct coding_system *coding; |
| 179 unsigned char *source, *destination; | 265 { |
| 180 int src_bytes, dst_bytes; | 266 int multibytep = coding->dst_multibyte; |
| 181 { | 267 int *charbuf = coding->charbuf; |
| 182 ... | 268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used; |
| 269 unsigned char *dst = coding->destination + coding->produced; | |
| 270 unsigned char *dst_end = coding->destination + coding->dst_bytes; | |
| 271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_; | |
| 272 int produced_chars = 0; | |
| 273 | |
| 274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++) | |
| 275 { | |
| 276 int c = *charbuf; | |
| 277 /* Encode C into DST, and increment DST. */ | |
| 278 } | |
| 279 label_no_more_destination: | |
| 280 /* How many chars and bytes we produced. */ | |
| 281 coding->produced_char += produced_chars; | |
| 282 coding->produced = dst - coding->destination; | |
| 183 } | 283 } |
| 184 #endif | 284 #endif |
| 185 | |
| 186 /*** COMMONLY USED MACROS ***/ | |
| 187 | |
| 188 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely | |
| 189 get one, two, and three bytes from the source text respectively. | |
| 190 If there are not enough bytes in the source, they jump to | |
| 191 `label_end_of_loop'. The caller should set variables `coding', | |
| 192 `src' and `src_end' to appropriate pointer in advance. These | |
| 193 macros are called from decoding routines `decode_coding_XXX', thus | |
| 194 it is assumed that the source text is unibyte. */ | |
| 195 | |
| 196 #define ONE_MORE_BYTE(c1) \ | |
| 197 do { \ | |
| 198 if (src >= src_end) \ | |
| 199 { \ | |
| 200 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \ | |
| 201 goto label_end_of_loop; \ | |
| 202 } \ | |
| 203 c1 = *src++; \ | |
| 204 } while (0) | |
| 205 | |
| 206 #define TWO_MORE_BYTES(c1, c2) \ | |
| 207 do { \ | |
| 208 if (src + 1 >= src_end) \ | |
| 209 { \ | |
| 210 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \ | |
| 211 goto label_end_of_loop; \ | |
| 212 } \ | |
| 213 c1 = *src++; \ | |
| 214 c2 = *src++; \ | |
| 215 } while (0) | |
| 216 | |
| 217 | |
| 218 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte | |
| 219 form if MULTIBYTEP is nonzero. */ | |
| 220 | |
| 221 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \ | |
| 222 do { \ | |
| 223 if (src >= src_end) \ | |
| 224 { \ | |
| 225 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \ | |
| 226 goto label_end_of_loop; \ | |
| 227 } \ | |
| 228 c1 = *src++; \ | |
| 229 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \ | |
| 230 c1 = *src++ - 0x20; \ | |
| 231 } while (0) | |
| 232 | |
| 233 /* Set C to the next character at the source text pointed by `src'. | |
| 234 If there are not enough characters in the source, jump to | |
| 235 `label_end_of_loop'. The caller should set variables `coding' | |
| 236 `src', `src_end', and `translation_table' to appropriate pointers | |
| 237 in advance. This macro is used in encoding routines | |
| 238 `encode_coding_XXX', thus it assumes that the source text is in | |
| 239 multibyte form except for 8-bit characters. 8-bit characters are | |
| 240 in multibyte form if coding->src_multibyte is nonzero, else they | |
| 241 are represented by a single byte. */ | |
| 242 | |
| 243 #define ONE_MORE_CHAR(c) \ | |
| 244 do { \ | |
| 245 int len = src_end - src; \ | |
| 246 int bytes; \ | |
| 247 if (len <= 0) \ | |
| 248 { \ | |
| 249 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \ | |
| 250 goto label_end_of_loop; \ | |
| 251 } \ | |
| 252 if (coding->src_multibyte \ | |
| 253 || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes)) \ | |
| 254 c = STRING_CHAR_AND_LENGTH (src, len, bytes); \ | |
| 255 else \ | |
| 256 c = *src, bytes = 1; \ | |
| 257 if (!NILP (translation_table)) \ | |
| 258 c = translate_char (translation_table, c, -1, 0, 0); \ | |
| 259 src += bytes; \ | |
| 260 } while (0) | |
| 261 | |
| 262 | |
| 263 /* Produce a multibyte form of character C to `dst'. Jump to | |
| 264 `label_end_of_loop' if there's not enough space at `dst'. | |
| 265 | |
| 266 If we are now in the middle of a composition sequence, the decoded | |
| 267 character may be ALTCHAR (for the current composition). In that | |
| 268 case, the character goes to coding->cmp_data->data instead of | |
| 269 `dst'. | |
| 270 | |
| 271 This macro is used in decoding routines. */ | |
| 272 | |
| 273 #define EMIT_CHAR(c) \ | |
| 274 do { \ | |
| 275 if (! COMPOSING_P (coding) \ | |
| 276 || coding->composing == COMPOSITION_RELATIVE \ | |
| 277 || coding->composing == COMPOSITION_WITH_RULE) \ | |
| 278 { \ | |
| 279 int bytes = CHAR_BYTES (c); \ | |
| 280 if ((dst + bytes) > (dst_bytes ? dst_end : src)) \ | |
| 281 { \ | |
| 282 coding->result = CODING_FINISH_INSUFFICIENT_DST; \ | |
| 283 goto label_end_of_loop; \ | |
| 284 } \ | |
| 285 dst += CHAR_STRING (c, dst); \ | |
| 286 coding->produced_char++; \ | |
| 287 } \ | |
| 288 \ | |
| 289 if (COMPOSING_P (coding) \ | |
| 290 && coding->composing != COMPOSITION_RELATIVE) \ | |
| 291 { \ | |
| 292 CODING_ADD_COMPOSITION_COMPONENT (coding, c); \ | |
| 293 coding->composition_rule_follows \ | |
| 294 = coding->composing != COMPOSITION_WITH_ALTCHARS; \ | |
| 295 } \ | |
| 296 } while (0) | |
| 297 | |
| 298 | |
| 299 #define EMIT_ONE_BYTE(c) \ | |
| 300 do { \ | |
| 301 if (dst >= (dst_bytes ? dst_end : src)) \ | |
| 302 { \ | |
| 303 coding->result = CODING_FINISH_INSUFFICIENT_DST; \ | |
| 304 goto label_end_of_loop; \ | |
| 305 } \ | |
| 306 *dst++ = c; \ | |
| 307 } while (0) | |
| 308 | |
| 309 #define EMIT_TWO_BYTES(c1, c2) \ | |
| 310 do { \ | |
| 311 if (dst + 2 > (dst_bytes ? dst_end : src)) \ | |
| 312 { \ | |
| 313 coding->result = CODING_FINISH_INSUFFICIENT_DST; \ | |
| 314 goto label_end_of_loop; \ | |
| 315 } \ | |
| 316 *dst++ = c1, *dst++ = c2; \ | |
| 317 } while (0) | |
| 318 | |
| 319 #define EMIT_BYTES(from, to) \ | |
| 320 do { \ | |
| 321 if (dst + (to - from) > (dst_bytes ? dst_end : src)) \ | |
| 322 { \ | |
| 323 coding->result = CODING_FINISH_INSUFFICIENT_DST; \ | |
| 324 goto label_end_of_loop; \ | |
| 325 } \ | |
| 326 while (from < to) \ | |
| 327 *dst++ = *from++; \ | |
| 328 } while (0) | |
| 329 | 285 |
| 330 | 286 |
| 331 /*** 1. Preamble ***/ | 287 /*** 1. Preamble ***/ |
| 332 | 288 |
| 333 #ifdef emacs | |
| 334 #include <config.h> | 289 #include <config.h> |
| 335 #endif | |
| 336 | |
| 337 #include <stdio.h> | 290 #include <stdio.h> |
| 338 | |
| 339 #ifdef emacs | |
| 340 | 291 |
| 341 #include "lisp.h" | 292 #include "lisp.h" |
| 342 #include "buffer.h" | 293 #include "buffer.h" |
| 294 #include "character.h" | |
| 343 #include "charset.h" | 295 #include "charset.h" |
| 296 #include "ccl.h" | |
| 344 #include "composite.h" | 297 #include "composite.h" |
| 345 #include "ccl.h" | |
| 346 #include "coding.h" | 298 #include "coding.h" |
| 347 #include "window.h" | 299 #include "window.h" |
| 348 #include "intervals.h" | 300 |
| 349 | 301 Lisp_Object Vcoding_system_hash_table; |
| 350 #else /* not emacs */ | 302 |
| 351 | 303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type; |
| 352 #include "mulelib.h" | 304 Lisp_Object Qunix, Qdos; |
| 353 | 305 extern Lisp_Object Qmac; /* frame.c */ |
| 354 #endif /* not emacs */ | |
| 355 | |
| 356 Lisp_Object Qcoding_system, Qeol_type; | |
| 357 Lisp_Object Qbuffer_file_coding_system; | 306 Lisp_Object Qbuffer_file_coding_system; |
| 358 Lisp_Object Qpost_read_conversion, Qpre_write_conversion; | 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion; |
| 308 Lisp_Object Qdefault_char; | |
| 359 Lisp_Object Qno_conversion, Qundecided; | 309 Lisp_Object Qno_conversion, Qundecided; |
| 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5; | |
| 311 Lisp_Object Qbig, Qlittle; | |
| 360 Lisp_Object Qcoding_system_history; | 312 Lisp_Object Qcoding_system_history; |
| 361 Lisp_Object Qsafe_chars; | |
| 362 Lisp_Object Qvalid_codes; | 313 Lisp_Object Qvalid_codes; |
| 314 Lisp_Object QCcategory; | |
| 363 | 315 |
| 364 extern Lisp_Object Qinsert_file_contents, Qwrite_region; | 316 extern Lisp_Object Qinsert_file_contents, Qwrite_region; |
| 365 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument; | 317 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument; |
| 366 Lisp_Object Qstart_process, Qopen_network_stream; | 318 Lisp_Object Qstart_process, Qopen_network_stream; |
| 367 Lisp_Object Qtarget_idx; | 319 Lisp_Object Qtarget_idx; |
| 368 | 320 |
| 321 int coding_system_require_warning; | |
| 322 | |
| 369 Lisp_Object Vselect_safe_coding_system_function; | 323 Lisp_Object Vselect_safe_coding_system_function; |
| 370 | |
| 371 int coding_system_require_warning; | |
| 372 | 324 |
| 373 /* Mnemonic string for each format of end-of-line. */ | 325 /* Mnemonic string for each format of end-of-line. */ |
| 374 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac; | 326 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac; |
| 375 /* Mnemonic string to indicate format of end-of-line is not yet | 327 /* Mnemonic string to indicate format of end-of-line is not yet |
| 376 decided. */ | 328 decided. */ |
| 377 Lisp_Object eol_mnemonic_undecided; | 329 Lisp_Object eol_mnemonic_undecided; |
| 378 | 330 |
| 379 /* Format of end-of-line decided by system. This is CODING_EOL_LF on | |
| 380 Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac. */ | |
| 381 int system_eol_type; | |
| 382 | |
| 383 #ifdef emacs | 331 #ifdef emacs |
| 384 | |
| 385 /* Information about which coding system is safe for which chars. | |
| 386 The value has the form (GENERIC-LIST . NON-GENERIC-ALIST). | |
| 387 | |
| 388 GENERIC-LIST is a list of generic coding systems which can encode | |
| 389 any characters. | |
| 390 | |
| 391 NON-GENERIC-ALIST is an alist of non generic coding systems vs the | |
| 392 corresponding char table that contains safe chars. */ | |
| 393 Lisp_Object Vcoding_system_safe_chars; | |
| 394 | 332 |
| 395 Lisp_Object Vcoding_system_list, Vcoding_system_alist; | 333 Lisp_Object Vcoding_system_list, Vcoding_system_alist; |
| 396 | 334 |
| 397 Lisp_Object Qcoding_system_p, Qcoding_system_error; | 335 Lisp_Object Qcoding_system_p, Qcoding_system_error; |
| 398 | 336 |
| 399 /* Coding system emacs-mule and raw-text are for converting only | 337 /* Coding system emacs-mule and raw-text are for converting only |
| 400 end-of-line format. */ | 338 end-of-line format. */ |
| 401 Lisp_Object Qemacs_mule, Qraw_text; | 339 Lisp_Object Qemacs_mule, Qraw_text; |
| 402 | 340 Lisp_Object Qutf_8_emacs; |
| 403 Lisp_Object Qutf_8; | |
| 404 | 341 |
| 405 /* Coding-systems are handed between Emacs Lisp programs and C internal | 342 /* Coding-systems are handed between Emacs Lisp programs and C internal |
| 406 routines by the following three variables. */ | 343 routines by the following three variables. */ |
| 407 /* Coding-system for reading files and receiving data from process. */ | 344 /* Coding-system for reading files and receiving data from process. */ |
| 408 Lisp_Object Vcoding_system_for_read; | 345 Lisp_Object Vcoding_system_for_read; |
| 432 struct coding_system safe_terminal_coding; | 369 struct coding_system safe_terminal_coding; |
| 433 | 370 |
| 434 /* Coding system of what is sent from terminal keyboard. */ | 371 /* Coding system of what is sent from terminal keyboard. */ |
| 435 struct coding_system keyboard_coding; | 372 struct coding_system keyboard_coding; |
| 436 | 373 |
| 437 /* Default coding system to be used to write a file. */ | |
| 438 struct coding_system default_buffer_file_coding; | |
| 439 | |
| 440 Lisp_Object Vfile_coding_system_alist; | 374 Lisp_Object Vfile_coding_system_alist; |
| 441 Lisp_Object Vprocess_coding_system_alist; | 375 Lisp_Object Vprocess_coding_system_alist; |
| 442 Lisp_Object Vnetwork_coding_system_alist; | 376 Lisp_Object Vnetwork_coding_system_alist; |
| 443 | 377 |
| 444 Lisp_Object Vlocale_coding_system; | 378 Lisp_Object Vlocale_coding_system; |
| 445 | 379 |
| 446 #endif /* emacs */ | 380 #endif /* emacs */ |
| 447 | |
| 448 Lisp_Object Qcoding_category, Qcoding_category_index; | |
| 449 | |
| 450 /* List of symbols `coding-category-xxx' ordered by priority. */ | |
| 451 Lisp_Object Vcoding_category_list; | |
| 452 | |
| 453 /* Table of coding categories (Lisp symbols). */ | |
| 454 Lisp_Object Vcoding_category_table; | |
| 455 | |
| 456 /* Table of names of symbol for each coding-category. */ | |
| 457 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = { | |
| 458 "coding-category-emacs-mule", | |
| 459 "coding-category-sjis", | |
| 460 "coding-category-iso-7", | |
| 461 "coding-category-iso-7-tight", | |
| 462 "coding-category-iso-8-1", | |
| 463 "coding-category-iso-8-2", | |
| 464 "coding-category-iso-7-else", | |
| 465 "coding-category-iso-8-else", | |
| 466 "coding-category-ccl", | |
| 467 "coding-category-big5", | |
| 468 "coding-category-utf-8", | |
| 469 "coding-category-utf-16-be", | |
| 470 "coding-category-utf-16-le", | |
| 471 "coding-category-raw-text", | |
| 472 "coding-category-binary" | |
| 473 }; | |
| 474 | |
| 475 /* Table of pointers to coding systems corresponding to each coding | |
| 476 categories. */ | |
| 477 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX]; | |
| 478 | |
| 479 /* Table of coding category masks. Nth element is a mask for a coding | |
| 480 category of which priority is Nth. */ | |
| 481 static | |
| 482 int coding_priorities[CODING_CATEGORY_IDX_MAX]; | |
| 483 | 381 |
| 484 /* Flag to tell if we look up translation table on character code | 382 /* Flag to tell if we look up translation table on character code |
| 485 conversion. */ | 383 conversion. */ |
| 486 Lisp_Object Venable_character_translation; | 384 Lisp_Object Venable_character_translation; |
| 487 /* Standard translation table to look up on decoding (reading). */ | 385 /* Standard translation table to look up on decoding (reading). */ |
| 493 Lisp_Object Qtranslation_table_id; | 391 Lisp_Object Qtranslation_table_id; |
| 494 Lisp_Object Qtranslation_table_for_decode; | 392 Lisp_Object Qtranslation_table_for_decode; |
| 495 Lisp_Object Qtranslation_table_for_encode; | 393 Lisp_Object Qtranslation_table_for_encode; |
| 496 | 394 |
| 497 /* Alist of charsets vs revision number. */ | 395 /* Alist of charsets vs revision number. */ |
| 498 Lisp_Object Vcharset_revision_alist; | 396 static Lisp_Object Vcharset_revision_table; |
| 499 | 397 |
| 500 /* Default coding systems used for process I/O. */ | 398 /* Default coding systems used for process I/O. */ |
| 501 Lisp_Object Vdefault_process_coding_system; | 399 Lisp_Object Vdefault_process_coding_system; |
| 502 | 400 |
| 503 /* Char table for translating Quail and self-inserting input. */ | 401 /* Char table for translating Quail and self-inserting input. */ |
| 507 pre-write-conversion functions. Usually the value is zero, but it | 405 pre-write-conversion functions. Usually the value is zero, but it |
| 508 is set to 1 temporarily while such functions are running. This is | 406 is set to 1 temporarily while such functions are running. This is |
| 509 to avoid infinite recursive call. */ | 407 to avoid infinite recursive call. */ |
| 510 static int inhibit_pre_post_conversion; | 408 static int inhibit_pre_post_conversion; |
| 511 | 409 |
| 512 Lisp_Object Qchar_coding_system; | 410 /* Two special coding systems. */ |
| 513 | 411 Lisp_Object Vsjis_coding_system; |
| 514 /* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check | 412 Lisp_Object Vbig5_coding_system; |
| 515 its validity. */ | 413 |
| 516 | 414 |
| 517 Lisp_Object | 415 static int detect_coding_utf_8 P_ ((struct coding_system *, |
| 518 coding_safe_chars (coding_system) | 416 struct coding_detection_info *info)); |
| 519 Lisp_Object coding_system; | 417 static void decode_coding_utf_8 P_ ((struct coding_system *)); |
| 520 { | 418 static int encode_coding_utf_8 P_ ((struct coding_system *)); |
| 521 Lisp_Object coding_spec, plist, safe_chars; | 419 |
| 522 | 420 static int detect_coding_utf_16 P_ ((struct coding_system *, |
| 523 coding_spec = Fget (coding_system, Qcoding_system); | 421 struct coding_detection_info *info)); |
| 524 plist = XVECTOR (coding_spec)->contents[3]; | 422 static void decode_coding_utf_16 P_ ((struct coding_system *)); |
| 525 safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars); | 423 static int encode_coding_utf_16 P_ ((struct coding_system *)); |
| 526 return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt); | 424 |
| 527 } | 425 static int detect_coding_iso_2022 P_ ((struct coding_system *, |
| 528 | 426 struct coding_detection_info *info)); |
| 529 #define CODING_SAFE_CHAR_P(safe_chars, c) \ | 427 static void decode_coding_iso_2022 P_ ((struct coding_system *)); |
| 530 (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c))) | 428 static int encode_coding_iso_2022 P_ ((struct coding_system *)); |
| 429 | |
| 430 static int detect_coding_emacs_mule P_ ((struct coding_system *, | |
| 431 struct coding_detection_info *info)); | |
| 432 static void decode_coding_emacs_mule P_ ((struct coding_system *)); | |
| 433 static int encode_coding_emacs_mule P_ ((struct coding_system *)); | |
| 434 | |
| 435 static int detect_coding_sjis P_ ((struct coding_system *, | |
| 436 struct coding_detection_info *info)); | |
| 437 static void decode_coding_sjis P_ ((struct coding_system *)); | |
| 438 static int encode_coding_sjis P_ ((struct coding_system *)); | |
| 439 | |
| 440 static int detect_coding_big5 P_ ((struct coding_system *, | |
| 441 struct coding_detection_info *info)); | |
| 442 static void decode_coding_big5 P_ ((struct coding_system *)); | |
| 443 static int encode_coding_big5 P_ ((struct coding_system *)); | |
| 444 | |
| 445 static int detect_coding_ccl P_ ((struct coding_system *, | |
| 446 struct coding_detection_info *info)); | |
| 447 static void decode_coding_ccl P_ ((struct coding_system *)); | |
| 448 static int encode_coding_ccl P_ ((struct coding_system *)); | |
| 449 | |
| 450 static void decode_coding_raw_text P_ ((struct coding_system *)); | |
| 451 static int encode_coding_raw_text P_ ((struct coding_system *)); | |
| 452 | |
| 453 | |
| 454 /* ISO2022 section */ | |
| 455 | |
| 456 #define CODING_ISO_INITIAL(coding, reg) \ | |
| 457 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \ | |
| 458 coding_attr_iso_initial), \ | |
| 459 reg))) | |
| 460 | |
| 461 | |
| 462 #define CODING_ISO_REQUEST(coding, charset_id) \ | |
| 463 ((charset_id <= (coding)->max_charset_id \ | |
| 464 ? (coding)->safe_charsets[charset_id] \ | |
| 465 : -1)) | |
| 466 | |
| 467 | |
| 468 #define CODING_ISO_FLAGS(coding) \ | |
| 469 ((coding)->spec.iso_2022.flags) | |
| 470 #define CODING_ISO_DESIGNATION(coding, reg) \ | |
| 471 ((coding)->spec.iso_2022.current_designation[reg]) | |
| 472 #define CODING_ISO_INVOCATION(coding, plane) \ | |
| 473 ((coding)->spec.iso_2022.current_invocation[plane]) | |
| 474 #define CODING_ISO_SINGLE_SHIFTING(coding) \ | |
| 475 ((coding)->spec.iso_2022.single_shifting) | |
| 476 #define CODING_ISO_BOL(coding) \ | |
| 477 ((coding)->spec.iso_2022.bol) | |
| 478 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \ | |
| 479 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane))) | |
| 480 | |
| 481 /* Control characters of ISO2022. */ | |
| 482 /* code */ /* function */ | |
| 483 #define ISO_CODE_LF 0x0A /* line-feed */ | |
| 484 #define ISO_CODE_CR 0x0D /* carriage-return */ | |
| 485 #define ISO_CODE_SO 0x0E /* shift-out */ | |
| 486 #define ISO_CODE_SI 0x0F /* shift-in */ | |
| 487 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */ | |
| 488 #define ISO_CODE_ESC 0x1B /* escape */ | |
| 489 #define ISO_CODE_SS2 0x8E /* single-shift-2 */ | |
| 490 #define ISO_CODE_SS3 0x8F /* single-shift-3 */ | |
| 491 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */ | |
| 492 | |
| 493 /* All code (1-byte) of ISO2022 is classified into one of the | |
| 494 followings. */ | |
| 495 enum iso_code_class_type | |
| 496 { | |
| 497 ISO_control_0, /* Control codes in the range | |
| 498 0x00..0x1F and 0x7F, except for the | |
| 499 following 5 codes. */ | |
| 500 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */ | |
| 501 ISO_shift_out, /* ISO_CODE_SO (0x0E) */ | |
| 502 ISO_shift_in, /* ISO_CODE_SI (0x0F) */ | |
| 503 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */ | |
| 504 ISO_escape, /* ISO_CODE_SO (0x1B) */ | |
| 505 ISO_control_1, /* Control codes in the range | |
| 506 0x80..0x9F, except for the | |
| 507 following 3 codes. */ | |
| 508 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */ | |
| 509 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */ | |
| 510 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */ | |
| 511 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */ | |
| 512 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */ | |
| 513 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */ | |
| 514 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */ | |
| 515 }; | |
| 516 | |
| 517 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the | |
| 518 `iso-flags' attribute of an iso2022 coding system. */ | |
| 519 | |
| 520 /* If set, produce long-form designation sequence (e.g. ESC $ ( A) | |
| 521 instead of the correct short-form sequence (e.g. ESC $ A). */ | |
| 522 #define CODING_ISO_FLAG_LONG_FORM 0x0001 | |
| 523 | |
| 524 /* If set, reset graphic planes and registers at end-of-line to the | |
| 525 initial state. */ | |
| 526 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002 | |
| 527 | |
| 528 /* If set, reset graphic planes and registers before any control | |
| 529 characters to the initial state. */ | |
| 530 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004 | |
| 531 | |
| 532 /* If set, encode by 7-bit environment. */ | |
| 533 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008 | |
| 534 | |
| 535 /* If set, use locking-shift function. */ | |
| 536 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010 | |
| 537 | |
| 538 /* If set, use single-shift function. Overwrite | |
| 539 CODING_ISO_FLAG_LOCKING_SHIFT. */ | |
| 540 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020 | |
| 541 | |
| 542 /* If set, use designation escape sequence. */ | |
| 543 #define CODING_ISO_FLAG_DESIGNATION 0x0040 | |
| 544 | |
| 545 /* If set, produce revision number sequence. */ | |
| 546 #define CODING_ISO_FLAG_REVISION 0x0080 | |
| 547 | |
| 548 /* If set, produce ISO6429's direction specifying sequence. */ | |
| 549 #define CODING_ISO_FLAG_DIRECTION 0x0100 | |
| 550 | |
| 551 /* If set, assume designation states are reset at beginning of line on | |
| 552 output. */ | |
| 553 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200 | |
| 554 | |
| 555 /* If set, designation sequence should be placed at beginning of line | |
| 556 on output. */ | |
| 557 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400 | |
| 558 | |
| 559 /* If set, do not encode unsafe charactes on output. */ | |
| 560 #define CODING_ISO_FLAG_SAFE 0x0800 | |
| 561 | |
| 562 /* If set, extra latin codes (128..159) are accepted as a valid code | |
| 563 on input. */ | |
| 564 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000 | |
| 565 | |
| 566 #define CODING_ISO_FLAG_COMPOSITION 0x2000 | |
| 567 | |
| 568 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 | |
| 569 | |
| 570 #define CODING_ISO_FLAG_USE_ROMAN 0x8000 | |
| 571 | |
| 572 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000 | |
| 573 | |
| 574 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000 | |
| 575 | |
| 576 /* A character to be produced on output if encoding of the original | |
| 577 character is prohibited by CODING_ISO_FLAG_SAFE. */ | |
| 578 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?' | |
| 579 | |
| 580 | |
| 581 /* UTF-16 section */ | |
| 582 #define CODING_UTF_16_BOM(coding) \ | |
| 583 ((coding)->spec.utf_16.bom) | |
| 584 | |
| 585 #define CODING_UTF_16_ENDIAN(coding) \ | |
| 586 ((coding)->spec.utf_16.endian) | |
| 587 | |
| 588 #define CODING_UTF_16_SURROGATE(coding) \ | |
| 589 ((coding)->spec.utf_16.surrogate) | |
| 590 | |
| 591 | |
| 592 /* CCL section */ | |
| 593 #define CODING_CCL_DECODER(coding) \ | |
| 594 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder) | |
| 595 #define CODING_CCL_ENCODER(coding) \ | |
| 596 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder) | |
| 597 #define CODING_CCL_VALIDS(coding) \ | |
| 598 (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids))) | |
| 599 | |
| 600 /* Index for each coding category in `coding_categories' */ | |
| 601 | |
| 602 enum coding_category | |
| 603 { | |
| 604 coding_category_iso_7, | |
| 605 coding_category_iso_7_tight, | |
| 606 coding_category_iso_8_1, | |
| 607 coding_category_iso_8_2, | |
| 608 coding_category_iso_7_else, | |
| 609 coding_category_iso_8_else, | |
| 610 coding_category_utf_8, | |
| 611 coding_category_utf_16_auto, | |
| 612 coding_category_utf_16_be, | |
| 613 coding_category_utf_16_le, | |
| 614 coding_category_utf_16_be_nosig, | |
| 615 coding_category_utf_16_le_nosig, | |
| 616 coding_category_charset, | |
| 617 coding_category_sjis, | |
| 618 coding_category_big5, | |
| 619 coding_category_ccl, | |
| 620 coding_category_emacs_mule, | |
| 621 /* All above are targets of code detection. */ | |
| 622 coding_category_raw_text, | |
| 623 coding_category_undecided, | |
| 624 coding_category_max | |
| 625 }; | |
| 626 | |
| 627 /* Definitions of flag bits used in detect_coding_XXXX. */ | |
| 628 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7) | |
| 629 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight) | |
| 630 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1) | |
| 631 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2) | |
| 632 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else) | |
| 633 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else) | |
| 634 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8) | |
| 635 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto) | |
| 636 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be) | |
| 637 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le) | |
| 638 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig) | |
| 639 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig) | |
| 640 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset) | |
| 641 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis) | |
| 642 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5) | |
| 643 #define CATEGORY_MASK_CCL (1 << coding_category_ccl) | |
| 644 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule) | |
| 645 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text) | |
| 646 | |
| 647 /* This value is returned if detect_coding_mask () find nothing other | |
| 648 than ASCII characters. */ | |
| 649 #define CATEGORY_MASK_ANY \ | |
| 650 (CATEGORY_MASK_ISO_7 \ | |
| 651 | CATEGORY_MASK_ISO_7_TIGHT \ | |
| 652 | CATEGORY_MASK_ISO_8_1 \ | |
| 653 | CATEGORY_MASK_ISO_8_2 \ | |
| 654 | CATEGORY_MASK_ISO_7_ELSE \ | |
| 655 | CATEGORY_MASK_ISO_8_ELSE \ | |
| 656 | CATEGORY_MASK_UTF_8 \ | |
| 657 | CATEGORY_MASK_UTF_16_BE \ | |
| 658 | CATEGORY_MASK_UTF_16_LE \ | |
| 659 | CATEGORY_MASK_UTF_16_BE_NOSIG \ | |
| 660 | CATEGORY_MASK_UTF_16_LE_NOSIG \ | |
| 661 | CATEGORY_MASK_CHARSET \ | |
| 662 | CATEGORY_MASK_SJIS \ | |
| 663 | CATEGORY_MASK_BIG5 \ | |
| 664 | CATEGORY_MASK_CCL \ | |
| 665 | CATEGORY_MASK_EMACS_MULE) | |
| 666 | |
| 667 | |
| 668 #define CATEGORY_MASK_ISO_7BIT \ | |
| 669 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT) | |
| 670 | |
| 671 #define CATEGORY_MASK_ISO_8BIT \ | |
| 672 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2) | |
| 673 | |
| 674 #define CATEGORY_MASK_ISO_ELSE \ | |
| 675 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE) | |
| 676 | |
| 677 #define CATEGORY_MASK_ISO_ESCAPE \ | |
| 678 (CATEGORY_MASK_ISO_7 \ | |
| 679 | CATEGORY_MASK_ISO_7_TIGHT \ | |
| 680 | CATEGORY_MASK_ISO_7_ELSE \ | |
| 681 | CATEGORY_MASK_ISO_8_ELSE) | |
| 682 | |
| 683 #define CATEGORY_MASK_ISO \ | |
| 684 ( CATEGORY_MASK_ISO_7BIT \ | |
| 685 | CATEGORY_MASK_ISO_8BIT \ | |
| 686 | CATEGORY_MASK_ISO_ELSE) | |
| 687 | |
| 688 #define CATEGORY_MASK_UTF_16 \ | |
| 689 (CATEGORY_MASK_UTF_16_BE \ | |
| 690 | CATEGORY_MASK_UTF_16_LE \ | |
| 691 | CATEGORY_MASK_UTF_16_BE_NOSIG \ | |
| 692 | CATEGORY_MASK_UTF_16_LE_NOSIG) | |
| 693 | |
| 694 | |
| 695 /* List of symbols `coding-category-xxx' ordered by priority. This | |
| 696 variable is exposed to Emacs Lisp. */ | |
| 697 static Lisp_Object Vcoding_category_list; | |
| 698 | |
| 699 /* Table of coding categories (Lisp symbols). This variable is for | |
| 700 internal use oly. */ | |
| 701 static Lisp_Object Vcoding_category_table; | |
| 702 | |
| 703 /* Table of coding-categories ordered by priority. */ | |
| 704 static enum coding_category coding_priorities[coding_category_max]; | |
| 705 | |
| 706 /* Nth element is a coding context for the coding system bound to the | |
| 707 Nth coding category. */ | |
| 708 static struct coding_system coding_categories[coding_category_max]; | |
| 709 | |
| 710 /*** Commonly used macros and functions ***/ | |
| 711 | |
| 712 #ifndef min | |
| 713 #define min(a, b) ((a) < (b) ? (a) : (b)) | |
| 714 #endif | |
| 715 #ifndef max | |
| 716 #define max(a, b) ((a) > (b) ? (a) : (b)) | |
| 717 #endif | |
| 718 | |
| 719 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \ | |
| 720 do { \ | |
| 721 attrs = CODING_ID_ATTRS (coding->id); \ | |
| 722 eol_type = CODING_ID_EOL_TYPE (coding->id); \ | |
| 723 if (VECTORP (eol_type)) \ | |
| 724 eol_type = Qunix; \ | |
| 725 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \ | |
| 726 } while (0) | |
| 727 | |
| 728 | |
| 729 /* Safely get one byte from the source text pointed by SRC which ends | |
| 730 at SRC_END, and set C to that byte. If there are not enough bytes | |
| 731 in the source, it jumps to `no_more_source'. The caller | |
| 732 should declare and set these variables appropriately in advance: | |
| 733 src, src_end, multibytep | |
| 734 */ | |
| 735 | |
| 736 #define ONE_MORE_BYTE(c) \ | |
| 737 do { \ | |
| 738 if (src == src_end) \ | |
| 739 { \ | |
| 740 if (src_base < src) \ | |
| 741 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \ | |
| 742 goto no_more_source; \ | |
| 743 } \ | |
| 744 c = *src++; \ | |
| 745 if (multibytep && (c & 0x80)) \ | |
| 746 { \ | |
| 747 if ((c & 0xFE) != 0xC0) \ | |
| 748 error ("Undecodable char found"); \ | |
| 749 c = ((c & 1) << 6) | *src++; \ | |
| 750 } \ | |
| 751 consumed_chars++; \ | |
| 752 } while (0) | |
| 753 | |
| 754 | |
| 755 #define ONE_MORE_BYTE_NO_CHECK(c) \ | |
| 756 do { \ | |
| 757 c = *src++; \ | |
| 758 if (multibytep && (c & 0x80)) \ | |
| 759 { \ | |
| 760 if ((c & 0xFE) != 0xC0) \ | |
| 761 error ("Undecodable char found"); \ | |
| 762 c = ((c & 1) << 6) | *src++; \ | |
| 763 } \ | |
| 764 consumed_chars++; \ | |
| 765 } while (0) | |
| 766 | |
| 767 | |
| 768 /* Store a byte C in the place pointed by DST and increment DST to the | |
| 769 next free point, and increment PRODUCED_CHARS. The caller should | |
| 770 assure that C is 0..127, and declare and set the variable `dst' | |
| 771 appropriately in advance. | |
| 772 */ | |
| 773 | |
| 774 | |
| 775 #define EMIT_ONE_ASCII_BYTE(c) \ | |
| 776 do { \ | |
| 777 produced_chars++; \ | |
| 778 *dst++ = (c); \ | |
| 779 } while (0) | |
| 780 | |
| 781 | |
| 782 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */ | |
| 783 | |
| 784 #define EMIT_TWO_ASCII_BYTES(c1, c2) \ | |
| 785 do { \ | |
| 786 produced_chars += 2; \ | |
| 787 *dst++ = (c1), *dst++ = (c2); \ | |
| 788 } while (0) | |
| 789 | |
| 790 | |
| 791 /* Store a byte C in the place pointed by DST and increment DST to the | |
| 792 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is | |
| 793 nonzero, store in an appropriate multibyte from. The caller should | |
| 794 declare and set the variables `dst' and `multibytep' appropriately | |
| 795 in advance. */ | |
| 796 | |
| 797 #define EMIT_ONE_BYTE(c) \ | |
| 798 do { \ | |
| 799 produced_chars++; \ | |
| 800 if (multibytep) \ | |
| 801 { \ | |
| 802 int ch = (c); \ | |
| 803 if (ch >= 0x80) \ | |
| 804 ch = BYTE8_TO_CHAR (ch); \ | |
| 805 CHAR_STRING_ADVANCE (ch, dst); \ | |
| 806 } \ | |
| 807 else \ | |
| 808 *dst++ = (c); \ | |
| 809 } while (0) | |
| 810 | |
| 811 | |
| 812 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */ | |
| 813 | |
| 814 #define EMIT_TWO_BYTES(c1, c2) \ | |
| 815 do { \ | |
| 816 produced_chars += 2; \ | |
| 817 if (multibytep) \ | |
| 818 { \ | |
| 819 int ch; \ | |
| 820 \ | |
| 821 ch = (c1); \ | |
| 822 if (ch >= 0x80) \ | |
| 823 ch = BYTE8_TO_CHAR (ch); \ | |
| 824 CHAR_STRING_ADVANCE (ch, dst); \ | |
| 825 ch = (c2); \ | |
| 826 if (ch >= 0x80) \ | |
| 827 ch = BYTE8_TO_CHAR (ch); \ | |
| 828 CHAR_STRING_ADVANCE (ch, dst); \ | |
| 829 } \ | |
| 830 else \ | |
| 831 { \ | |
| 832 *dst++ = (c1); \ | |
| 833 *dst++ = (c2); \ | |
| 834 } \ | |
| 835 } while (0) | |
| 836 | |
| 837 | |
| 838 #define EMIT_THREE_BYTES(c1, c2, c3) \ | |
| 839 do { \ | |
| 840 EMIT_ONE_BYTE (c1); \ | |
| 841 EMIT_TWO_BYTES (c2, c3); \ | |
| 842 } while (0) | |
| 843 | |
| 844 | |
| 845 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \ | |
| 846 do { \ | |
| 847 EMIT_TWO_BYTES (c1, c2); \ | |
| 848 EMIT_TWO_BYTES (c3, c4); \ | |
| 849 } while (0) | |
| 850 | |
| 851 | |
| 852 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \ | |
| 853 do { \ | |
| 854 charset_map_loaded = 0; \ | |
| 855 c = DECODE_CHAR (charset, code); \ | |
| 856 if (charset_map_loaded) \ | |
| 857 { \ | |
| 858 const unsigned char *orig = coding->source; \ | |
| 859 EMACS_INT offset; \ | |
| 860 \ | |
| 861 coding_set_source (coding); \ | |
| 862 offset = coding->source - orig; \ | |
| 863 src += offset; \ | |
| 864 src_base += offset; \ | |
| 865 src_end += offset; \ | |
| 866 } \ | |
| 867 } while (0) | |
| 868 | |
| 869 | |
| 870 #define ASSURE_DESTINATION(bytes) \ | |
| 871 do { \ | |
| 872 if (dst + (bytes) >= dst_end) \ | |
| 873 { \ | |
| 874 int more_bytes = charbuf_end - charbuf + (bytes); \ | |
| 875 \ | |
| 876 dst = alloc_destination (coding, more_bytes, dst); \ | |
| 877 dst_end = coding->destination + coding->dst_bytes; \ | |
| 878 } \ | |
| 879 } while (0) | |
| 880 | |
| 881 | |
| 882 | |
| 883 static void | |
| 884 coding_set_source (coding) | |
| 885 struct coding_system *coding; | |
| 886 { | |
| 887 if (BUFFERP (coding->src_object)) | |
| 888 { | |
| 889 struct buffer *buf = XBUFFER (coding->src_object); | |
| 890 | |
| 891 if (coding->src_pos < 0) | |
| 892 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte; | |
| 893 else | |
| 894 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte); | |
| 895 } | |
| 896 else if (STRINGP (coding->src_object)) | |
| 897 { | |
| 898 coding->source = SDATA (coding->src_object) + coding->src_pos_byte; | |
| 899 } | |
| 900 else | |
| 901 /* Otherwise, the source is C string and is never relocated | |
| 902 automatically. Thus we don't have to update anything. */ | |
| 903 ; | |
| 904 } | |
| 905 | |
| 906 static void | |
| 907 coding_set_destination (coding) | |
| 908 struct coding_system *coding; | |
| 909 { | |
| 910 if (BUFFERP (coding->dst_object)) | |
| 911 { | |
| 912 if (coding->src_pos < 0) | |
| 913 { | |
| 914 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1; | |
| 915 coding->dst_bytes = (GAP_END_ADDR | |
| 916 - (coding->src_bytes - coding->consumed) | |
| 917 - coding->destination); | |
| 918 } | |
| 919 else | |
| 920 { | |
| 921 /* We are sure that coding->dst_pos_byte is before the gap | |
| 922 of the buffer. */ | |
| 923 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object)) | |
| 924 + coding->dst_pos_byte - 1); | |
| 925 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object)) | |
| 926 - coding->destination); | |
| 927 } | |
| 928 } | |
| 929 else | |
| 930 /* Otherwise, the destination is C string and is never relocated | |
| 931 automatically. Thus we don't have to update anything. */ | |
| 932 ; | |
| 933 } | |
| 934 | |
| 935 | |
| 936 static void | |
| 937 coding_alloc_by_realloc (coding, bytes) | |
| 938 struct coding_system *coding; | |
| 939 EMACS_INT bytes; | |
| 940 { | |
| 941 coding->destination = (unsigned char *) xrealloc (coding->destination, | |
| 942 coding->dst_bytes + bytes); | |
| 943 coding->dst_bytes += bytes; | |
| 944 } | |
| 945 | |
| 946 static void | |
| 947 coding_alloc_by_making_gap (coding, bytes) | |
| 948 struct coding_system *coding; | |
| 949 EMACS_INT bytes; | |
| 950 { | |
| 951 if (BUFFERP (coding->dst_object) | |
| 952 && EQ (coding->src_object, coding->dst_object)) | |
| 953 { | |
| 954 EMACS_INT add = coding->src_bytes - coding->consumed; | |
| 955 | |
| 956 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add; | |
| 957 make_gap (bytes); | |
| 958 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add; | |
| 959 } | |
| 960 else | |
| 961 { | |
| 962 Lisp_Object this_buffer; | |
| 963 | |
| 964 this_buffer = Fcurrent_buffer (); | |
| 965 set_buffer_internal (XBUFFER (coding->dst_object)); | |
| 966 make_gap (bytes); | |
| 967 set_buffer_internal (XBUFFER (this_buffer)); | |
| 968 } | |
| 969 } | |
| 970 | |
| 971 | |
| 972 static unsigned char * | |
| 973 alloc_destination (coding, nbytes, dst) | |
| 974 struct coding_system *coding; | |
| 975 int nbytes; | |
| 976 unsigned char *dst; | |
| 977 { | |
| 978 EMACS_INT offset = dst - coding->destination; | |
| 979 | |
| 980 if (BUFFERP (coding->dst_object)) | |
| 981 coding_alloc_by_making_gap (coding, nbytes); | |
| 982 else | |
| 983 coding_alloc_by_realloc (coding, nbytes); | |
| 984 coding->result = CODING_RESULT_SUCCESS; | |
| 985 coding_set_destination (coding); | |
| 986 dst = coding->destination + offset; | |
| 987 return dst; | |
| 988 } | |
| 989 | |
| 990 /** Macros for annotations. */ | |
| 991 | |
| 992 /* Maximum length of annotation data (sum of annotations for | |
| 993 composition and charset). */ | |
| 994 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5) | |
| 995 | |
| 996 /* An annotation data is stored in the array coding->charbuf in this | |
| 997 format: | |
| 998 [ -LENGTH ANNOTATION_MASK FROM TO ... ] | |
| 999 LENGTH is the number of elements in the annotation. | |
| 1000 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK. | |
| 1001 FROM and TO specify the range of text annotated. They are relative | |
| 1002 to coding->src_pos (on encoding) or coding->dst_pos (on decoding). | |
| 1003 | |
| 1004 The format of the following elements depend on ANNOTATION_MASK. | |
| 1005 | |
| 1006 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements | |
| 1007 follows: | |
| 1008 ... METHOD [ COMPOSITION-COMPONENTS ... ] | |
| 1009 METHOD is one of enum composition_method. | |
| 1010 Optionnal COMPOSITION-COMPONENTS are characters and composition | |
| 1011 rules. | |
| 1012 | |
| 1013 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID | |
| 1014 follows. */ | |
| 1015 | |
| 1016 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \ | |
| 1017 do { \ | |
| 1018 *(buf)++ = -(len); \ | |
| 1019 *(buf)++ = (mask); \ | |
| 1020 *(buf)++ = (from); \ | |
| 1021 *(buf)++ = (to); \ | |
| 1022 coding->annotated = 1; \ | |
| 1023 } while (0); | |
| 1024 | |
| 1025 #define ADD_COMPOSITION_DATA(buf, from, to, method) \ | |
| 1026 do { \ | |
| 1027 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \ | |
| 1028 *buf++ = method; \ | |
| 1029 } while (0) | |
| 1030 | |
| 1031 | |
| 1032 #define ADD_CHARSET_DATA(buf, from, to, id) \ | |
| 1033 do { \ | |
| 1034 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \ | |
| 1035 *buf++ = id; \ | |
| 1036 } while (0) | |
| 531 | 1037 |
| 532 | 1038 |
| 533 /*** 2. Emacs internal format (emacs-mule) handlers ***/ | 1039 /*** 2. Emacs' internal format (emacs-utf-8) ***/ |
| 1040 | |
| 1041 | |
| 1042 | |
| 1043 | |
| 1044 /*** 3. UTF-8 ***/ | |
| 1045 | |
| 1046 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | |
| 1047 Check if a text is encoded in UTF-8. If it is, return 1, else | |
| 1048 return 0. */ | |
| 1049 | |
| 1050 #define UTF_8_1_OCTET_P(c) ((c) < 0x80) | |
| 1051 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80) | |
| 1052 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0) | |
| 1053 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0) | |
| 1054 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0) | |
| 1055 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8) | |
| 1056 | |
| 1057 static int | |
| 1058 detect_coding_utf_8 (coding, detect_info) | |
| 1059 struct coding_system *coding; | |
| 1060 struct coding_detection_info *detect_info; | |
| 1061 { | |
| 1062 const unsigned char *src = coding->source, *src_base = src; | |
| 1063 const unsigned char *src_end = coding->source + coding->src_bytes; | |
| 1064 int multibytep = coding->src_multibyte; | |
| 1065 int consumed_chars = 0; | |
| 1066 int found = 0; | |
| 1067 int incomplete; | |
| 1068 | |
| 1069 detect_info->checked |= CATEGORY_MASK_UTF_8; | |
| 1070 /* A coding system of this category is always ASCII compatible. */ | |
| 1071 src += coding->head_ascii; | |
| 1072 | |
| 1073 while (1) | |
| 1074 { | |
| 1075 int c, c1, c2, c3, c4; | |
| 1076 | |
| 1077 incomplete = 0; | |
| 1078 ONE_MORE_BYTE (c); | |
| 1079 if (UTF_8_1_OCTET_P (c)) | |
| 1080 continue; | |
| 1081 incomplete = 1; | |
| 1082 ONE_MORE_BYTE (c1); | |
| 1083 if (! UTF_8_EXTRA_OCTET_P (c1)) | |
| 1084 break; | |
| 1085 if (UTF_8_2_OCTET_LEADING_P (c)) | |
| 1086 { | |
| 1087 found = CATEGORY_MASK_UTF_8; | |
| 1088 continue; | |
| 1089 } | |
| 1090 ONE_MORE_BYTE (c2); | |
| 1091 if (! UTF_8_EXTRA_OCTET_P (c2)) | |
| 1092 break; | |
| 1093 if (UTF_8_3_OCTET_LEADING_P (c)) | |
| 1094 { | |
| 1095 found = CATEGORY_MASK_UTF_8; | |
| 1096 continue; | |
| 1097 } | |
| 1098 ONE_MORE_BYTE (c3); | |
| 1099 if (! UTF_8_EXTRA_OCTET_P (c3)) | |
| 1100 break; | |
| 1101 if (UTF_8_4_OCTET_LEADING_P (c)) | |
| 1102 { | |
| 1103 found = CATEGORY_MASK_UTF_8; | |
| 1104 continue; | |
| 1105 } | |
| 1106 ONE_MORE_BYTE (c4); | |
| 1107 if (! UTF_8_EXTRA_OCTET_P (c4)) | |
| 1108 break; | |
| 1109 if (UTF_8_5_OCTET_LEADING_P (c)) | |
| 1110 { | |
| 1111 found = CATEGORY_MASK_UTF_8; | |
| 1112 continue; | |
| 1113 } | |
| 1114 break; | |
| 1115 } | |
| 1116 detect_info->rejected |= CATEGORY_MASK_UTF_8; | |
| 1117 return 0; | |
| 1118 | |
| 1119 no_more_source: | |
| 1120 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) | |
| 1121 { | |
| 1122 detect_info->rejected |= CATEGORY_MASK_UTF_8; | |
| 1123 return 0; | |
| 1124 } | |
| 1125 detect_info->found |= found; | |
| 1126 return 1; | |
| 1127 } | |
| 1128 | |
| 1129 | |
| 1130 static void | |
| 1131 decode_coding_utf_8 (coding) | |
| 1132 struct coding_system *coding; | |
| 1133 { | |
| 1134 const unsigned char *src = coding->source + coding->consumed; | |
| 1135 const unsigned char *src_end = coding->source + coding->src_bytes; | |
| 1136 const unsigned char *src_base; | |
| 1137 int *charbuf = coding->charbuf; | |
| 1138 int *charbuf_end = charbuf + coding->charbuf_size; | |
| 1139 int consumed_chars = 0, consumed_chars_base; | |
| 1140 int multibytep = coding->src_multibyte; | |
| 1141 Lisp_Object attr, eol_type, charset_list; | |
| 1142 | |
| 1143 CODING_GET_INFO (coding, attr, eol_type, charset_list); | |
| 1144 | |
| 1145 while (1) | |
| 1146 { | |
| 1147 int c, c1, c2, c3, c4, c5; | |
| 1148 | |
| 1149 src_base = src; | |
| 1150 consumed_chars_base = consumed_chars; | |
| 1151 | |
| 1152 if (charbuf >= charbuf_end) | |
| 1153 break; | |
| 1154 | |
| 1155 ONE_MORE_BYTE (c1); | |
| 1156 if (UTF_8_1_OCTET_P(c1)) | |
| 1157 { | |
| 1158 c = c1; | |
| 1159 if (c == '\r') | |
| 1160 { | |
| 1161 if (EQ (eol_type, Qdos)) | |
| 1162 { | |
| 1163 if (src == src_end) | |
| 1164 { | |
| 1165 coding->result = CODING_RESULT_INSUFFICIENT_SRC; | |
| 1166 goto no_more_source; | |
| 1167 } | |
| 1168 if (*src == '\n') | |
| 1169 ONE_MORE_BYTE (c); | |
| 1170 } | |
| 1171 else if (EQ (eol_type, Qmac)) | |
| 1172 c = '\n'; | |
| 1173 } | |
| 1174 } | |
| 1175 else | |
| 1176 { | |
| 1177 ONE_MORE_BYTE (c2); | |
| 1178 if (! UTF_8_EXTRA_OCTET_P (c2)) | |
| 1179 goto invalid_code; | |
| 1180 if (UTF_8_2_OCTET_LEADING_P (c1)) | |
| 1181 { | |
| 1182 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F); | |
| 1183 /* Reject overlong sequences here and below. Encoders | |
| 1184 producing them are incorrect, they can be misleading, | |
| 1185 and they mess up read/write invariance. */ | |
| 1186 if (c < 128) | |
| 1187 goto invalid_code; | |
| 1188 } | |
| 1189 else | |
| 1190 { | |
| 1191 ONE_MORE_BYTE (c3); | |
| 1192 if (! UTF_8_EXTRA_OCTET_P (c3)) | |
| 1193 goto invalid_code; | |
| 1194 if (UTF_8_3_OCTET_LEADING_P (c1)) | |
| 1195 { | |
| 1196 c = (((c1 & 0xF) << 12) | |
| 1197 | ((c2 & 0x3F) << 6) | (c3 & 0x3F)); | |
| 1198 if (c < 0x800 | |
| 1199 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */ | |
| 1200 goto invalid_code; | |
| 1201 } | |
| 1202 else | |
| 1203 { | |
| 1204 ONE_MORE_BYTE (c4); | |
| 1205 if (! UTF_8_EXTRA_OCTET_P (c4)) | |
| 1206 goto invalid_code; | |
| 1207 if (UTF_8_4_OCTET_LEADING_P (c1)) | |
| 1208 { | |
| 1209 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12) | |
| 1210 | ((c3 & 0x3F) << 6) | (c4 & 0x3F)); | |
| 1211 if (c < 0x10000) | |
| 1212 goto invalid_code; | |
| 1213 } | |
| 1214 else | |
| 1215 { | |
| 1216 ONE_MORE_BYTE (c5); | |
| 1217 if (! UTF_8_EXTRA_OCTET_P (c5)) | |
| 1218 goto invalid_code; | |
| 1219 if (UTF_8_5_OCTET_LEADING_P (c1)) | |
| 1220 { | |
| 1221 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18) | |
| 1222 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6) | |
| 1223 | (c5 & 0x3F)); | |
| 1224 if ((c > MAX_CHAR) || (c < 0x200000)) | |
| 1225 goto invalid_code; | |
| 1226 } | |
| 1227 else | |
| 1228 goto invalid_code; | |
| 1229 } | |
| 1230 } | |
| 1231 } | |
| 1232 } | |
| 1233 | |
| 1234 *charbuf++ = c; | |
| 1235 continue; | |
| 1236 | |
| 1237 invalid_code: | |
| 1238 src = src_base; | |
| 1239 consumed_chars = consumed_chars_base; | |
| 1240 ONE_MORE_BYTE (c); | |
| 1241 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); | |
| 1242 coding->errors++; | |
| 1243 } | |
| 1244 | |
| 1245 no_more_source: | |
| 1246 coding->consumed_char += consumed_chars_base; | |
| 1247 coding->consumed = src_base - coding->source; | |
| 1248 coding->charbuf_used = charbuf - coding->charbuf; | |
| 1249 } | |
| 1250 | |
| 1251 | |
| 1252 static int | |
| 1253 encode_coding_utf_8 (coding) | |
| 1254 struct coding_system *coding; | |
| 1255 { | |
| 1256 int multibytep = coding->dst_multibyte; | |
| 1257 int *charbuf = coding->charbuf; | |
| 1258 int *charbuf_end = charbuf + coding->charbuf_used; | |
| 1259 unsigned char *dst = coding->destination + coding->produced; | |
| 1260 unsigned char *dst_end = coding->destination + coding->dst_bytes; | |
| 1261 int produced_chars = 0; | |
| 1262 int c; | |
| 1263 | |
| 1264 if (multibytep) | |
| 1265 { | |
| 1266 int safe_room = MAX_MULTIBYTE_LENGTH * 2; | |
| 1267 | |
| 1268 while (charbuf < charbuf_end) | |
| 1269 { | |
| 1270 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str; | |
| 1271 | |
| 1272 ASSURE_DESTINATION (safe_room); | |
| 1273 c = *charbuf++; | |
| 1274 if (CHAR_BYTE8_P (c)) | |
| 1275 { | |
| 1276 c = CHAR_TO_BYTE8 (c); | |
| 1277 EMIT_ONE_BYTE (c); | |
| 1278 } | |
| 1279 else | |
| 1280 { | |
| 1281 CHAR_STRING_ADVANCE (c, pend); | |
| 1282 for (p = str; p < pend; p++) | |
| 1283 EMIT_ONE_BYTE (*p); | |
| 1284 } | |
| 1285 } | |
| 1286 } | |
| 1287 else | |
| 1288 { | |
| 1289 int safe_room = MAX_MULTIBYTE_LENGTH; | |
| 1290 | |
| 1291 while (charbuf < charbuf_end) | |
| 1292 { | |
| 1293 ASSURE_DESTINATION (safe_room); | |
| 1294 c = *charbuf++; | |
| 1295 dst += CHAR_STRING (c, dst); | |
| 1296 produced_chars++; | |
| 1297 } | |
| 1298 } | |
| 1299 coding->result = CODING_RESULT_SUCCESS; | |
| 1300 coding->produced_char += produced_chars; | |
| 1301 coding->produced = dst - coding->destination; | |
| 1302 return 0; | |
| 1303 } | |
| 1304 | |
| 1305 | |
| 1306 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | |
| 1307 Check if a text is encoded in one of UTF-16 based coding systems. | |
| 1308 If it is, return 1, else return 0. */ | |
| 1309 | |
| 1310 #define UTF_16_HIGH_SURROGATE_P(val) \ | |
| 1311 (((val) & 0xFC00) == 0xD800) | |
| 1312 | |
| 1313 #define UTF_16_LOW_SURROGATE_P(val) \ | |
| 1314 (((val) & 0xFC00) == 0xDC00) | |
| 1315 | |
| 1316 #define UTF_16_INVALID_P(val) \ | |
| 1317 (((val) == 0xFFFE) \ | |
| 1318 || ((val) == 0xFFFF) \ | |
| 1319 || UTF_16_LOW_SURROGATE_P (val)) | |
| 1320 | |
| 1321 | |
| 1322 static int | |
| 1323 detect_coding_utf_16 (coding, detect_info) | |
| 1324 struct coding_system *coding; | |
| 1325 struct coding_detection_info *detect_info; | |
| 1326 { | |
| 1327 const unsigned char *src = coding->source, *src_base = src; | |
| 1328 const unsigned char *src_end = coding->source + coding->src_bytes; | |
| 1329 int multibytep = coding->src_multibyte; | |
| 1330 int consumed_chars = 0; | |
| 1331 int c1, c2; | |
| 1332 | |
| 1333 detect_info->checked |= CATEGORY_MASK_UTF_16; | |
| 1334 | |
| 1335 if (coding->mode & CODING_MODE_LAST_BLOCK | |
| 1336 && (coding->src_bytes & 1)) | |
| 1337 { | |
| 1338 detect_info->rejected |= CATEGORY_MASK_UTF_16; | |
| 1339 return 0; | |
| 1340 } | |
| 1341 ONE_MORE_BYTE (c1); | |
| 1342 ONE_MORE_BYTE (c2); | |
| 1343 | |
| 1344 if ((c1 == 0xFF) && (c2 == 0xFE)) | |
| 1345 { | |
| 1346 detect_info->found |= (CATEGORY_MASK_UTF_16_LE | |
| 1347 | CATEGORY_MASK_UTF_16_AUTO); | |
| 1348 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE; | |
| 1349 } | |
| 1350 else if ((c1 == 0xFE) && (c2 == 0xFF)) | |
| 1351 { | |
| 1352 detect_info->found |= (CATEGORY_MASK_UTF_16_BE | |
| 1353 | CATEGORY_MASK_UTF_16_AUTO); | |
| 1354 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE; | |
| 1355 } | |
| 1356 no_more_source: | |
| 1357 return 1; | |
| 1358 } | |
| 1359 | |
| 1360 static void | |
| 1361 decode_coding_utf_16 (coding) | |
| 1362 struct coding_system *coding; | |
| 1363 { | |
| 1364 const unsigned char *src = coding->source + coding->consumed; | |
| 1365 const unsigned char *src_end = coding->source + coding->src_bytes; | |
| 1366 const unsigned char *src_base; | |
| 1367 int *charbuf = coding->charbuf; | |
| 1368 int *charbuf_end = charbuf + coding->charbuf_size; | |
| 1369 int consumed_chars = 0, consumed_chars_base; | |
| 1370 int multibytep = coding->src_multibyte; | |
| 1371 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding); | |
| 1372 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding); | |
| 1373 int surrogate = CODING_UTF_16_SURROGATE (coding); | |
| 1374 Lisp_Object attr, eol_type, charset_list; | |
| 1375 | |
| 1376 CODING_GET_INFO (coding, attr, eol_type, charset_list); | |
| 1377 | |
| 1378 if (bom == utf_16_with_bom) | |
| 1379 { | |
| 1380 int c, c1, c2; | |
| 1381 | |
| 1382 src_base = src; | |
| 1383 ONE_MORE_BYTE (c1); | |
| 1384 ONE_MORE_BYTE (c2); | |
| 1385 c = (c1 << 8) | c2; | |
| 1386 | |
| 1387 if (endian == utf_16_big_endian | |
| 1388 ? c != 0xFEFF : c != 0xFFFE) | |
| 1389 { | |
| 1390 /* The first two bytes are not BOM. Treat them as bytes | |
| 1391 for a normal character. */ | |
| 1392 src = src_base; | |
| 1393 coding->errors++; | |
| 1394 } | |
| 1395 CODING_UTF_16_BOM (coding) = utf_16_without_bom; | |
| 1396 } | |
| 1397 else if (bom == utf_16_detect_bom) | |
| 1398 { | |
| 1399 /* We have already tried to detect BOM and failed in | |
| 1400 detect_coding. */ | |
| 1401 CODING_UTF_16_BOM (coding) = utf_16_without_bom; | |
| 1402 } | |
| 1403 | |
| 1404 while (1) | |
| 1405 { | |
| 1406 int c, c1, c2; | |
| 1407 | |
| 1408 src_base = src; | |
| 1409 consumed_chars_base = consumed_chars; | |
| 1410 | |
| 1411 if (charbuf + 2 >= charbuf_end) | |
| 1412 break; | |
| 1413 | |
| 1414 ONE_MORE_BYTE (c1); | |
| 1415 ONE_MORE_BYTE (c2); | |
| 1416 c = (endian == utf_16_big_endian | |
| 1417 ? ((c1 << 8) | c2) : ((c2 << 8) | c1)); | |
| 1418 if (surrogate) | |
| 1419 { | |
| 1420 if (! UTF_16_LOW_SURROGATE_P (c)) | |
| 1421 { | |
| 1422 if (endian == utf_16_big_endian) | |
| 1423 c1 = surrogate >> 8, c2 = surrogate & 0xFF; | |
| 1424 else | |
| 1425 c1 = surrogate & 0xFF, c2 = surrogate >> 8; | |
| 1426 *charbuf++ = c1; | |
| 1427 *charbuf++ = c2; | |
| 1428 coding->errors++; | |
| 1429 if (UTF_16_HIGH_SURROGATE_P (c)) | |
| 1430 CODING_UTF_16_SURROGATE (coding) = surrogate = c; | |
| 1431 else | |
| 1432 *charbuf++ = c; | |
| 1433 } | |
| 1434 else | |
| 1435 { | |
| 1436 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00); | |
| 1437 CODING_UTF_16_SURROGATE (coding) = surrogate = 0; | |
| 1438 *charbuf++ = c; | |
| 1439 } | |
| 1440 } | |
| 1441 else | |
| 1442 { | |
| 1443 if (UTF_16_HIGH_SURROGATE_P (c)) | |
| 1444 CODING_UTF_16_SURROGATE (coding) = surrogate = c; | |
| 1445 else | |
| 1446 *charbuf++ = c; | |
| 1447 } | |
| 1448 } | |
| 1449 | |
| 1450 no_more_source: | |
| 1451 coding->consumed_char += consumed_chars_base; | |
| 1452 coding->consumed = src_base - coding->source; | |
| 1453 coding->charbuf_used = charbuf - coding->charbuf; | |
| 1454 } | |
| 1455 | |
| 1456 static int | |
| 1457 encode_coding_utf_16 (coding) | |
| 1458 struct coding_system *coding; | |
| 1459 { | |
| 1460 int multibytep = coding->dst_multibyte; | |
| 1461 int *charbuf = coding->charbuf; | |
| 1462 int *charbuf_end = charbuf + coding->charbuf_used; | |
| 1463 unsigned char *dst = coding->destination + coding->produced; | |
| 1464 unsigned char *dst_end = coding->destination + coding->dst_bytes; | |
| 1465 int safe_room = 8; | |
| 1466 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding); | |
| 1467 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian; | |
| 1468 int produced_chars = 0; | |
| 1469 Lisp_Object attrs, eol_type, charset_list; | |
| 1470 int c; | |
| 1471 | |
| 1472 CODING_GET_INFO (coding, attrs, eol_type, charset_list); | |
| 1473 | |
| 1474 if (bom != utf_16_without_bom) | |
| 1475 { | |
| 1476 ASSURE_DESTINATION (safe_room); | |
| 1477 if (big_endian) | |
| 1478 EMIT_TWO_BYTES (0xFE, 0xFF); | |
| 1479 else | |
| 1480 EMIT_TWO_BYTES (0xFF, 0xFE); | |
| 1481 CODING_UTF_16_BOM (coding) = utf_16_without_bom; | |
| 1482 } | |
| 1483 | |
| 1484 while (charbuf < charbuf_end) | |
| 1485 { | |
| 1486 ASSURE_DESTINATION (safe_room); | |
| 1487 c = *charbuf++; | |
| 1488 if (c >= MAX_UNICODE_CHAR) | |
| 1489 c = coding->default_char; | |
| 1490 | |
| 1491 if (c < 0x10000) | |
| 1492 { | |
| 1493 if (big_endian) | |
| 1494 EMIT_TWO_BYTES (c >> 8, c & 0xFF); | |
| 1495 else | |
| 1496 EMIT_TWO_BYTES (c & 0xFF, c >> 8); | |
| 1497 } | |
| 1498 else | |
| 1499 { | |
| 1500 int c1, c2; | |
| 1501 | |
| 1502 c -= 0x10000; | |
| 1503 c1 = (c >> 10) + 0xD800; | |
| 1504 c2 = (c & 0x3FF) + 0xDC00; | |
| 1505 if (big_endian) | |
| 1506 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF); | |
| 1507 else | |
| 1508 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8); | |
| 1509 } | |
| 1510 } | |
| 1511 coding->result = CODING_RESULT_SUCCESS; | |
| 1512 coding->produced = dst - coding->destination; | |
| 1513 coding->produced_char += produced_chars; | |
| 1514 return 0; | |
| 1515 } | |
| 1516 | |
| 1517 | |
| 1518 /*** 6. Old Emacs' internal format (emacs-mule) ***/ | |
| 534 | 1519 |
| 535 /* Emacs' internal format for representation of multiple character | 1520 /* Emacs' internal format for representation of multiple character |
| 536 sets is a kind of multi-byte encoding, i.e. characters are | 1521 sets is a kind of multi-byte encoding, i.e. characters are |
| 537 represented by variable-length sequences of one-byte codes. | 1522 represented by variable-length sequences of one-byte codes. |
| 538 | 1523 |
| 570 format (i.e. by encoding by the coding system `emacs-mule'). | 1555 format (i.e. by encoding by the coding system `emacs-mule'). |
| 571 | 1556 |
| 572 In that case, a sequence of one-byte codes has a slightly different | 1557 In that case, a sequence of one-byte codes has a slightly different |
| 573 form. | 1558 form. |
| 574 | 1559 |
| 575 Firstly, all characters in eight-bit-control are represented by | 1560 At first, all characters in eight-bit-control are represented by |
| 576 one-byte sequences which are their 8-bit code. | 1561 one-byte sequences which are their 8-bit code. |
| 577 | 1562 |
| 578 Next, character composition data are represented by the byte | 1563 Next, character composition data are represented by the byte |
| 579 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ..., | 1564 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ..., |
| 580 where, | 1565 where, |
| 581 METHOD is 0xF0 plus one of composition method (enum | 1566 METHOD is 0xF0 plus one of composition method (enum |
| 582 composition_method), | 1567 composition_method), |
| 583 | 1568 |
| 584 BYTES is 0xA0 plus the byte length of these composition data, | 1569 BYTES is 0xA0 plus a byte length of this composition data, |
| 585 | 1570 |
| 586 CHARS is 0xA0 plus the number of characters composed by these | 1571 CHARS is 0x20 plus a number of characters composed by this |
| 587 data, | 1572 data, |
| 588 | 1573 |
| 589 COMPONENTs are characters of multibyte form or composition | 1574 COMPONENTs are characters of multibye form or composition |
| 590 rules encoded by two-byte of ASCII codes. | 1575 rules encoded by two-byte of ASCII codes. |
| 591 | 1576 |
| 592 In addition, for backward compatibility, the following formats are | 1577 In addition, for backward compatibility, the following formats are |
| 593 also recognized as composition data on decoding. | 1578 also recognized as composition data on decoding. |
| 594 | 1579 |
| 601 other: LEADING_CODE+0x20 FOLLOWING-BYTE ..., | 1586 other: LEADING_CODE+0x20 FOLLOWING-BYTE ..., |
| 602 RULE is a one byte code of the range 0xA0..0xF0 that | 1587 RULE is a one byte code of the range 0xA0..0xF0 that |
| 603 represents a composition rule. | 1588 represents a composition rule. |
| 604 */ | 1589 */ |
| 605 | 1590 |
| 606 enum emacs_code_class_type emacs_code_class[256]; | 1591 char emacs_mule_bytes[256]; |
| 1592 | |
| 1593 int | |
| 1594 emacs_mule_char (coding, src, nbytes, nchars, id) | |
| 1595 struct coding_system *coding; | |
| 1596 unsigned char *src; | |
| 1597 int *nbytes, *nchars, *id; | |
| 1598 { | |
| 1599 const unsigned char *src_end = coding->source + coding->src_bytes; | |
| 1600 const unsigned char *src_base = src; | |
| 1601 int multibytep = coding->src_multibyte; | |
| 1602 struct charset *charset; | |
| 1603 unsigned code; | |
| 1604 int c; | |
| 1605 int consumed_chars = 0; | |
| 1606 | |
| 1607 ONE_MORE_BYTE (c); | |
| 1608 switch (emacs_mule_bytes[c]) | |
| 1609 { | |
| 1610 case 2: | |
| 1611 if (! (charset = emacs_mule_charset[c])) | |
| 1612 goto invalid_code; | |
| 1613 ONE_MORE_BYTE (c); | |
| 1614 code = c & 0x7F; | |
| 1615 break; | |
| 1616 | |
| 1617 case 3: | |
| 1618 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11 | |
| 1619 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12) | |
| 1620 { | |
| 1621 ONE_MORE_BYTE (c); | |
| 1622 if (! (charset = emacs_mule_charset[c])) | |
| 1623 goto invalid_code; | |
| 1624 ONE_MORE_BYTE (c); | |
| 1625 code = c & 0x7F; | |
| 1626 } | |
| 1627 else | |
| 1628 { | |
| 1629 if (! (charset = emacs_mule_charset[c])) | |
| 1630 goto invalid_code; | |
| 1631 ONE_MORE_BYTE (c); | |
| 1632 code = (c & 0x7F) << 8; | |
| 1633 ONE_MORE_BYTE (c); | |
| 1634 code |= c & 0x7F; | |
| 1635 } | |
| 1636 break; | |
| 1637 | |
| 1638 case 4: | |
| 1639 ONE_MORE_BYTE (c); | |
| 1640 if (! (charset = emacs_mule_charset[c])) | |
| 1641 goto invalid_code; | |
| 1642 ONE_MORE_BYTE (c); | |
| 1643 code = (c & 0x7F) << 8; | |
| 1644 ONE_MORE_BYTE (c); | |
| 1645 code |= c & 0x7F; | |
| 1646 break; | |
| 1647 | |
| 1648 case 1: | |
| 1649 code = c; | |
| 1650 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code) | |
| 1651 ? charset_ascii : charset_eight_bit); | |
| 1652 break; | |
| 1653 | |
| 1654 default: | |
| 1655 abort (); | |
| 1656 } | |
| 1657 c = DECODE_CHAR (charset, code); | |
| 1658 if (c < 0) | |
| 1659 goto invalid_code; | |
| 1660 *nbytes = src - src_base; | |
| 1661 *nchars = consumed_chars; | |
| 1662 if (id) | |
| 1663 *id = charset->id; | |
| 1664 return c; | |
| 1665 | |
| 1666 no_more_source: | |
| 1667 return -2; | |
| 1668 | |
| 1669 invalid_code: | |
| 1670 return -1; | |
| 1671 } | |
| 1672 | |
| 607 | 1673 |
| 608 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 1674 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| 609 Check if a text is encoded in Emacs' internal format. If it is, | 1675 Check if a text is encoded in `emacs-mule'. If it is, return 1, |
| 610 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */ | 1676 else return 0. */ |
| 611 | 1677 |
| 612 static int | 1678 static int |
| 613 detect_coding_emacs_mule (src, src_end, multibytep) | 1679 detect_coding_emacs_mule (coding, detect_info) |
| 614 unsigned char *src, *src_end; | 1680 struct coding_system *coding; |
| 615 int multibytep; | 1681 struct coding_detection_info *detect_info; |
| 616 { | 1682 { |
| 617 unsigned char c; | 1683 const unsigned char *src = coding->source, *src_base = src; |
| 618 int composing = 0; | 1684 const unsigned char *src_end = coding->source + coding->src_bytes; |
| 619 /* Dummy for ONE_MORE_BYTE. */ | 1685 int multibytep = coding->src_multibyte; |
| 620 struct coding_system dummy_coding; | 1686 int consumed_chars = 0; |
| 621 struct coding_system *coding = &dummy_coding; | 1687 int c; |
| 1688 int found = 0; | |
| 1689 int incomplete; | |
| 1690 | |
| 1691 detect_info->checked |= CATEGORY_MASK_EMACS_MULE; | |
| 1692 /* A coding system of this category is always ASCII compatible. */ | |
| 1693 src += coding->head_ascii; | |
| 622 | 1694 |
| 623 while (1) | 1695 while (1) |
| 624 { | 1696 { |
| 625 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 1697 incomplete = 0; |
| 626 | 1698 ONE_MORE_BYTE (c); |
| 627 if (composing) | 1699 incomplete = 1; |
| 628 { | 1700 |
| 629 if (c < 0xA0) | 1701 if (c == 0x80) |
| 630 composing = 0; | 1702 { |
| 631 else if (c == 0xA0) | 1703 /* Perhaps the start of composite character. We simple skip |
| 1704 it because analyzing it is too heavy for detecting. But, | |
| 1705 at least, we check that the composite character | |
| 1706 constitues of more than 4 bytes. */ | |
| 1707 const unsigned char *src_base; | |
| 1708 | |
| 1709 repeat: | |
| 1710 src_base = src; | |
| 1711 do | |
| 632 { | 1712 { |
| 633 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 1713 ONE_MORE_BYTE (c); |
| 634 c &= 0x7F; | 1714 } |
| 1715 while (c >= 0xA0); | |
| 1716 | |
| 1717 if (src - src_base <= 4) | |
| 1718 break; | |
| 1719 found = CATEGORY_MASK_EMACS_MULE; | |
| 1720 if (c == 0x80) | |
| 1721 goto repeat; | |
| 1722 } | |
| 1723 | |
| 1724 if (c < 0x80) | |
| 1725 { | |
| 1726 if (c < 0x20 | |
| 1727 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)) | |
| 1728 break; | |
| 1729 } | |
| 1730 else | |
| 1731 { | |
| 1732 const unsigned char *src_base = src - 1; | |
| 1733 | |
| 1734 do | |
| 1735 { | |
| 1736 ONE_MORE_BYTE (c); | |
| 1737 } | |
| 1738 while (c >= 0xA0); | |
| 1739 if (src - src_base != emacs_mule_bytes[*src_base]) | |
| 1740 break; | |
| 1741 found = CATEGORY_MASK_EMACS_MULE; | |
| 1742 } | |
| 1743 } | |
| 1744 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE; | |
| 1745 return 0; | |
| 1746 | |
| 1747 no_more_source: | |
| 1748 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) | |
| 1749 { | |
| 1750 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE; | |
| 1751 return 0; | |
| 1752 } | |
| 1753 detect_info->found |= found; | |
| 1754 return 1; | |
| 1755 } | |
| 1756 | |
| 1757 | |
| 1758 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ | |
| 1759 | |
| 1760 /* Decode a character represented as a component of composition | |
| 1761 sequence of Emacs 20/21 style at SRC. Set C to that character and | |
| 1762 update SRC to the head of next character (or an encoded composition | |
| 1763 rule). If SRC doesn't points a composition component, set C to -1. | |
| 1764 If SRC points an invalid byte sequence, global exit by a return | |
| 1765 value 0. */ | |
| 1766 | |
| 1767 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \ | |
| 1768 if (1) \ | |
| 1769 { \ | |
| 1770 int c; \ | |
| 1771 int nbytes, nchars; \ | |
| 1772 \ | |
| 1773 if (src == src_end) \ | |
| 1774 break; \ | |
| 1775 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\ | |
| 1776 if (c < 0) \ | |
| 1777 { \ | |
| 1778 if (c == -2) \ | |
| 1779 break; \ | |
| 1780 goto invalid_code; \ | |
| 1781 } \ | |
| 1782 *buf++ = c; \ | |
| 1783 src += nbytes; \ | |
| 1784 consumed_chars += nchars; \ | |
| 1785 } \ | |
| 1786 else | |
| 1787 | |
| 1788 | |
| 1789 /* Decode a composition rule represented as a component of composition | |
| 1790 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF, | |
| 1791 and increment BUF. If SRC points an invalid byte sequence, set C | |
| 1792 to -1. */ | |
| 1793 | |
| 1794 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \ | |
| 1795 do { \ | |
| 1796 int c, gref, nref; \ | |
| 1797 \ | |
| 1798 if (src >= src_end) \ | |
| 1799 goto invalid_code; \ | |
| 1800 ONE_MORE_BYTE_NO_CHECK (c); \ | |
| 1801 c -= 0x20; \ | |
| 1802 if (c < 0 || c >= 81) \ | |
| 1803 goto invalid_code; \ | |
| 1804 \ | |
| 1805 gref = c / 9, nref = c % 9; \ | |
| 1806 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \ | |
| 1807 } while (0) | |
| 1808 | |
| 1809 | |
| 1810 /* Decode a composition rule represented as a component of composition | |
| 1811 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF, | |
| 1812 and increment BUF. If SRC points an invalid byte sequence, set C | |
| 1813 to -1. */ | |
| 1814 | |
| 1815 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \ | |
| 1816 do { \ | |
| 1817 int gref, nref; \ | |
| 1818 \ | |
| 1819 if (src + 1>= src_end) \ | |
| 1820 goto invalid_code; \ | |
| 1821 ONE_MORE_BYTE_NO_CHECK (gref); \ | |
| 1822 gref -= 0x20; \ | |
| 1823 ONE_MORE_BYTE_NO_CHECK (nref); \ | |
| 1824 nref -= 0x20; \ | |
| 1825 if (gref < 0 || gref >= 81 \ | |
| 1826 || nref < 0 || nref >= 81) \ | |
| 1827 goto invalid_code; \ | |
| 1828 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \ | |
| 1829 } while (0) | |
| 1830 | |
| 1831 | |
| 1832 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \ | |
| 1833 do { \ | |
| 1834 /* Emacs 21 style format. The first three bytes at SRC are \ | |
| 1835 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \ | |
| 1836 the byte length of this composition information, CHARS is the \ | |
| 1837 number of characters composed by this composition. */ \ | |
| 1838 enum composition_method method = c - 0xF2; \ | |
| 1839 int *charbuf_base = charbuf; \ | |
| 1840 int from, to; \ | |
| 1841 int consumed_chars_limit; \ | |
| 1842 int nbytes, nchars; \ | |
| 1843 \ | |
| 1844 ONE_MORE_BYTE (c); \ | |
| 1845 nbytes = c - 0xA0; \ | |
| 1846 if (nbytes < 3) \ | |
| 1847 goto invalid_code; \ | |
| 1848 ONE_MORE_BYTE (c); \ | |
| 1849 nchars = c - 0xA0; \ | |
| 1850 from = coding->produced + char_offset; \ | |
| 1851 to = from + nchars; \ | |
| 1852 ADD_COMPOSITION_DATA (charbuf, from, to, method); \ | |
| 1853 consumed_chars_limit = consumed_chars_base + nbytes; \ | |
| 1854 if (method != COMPOSITION_RELATIVE) \ | |
| 1855 { \ | |
| 1856 int i = 0; \ | |
| 1857 while (consumed_chars < consumed_chars_limit) \ | |
| 1858 { \ | |
| 1859 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \ | |
| 1860 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \ | |
| 1861 else \ | |
| 1862 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \ | |
| 1863 i++; \ | |
| 1864 } \ | |
| 1865 if (consumed_chars < consumed_chars_limit) \ | |
| 1866 goto invalid_code; \ | |
| 1867 charbuf_base[0] -= i; \ | |
| 1868 } \ | |
| 1869 } while (0) | |
| 1870 | |
| 1871 | |
| 1872 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \ | |
| 1873 do { \ | |
| 1874 /* Emacs 20 style format for relative composition. */ \ | |
| 1875 /* Store multibyte form of characters to be composed. */ \ | |
| 1876 enum composition_method method = COMPOSITION_RELATIVE; \ | |
| 1877 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \ | |
| 1878 int *buf = components; \ | |
| 1879 int i, j; \ | |
| 1880 int from, to; \ | |
| 1881 \ | |
| 1882 src = src_base; \ | |
| 1883 ONE_MORE_BYTE (c); /* skip 0x80 */ \ | |
| 1884 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \ | |
| 1885 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ | |
| 1886 if (i < 2) \ | |
| 1887 goto invalid_code; \ | |
| 1888 from = coding->produced_char + char_offset; \ | |
| 1889 to = from + i; \ | |
| 1890 ADD_COMPOSITION_DATA (charbuf, from, to, method); \ | |
| 1891 for (j = 0; j < i; j++) \ | |
| 1892 *charbuf++ = components[j]; \ | |
| 1893 } while (0) | |
| 1894 | |
| 1895 | |
| 1896 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \ | |
| 1897 do { \ | |
| 1898 /* Emacs 20 style format for rule-base composition. */ \ | |
| 1899 /* Store multibyte form of characters to be composed. */ \ | |
| 1900 enum composition_method method = COMPOSITION_WITH_RULE; \ | |
| 1901 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \ | |
| 1902 int *buf = components; \ | |
| 1903 int i, j; \ | |
| 1904 int from, to; \ | |
| 1905 \ | |
| 1906 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ | |
| 1907 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \ | |
| 1908 { \ | |
| 1909 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \ | |
| 1910 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ | |
| 1911 } \ | |
| 1912 if (i < 1 || (buf - components) % 2 == 0) \ | |
| 1913 goto invalid_code; \ | |
| 1914 if (charbuf + i + (i / 2) + 1 < charbuf_end) \ | |
| 1915 goto no_more_source; \ | |
| 1916 from = coding->produced_char + char_offset; \ | |
| 1917 to = from + i; \ | |
| 1918 ADD_COMPOSITION_DATA (buf, from, to, method); \ | |
| 1919 for (j = 0; j < i; j++) \ | |
| 1920 *charbuf++ = components[j]; \ | |
| 1921 for (j = 0; j < i; j += 2) \ | |
| 1922 *charbuf++ = components[j]; \ | |
| 1923 } while (0) | |
| 1924 | |
| 1925 | |
| 1926 static void | |
| 1927 decode_coding_emacs_mule (coding) | |
| 1928 struct coding_system *coding; | |
| 1929 { | |
| 1930 const unsigned char *src = coding->source + coding->consumed; | |
| 1931 const unsigned char *src_end = coding->source + coding->src_bytes; | |
| 1932 const unsigned char *src_base; | |
| 1933 int *charbuf = coding->charbuf; | |
| 1934 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH; | |
| 1935 int consumed_chars = 0, consumed_chars_base; | |
| 1936 int multibytep = coding->src_multibyte; | |
| 1937 Lisp_Object attrs, eol_type, charset_list; | |
| 1938 int char_offset = coding->produced_char; | |
| 1939 int last_offset = char_offset; | |
| 1940 int last_id = charset_ascii; | |
| 1941 | |
| 1942 CODING_GET_INFO (coding, attrs, eol_type, charset_list); | |
| 1943 | |
| 1944 while (1) | |
| 1945 { | |
| 1946 int c; | |
| 1947 | |
| 1948 src_base = src; | |
| 1949 consumed_chars_base = consumed_chars; | |
| 1950 | |
| 1951 if (charbuf >= charbuf_end) | |
| 1952 break; | |
| 1953 | |
| 1954 ONE_MORE_BYTE (c); | |
| 1955 | |
| 1956 if (c < 0x80) | |
| 1957 { | |
| 1958 if (c == '\r') | |
| 1959 { | |
| 1960 if (EQ (eol_type, Qdos)) | |
| 1961 { | |
| 1962 if (src == src_end) | |
| 1963 { | |
| 1964 coding->result = CODING_RESULT_INSUFFICIENT_SRC; | |
| 1965 goto no_more_source; | |
| 1966 } | |
| 1967 if (*src == '\n') | |
| 1968 ONE_MORE_BYTE (c); | |
| 1969 } | |
| 1970 else if (EQ (eol_type, Qmac)) | |
| 1971 c = '\n'; | |
| 1972 } | |
| 1973 *charbuf++ = c; | |
| 1974 char_offset++; | |
| 1975 } | |
| 1976 else if (c == 0x80) | |
| 1977 { | |
| 1978 ONE_MORE_BYTE (c); | |
| 1979 if (c - 0xF2 >= COMPOSITION_RELATIVE | |
| 1980 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) | |
| 1981 DECODE_EMACS_MULE_21_COMPOSITION (c); | |
| 1982 else if (c < 0xC0) | |
| 1983 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c); | |
| 1984 else if (c == 0xFF) | |
| 1985 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c); | |
| 1986 else | |
| 1987 goto invalid_code; | |
| 1988 } | |
| 1989 else if (c < 0xA0 && emacs_mule_bytes[c] > 1) | |
| 1990 { | |
| 1991 int nbytes, nchars; | |
| 1992 int id; | |
| 1993 | |
| 1994 src = src_base; | |
| 1995 consumed_chars = consumed_chars_base; | |
| 1996 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id); | |
| 1997 if (c < 0) | |
| 1998 { | |
| 1999 if (c == -2) | |
| 2000 break; | |
| 2001 goto invalid_code; | |
| 2002 } | |
| 2003 if (last_id != id) | |
| 2004 { | |
| 2005 if (last_id != charset_ascii) | |
| 2006 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | |
| 2007 last_id = id; | |
| 2008 last_offset = char_offset; | |
| 2009 } | |
| 2010 *charbuf++ = c; | |
| 2011 src += nbytes; | |
| 2012 consumed_chars += nchars; | |
| 2013 char_offset++; | |
| 2014 } | |
| 2015 continue; | |
| 2016 | |
| 2017 invalid_code: | |
| 2018 src = src_base; | |
| 2019 consumed_chars = consumed_chars_base; | |
| 2020 ONE_MORE_BYTE (c); | |
| 2021 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); | |
| 2022 char_offset++; | |
| 2023 coding->errors++; | |
| 2024 } | |
| 2025 | |
| 2026 no_more_source: | |
| 2027 if (last_id != charset_ascii) | |
| 2028 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | |
| 2029 coding->consumed_char += consumed_chars_base; | |
| 2030 coding->consumed = src_base - coding->source; | |
| 2031 coding->charbuf_used = charbuf - coding->charbuf; | |
| 2032 } | |
| 2033 | |
| 2034 | |
| 2035 #define EMACS_MULE_LEADING_CODES(id, codes) \ | |
| 2036 do { \ | |
| 2037 if (id < 0xA0) \ | |
| 2038 codes[0] = id, codes[1] = 0; \ | |
| 2039 else if (id < 0xE0) \ | |
| 2040 codes[0] = 0x9A, codes[1] = id; \ | |
| 2041 else if (id < 0xF0) \ | |
| 2042 codes[0] = 0x9B, codes[1] = id; \ | |
| 2043 else if (id < 0xF5) \ | |
| 2044 codes[0] = 0x9C, codes[1] = id; \ | |
| 2045 else \ | |
| 2046 codes[0] = 0x9D, codes[1] = id; \ | |
| 2047 } while (0); | |
| 2048 | |
| 2049 | |
| 2050 static int | |
| 2051 encode_coding_emacs_mule (coding) | |
| 2052 struct coding_system *coding; | |
| 2053 { | |
| 2054 int multibytep = coding->dst_multibyte; | |
| 2055 int *charbuf = coding->charbuf; | |
| 2056 int *charbuf_end = charbuf + coding->charbuf_used; | |
| 2057 unsigned char *dst = coding->destination + coding->produced; | |
| 2058 unsigned char *dst_end = coding->destination + coding->dst_bytes; | |
| 2059 int safe_room = 8; | |
| 2060 int produced_chars = 0; | |
| 2061 Lisp_Object attrs, eol_type, charset_list; | |
| 2062 int c; | |
| 2063 int preferred_charset_id = -1; | |
| 2064 | |
| 2065 CODING_GET_INFO (coding, attrs, eol_type, charset_list); | |
| 2066 | |
| 2067 while (charbuf < charbuf_end) | |
| 2068 { | |
| 2069 ASSURE_DESTINATION (safe_room); | |
| 2070 c = *charbuf++; | |
| 2071 | |
| 2072 if (c < 0) | |
| 2073 { | |
| 2074 /* Handle an annotation. */ | |
| 2075 switch (*charbuf) | |
| 2076 { | |
| 2077 case CODING_ANNOTATE_COMPOSITION_MASK: | |
| 2078 /* Not yet implemented. */ | |
| 2079 break; | |
| 2080 case CODING_ANNOTATE_CHARSET_MASK: | |
| 2081 preferred_charset_id = charbuf[3]; | |
| 2082 if (preferred_charset_id >= 0 | |
| 2083 && NILP (Fmemq (make_number (preferred_charset_id), | |
| 2084 charset_list))) | |
| 2085 preferred_charset_id = -1; | |
| 2086 break; | |
| 2087 default: | |
| 2088 abort (); | |
| 2089 } | |
| 2090 charbuf += -c - 1; | |
| 2091 continue; | |
| 2092 } | |
| 2093 | |
| 2094 if (ASCII_CHAR_P (c)) | |
| 2095 EMIT_ONE_ASCII_BYTE (c); | |
| 2096 else if (CHAR_BYTE8_P (c)) | |
| 2097 { | |
| 2098 c = CHAR_TO_BYTE8 (c); | |
| 2099 EMIT_ONE_BYTE (c); | |
| 2100 } | |
| 2101 else | |
| 2102 { | |
| 2103 struct charset *charset; | |
| 2104 unsigned code; | |
| 2105 int dimension; | |
| 2106 int emacs_mule_id; | |
| 2107 unsigned char leading_codes[2]; | |
| 2108 | |
| 2109 if (preferred_charset_id >= 0) | |
| 2110 { | |
| 2111 charset = CHARSET_FROM_ID (preferred_charset_id); | |
| 2112 if (! CHAR_CHARSET_P (c, charset)) | |
| 2113 charset = char_charset (c, charset_list, NULL); | |
| 635 } | 2114 } |
| 636 else | 2115 else |
| 637 c -= 0x20; | 2116 charset = char_charset (c, charset_list, &code); |
| 638 } | 2117 if (! charset) |
| 639 | 2118 { |
| 640 if (c < 0x20) | 2119 c = coding->default_char; |
| 641 { | 2120 if (ASCII_CHAR_P (c)) |
| 642 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | 2121 { |
| 643 return 0; | 2122 EMIT_ONE_ASCII_BYTE (c); |
| 644 } | 2123 continue; |
| 645 else if (c >= 0x80 && c < 0xA0) | 2124 } |
| 646 { | 2125 charset = char_charset (c, charset_list, &code); |
| 647 if (c == 0x80) | 2126 } |
| 648 /* Old leading code for a composite character. */ | 2127 dimension = CHARSET_DIMENSION (charset); |
| 649 composing = 1; | 2128 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset); |
| 2129 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes); | |
| 2130 EMIT_ONE_BYTE (leading_codes[0]); | |
| 2131 if (leading_codes[1]) | |
| 2132 EMIT_ONE_BYTE (leading_codes[1]); | |
| 2133 if (dimension == 1) | |
| 2134 EMIT_ONE_BYTE (code); | |
| 650 else | 2135 else |
| 651 { | 2136 { |
| 652 unsigned char *src_base = src - 1; | 2137 EMIT_ONE_BYTE (code >> 8); |
| 653 int bytes; | 2138 EMIT_ONE_BYTE (code & 0xFF); |
| 654 | |
| 655 if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base, | |
| 656 bytes)) | |
| 657 return 0; | |
| 658 src = src_base + bytes; | |
| 659 } | 2139 } |
| 660 } | 2140 } |
| 661 } | 2141 } |
| 662 label_end_of_loop: | 2142 coding->result = CODING_RESULT_SUCCESS; |
| 663 return CODING_CATEGORY_MASK_EMACS_MULE; | 2143 coding->produced_char += produced_chars; |
| 664 } | 2144 coding->produced = dst - coding->destination; |
| 665 | 2145 return 0; |
| 666 | |
| 667 /* Record the starting position START and METHOD of one composition. */ | |
| 668 | |
| 669 #define CODING_ADD_COMPOSITION_START(coding, start, method) \ | |
| 670 do { \ | |
| 671 struct composition_data *cmp_data = coding->cmp_data; \ | |
| 672 int *data = cmp_data->data + cmp_data->used; \ | |
| 673 coding->cmp_data_start = cmp_data->used; \ | |
| 674 data[0] = -1; \ | |
| 675 data[1] = cmp_data->char_offset + start; \ | |
| 676 data[3] = (int) method; \ | |
| 677 cmp_data->used += 4; \ | |
| 678 } while (0) | |
| 679 | |
| 680 /* Record the ending position END of the current composition. */ | |
| 681 | |
| 682 #define CODING_ADD_COMPOSITION_END(coding, end) \ | |
| 683 do { \ | |
| 684 struct composition_data *cmp_data = coding->cmp_data; \ | |
| 685 int *data = cmp_data->data + coding->cmp_data_start; \ | |
| 686 data[0] = cmp_data->used - coding->cmp_data_start; \ | |
| 687 data[2] = cmp_data->char_offset + end; \ | |
| 688 } while (0) | |
| 689 | |
| 690 /* Record one COMPONENT (alternate character or composition rule). */ | |
| 691 | |
| 692 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \ | |
| 693 do { \ | |
| 694 coding->cmp_data->data[coding->cmp_data->used++] = component; \ | |
| 695 if (coding->cmp_data->used - coding->cmp_data_start \ | |
| 696 == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \ | |
| 697 { \ | |
| 698 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \ | |
| 699 coding->composing = COMPOSITION_NO; \ | |
| 700 } \ | |
| 701 } while (0) | |
| 702 | |
| 703 | |
| 704 /* Get one byte from a data pointed by SRC and increment SRC. If SRC | |
| 705 is not less than SRC_END, return -1 without incrementing Src. */ | |
| 706 | |
| 707 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++) | |
| 708 | |
| 709 | |
| 710 /* Decode a character represented as a component of composition | |
| 711 sequence of Emacs 20 style at SRC. Set C to that character, store | |
| 712 its multibyte form sequence at P, and set P to the end of that | |
| 713 sequence. If no valid character is found, set C to -1. */ | |
| 714 | |
| 715 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \ | |
| 716 do { \ | |
| 717 int bytes; \ | |
| 718 \ | |
| 719 c = SAFE_ONE_MORE_BYTE (); \ | |
| 720 if (c < 0) \ | |
| 721 break; \ | |
| 722 if (CHAR_HEAD_P (c)) \ | |
| 723 c = -1; \ | |
| 724 else if (c == 0xA0) \ | |
| 725 { \ | |
| 726 c = SAFE_ONE_MORE_BYTE (); \ | |
| 727 if (c < 0xA0) \ | |
| 728 c = -1; \ | |
| 729 else \ | |
| 730 { \ | |
| 731 c -= 0xA0; \ | |
| 732 *p++ = c; \ | |
| 733 } \ | |
| 734 } \ | |
| 735 else if (BASE_LEADING_CODE_P (c - 0x20)) \ | |
| 736 { \ | |
| 737 unsigned char *p0 = p; \ | |
| 738 \ | |
| 739 c -= 0x20; \ | |
| 740 *p++ = c; \ | |
| 741 bytes = BYTES_BY_CHAR_HEAD (c); \ | |
| 742 while (--bytes) \ | |
| 743 { \ | |
| 744 c = SAFE_ONE_MORE_BYTE (); \ | |
| 745 if (c < 0) \ | |
| 746 break; \ | |
| 747 *p++ = c; \ | |
| 748 } \ | |
| 749 if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \ | |
| 750 || (coding->flags /* We are recovering a file. */ \ | |
| 751 && p0[0] == LEADING_CODE_8_BIT_CONTROL \ | |
| 752 && ! CHAR_HEAD_P (p0[1]))) \ | |
| 753 c = STRING_CHAR (p0, bytes); \ | |
| 754 else \ | |
| 755 c = -1; \ | |
| 756 } \ | |
| 757 else \ | |
| 758 c = -1; \ | |
| 759 } while (0) | |
| 760 | |
| 761 | |
| 762 /* Decode a composition rule represented as a component of composition | |
| 763 sequence of Emacs 20 style at SRC. Set C to the rule. If not | |
| 764 valid rule is found, set C to -1. */ | |
| 765 | |
| 766 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c) \ | |
| 767 do { \ | |
| 768 c = SAFE_ONE_MORE_BYTE (); \ | |
| 769 c -= 0xA0; \ | |
| 770 if (c < 0 || c >= 81) \ | |
| 771 c = -1; \ | |
| 772 else \ | |
| 773 { \ | |
| 774 gref = c / 9, nref = c % 9; \ | |
| 775 c = COMPOSITION_ENCODE_RULE (gref, nref); \ | |
| 776 } \ | |
| 777 } while (0) | |
| 778 | |
| 779 | |
| 780 /* Decode composition sequence encoded by `emacs-mule' at the source | |
| 781 pointed by SRC. SRC_END is the end of source. Store information | |
| 782 of the composition in CODING->cmp_data. | |
| 783 | |
| 784 For backward compatibility, decode also a composition sequence of | |
| 785 Emacs 20 style. In that case, the composition sequence contains | |
| 786 characters that should be extracted into a buffer or string. Store | |
| 787 those characters at *DESTINATION in multibyte form. | |
| 788 | |
| 789 If we encounter an invalid byte sequence, return 0. | |
| 790 If we encounter an insufficient source or destination, or | |
| 791 insufficient space in CODING->cmp_data, return 1. | |
| 792 Otherwise, return consumed bytes in the source. | |
| 793 | |
| 794 */ | |
| 795 static INLINE int | |
| 796 decode_composition_emacs_mule (coding, src, src_end, | |
| 797 destination, dst_end, dst_bytes) | |
| 798 struct coding_system *coding; | |
| 799 unsigned char *src, *src_end, **destination, *dst_end; | |
| 800 int dst_bytes; | |
| 801 { | |
| 802 unsigned char *dst = *destination; | |
| 803 int method, data_len, nchars; | |
| 804 unsigned char *src_base = src++; | |
| 805 /* Store components of composition. */ | |
| 806 int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH]; | |
| 807 int ncomponent; | |
| 808 /* Store multibyte form of characters to be composed. This is for | |
| 809 Emacs 20 style composition sequence. */ | |
| 810 unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH]; | |
| 811 unsigned char *bufp = buf; | |
| 812 int c, i, gref, nref; | |
| 813 | |
| 814 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH | |
| 815 >= COMPOSITION_DATA_SIZE) | |
| 816 { | |
| 817 coding->result = CODING_FINISH_INSUFFICIENT_CMP; | |
| 818 return -1; | |
| 819 } | |
| 820 | |
| 821 ONE_MORE_BYTE (c); | |
| 822 if (c - 0xF0 >= COMPOSITION_RELATIVE | |
| 823 && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS) | |
| 824 { | |
| 825 int with_rule; | |
| 826 | |
| 827 method = c - 0xF0; | |
| 828 with_rule = (method == COMPOSITION_WITH_RULE | |
| 829 || method == COMPOSITION_WITH_RULE_ALTCHARS); | |
| 830 ONE_MORE_BYTE (c); | |
| 831 data_len = c - 0xA0; | |
| 832 if (data_len < 4 | |
| 833 || src_base + data_len > src_end) | |
| 834 return 0; | |
| 835 ONE_MORE_BYTE (c); | |
| 836 nchars = c - 0xA0; | |
| 837 if (c < 1) | |
| 838 return 0; | |
| 839 for (ncomponent = 0; src < src_base + data_len; ncomponent++) | |
| 840 { | |
| 841 /* If it is longer than this, it can't be valid. */ | |
| 842 if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH) | |
| 843 return 0; | |
| 844 | |
| 845 if (ncomponent % 2 && with_rule) | |
| 846 { | |
| 847 ONE_MORE_BYTE (gref); | |
| 848 gref -= 32; | |
| 849 ONE_MORE_BYTE (nref); | |
| 850 nref -= 32; | |
| 851 c = COMPOSITION_ENCODE_RULE (gref, nref); | |
| 852 } | |
| 853 else | |
| 854 { | |
| 855 int bytes; | |
| 856 if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes) | |
| 857 || (coding->flags /* We are recovering a file. */ | |
| 858 && src[0] == LEADING_CODE_8_BIT_CONTROL | |
| 859 && ! CHAR_HEAD_P (src[1]))) | |
| 860 c = STRING_CHAR (src, bytes); | |
| 861 else | |
| 862 c = *src, bytes = 1; | |
| 863 src += bytes; | |
| 864 } | |
| 865 component[ncomponent] = c; | |
| 866 } | |
| 867 } | |
| 868 else | |
| 869 { | |
| 870 /* This may be an old Emacs 20 style format. See the comment at | |
| 871 the section 2 of this file. */ | |
| 872 while (src < src_end && !CHAR_HEAD_P (*src)) src++; | |
| 873 if (src == src_end | |
| 874 && !(coding->mode & CODING_MODE_LAST_BLOCK)) | |
| 875 goto label_end_of_loop; | |
| 876 | |
| 877 src_end = src; | |
| 878 src = src_base + 1; | |
| 879 if (c < 0xC0) | |
| 880 { | |
| 881 method = COMPOSITION_RELATIVE; | |
| 882 for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;) | |
| 883 { | |
| 884 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp); | |
| 885 if (c < 0) | |
| 886 break; | |
| 887 component[ncomponent++] = c; | |
| 888 } | |
| 889 if (ncomponent < 2) | |
| 890 return 0; | |
| 891 nchars = ncomponent; | |
| 892 } | |
| 893 else if (c == 0xFF) | |
| 894 { | |
| 895 method = COMPOSITION_WITH_RULE; | |
| 896 src++; | |
| 897 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp); | |
| 898 if (c < 0) | |
| 899 return 0; | |
| 900 component[0] = c; | |
| 901 for (ncomponent = 1; | |
| 902 ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;) | |
| 903 { | |
| 904 DECODE_EMACS_MULE_COMPOSITION_RULE (c); | |
| 905 if (c < 0) | |
| 906 break; | |
| 907 component[ncomponent++] = c; | |
| 908 DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp); | |
| 909 if (c < 0) | |
| 910 break; | |
| 911 component[ncomponent++] = c; | |
| 912 } | |
| 913 if (ncomponent < 3) | |
| 914 return 0; | |
| 915 nchars = (ncomponent + 1) / 2; | |
| 916 } | |
| 917 else | |
| 918 return 0; | |
| 919 } | |
| 920 | |
| 921 if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src)) | |
| 922 { | |
| 923 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method); | |
| 924 for (i = 0; i < ncomponent; i++) | |
| 925 CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]); | |
| 926 CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars); | |
| 927 if (buf < bufp) | |
| 928 { | |
| 929 unsigned char *p = buf; | |
| 930 EMIT_BYTES (p, bufp); | |
| 931 *destination += bufp - buf; | |
| 932 coding->produced_char += nchars; | |
| 933 } | |
| 934 return (src - src_base); | |
| 935 } | |
| 936 label_end_of_loop: | |
| 937 return -1; | |
| 938 } | |
| 939 | |
| 940 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ | |
| 941 | |
| 942 static void | |
| 943 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes) | |
| 944 struct coding_system *coding; | |
| 945 unsigned char *source, *destination; | |
| 946 int src_bytes, dst_bytes; | |
| 947 { | |
| 948 unsigned char *src = source; | |
| 949 unsigned char *src_end = source + src_bytes; | |
| 950 unsigned char *dst = destination; | |
| 951 unsigned char *dst_end = destination + dst_bytes; | |
| 952 /* SRC_BASE remembers the start position in source in each loop. | |
| 953 The loop will be exited when there's not enough source code, or | |
| 954 when there's not enough destination area to produce a | |
| 955 character. */ | |
| 956 unsigned char *src_base; | |
| 957 | |
| 958 coding->produced_char = 0; | |
| 959 while ((src_base = src) < src_end) | |
| 960 { | |
| 961 unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p; | |
| 962 int bytes; | |
| 963 | |
| 964 if (*src == '\r') | |
| 965 { | |
| 966 int c = *src++; | |
| 967 | |
| 968 if (coding->eol_type == CODING_EOL_CR) | |
| 969 c = '\n'; | |
| 970 else if (coding->eol_type == CODING_EOL_CRLF) | |
| 971 { | |
| 972 ONE_MORE_BYTE (c); | |
| 973 if (c != '\n') | |
| 974 { | |
| 975 src--; | |
| 976 c = '\r'; | |
| 977 } | |
| 978 } | |
| 979 *dst++ = c; | |
| 980 coding->produced_char++; | |
| 981 continue; | |
| 982 } | |
| 983 else if (*src == '\n') | |
| 984 { | |
| 985 if ((coding->eol_type == CODING_EOL_CR | |
| 986 || coding->eol_type == CODING_EOL_CRLF) | |
| 987 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) | |
| 988 { | |
| 989 coding->result = CODING_FINISH_INCONSISTENT_EOL; | |
| 990 goto label_end_of_loop; | |
| 991 } | |
| 992 *dst++ = *src++; | |
| 993 coding->produced_char++; | |
| 994 continue; | |
| 995 } | |
| 996 else if (*src == 0x80 && coding->cmp_data) | |
| 997 { | |
| 998 /* Start of composition data. */ | |
| 999 int consumed = decode_composition_emacs_mule (coding, src, src_end, | |
| 1000 &dst, dst_end, | |
| 1001 dst_bytes); | |
| 1002 if (consumed < 0) | |
| 1003 goto label_end_of_loop; | |
| 1004 else if (consumed > 0) | |
| 1005 { | |
| 1006 src += consumed; | |
| 1007 continue; | |
| 1008 } | |
| 1009 bytes = CHAR_STRING (*src, tmp); | |
| 1010 p = tmp; | |
| 1011 src++; | |
| 1012 } | |
| 1013 else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes) | |
| 1014 || (coding->flags /* We are recovering a file. */ | |
| 1015 && src[0] == LEADING_CODE_8_BIT_CONTROL | |
| 1016 && ! CHAR_HEAD_P (src[1]))) | |
| 1017 { | |
| 1018 p = src; | |
| 1019 src += bytes; | |
| 1020 } | |
| 1021 else | |
| 1022 { | |
| 1023 bytes = CHAR_STRING (*src, tmp); | |
| 1024 p = tmp; | |
| 1025 src++; | |
| 1026 } | |
| 1027 if (dst + bytes >= (dst_bytes ? dst_end : src)) | |
| 1028 { | |
| 1029 coding->result = CODING_FINISH_INSUFFICIENT_DST; | |
| 1030 break; | |
| 1031 } | |
| 1032 while (bytes--) *dst++ = *p++; | |
| 1033 coding->produced_char++; | |
| 1034 } | |
| 1035 label_end_of_loop: | |
| 1036 coding->consumed = coding->consumed_char = src_base - source; | |
| 1037 coding->produced = dst - destination; | |
| 1038 } | |
| 1039 | |
| 1040 | |
| 1041 /* Encode composition data stored at DATA into a special byte sequence | |
| 1042 starting by 0x80. Update CODING->cmp_data_start and maybe | |
| 1043 CODING->cmp_data for the next call. */ | |
| 1044 | |
| 1045 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data) \ | |
| 1046 do { \ | |
| 1047 unsigned char buf[1024], *p0 = buf, *p; \ | |
| 1048 int len = data[0]; \ | |
| 1049 int i; \ | |
| 1050 \ | |
| 1051 buf[0] = 0x80; \ | |
| 1052 buf[1] = 0xF0 + data[3]; /* METHOD */ \ | |
| 1053 buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */ \ | |
| 1054 p = buf + 4; \ | |
| 1055 if (data[3] == COMPOSITION_WITH_RULE \ | |
| 1056 || data[3] == COMPOSITION_WITH_RULE_ALTCHARS) \ | |
| 1057 { \ | |
| 1058 p += CHAR_STRING (data[4], p); \ | |
| 1059 for (i = 5; i < len; i += 2) \ | |
| 1060 { \ | |
| 1061 int gref, nref; \ | |
| 1062 COMPOSITION_DECODE_RULE (data[i], gref, nref); \ | |
| 1063 *p++ = 0x20 + gref; \ | |
| 1064 *p++ = 0x20 + nref; \ | |
| 1065 p += CHAR_STRING (data[i + 1], p); \ | |
| 1066 } \ | |
| 1067 } \ | |
| 1068 else \ | |
| 1069 { \ | |
| 1070 for (i = 4; i < len; i++) \ | |
| 1071 p += CHAR_STRING (data[i], p); \ | |
| 1072 } \ | |
| 1073 buf[2] = 0xA0 + (p - buf); /* COMPONENTS-BYTES */ \ | |
| 1074 \ | |
| 1075 if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src)) \ | |
| 1076 { \ | |
| 1077 coding->result = CODING_FINISH_INSUFFICIENT_DST; \ | |
| 1078 goto label_end_of_loop; \ | |
| 1079 } \ | |
| 1080 while (p0 < p) \ | |
| 1081 *dst++ = *p0++; \ | |
| 1082 coding->cmp_data_start += data[0]; \ | |
| 1083 if (coding->cmp_data_start == coding->cmp_data->used \ | |
| 1084 && coding->cmp_data->next) \ | |
| 1085 { \ | |
| 1086 coding->cmp_data = coding->cmp_data->next; \ | |
| 1087 coding->cmp_data_start = 0; \ | |
| 1088 } \ | |
| 1089 } while (0) | |
| 1090 | |
| 1091 | |
| 1092 static void encode_eol P_ ((struct coding_system *, const unsigned char *, | |
| 1093 unsigned char *, int, int)); | |
| 1094 | |
| 1095 static void | |
| 1096 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes) | |
| 1097 struct coding_system *coding; | |
| 1098 unsigned char *source, *destination; | |
| 1099 int src_bytes, dst_bytes; | |
| 1100 { | |
| 1101 unsigned char *src = source; | |
| 1102 unsigned char *src_end = source + src_bytes; | |
| 1103 unsigned char *dst = destination; | |
| 1104 unsigned char *dst_end = destination + dst_bytes; | |
| 1105 unsigned char *src_base; | |
| 1106 int c; | |
| 1107 int char_offset; | |
| 1108 int *data; | |
| 1109 | |
| 1110 Lisp_Object translation_table; | |
| 1111 | |
| 1112 translation_table = Qnil; | |
| 1113 | |
| 1114 /* Optimization for the case that there's no composition. */ | |
| 1115 if (!coding->cmp_data || coding->cmp_data->used == 0) | |
| 1116 { | |
| 1117 encode_eol (coding, source, destination, src_bytes, dst_bytes); | |
| 1118 return; | |
| 1119 } | |
| 1120 | |
| 1121 char_offset = coding->cmp_data->char_offset; | |
| 1122 data = coding->cmp_data->data + coding->cmp_data_start; | |
| 1123 while (1) | |
| 1124 { | |
| 1125 src_base = src; | |
| 1126 | |
| 1127 /* If SRC starts a composition, encode the information about the | |
| 1128 composition in advance. */ | |
| 1129 if (coding->cmp_data_start < coding->cmp_data->used | |
| 1130 && char_offset + coding->consumed_char == data[1]) | |
| 1131 { | |
| 1132 ENCODE_COMPOSITION_EMACS_MULE (coding, data); | |
| 1133 char_offset = coding->cmp_data->char_offset; | |
| 1134 data = coding->cmp_data->data + coding->cmp_data_start; | |
| 1135 } | |
| 1136 | |
| 1137 ONE_MORE_CHAR (c); | |
| 1138 if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF | |
| 1139 || coding->eol_type == CODING_EOL_CR)) | |
| 1140 { | |
| 1141 if (coding->eol_type == CODING_EOL_CRLF) | |
| 1142 EMIT_TWO_BYTES ('\r', c); | |
| 1143 else | |
| 1144 EMIT_ONE_BYTE ('\r'); | |
| 1145 } | |
| 1146 else if (SINGLE_BYTE_CHAR_P (c)) | |
| 1147 { | |
| 1148 if (coding->flags && ! ASCII_BYTE_P (c)) | |
| 1149 { | |
| 1150 /* As we are auto saving, retain the multibyte form for | |
| 1151 8-bit chars. */ | |
| 1152 unsigned char buf[MAX_MULTIBYTE_LENGTH]; | |
| 1153 int bytes = CHAR_STRING (c, buf); | |
| 1154 | |
| 1155 if (bytes == 1) | |
| 1156 EMIT_ONE_BYTE (buf[0]); | |
| 1157 else | |
| 1158 EMIT_TWO_BYTES (buf[0], buf[1]); | |
| 1159 } | |
| 1160 else | |
| 1161 EMIT_ONE_BYTE (c); | |
| 1162 } | |
| 1163 else | |
| 1164 EMIT_BYTES (src_base, src); | |
| 1165 coding->consumed_char++; | |
| 1166 } | |
| 1167 label_end_of_loop: | |
| 1168 coding->consumed = src_base - source; | |
| 1169 coding->produced = coding->produced_char = dst - destination; | |
| 1170 return; | |
| 1171 } | 2146 } |
| 1172 | 2147 |
| 1173 | 2148 |
| 1174 /*** 3. ISO2022 handlers ***/ | 2149 /*** 7. ISO2022 handlers ***/ |
| 1175 | 2150 |
| 1176 /* The following note describes the coding system ISO2022 briefly. | 2151 /* The following note describes the coding system ISO2022 briefly. |
| 1177 Since the intention of this note is to help understand the | 2152 Since the intention of this note is to help understand the |
| 1178 functions in this file, some parts are NOT ACCURATE or are OVERLY | 2153 functions in this file, some parts are NOT ACCURATE or are OVERLY |
| 1179 SIMPLIFIED. For thorough understanding, please refer to the | 2154 SIMPLIFIED. For thorough understanding, please refer to the |
| 1299 Emacs accepts them on decoding, and produces them on encoding | 2274 Emacs accepts them on decoding, and produces them on encoding |
| 1300 CHARS96 character sets in a coding system which is characterized as | 2275 CHARS96 character sets in a coding system which is characterized as |
| 1301 7-bit environment, non-locking-shift, and non-single-shift. | 2276 7-bit environment, non-locking-shift, and non-single-shift. |
| 1302 | 2277 |
| 1303 Note (**): If <F> is '@', 'A', or 'B', the intermediate character | 2278 Note (**): If <F> is '@', 'A', or 'B', the intermediate character |
| 1304 '(' can be omitted. We refer to this as "short-form" hereafter. | 2279 '(' must be omitted. We refer to this as "short-form" hereafter. |
| 1305 | 2280 |
| 1306 Now you may notice that there are a lot of ways of encoding the | 2281 Now you may notice that there are a lot of ways of encoding the |
| 1307 same multilingual text in ISO2022. Actually, there exist many | 2282 same multilingual text in ISO2022. Actually, there exist many |
| 1308 coding systems such as Compound Text (used in X11's inter client | 2283 coding systems such as Compound Text (used in X11's inter client |
| 1309 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR | 2284 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR |
| 1329 o ESC '3' -- start relative composition with alternate chars (**) | 2304 o ESC '3' -- start relative composition with alternate chars (**) |
| 1330 o ESC '4' -- start rule-base composition with alternate chars (**) | 2305 o ESC '4' -- start rule-base composition with alternate chars (**) |
| 1331 Since these are not standard escape sequences of any ISO standard, | 2306 Since these are not standard escape sequences of any ISO standard, |
| 1332 the use of them with these meanings is restricted to Emacs only. | 2307 the use of them with these meanings is restricted to Emacs only. |
| 1333 | 2308 |
| 1334 (*) This form is used only in Emacs 20.5 and older versions, | 2309 (*) This form is used only in Emacs 20.7 and older versions, |
| 1335 but the newer versions can safely decode it. | 2310 but newer versions can safely decode it. |
| 1336 (**) This form is used only in Emacs 21.1 and newer versions, | 2311 (**) This form is used only in Emacs 21.1 and newer versions, |
| 1337 and the older versions can't decode it. | 2312 and older versions can't decode it. |
| 1338 | 2313 |
| 1339 Here's a list of example usages of these composition escape | 2314 Here's a list of example usages of these composition escape |
| 1340 sequences (categorized by `enum composition_method'). | 2315 sequences (categorized by `enum composition_method'). |
| 1341 | 2316 |
| 1342 COMPOSITION_RELATIVE: | 2317 COMPOSITION_RELATIVE: |
| 1348 COMPOSITION_WITH_RULE_ALTCHARS: | 2323 COMPOSITION_WITH_RULE_ALTCHARS: |
| 1349 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */ | 2324 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */ |
| 1350 | 2325 |
| 1351 enum iso_code_class_type iso_code_class[256]; | 2326 enum iso_code_class_type iso_code_class[256]; |
| 1352 | 2327 |
| 1353 #define CHARSET_OK(idx, charset, c) \ | 2328 #define SAFE_CHARSET_P(coding, id) \ |
| 1354 (coding_system_table[idx] \ | 2329 ((id) <= (coding)->max_charset_id \ |
| 1355 && (charset == CHARSET_ASCII \ | 2330 && (coding)->safe_charsets[id] >= 0) |
| 1356 || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \ | 2331 |
| 1357 CODING_SAFE_CHAR_P (safe_chars, c))) \ | 2332 |
| 1358 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \ | 2333 #define SHIFT_OUT_OK(category) \ |
| 1359 charset) \ | 2334 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0) |
| 1360 != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)) | 2335 |
| 1361 | 2336 static void |
| 1362 #define SHIFT_OUT_OK(idx) \ | 2337 setup_iso_safe_charsets (attrs) |
| 1363 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0) | 2338 Lisp_Object attrs; |
| 1364 | 2339 { |
| 1365 #define COMPOSITION_OK(idx) \ | 2340 Lisp_Object charset_list, safe_charsets; |
| 1366 (coding_system_table[idx]->composing != COMPOSITION_DISABLED) | 2341 Lisp_Object request; |
| 2342 Lisp_Object reg_usage; | |
| 2343 Lisp_Object tail; | |
| 2344 int reg94, reg96; | |
| 2345 int flags = XINT (AREF (attrs, coding_attr_iso_flags)); | |
| 2346 int max_charset_id; | |
| 2347 | |
| 2348 charset_list = CODING_ATTR_CHARSET_LIST (attrs); | |
| 2349 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT) | |
| 2350 && ! EQ (charset_list, Viso_2022_charset_list)) | |
| 2351 { | |
| 2352 CODING_ATTR_CHARSET_LIST (attrs) | |
| 2353 = charset_list = Viso_2022_charset_list; | |
| 2354 ASET (attrs, coding_attr_safe_charsets, Qnil); | |
| 2355 } | |
| 2356 | |
| 2357 if (STRINGP (AREF (attrs, coding_attr_safe_charsets))) | |
| 2358 return; | |
| 2359 | |
| 2360 max_charset_id = 0; | |
| 2361 for (tail = charset_list; CONSP (tail); tail = XCDR (tail)) | |
| 2362 { | |
| 2363 int id = XINT (XCAR (tail)); | |
| 2364 if (max_charset_id < id) | |
| 2365 max_charset_id = id; | |
| 2366 } | |
| 2367 | |
| 2368 safe_charsets = Fmake_string (make_number (max_charset_id + 1), | |
| 2369 make_number (255)); | |
| 2370 request = AREF (attrs, coding_attr_iso_request); | |
| 2371 reg_usage = AREF (attrs, coding_attr_iso_usage); | |
| 2372 reg94 = XINT (XCAR (reg_usage)); | |
| 2373 reg96 = XINT (XCDR (reg_usage)); | |
| 2374 | |
| 2375 for (tail = charset_list; CONSP (tail); tail = XCDR (tail)) | |
| 2376 { | |
| 2377 Lisp_Object id; | |
| 2378 Lisp_Object reg; | |
| 2379 struct charset *charset; | |
| 2380 | |
| 2381 id = XCAR (tail); | |
| 2382 charset = CHARSET_FROM_ID (XINT (id)); | |
| 2383 reg = Fcdr (Fassq (id, request)); | |
| 2384 if (! NILP (reg)) | |
| 2385 SSET (safe_charsets, XINT (id), XINT (reg)); | |
| 2386 else if (charset->iso_chars_96) | |
| 2387 { | |
| 2388 if (reg96 < 4) | |
| 2389 SSET (safe_charsets, XINT (id), reg96); | |
| 2390 } | |
| 2391 else | |
| 2392 { | |
| 2393 if (reg94 < 4) | |
| 2394 SSET (safe_charsets, XINT (id), reg94); | |
| 2395 } | |
| 2396 } | |
| 2397 ASET (attrs, coding_attr_safe_charsets, safe_charsets); | |
| 2398 } | |
| 2399 | |
| 1367 | 2400 |
| 1368 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 2401 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| 1369 Check if a text is encoded in ISO2022. If it is, return an | 2402 Check if a text is encoded in one of ISO-2022 based codig systems. |
| 1370 integer in which appropriate flag bits any of: | 2403 If it is, return 1, else return 0. */ |
| 1371 CODING_CATEGORY_MASK_ISO_7 | |
| 1372 CODING_CATEGORY_MASK_ISO_7_TIGHT | |
| 1373 CODING_CATEGORY_MASK_ISO_8_1 | |
| 1374 CODING_CATEGORY_MASK_ISO_8_2 | |
| 1375 CODING_CATEGORY_MASK_ISO_7_ELSE | |
| 1376 CODING_CATEGORY_MASK_ISO_8_ELSE | |
| 1377 are set. If a code which should never appear in ISO2022 is found, | |
| 1378 returns 0. */ | |
| 1379 | 2404 |
| 1380 static int | 2405 static int |
| 1381 detect_coding_iso2022 (src, src_end, multibytep) | 2406 detect_coding_iso_2022 (coding, detect_info) |
| 1382 unsigned char *src, *src_end; | 2407 struct coding_system *coding; |
| 1383 int multibytep; | 2408 struct coding_detection_info *detect_info; |
| 1384 { | 2409 { |
| 1385 int mask = CODING_CATEGORY_MASK_ISO; | 2410 const unsigned char *src = coding->source, *src_base = src; |
| 1386 int mask_found = 0; | 2411 const unsigned char *src_end = coding->source + coding->src_bytes; |
| 1387 int reg[4], shift_out = 0, single_shifting = 0; | 2412 int multibytep = coding->src_multibyte; |
| 1388 int c, c1, charset; | 2413 int single_shifting = 0; |
| 1389 /* Dummy for ONE_MORE_BYTE. */ | 2414 int id; |
| 1390 struct coding_system dummy_coding; | 2415 int c, c1; |
| 1391 struct coding_system *coding = &dummy_coding; | 2416 int consumed_chars = 0; |
| 1392 Lisp_Object safe_chars; | 2417 int i; |
| 1393 | 2418 int rejected = 0; |
| 1394 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1; | 2419 int found = 0; |
| 1395 while (mask && src < src_end) | 2420 |
| 1396 { | 2421 detect_info->checked |= CATEGORY_MASK_ISO; |
| 1397 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 2422 |
| 1398 retry: | 2423 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++) |
| 2424 { | |
| 2425 struct coding_system *this = &(coding_categories[i]); | |
| 2426 Lisp_Object attrs, val; | |
| 2427 | |
| 2428 attrs = CODING_ID_ATTRS (this->id); | |
| 2429 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT | |
| 2430 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list)) | |
| 2431 setup_iso_safe_charsets (attrs); | |
| 2432 val = CODING_ATTR_SAFE_CHARSETS (attrs); | |
| 2433 this->max_charset_id = SCHARS (val) - 1; | |
| 2434 this->safe_charsets = (char *) SDATA (val); | |
| 2435 } | |
| 2436 | |
| 2437 /* A coding system of this category is always ASCII compatible. */ | |
| 2438 src += coding->head_ascii; | |
| 2439 | |
| 2440 while (rejected != CATEGORY_MASK_ISO) | |
| 2441 { | |
| 2442 ONE_MORE_BYTE (c); | |
| 1399 switch (c) | 2443 switch (c) |
| 1400 { | 2444 { |
| 1401 case ISO_CODE_ESC: | 2445 case ISO_CODE_ESC: |
| 1402 if (inhibit_iso_escape_detection) | 2446 if (inhibit_iso_escape_detection) |
| 1403 break; | 2447 break; |
| 1404 single_shifting = 0; | 2448 single_shifting = 0; |
| 1405 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 2449 ONE_MORE_BYTE (c); |
| 1406 if (c >= '(' && c <= '/') | 2450 if (c >= '(' && c <= '/') |
| 1407 { | 2451 { |
| 1408 /* Designation sequence for a charset of dimension 1. */ | 2452 /* Designation sequence for a charset of dimension 1. */ |
| 1409 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); | 2453 ONE_MORE_BYTE (c1); |
| 1410 if (c1 < ' ' || c1 >= 0x80 | 2454 if (c1 < ' ' || c1 >= 0x80 |
| 1411 || (charset = iso_charset_table[0][c >= ','][c1]) < 0) | 2455 || (id = iso_charset_table[0][c >= ','][c1]) < 0) |
| 1412 /* Invalid designation sequence. Just ignore. */ | 2456 /* Invalid designation sequence. Just ignore. */ |
| 1413 break; | 2457 break; |
| 1414 reg[(c - '(') % 4] = charset; | |
| 1415 } | 2458 } |
| 1416 else if (c == '$') | 2459 else if (c == '$') |
| 1417 { | 2460 { |
| 1418 /* Designation sequence for a charset of dimension 2. */ | 2461 /* Designation sequence for a charset of dimension 2. */ |
| 1419 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 2462 ONE_MORE_BYTE (c); |
| 1420 if (c >= '@' && c <= 'B') | 2463 if (c >= '@' && c <= 'B') |
| 1421 /* Designation for JISX0208.1978, GB2312, or JISX0208. */ | 2464 /* Designation for JISX0208.1978, GB2312, or JISX0208. */ |
| 1422 reg[0] = charset = iso_charset_table[1][0][c]; | 2465 id = iso_charset_table[1][0][c]; |
| 1423 else if (c >= '(' && c <= '/') | 2466 else if (c >= '(' && c <= '/') |
| 1424 { | 2467 { |
| 1425 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); | 2468 ONE_MORE_BYTE (c1); |
| 1426 if (c1 < ' ' || c1 >= 0x80 | 2469 if (c1 < ' ' || c1 >= 0x80 |
| 1427 || (charset = iso_charset_table[1][c >= ','][c1]) < 0) | 2470 || (id = iso_charset_table[1][c >= ','][c1]) < 0) |
| 1428 /* Invalid designation sequence. Just ignore. */ | 2471 /* Invalid designation sequence. Just ignore. */ |
| 1429 break; | 2472 break; |
| 1430 reg[(c - '(') % 4] = charset; | |
| 1431 } | 2473 } |
| 1432 else | 2474 else |
| 1433 /* Invalid designation sequence. Just ignore. */ | 2475 /* Invalid designation sequence. Just ignore it. */ |
| 1434 break; | 2476 break; |
| 1435 } | 2477 } |
| 1436 else if (c == 'N' || c == 'O') | 2478 else if (c == 'N' || c == 'O') |
| 1437 { | 2479 { |
| 1438 /* ESC <Fe> for SS2 or SS3. */ | 2480 /* ESC <Fe> for SS2 or SS3. */ |
| 1439 mask &= CODING_CATEGORY_MASK_ISO_7_ELSE; | 2481 single_shifting = 1; |
| 2482 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT; | |
| 1440 break; | 2483 break; |
| 1441 } | 2484 } |
| 1442 else if (c >= '0' && c <= '4') | 2485 else if (c >= '0' && c <= '4') |
| 1443 { | 2486 { |
| 1444 /* ESC <Fp> for start/end composition. */ | 2487 /* ESC <Fp> for start/end composition. */ |
| 1445 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7)) | 2488 found |= CATEGORY_MASK_ISO; |
| 1446 mask_found |= CODING_CATEGORY_MASK_ISO_7; | |
| 1447 else | |
| 1448 mask &= ~CODING_CATEGORY_MASK_ISO_7; | |
| 1449 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT)) | |
| 1450 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT; | |
| 1451 else | |
| 1452 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT; | |
| 1453 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1)) | |
| 1454 mask_found |= CODING_CATEGORY_MASK_ISO_8_1; | |
| 1455 else | |
| 1456 mask &= ~CODING_CATEGORY_MASK_ISO_8_1; | |
| 1457 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2)) | |
| 1458 mask_found |= CODING_CATEGORY_MASK_ISO_8_2; | |
| 1459 else | |
| 1460 mask &= ~CODING_CATEGORY_MASK_ISO_8_2; | |
| 1461 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)) | |
| 1462 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE; | |
| 1463 else | |
| 1464 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE; | |
| 1465 if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)) | |
| 1466 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE; | |
| 1467 else | |
| 1468 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE; | |
| 1469 break; | 2489 break; |
| 1470 } | 2490 } |
| 1471 else | 2491 else |
| 1472 /* Invalid escape sequence. Just ignore. */ | 2492 { |
| 1473 break; | 2493 /* Invalid escape sequence. Just ignore it. */ |
| 2494 break; | |
| 2495 } | |
| 1474 | 2496 |
| 1475 /* We found a valid designation sequence for CHARSET. */ | 2497 /* We found a valid designation sequence for CHARSET. */ |
| 1476 mask &= ~CODING_CATEGORY_MASK_ISO_8BIT; | 2498 rejected |= CATEGORY_MASK_ISO_8BIT; |
| 1477 c = MAKE_CHAR (charset, 0, 0); | 2499 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7], |
| 1478 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c)) | 2500 id)) |
| 1479 mask_found |= CODING_CATEGORY_MASK_ISO_7; | 2501 found |= CATEGORY_MASK_ISO_7; |
| 1480 else | 2502 else |
| 1481 mask &= ~CODING_CATEGORY_MASK_ISO_7; | 2503 rejected |= CATEGORY_MASK_ISO_7; |
| 1482 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c)) | 2504 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight], |
| 1483 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT; | 2505 id)) |
| 2506 found |= CATEGORY_MASK_ISO_7_TIGHT; | |
| 1484 else | 2507 else |
| 1485 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT; | 2508 rejected |= CATEGORY_MASK_ISO_7_TIGHT; |
| 1486 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c)) | 2509 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else], |
| 1487 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE; | 2510 id)) |
| 2511 found |= CATEGORY_MASK_ISO_7_ELSE; | |
| 1488 else | 2512 else |
| 1489 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE; | 2513 rejected |= CATEGORY_MASK_ISO_7_ELSE; |
| 1490 if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c)) | 2514 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else], |
| 1491 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE; | 2515 id)) |
| 2516 found |= CATEGORY_MASK_ISO_8_ELSE; | |
| 1492 else | 2517 else |
| 1493 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE; | 2518 rejected |= CATEGORY_MASK_ISO_8_ELSE; |
| 1494 break; | 2519 break; |
| 1495 | 2520 |
| 1496 case ISO_CODE_SO: | 2521 case ISO_CODE_SO: |
| 2522 case ISO_CODE_SI: | |
| 2523 /* Locking shift out/in. */ | |
| 1497 if (inhibit_iso_escape_detection) | 2524 if (inhibit_iso_escape_detection) |
| 1498 break; | 2525 break; |
| 1499 single_shifting = 0; | 2526 single_shifting = 0; |
| 1500 if (shift_out == 0 | 2527 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT; |
| 1501 && (reg[1] >= 0 | 2528 found |= CATEGORY_MASK_ISO_ELSE; |
| 1502 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE) | |
| 1503 || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))) | |
| 1504 { | |
| 1505 /* Locking shift out. */ | |
| 1506 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT; | |
| 1507 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT; | |
| 1508 } | |
| 1509 break; | 2529 break; |
| 1510 | 2530 |
| 1511 case ISO_CODE_SI: | 2531 case ISO_CODE_CSI: |
| 2532 /* Control sequence introducer. */ | |
| 2533 single_shifting = 0; | |
| 2534 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE; | |
| 2535 found |= CATEGORY_MASK_ISO_8_ELSE; | |
| 2536 goto check_extra_latin; | |
| 2537 | |
| 2538 | |
| 2539 case ISO_CODE_SS2: | |
| 2540 case ISO_CODE_SS3: | |
| 2541 /* Single shift. */ | |
| 1512 if (inhibit_iso_escape_detection) | 2542 if (inhibit_iso_escape_detection) |
| 1513 break; | 2543 break; |
| 1514 single_shifting = 0; | 2544 single_shifting = 1; |
| 1515 if (shift_out == 1) | 2545 rejected |= CATEGORY_MASK_ISO_7BIT; |
| 1516 { | 2546 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1]) |
| 1517 /* Locking shift in. */ | 2547 & CODING_ISO_FLAG_SINGLE_SHIFT) |
| 1518 mask &= ~CODING_CATEGORY_MASK_ISO_7BIT; | 2548 found |= CATEGORY_MASK_ISO_8_1; |
| 1519 mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT; | 2549 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2]) |
| 1520 } | 2550 & CODING_ISO_FLAG_SINGLE_SHIFT) |
| 1521 break; | 2551 found |= CATEGORY_MASK_ISO_8_2; |
| 1522 | 2552 goto check_extra_latin; |
| 1523 case ISO_CODE_CSI: | |
| 1524 single_shifting = 0; | |
| 1525 case ISO_CODE_SS2: | |
| 1526 case ISO_CODE_SS3: | |
| 1527 { | |
| 1528 int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE; | |
| 1529 | |
| 1530 if (inhibit_iso_escape_detection) | |
| 1531 break; | |
| 1532 if (c != ISO_CODE_CSI) | |
| 1533 { | |
| 1534 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags | |
| 1535 & CODING_FLAG_ISO_SINGLE_SHIFT) | |
| 1536 newmask |= CODING_CATEGORY_MASK_ISO_8_1; | |
| 1537 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags | |
| 1538 & CODING_FLAG_ISO_SINGLE_SHIFT) | |
| 1539 newmask |= CODING_CATEGORY_MASK_ISO_8_2; | |
| 1540 single_shifting = 1; | |
| 1541 } | |
| 1542 if (VECTORP (Vlatin_extra_code_table) | |
| 1543 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) | |
| 1544 { | |
| 1545 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags | |
| 1546 & CODING_FLAG_ISO_LATIN_EXTRA) | |
| 1547 newmask |= CODING_CATEGORY_MASK_ISO_8_1; | |
| 1548 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags | |
| 1549 & CODING_FLAG_ISO_LATIN_EXTRA) | |
| 1550 newmask |= CODING_CATEGORY_MASK_ISO_8_2; | |
| 1551 } | |
| 1552 mask &= newmask; | |
| 1553 mask_found |= newmask; | |
| 1554 } | |
| 1555 break; | |
| 1556 | 2553 |
| 1557 default: | 2554 default: |
| 1558 if (c < 0x80) | 2555 if (c < 0x80) |
| 1559 { | 2556 { |
| 1560 single_shifting = 0; | 2557 single_shifting = 0; |
| 1561 break; | 2558 break; |
| 1562 } | 2559 } |
| 1563 else if (c < 0xA0) | 2560 if (c >= 0xA0) |
| 1564 { | 2561 { |
| 1565 single_shifting = 0; | 2562 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE; |
| 1566 if (VECTORP (Vlatin_extra_code_table) | 2563 found |= CATEGORY_MASK_ISO_8_1; |
| 1567 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) | |
| 1568 { | |
| 1569 int newmask = 0; | |
| 1570 | |
| 1571 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags | |
| 1572 & CODING_FLAG_ISO_LATIN_EXTRA) | |
| 1573 newmask |= CODING_CATEGORY_MASK_ISO_8_1; | |
| 1574 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags | |
| 1575 & CODING_FLAG_ISO_LATIN_EXTRA) | |
| 1576 newmask |= CODING_CATEGORY_MASK_ISO_8_2; | |
| 1577 mask &= newmask; | |
| 1578 mask_found |= newmask; | |
| 1579 } | |
| 1580 else | |
| 1581 return 0; | |
| 1582 } | |
| 1583 else | |
| 1584 { | |
| 1585 mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT | |
| 1586 | CODING_CATEGORY_MASK_ISO_7_ELSE); | |
| 1587 mask_found |= CODING_CATEGORY_MASK_ISO_8_1; | |
| 1588 /* Check the length of succeeding codes of the range | 2564 /* Check the length of succeeding codes of the range |
| 1589 0xA0..0FF. If the byte length is odd, we exclude | 2565 0xA0..0FF. If the byte length is even, we include |
| 1590 CODING_CATEGORY_MASK_ISO_8_2. We can check this only | 2566 CATEGORY_MASK_ISO_8_2 in `found'. We can check this |
| 1591 when we are not single shifting. */ | 2567 only when we are not single shifting. */ |
| 1592 if (!single_shifting | 2568 if (! single_shifting |
| 1593 && mask & CODING_CATEGORY_MASK_ISO_8_2) | 2569 && ! (rejected & CATEGORY_MASK_ISO_8_2)) |
| 1594 { | 2570 { |
| 1595 int i = 1; | 2571 int i = 1; |
| 1596 | |
| 1597 c = -1; | |
| 1598 while (src < src_end) | 2572 while (src < src_end) |
| 1599 { | 2573 { |
| 1600 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 2574 ONE_MORE_BYTE (c); |
| 1601 if (c < 0xA0) | 2575 if (c < 0xA0) |
| 1602 break; | 2576 break; |
| 1603 i++; | 2577 i++; |
| 1604 } | 2578 } |
| 1605 | 2579 |
| 1606 if (i & 1 && src < src_end) | 2580 if (i & 1 && src < src_end) |
| 1607 mask &= ~CODING_CATEGORY_MASK_ISO_8_2; | 2581 rejected |= CATEGORY_MASK_ISO_8_2; |
| 1608 else | 2582 else |
| 1609 mask_found |= CODING_CATEGORY_MASK_ISO_8_2; | 2583 found |= CATEGORY_MASK_ISO_8_2; |
| 1610 if (c >= 0) | |
| 1611 /* This means that we have read one extra byte. */ | |
| 1612 goto retry; | |
| 1613 } | 2584 } |
| 2585 break; | |
| 1614 } | 2586 } |
| 1615 break; | 2587 check_extra_latin: |
| 1616 } | 2588 single_shifting = 0; |
| 1617 } | 2589 if (! VECTORP (Vlatin_extra_code_table) |
| 1618 label_end_of_loop: | 2590 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) |
| 1619 return (mask & mask_found); | 2591 { |
| 1620 } | 2592 rejected = CATEGORY_MASK_ISO; |
| 1621 | 2593 break; |
| 1622 /* Decode a character of which charset is CHARSET, the 1st position | 2594 } |
| 1623 code is C1, the 2nd position code is C2, and return the decoded | 2595 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1]) |
| 1624 character code. If the variable `translation_table' is non-nil, | 2596 & CODING_ISO_FLAG_LATIN_EXTRA) |
| 1625 returned the translated code. */ | 2597 found |= CATEGORY_MASK_ISO_8_1; |
| 1626 | 2598 else |
| 1627 #define DECODE_ISO_CHARACTER(charset, c1, c2) \ | 2599 rejected |= CATEGORY_MASK_ISO_8_1; |
| 1628 (NILP (translation_table) \ | 2600 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2]) |
| 1629 ? MAKE_CHAR (charset, c1, c2) \ | 2601 & CODING_ISO_FLAG_LATIN_EXTRA) |
| 1630 : translate_char (translation_table, -1, charset, c1, c2)) | 2602 found |= CATEGORY_MASK_ISO_8_2; |
| 2603 else | |
| 2604 rejected |= CATEGORY_MASK_ISO_8_2; | |
| 2605 } | |
| 2606 } | |
| 2607 detect_info->rejected |= CATEGORY_MASK_ISO; | |
| 2608 return 0; | |
| 2609 | |
| 2610 no_more_source: | |
| 2611 detect_info->rejected |= rejected; | |
| 2612 detect_info->found |= (found & ~rejected); | |
| 2613 return 1; | |
| 2614 } | |
| 2615 | |
| 1631 | 2616 |
| 1632 /* Set designation state into CODING. */ | 2617 /* Set designation state into CODING. */ |
| 1633 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \ | 2618 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \ |
| 1634 do { \ | 2619 do { \ |
| 1635 int charset, c; \ | 2620 int id, prev; \ |
| 1636 \ | 2621 \ |
| 1637 if (final_char < '0' || final_char >= 128) \ | 2622 if (final < '0' || final >= 128 \ |
| 1638 goto label_invalid_code; \ | 2623 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \ |
| 1639 charset = ISO_CHARSET_TABLE (make_number (dimension), \ | 2624 || !SAFE_CHARSET_P (coding, id)) \ |
| 1640 make_number (chars), \ | 2625 { \ |
| 1641 make_number (final_char)); \ | 2626 CODING_ISO_DESIGNATION (coding, reg) = -2; \ |
| 1642 c = MAKE_CHAR (charset, 0, 0); \ | 2627 goto invalid_code; \ |
| 1643 if (charset >= 0 \ | 2628 } \ |
| 1644 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \ | 2629 prev = CODING_ISO_DESIGNATION (coding, reg); \ |
| 1645 || CODING_SAFE_CHAR_P (safe_chars, c))) \ | 2630 if (id == charset_jisx0201_roman) \ |
| 1646 { \ | 2631 { \ |
| 1647 if (coding->spec.iso2022.last_invalid_designation_register == 0 \ | 2632 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \ |
| 1648 && reg == 0 \ | 2633 id = charset_ascii; \ |
| 1649 && charset == CHARSET_ASCII) \ | 2634 } \ |
| 1650 { \ | 2635 else if (id == charset_jisx0208_1978) \ |
| 1651 /* We should insert this designation sequence as is so \ | 2636 { \ |
| 1652 that it is surely written back to a file. */ \ | 2637 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \ |
| 1653 coding->spec.iso2022.last_invalid_designation_register = -1; \ | 2638 id = charset_jisx0208; \ |
| 1654 goto label_invalid_code; \ | 2639 } \ |
| 1655 } \ | 2640 CODING_ISO_DESIGNATION (coding, reg) = id; \ |
| 1656 coding->spec.iso2022.last_invalid_designation_register = -1; \ | 2641 /* If there was an invalid designation to REG previously, and this \ |
| 1657 if ((coding->mode & CODING_MODE_DIRECTION) \ | 2642 designation is ASCII to REG, we should keep this designation \ |
| 1658 && CHARSET_REVERSE_CHARSET (charset) >= 0) \ | 2643 sequence. */ \ |
| 1659 charset = CHARSET_REVERSE_CHARSET (charset); \ | 2644 if (prev == -2 && id == charset_ascii) \ |
| 1660 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \ | 2645 goto invalid_code; \ |
| 1661 } \ | |
| 1662 else \ | |
| 1663 { \ | |
| 1664 coding->spec.iso2022.last_invalid_designation_register = reg; \ | |
| 1665 goto label_invalid_code; \ | |
| 1666 } \ | |
| 1667 } while (0) | 2646 } while (0) |
| 1668 | 2647 |
| 1669 /* Allocate a memory block for storing information about compositions. | 2648 |
| 1670 The block is chained to the already allocated blocks. */ | 2649 #define MAYBE_FINISH_COMPOSITION() \ |
| 1671 | 2650 do { \ |
| 1672 void | 2651 int i; \ |
| 1673 coding_allocate_composition_data (coding, char_offset) | 2652 if (composition_state == COMPOSING_NO) \ |
| 1674 struct coding_system *coding; | 2653 break; \ |
| 1675 int char_offset; | 2654 /* It is assured that we have enough room for producing \ |
| 1676 { | 2655 characters stored in the table `components'. */ \ |
| 1677 struct composition_data *cmp_data | 2656 if (charbuf + component_idx > charbuf_end) \ |
| 1678 = (struct composition_data *) xmalloc (sizeof *cmp_data); | 2657 goto no_more_source; \ |
| 1679 | 2658 composition_state = COMPOSING_NO; \ |
| 1680 cmp_data->char_offset = char_offset; | 2659 if (method == COMPOSITION_RELATIVE \ |
| 1681 cmp_data->used = 0; | 2660 || method == COMPOSITION_WITH_ALTCHARS) \ |
| 1682 cmp_data->prev = coding->cmp_data; | 2661 { \ |
| 1683 cmp_data->next = NULL; | 2662 for (i = 0; i < component_idx; i++) \ |
| 1684 if (coding->cmp_data) | 2663 *charbuf++ = components[i]; \ |
| 1685 coding->cmp_data->next = cmp_data; | 2664 char_offset += component_idx; \ |
| 1686 coding->cmp_data = cmp_data; | 2665 } \ |
| 1687 coding->cmp_data_start = 0; | 2666 else \ |
| 1688 } | 2667 { \ |
| 2668 for (i = 0; i < component_idx; i += 2) \ | |
| 2669 *charbuf++ = components[i]; \ | |
| 2670 char_offset += (component_idx / 2) + 1; \ | |
| 2671 } \ | |
| 2672 } while (0) | |
| 2673 | |
| 1689 | 2674 |
| 1690 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4. | 2675 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4. |
| 1691 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1 | 2676 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1 |
| 1692 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1 | 2677 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1 |
| 1693 ESC 3 : altchar composition : ESC 3 ALT ... ESC 0 CHAR ... ESC 1 | 2678 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1 |
| 1694 ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1 | 2679 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1 |
| 1695 */ | 2680 */ |
| 1696 | 2681 |
| 1697 #define DECODE_COMPOSITION_START(c1) \ | 2682 #define DECODE_COMPOSITION_START(c1) \ |
| 1698 do { \ | |
| 1699 if (coding->composing == COMPOSITION_DISABLED) \ | |
| 1700 { \ | |
| 1701 *dst++ = ISO_CODE_ESC; \ | |
| 1702 *dst++ = c1 & 0x7f; \ | |
| 1703 coding->produced_char += 2; \ | |
| 1704 } \ | |
| 1705 else if (!COMPOSING_P (coding)) \ | |
| 1706 { \ | |
| 1707 /* This is surely the start of a composition. We must be sure \ | |
| 1708 that coding->cmp_data has enough space to store the \ | |
| 1709 information about the composition. If not, terminate the \ | |
| 1710 current decoding loop, allocate one more memory block for \ | |
| 1711 coding->cmp_data in the caller, then start the decoding \ | |
| 1712 loop again. We can't allocate memory here directly because \ | |
| 1713 it may cause buffer/string relocation. */ \ | |
| 1714 if (!coding->cmp_data \ | |
| 1715 || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \ | |
| 1716 >= COMPOSITION_DATA_SIZE)) \ | |
| 1717 { \ | |
| 1718 coding->result = CODING_FINISH_INSUFFICIENT_CMP; \ | |
| 1719 goto label_end_of_loop; \ | |
| 1720 } \ | |
| 1721 coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE \ | |
| 1722 : c1 == '2' ? COMPOSITION_WITH_RULE \ | |
| 1723 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \ | |
| 1724 : COMPOSITION_WITH_RULE_ALTCHARS); \ | |
| 1725 CODING_ADD_COMPOSITION_START (coding, coding->produced_char, \ | |
| 1726 coding->composing); \ | |
| 1727 coding->composition_rule_follows = 0; \ | |
| 1728 } \ | |
| 1729 else \ | |
| 1730 { \ | |
| 1731 /* We are already handling a composition. If the method is \ | |
| 1732 the following two, the codes following the current escape \ | |
| 1733 sequence are actual characters stored in a buffer. */ \ | |
| 1734 if (coding->composing == COMPOSITION_WITH_ALTCHARS \ | |
| 1735 || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) \ | |
| 1736 { \ | |
| 1737 coding->composing = COMPOSITION_RELATIVE; \ | |
| 1738 coding->composition_rule_follows = 0; \ | |
| 1739 } \ | |
| 1740 } \ | |
| 1741 } while (0) | |
| 1742 | |
| 1743 /* Handle composition end sequence ESC 1. */ | |
| 1744 | |
| 1745 #define DECODE_COMPOSITION_END(c1) \ | |
| 1746 do { \ | 2683 do { \ |
| 1747 if (! COMPOSING_P (coding)) \ | 2684 if (c1 == '0' \ |
| 2685 && composition_state == COMPOSING_COMPONENT_RULE) \ | |
| 1748 { \ | 2686 { \ |
| 1749 *dst++ = ISO_CODE_ESC; \ | 2687 component_len = component_idx; \ |
| 1750 *dst++ = c1; \ | 2688 composition_state = COMPOSING_CHAR; \ |
| 1751 coding->produced_char += 2; \ | |
| 1752 } \ | 2689 } \ |
| 1753 else \ | 2690 else \ |
| 1754 { \ | 2691 { \ |
| 1755 CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \ | 2692 const unsigned char *p; \ |
| 1756 coding->composing = COMPOSITION_NO; \ | 2693 \ |
| 2694 MAYBE_FINISH_COMPOSITION (); \ | |
| 2695 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \ | |
| 2696 goto no_more_source; \ | |
| 2697 for (p = src; p < src_end - 1; p++) \ | |
| 2698 if (*p == ISO_CODE_ESC && p[1] == '1') \ | |
| 2699 break; \ | |
| 2700 if (p == src_end - 1) \ | |
| 2701 { \ | |
| 2702 if (coding->mode & CODING_MODE_LAST_BLOCK) \ | |
| 2703 goto invalid_code; \ | |
| 2704 goto no_more_source; \ | |
| 2705 } \ | |
| 2706 \ | |
| 2707 /* This is surely the start of a composition. */ \ | |
| 2708 method = (c1 == '0' ? COMPOSITION_RELATIVE \ | |
| 2709 : c1 == '2' ? COMPOSITION_WITH_RULE \ | |
| 2710 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \ | |
| 2711 : COMPOSITION_WITH_RULE_ALTCHARS); \ | |
| 2712 composition_state = (c1 <= '2' ? COMPOSING_CHAR \ | |
| 2713 : COMPOSING_COMPONENT_CHAR); \ | |
| 2714 component_idx = component_len = 0; \ | |
| 1757 } \ | 2715 } \ |
| 1758 } while (0) | 2716 } while (0) |
| 2717 | |
| 2718 | |
| 2719 /* Handle compositoin end sequence ESC 1. */ | |
| 2720 | |
| 2721 #define DECODE_COMPOSITION_END() \ | |
| 2722 do { \ | |
| 2723 int nchars = (component_len > 0 ? component_idx - component_len \ | |
| 2724 : method == COMPOSITION_RELATIVE ? component_idx \ | |
| 2725 : (component_idx + 1) / 2); \ | |
| 2726 int i; \ | |
| 2727 int *saved_charbuf = charbuf; \ | |
| 2728 int from = char_offset; \ | |
| 2729 int to = from + nchars; \ | |
| 2730 \ | |
| 2731 ADD_COMPOSITION_DATA (charbuf, from, to, method); \ | |
| 2732 if (method != COMPOSITION_RELATIVE) \ | |
| 2733 { \ | |
| 2734 if (component_len == 0) \ | |
| 2735 for (i = 0; i < component_idx; i++) \ | |
| 2736 *charbuf++ = components[i]; \ | |
| 2737 else \ | |
| 2738 for (i = 0; i < component_len; i++) \ | |
| 2739 *charbuf++ = components[i]; \ | |
| 2740 *saved_charbuf = saved_charbuf - charbuf; \ | |
| 2741 } \ | |
| 2742 if (method == COMPOSITION_WITH_RULE) \ | |
| 2743 for (i = 0; i < component_idx; i += 2, char_offset++) \ | |
| 2744 *charbuf++ = components[i]; \ | |
| 2745 else \ | |
| 2746 for (i = component_len; i < component_idx; i++, char_offset++) \ | |
| 2747 *charbuf++ = components[i]; \ | |
| 2748 coding->annotated = 1; \ | |
| 2749 composition_state = COMPOSING_NO; \ | |
| 2750 } while (0) | |
| 2751 | |
| 1759 | 2752 |
| 1760 /* Decode a composition rule from the byte C1 (and maybe one more byte | 2753 /* Decode a composition rule from the byte C1 (and maybe one more byte |
| 1761 from SRC) and store one encoded composition rule in | 2754 from SRC) and store one encoded composition rule in |
| 1762 coding->cmp_data. */ | 2755 coding->cmp_data. */ |
| 1763 | 2756 |
| 1764 #define DECODE_COMPOSITION_RULE(c1) \ | 2757 #define DECODE_COMPOSITION_RULE(c1) \ |
| 1765 do { \ | 2758 do { \ |
| 1766 int rule = 0; \ | |
| 1767 (c1) -= 32; \ | 2759 (c1) -= 32; \ |
| 1768 if (c1 < 81) /* old format (before ver.21) */ \ | 2760 if (c1 < 81) /* old format (before ver.21) */ \ |
| 1769 { \ | 2761 { \ |
| 1770 int gref = (c1) / 9; \ | 2762 int gref = (c1) / 9; \ |
| 1771 int nref = (c1) % 9; \ | 2763 int nref = (c1) % 9; \ |
| 1772 if (gref == 4) gref = 10; \ | 2764 if (gref == 4) gref = 10; \ |
| 1773 if (nref == 4) nref = 10; \ | 2765 if (nref == 4) nref = 10; \ |
| 1774 rule = COMPOSITION_ENCODE_RULE (gref, nref); \ | 2766 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \ |
| 1775 } \ | 2767 } \ |
| 1776 else if (c1 < 93) /* new format (after ver.21) */ \ | 2768 else if (c1 < 93) /* new format (after ver.21) */ \ |
| 1777 { \ | 2769 { \ |
| 1778 ONE_MORE_BYTE (c2); \ | 2770 ONE_MORE_BYTE (c2); \ |
| 1779 rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \ | 2771 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \ |
| 1780 } \ | 2772 } \ |
| 1781 CODING_ADD_COMPOSITION_COMPONENT (coding, rule); \ | 2773 else \ |
| 1782 coding->composition_rule_follows = 0; \ | 2774 c1 = 0; \ |
| 1783 } while (0) | 2775 } while (0) |
| 1784 | 2776 |
| 1785 | 2777 |
| 1786 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ | 2778 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ |
| 1787 | 2779 |
| 1788 static void | 2780 static void |
| 1789 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) | 2781 decode_coding_iso_2022 (coding) |
| 1790 struct coding_system *coding; | 2782 struct coding_system *coding; |
| 1791 unsigned char *source, *destination; | 2783 { |
| 1792 int src_bytes, dst_bytes; | 2784 const unsigned char *src = coding->source + coding->consumed; |
| 1793 { | 2785 const unsigned char *src_end = coding->source + coding->src_bytes; |
| 1794 unsigned char *src = source; | 2786 const unsigned char *src_base; |
| 1795 unsigned char *src_end = source + src_bytes; | 2787 int *charbuf = coding->charbuf; |
| 1796 unsigned char *dst = destination; | 2788 int *charbuf_end |
| 1797 unsigned char *dst_end = destination + dst_bytes; | 2789 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH; |
| 2790 int consumed_chars = 0, consumed_chars_base; | |
| 2791 int multibytep = coding->src_multibyte; | |
| 1798 /* Charsets invoked to graphic plane 0 and 1 respectively. */ | 2792 /* Charsets invoked to graphic plane 0 and 1 respectively. */ |
| 1799 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); | 2793 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); |
| 1800 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1); | 2794 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1); |
| 1801 /* SRC_BASE remembers the start position in source in each loop. | 2795 struct charset *charset; |
| 1802 The loop will be exited when there's not enough source code | 2796 int c; |
| 1803 (within macro ONE_MORE_BYTE), or when there's not enough | 2797 /* For handling composition sequence. */ |
| 1804 destination area to produce a character (within macro | 2798 #define COMPOSING_NO 0 |
| 1805 EMIT_CHAR). */ | 2799 #define COMPOSING_CHAR 1 |
| 1806 unsigned char *src_base; | 2800 #define COMPOSING_RULE 2 |
| 1807 int c, charset; | 2801 #define COMPOSING_COMPONENT_CHAR 3 |
| 1808 Lisp_Object translation_table; | 2802 #define COMPOSING_COMPONENT_RULE 4 |
| 1809 Lisp_Object safe_chars; | 2803 |
| 1810 | 2804 int composition_state = COMPOSING_NO; |
| 1811 safe_chars = coding_safe_chars (coding->symbol); | 2805 enum composition_method method; |
| 1812 | 2806 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1]; |
| 1813 if (NILP (Venable_character_translation)) | 2807 int component_idx; |
| 1814 translation_table = Qnil; | 2808 int component_len; |
| 1815 else | 2809 Lisp_Object attrs, eol_type, charset_list; |
| 1816 { | 2810 int char_offset = coding->produced_char; |
| 1817 translation_table = coding->translation_table_for_decode; | 2811 int last_offset = char_offset; |
| 1818 if (NILP (translation_table)) | 2812 int last_id = charset_ascii; |
| 1819 translation_table = Vstandard_translation_table_for_decode; | 2813 |
| 1820 } | 2814 CODING_GET_INFO (coding, attrs, eol_type, charset_list); |
| 1821 | 2815 setup_iso_safe_charsets (attrs); |
| 1822 coding->result = CODING_FINISH_NORMAL; | |
| 1823 | 2816 |
| 1824 while (1) | 2817 while (1) |
| 1825 { | 2818 { |
| 1826 int c1, c2; | 2819 int c1, c2; |
| 1827 | 2820 |
| 1828 src_base = src; | 2821 src_base = src; |
| 2822 consumed_chars_base = consumed_chars; | |
| 2823 | |
| 2824 if (charbuf >= charbuf_end) | |
| 2825 break; | |
| 2826 | |
| 1829 ONE_MORE_BYTE (c1); | 2827 ONE_MORE_BYTE (c1); |
| 1830 | 2828 |
| 1831 /* We produce no character or one character. */ | 2829 /* We produce at most one character. */ |
| 1832 switch (iso_code_class [c1]) | 2830 switch (iso_code_class [c1]) |
| 1833 { | 2831 { |
| 1834 case ISO_0x20_or_0x7F: | 2832 case ISO_0x20_or_0x7F: |
| 1835 if (COMPOSING_P (coding) && coding->composition_rule_follows) | 2833 if (composition_state != COMPOSING_NO) |
| 1836 { | 2834 { |
| 1837 DECODE_COMPOSITION_RULE (c1); | 2835 if (composition_state == COMPOSING_RULE |
| 1838 continue; | 2836 || composition_state == COMPOSING_COMPONENT_RULE) |
| 1839 } | |
| 1840 if (charset0 < 0 || CHARSET_CHARS (charset0) == 94) | |
| 1841 { | |
| 1842 /* This is SPACE or DEL. */ | |
| 1843 charset = CHARSET_ASCII; | |
| 1844 break; | |
| 1845 } | |
| 1846 /* This is a graphic character, we fall down ... */ | |
| 1847 | |
| 1848 case ISO_graphic_plane_0: | |
| 1849 if (COMPOSING_P (coding) && coding->composition_rule_follows) | |
| 1850 { | |
| 1851 DECODE_COMPOSITION_RULE (c1); | |
| 1852 continue; | |
| 1853 } | |
| 1854 charset = charset0; | |
| 1855 break; | |
| 1856 | |
| 1857 case ISO_0xA0_or_0xFF: | |
| 1858 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94 | |
| 1859 || coding->flags & CODING_FLAG_ISO_SEVEN_BITS) | |
| 1860 goto label_invalid_code; | |
| 1861 /* This is a graphic character, we fall down ... */ | |
| 1862 | |
| 1863 case ISO_graphic_plane_1: | |
| 1864 if (charset1 < 0) | |
| 1865 goto label_invalid_code; | |
| 1866 charset = charset1; | |
| 1867 break; | |
| 1868 | |
| 1869 case ISO_control_0: | |
| 1870 if (COMPOSING_P (coding)) | |
| 1871 DECODE_COMPOSITION_END ('1'); | |
| 1872 | |
| 1873 /* All ISO2022 control characters in this class have the | |
| 1874 same representation in Emacs internal format. */ | |
| 1875 if (c1 == '\n' | |
| 1876 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) | |
| 1877 && (coding->eol_type == CODING_EOL_CR | |
| 1878 || coding->eol_type == CODING_EOL_CRLF)) | |
| 1879 { | |
| 1880 coding->result = CODING_FINISH_INCONSISTENT_EOL; | |
| 1881 goto label_end_of_loop; | |
| 1882 } | |
| 1883 charset = CHARSET_ASCII; | |
| 1884 break; | |
| 1885 | |
| 1886 case ISO_control_1: | |
| 1887 if (COMPOSING_P (coding)) | |
| 1888 DECODE_COMPOSITION_END ('1'); | |
| 1889 goto label_invalid_code; | |
| 1890 | |
| 1891 case ISO_carriage_return: | |
| 1892 if (COMPOSING_P (coding)) | |
| 1893 DECODE_COMPOSITION_END ('1'); | |
| 1894 | |
| 1895 if (coding->eol_type == CODING_EOL_CR) | |
| 1896 c1 = '\n'; | |
| 1897 else if (coding->eol_type == CODING_EOL_CRLF) | |
| 1898 { | |
| 1899 ONE_MORE_BYTE (c1); | |
| 1900 if (c1 != ISO_CODE_LF) | |
| 1901 { | 2837 { |
| 1902 src--; | 2838 DECODE_COMPOSITION_RULE (c1); |
| 1903 c1 = '\r'; | 2839 components[component_idx++] = c1; |
| 2840 composition_state--; | |
| 2841 continue; | |
| 1904 } | 2842 } |
| 1905 } | 2843 } |
| 1906 charset = CHARSET_ASCII; | 2844 if (charset_id_0 < 0 |
| 2845 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0))) | |
| 2846 /* This is SPACE or DEL. */ | |
| 2847 charset = CHARSET_FROM_ID (charset_ascii); | |
| 2848 else | |
| 2849 charset = CHARSET_FROM_ID (charset_id_0); | |
| 1907 break; | 2850 break; |
| 1908 | 2851 |
| 2852 case ISO_graphic_plane_0: | |
| 2853 if (composition_state != COMPOSING_NO) | |
| 2854 { | |
| 2855 if (composition_state == COMPOSING_RULE | |
| 2856 || composition_state == COMPOSING_COMPONENT_RULE) | |
| 2857 { | |
| 2858 DECODE_COMPOSITION_RULE (c1); | |
| 2859 components[component_idx++] = c1; | |
| 2860 composition_state--; | |
| 2861 continue; | |
| 2862 } | |
| 2863 } | |
| 2864 charset = CHARSET_FROM_ID (charset_id_0); | |
| 2865 break; | |
| 2866 | |
| 2867 case ISO_0xA0_or_0xFF: | |
| 2868 if (charset_id_1 < 0 | |
| 2869 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1)) | |
| 2870 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) | |
| 2871 goto invalid_code; | |
| 2872 /* This is a graphic character, we fall down ... */ | |
| 2873 | |
| 2874 case ISO_graphic_plane_1: | |
| 2875 if (charset_id_1 < 0) | |
| 2876 goto invalid_code; | |
| 2877 charset = CHARSET_FROM_ID (charset_id_1); | |
| 2878 break; | |
| 2879 | |
| 2880 case ISO_carriage_return: | |
| 2881 if (c1 == '\r') | |
| 2882 { | |
| 2883 if (EQ (eol_type, Qdos)) | |
| 2884 { | |
| 2885 if (src == src_end) | |
| 2886 { | |
| 2887 coding->result = CODING_RESULT_INSUFFICIENT_SRC; | |
| 2888 goto no_more_source; | |
| 2889 } | |
| 2890 if (*src == '\n') | |
| 2891 ONE_MORE_BYTE (c1); | |
| 2892 } | |
| 2893 else if (EQ (eol_type, Qmac)) | |
| 2894 c1 = '\n'; | |
| 2895 } | |
| 2896 /* fall through */ | |
| 2897 | |
| 2898 case ISO_control_0: | |
| 2899 MAYBE_FINISH_COMPOSITION (); | |
| 2900 charset = CHARSET_FROM_ID (charset_ascii); | |
| 2901 break; | |
| 2902 | |
| 2903 case ISO_control_1: | |
| 2904 MAYBE_FINISH_COMPOSITION (); | |
| 2905 goto invalid_code; | |
| 2906 | |
| 1909 case ISO_shift_out: | 2907 case ISO_shift_out: |
| 1910 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT) | 2908 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT) |
| 1911 || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0) | 2909 || CODING_ISO_DESIGNATION (coding, 1) < 0) |
| 1912 goto label_invalid_code; | 2910 goto invalid_code; |
| 1913 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; | 2911 CODING_ISO_INVOCATION (coding, 0) = 1; |
| 1914 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); | 2912 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); |
| 1915 continue; | 2913 continue; |
| 1916 | 2914 |
| 1917 case ISO_shift_in: | 2915 case ISO_shift_in: |
| 1918 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)) | 2916 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)) |
| 1919 goto label_invalid_code; | 2917 goto invalid_code; |
| 1920 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; | 2918 CODING_ISO_INVOCATION (coding, 0) = 0; |
| 1921 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); | 2919 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); |
| 1922 continue; | 2920 continue; |
| 1923 | 2921 |
| 1924 case ISO_single_shift_2_7: | 2922 case ISO_single_shift_2_7: |
| 1925 case ISO_single_shift_2: | 2923 case ISO_single_shift_2: |
| 1926 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)) | 2924 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)) |
| 1927 goto label_invalid_code; | 2925 goto invalid_code; |
| 1928 /* SS2 is handled as an escape sequence of ESC 'N' */ | 2926 /* SS2 is handled as an escape sequence of ESC 'N' */ |
| 1929 c1 = 'N'; | 2927 c1 = 'N'; |
| 1930 goto label_escape_sequence; | 2928 goto label_escape_sequence; |
| 1931 | 2929 |
| 1932 case ISO_single_shift_3: | 2930 case ISO_single_shift_3: |
| 1933 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)) | 2931 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)) |
| 1934 goto label_invalid_code; | 2932 goto invalid_code; |
| 1935 /* SS2 is handled as an escape sequence of ESC 'O' */ | 2933 /* SS2 is handled as an escape sequence of ESC 'O' */ |
| 1936 c1 = 'O'; | 2934 c1 = 'O'; |
| 1937 goto label_escape_sequence; | 2935 goto label_escape_sequence; |
| 1938 | 2936 |
| 1939 case ISO_control_sequence_introducer: | 2937 case ISO_control_sequence_introducer: |
| 1942 goto label_escape_sequence; | 2940 goto label_escape_sequence; |
| 1943 | 2941 |
| 1944 case ISO_escape: | 2942 case ISO_escape: |
| 1945 ONE_MORE_BYTE (c1); | 2943 ONE_MORE_BYTE (c1); |
| 1946 label_escape_sequence: | 2944 label_escape_sequence: |
| 1947 /* Escape sequences handled by Emacs are invocation, | 2945 /* Escape sequences handled here are invocation, |
| 1948 designation, direction specification, and character | 2946 designation, direction specification, and character |
| 1949 composition specification. */ | 2947 composition specification. */ |
| 1950 switch (c1) | 2948 switch (c1) |
| 1951 { | 2949 { |
| 1952 case '&': /* revision of following character set */ | 2950 case '&': /* revision of following character set */ |
| 1953 ONE_MORE_BYTE (c1); | 2951 ONE_MORE_BYTE (c1); |
| 1954 if (!(c1 >= '@' && c1 <= '~')) | 2952 if (!(c1 >= '@' && c1 <= '~')) |
| 1955 goto label_invalid_code; | 2953 goto invalid_code; |
| 1956 ONE_MORE_BYTE (c1); | 2954 ONE_MORE_BYTE (c1); |
| 1957 if (c1 != ISO_CODE_ESC) | 2955 if (c1 != ISO_CODE_ESC) |
| 1958 goto label_invalid_code; | 2956 goto invalid_code; |
| 1959 ONE_MORE_BYTE (c1); | 2957 ONE_MORE_BYTE (c1); |
| 1960 goto label_escape_sequence; | 2958 goto label_escape_sequence; |
| 1961 | 2959 |
| 1962 case '$': /* designation of 2-byte character set */ | 2960 case '$': /* designation of 2-byte character set */ |
| 1963 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION)) | 2961 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION)) |
| 1964 goto label_invalid_code; | 2962 goto invalid_code; |
| 1965 ONE_MORE_BYTE (c1); | 2963 ONE_MORE_BYTE (c1); |
| 1966 if (c1 >= '@' && c1 <= 'B') | 2964 if (c1 >= '@' && c1 <= 'B') |
| 1967 { /* designation of JISX0208.1978, GB2312.1980, | 2965 { /* designation of JISX0208.1978, GB2312.1980, |
| 1968 or JISX0208.1980 */ | 2966 or JISX0208.1980 */ |
| 1969 DECODE_DESIGNATION (0, 2, 94, c1); | 2967 DECODE_DESIGNATION (0, 2, 0, c1); |
| 1970 } | 2968 } |
| 1971 else if (c1 >= 0x28 && c1 <= 0x2B) | 2969 else if (c1 >= 0x28 && c1 <= 0x2B) |
| 1972 { /* designation of DIMENSION2_CHARS94 character set */ | 2970 { /* designation of DIMENSION2_CHARS94 character set */ |
| 1973 ONE_MORE_BYTE (c2); | 2971 ONE_MORE_BYTE (c2); |
| 1974 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2); | 2972 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2); |
| 1975 } | 2973 } |
| 1976 else if (c1 >= 0x2C && c1 <= 0x2F) | 2974 else if (c1 >= 0x2C && c1 <= 0x2F) |
| 1977 { /* designation of DIMENSION2_CHARS96 character set */ | 2975 { /* designation of DIMENSION2_CHARS96 character set */ |
| 1978 ONE_MORE_BYTE (c2); | 2976 ONE_MORE_BYTE (c2); |
| 1979 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2); | 2977 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2); |
| 1980 } | 2978 } |
| 1981 else | 2979 else |
| 1982 goto label_invalid_code; | 2980 goto invalid_code; |
| 1983 /* We must update these variables now. */ | 2981 /* We must update these variables now. */ |
| 1984 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); | 2982 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); |
| 1985 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1); | 2983 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1); |
| 1986 continue; | 2984 continue; |
| 1987 | 2985 |
| 1988 case 'n': /* invocation of locking-shift-2 */ | 2986 case 'n': /* invocation of locking-shift-2 */ |
| 1989 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT) | 2987 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT) |
| 1990 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0) | 2988 || CODING_ISO_DESIGNATION (coding, 2) < 0) |
| 1991 goto label_invalid_code; | 2989 goto invalid_code; |
| 1992 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; | 2990 CODING_ISO_INVOCATION (coding, 0) = 2; |
| 1993 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); | 2991 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); |
| 1994 continue; | 2992 continue; |
| 1995 | 2993 |
| 1996 case 'o': /* invocation of locking-shift-3 */ | 2994 case 'o': /* invocation of locking-shift-3 */ |
| 1997 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT) | 2995 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT) |
| 1998 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0) | 2996 || CODING_ISO_DESIGNATION (coding, 3) < 0) |
| 1999 goto label_invalid_code; | 2997 goto invalid_code; |
| 2000 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; | 2998 CODING_ISO_INVOCATION (coding, 0) = 3; |
| 2001 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); | 2999 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); |
| 2002 continue; | 3000 continue; |
| 2003 | 3001 |
| 2004 case 'N': /* invocation of single-shift-2 */ | 3002 case 'N': /* invocation of single-shift-2 */ |
| 2005 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT) | 3003 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT) |
| 2006 || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0) | 3004 || CODING_ISO_DESIGNATION (coding, 2) < 0) |
| 2007 goto label_invalid_code; | 3005 goto invalid_code; |
| 2008 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2); | 3006 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2)); |
| 2009 ONE_MORE_BYTE (c1); | 3007 ONE_MORE_BYTE (c1); |
| 2010 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)) | 3008 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)) |
| 2011 goto label_invalid_code; | 3009 goto invalid_code; |
| 2012 break; | 3010 break; |
| 2013 | 3011 |
| 2014 case 'O': /* invocation of single-shift-3 */ | 3012 case 'O': /* invocation of single-shift-3 */ |
| 2015 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT) | 3013 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT) |
| 2016 || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0) | 3014 || CODING_ISO_DESIGNATION (coding, 3) < 0) |
| 2017 goto label_invalid_code; | 3015 goto invalid_code; |
| 2018 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3); | 3016 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3)); |
| 2019 ONE_MORE_BYTE (c1); | 3017 ONE_MORE_BYTE (c1); |
| 2020 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)) | 3018 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)) |
| 2021 goto label_invalid_code; | 3019 goto invalid_code; |
| 2022 break; | 3020 break; |
| 2023 | 3021 |
| 2024 case '0': case '2': case '3': case '4': /* start composition */ | 3022 case '0': case '2': case '3': case '4': /* start composition */ |
| 3023 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)) | |
| 3024 goto invalid_code; | |
| 2025 DECODE_COMPOSITION_START (c1); | 3025 DECODE_COMPOSITION_START (c1); |
| 2026 continue; | 3026 continue; |
| 2027 | 3027 |
| 2028 case '1': /* end composition */ | 3028 case '1': /* end composition */ |
| 2029 DECODE_COMPOSITION_END (c1); | 3029 if (composition_state == COMPOSING_NO) |
| 3030 goto invalid_code; | |
| 3031 DECODE_COMPOSITION_END (); | |
| 2030 continue; | 3032 continue; |
| 2031 | 3033 |
| 2032 case '[': /* specification of direction */ | 3034 case '[': /* specification of direction */ |
| 2033 if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION) | 3035 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION) |
| 2034 goto label_invalid_code; | 3036 goto invalid_code; |
| 2035 /* For the moment, nested direction is not supported. | 3037 /* For the moment, nested direction is not supported. |
| 2036 So, `coding->mode & CODING_MODE_DIRECTION' zero means | 3038 So, `coding->mode & CODING_MODE_DIRECTION' zero means |
| 2037 left-to-right, and nonzero means right-to-left. */ | 3039 left-to-right, and nozero means right-to-left. */ |
| 2038 ONE_MORE_BYTE (c1); | 3040 ONE_MORE_BYTE (c1); |
| 2039 switch (c1) | 3041 switch (c1) |
| 2040 { | 3042 { |
| 2041 case ']': /* end of the current direction */ | 3043 case ']': /* end of the current direction */ |
| 2042 coding->mode &= ~CODING_MODE_DIRECTION; | 3044 coding->mode &= ~CODING_MODE_DIRECTION; |
| 2045 case '1': /* start of left-to-right direction */ | 3047 case '1': /* start of left-to-right direction */ |
| 2046 ONE_MORE_BYTE (c1); | 3048 ONE_MORE_BYTE (c1); |
| 2047 if (c1 == ']') | 3049 if (c1 == ']') |
| 2048 coding->mode &= ~CODING_MODE_DIRECTION; | 3050 coding->mode &= ~CODING_MODE_DIRECTION; |
| 2049 else | 3051 else |
| 2050 goto label_invalid_code; | 3052 goto invalid_code; |
| 2051 break; | 3053 break; |
| 2052 | 3054 |
| 2053 case '2': /* start of right-to-left direction */ | 3055 case '2': /* start of right-to-left direction */ |
| 2054 ONE_MORE_BYTE (c1); | 3056 ONE_MORE_BYTE (c1); |
| 2055 if (c1 == ']') | 3057 if (c1 == ']') |
| 2056 coding->mode |= CODING_MODE_DIRECTION; | 3058 coding->mode |= CODING_MODE_DIRECTION; |
| 2057 else | 3059 else |
| 2058 goto label_invalid_code; | 3060 goto invalid_code; |
| 2059 break; | 3061 break; |
| 2060 | 3062 |
| 2061 default: | 3063 default: |
| 2062 goto label_invalid_code; | 3064 goto invalid_code; |
| 2063 } | 3065 } |
| 2064 continue; | 3066 continue; |
| 2065 | 3067 |
| 2066 case '%': | 3068 case '%': |
| 2067 if (COMPOSING_P (coding)) | |
| 2068 DECODE_COMPOSITION_END ('1'); | |
| 2069 ONE_MORE_BYTE (c1); | 3069 ONE_MORE_BYTE (c1); |
| 2070 if (c1 == '/') | 3070 if (c1 == '/') |
| 2071 { | 3071 { |
| 2072 /* CTEXT extended segment: | 3072 /* CTEXT extended segment: |
| 2073 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES-- | 3073 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES-- |
| 2074 We keep these bytes as is for the moment. | 3074 We keep these bytes as is for the moment. |
| 2075 They may be decoded by post-read-conversion. */ | 3075 They may be decoded by post-read-conversion. */ |
| 2076 int dim, M, L; | 3076 int dim, M, L; |
| 2077 int size, required; | 3077 int size; |
| 2078 int produced_chars; | 3078 |
| 2079 | |
| 2080 ONE_MORE_BYTE (dim); | 3079 ONE_MORE_BYTE (dim); |
| 2081 ONE_MORE_BYTE (M); | 3080 ONE_MORE_BYTE (M); |
| 2082 ONE_MORE_BYTE (L); | 3081 ONE_MORE_BYTE (L); |
| 2083 size = ((M - 128) * 128) + (L - 128); | 3082 size = ((M - 128) * 128) + (L - 128); |
| 2084 required = 8 + size * 2; | 3083 if (charbuf + 8 + size > charbuf_end) |
| 2085 if (dst + required > (dst_bytes ? dst_end : src)) | 3084 goto break_loop; |
| 2086 goto label_end_of_loop; | 3085 *charbuf++ = ISO_CODE_ESC; |
| 2087 *dst++ = ISO_CODE_ESC; | 3086 *charbuf++ = '%'; |
| 2088 *dst++ = '%'; | 3087 *charbuf++ = '/'; |
| 2089 *dst++ = '/'; | 3088 *charbuf++ = dim; |
| 2090 *dst++ = dim; | 3089 *charbuf++ = BYTE8_TO_CHAR (M); |
| 2091 produced_chars = 4; | 3090 *charbuf++ = BYTE8_TO_CHAR (L); |
| 2092 dst += CHAR_STRING (M, dst), produced_chars++; | |
| 2093 dst += CHAR_STRING (L, dst), produced_chars++; | |
| 2094 while (size-- > 0) | 3091 while (size-- > 0) |
| 2095 { | 3092 { |
| 2096 ONE_MORE_BYTE (c1); | 3093 ONE_MORE_BYTE (c1); |
| 2097 dst += CHAR_STRING (c1, dst), produced_chars++; | 3094 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1); |
| 2098 } | 3095 } |
| 2099 coding->produced_char += produced_chars; | |
| 2100 } | 3096 } |
| 2101 else if (c1 == 'G') | 3097 else if (c1 == 'G') |
| 2102 { | 3098 { |
| 2103 unsigned char *d = dst; | |
| 2104 int produced_chars; | |
| 2105 | |
| 2106 /* XFree86 extension for embedding UTF-8 in CTEXT: | 3099 /* XFree86 extension for embedding UTF-8 in CTEXT: |
| 2107 ESC % G --UTF-8-BYTES-- ESC % @ | 3100 ESC % G --UTF-8-BYTES-- ESC % @ |
| 2108 We keep these bytes as is for the moment. | 3101 We keep these bytes as is for the moment. |
| 2109 They may be decoded by post-read-conversion. */ | 3102 They may be decoded by post-read-conversion. */ |
| 2110 if (d + 6 > (dst_bytes ? dst_end : src)) | 3103 int *p = charbuf; |
| 2111 goto label_end_of_loop; | 3104 |
| 2112 *d++ = ISO_CODE_ESC; | 3105 if (p + 6 > charbuf_end) |
| 2113 *d++ = '%'; | 3106 goto break_loop; |
| 2114 *d++ = 'G'; | 3107 *p++ = ISO_CODE_ESC; |
| 2115 produced_chars = 3; | 3108 *p++ = '%'; |
| 2116 while (d + 1 < (dst_bytes ? dst_end : src)) | 3109 *p++ = 'G'; |
| 3110 while (p < charbuf_end) | |
| 2117 { | 3111 { |
| 2118 ONE_MORE_BYTE (c1); | 3112 ONE_MORE_BYTE (c1); |
| 2119 if (c1 == ISO_CODE_ESC | 3113 if (c1 == ISO_CODE_ESC |
| 2120 && src + 1 < src_end | 3114 && src + 1 < src_end |
| 2121 && src[0] == '%' | 3115 && src[0] == '%' |
| 2122 && src[1] == '@') | 3116 && src[1] == '@') |
| 2123 break; | 3117 break; |
| 2124 d += CHAR_STRING (c1, d), produced_chars++; | 3118 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1); |
| 2125 } | 3119 } |
| 2126 if (d + 3 > (dst_bytes ? dst_end : src)) | 3120 if (p + 3 > charbuf_end) |
| 2127 goto label_end_of_loop; | 3121 goto break_loop; |
| 2128 *d++ = ISO_CODE_ESC; | 3122 *p++ = ISO_CODE_ESC; |
| 2129 *d++ = '%'; | 3123 *p++ = '%'; |
| 2130 *d++ = '@'; | 3124 *p++ = '@'; |
| 2131 dst = d; | 3125 charbuf = p; |
| 2132 coding->produced_char += produced_chars + 3; | |
| 2133 } | 3126 } |
| 2134 else | 3127 else |
| 2135 goto label_invalid_code; | 3128 goto invalid_code; |
| 2136 continue; | 3129 continue; |
| 3130 break; | |
| 2137 | 3131 |
| 2138 default: | 3132 default: |
| 2139 if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION)) | 3133 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION)) |
| 2140 goto label_invalid_code; | 3134 goto invalid_code; |
| 2141 if (c1 >= 0x28 && c1 <= 0x2B) | 3135 if (c1 >= 0x28 && c1 <= 0x2B) |
| 2142 { /* designation of DIMENSION1_CHARS94 character set */ | 3136 { /* designation of DIMENSION1_CHARS94 character set */ |
| 2143 ONE_MORE_BYTE (c2); | 3137 ONE_MORE_BYTE (c2); |
| 2144 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2); | 3138 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2); |
| 2145 } | 3139 } |
| 2146 else if (c1 >= 0x2C && c1 <= 0x2F) | 3140 else if (c1 >= 0x2C && c1 <= 0x2F) |
| 2147 { /* designation of DIMENSION1_CHARS96 character set */ | 3141 { /* designation of DIMENSION1_CHARS96 character set */ |
| 2148 ONE_MORE_BYTE (c2); | 3142 ONE_MORE_BYTE (c2); |
| 2149 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2); | 3143 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2); |
| 2150 } | 3144 } |
| 2151 else | 3145 else |
| 2152 goto label_invalid_code; | 3146 goto invalid_code; |
| 2153 /* We must update these variables now. */ | 3147 /* We must update these variables now. */ |
| 2154 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0); | 3148 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); |
| 2155 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1); | 3149 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1); |
| 2156 continue; | 3150 continue; |
| 2157 } | 3151 } |
| 2158 } | 3152 } |
| 2159 | 3153 |
| 3154 if (charset->id != charset_ascii | |
| 3155 && last_id != charset->id) | |
| 3156 { | |
| 3157 if (last_id != charset_ascii) | |
| 3158 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | |
| 3159 last_id = charset->id; | |
| 3160 last_offset = char_offset; | |
| 3161 } | |
| 3162 | |
| 2160 /* Now we know CHARSET and 1st position code C1 of a character. | 3163 /* Now we know CHARSET and 1st position code C1 of a character. |
| 2161 Produce a multibyte sequence for that character while getting | 3164 Produce a decoded character while getting 2nd position code |
| 2162 2nd position code C2 if necessary. */ | 3165 C2 if necessary. */ |
| 2163 if (CHARSET_DIMENSION (charset) == 2) | 3166 c1 &= 0x7F; |
| 3167 if (CHARSET_DIMENSION (charset) > 1) | |
| 2164 { | 3168 { |
| 2165 ONE_MORE_BYTE (c2); | 3169 ONE_MORE_BYTE (c2); |
| 2166 if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0) | 3170 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)) |
| 2167 /* C2 is not in a valid range. */ | 3171 /* C2 is not in a valid range. */ |
| 2168 goto label_invalid_code; | 3172 goto invalid_code; |
| 2169 } | 3173 c1 = (c1 << 8) | (c2 & 0x7F); |
| 2170 c = DECODE_ISO_CHARACTER (charset, c1, c2); | 3174 if (CHARSET_DIMENSION (charset) > 2) |
| 2171 EMIT_CHAR (c); | 3175 { |
| 3176 ONE_MORE_BYTE (c2); | |
| 3177 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)) | |
| 3178 /* C2 is not in a valid range. */ | |
| 3179 goto invalid_code; | |
| 3180 c1 = (c1 << 8) | (c2 & 0x7F); | |
| 3181 } | |
| 3182 } | |
| 3183 | |
| 3184 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c); | |
| 3185 if (c < 0) | |
| 3186 { | |
| 3187 MAYBE_FINISH_COMPOSITION (); | |
| 3188 for (; src_base < src; src_base++, char_offset++) | |
| 3189 { | |
| 3190 if (ASCII_BYTE_P (*src_base)) | |
| 3191 *charbuf++ = *src_base; | |
| 3192 else | |
| 3193 *charbuf++ = BYTE8_TO_CHAR (*src_base); | |
| 3194 } | |
| 3195 } | |
| 3196 else if (composition_state == COMPOSING_NO) | |
| 3197 { | |
| 3198 *charbuf++ = c; | |
| 3199 char_offset++; | |
| 3200 } | |
| 3201 else | |
| 3202 { | |
| 3203 components[component_idx++] = c; | |
| 3204 if (method == COMPOSITION_WITH_RULE | |
| 3205 || (method == COMPOSITION_WITH_RULE_ALTCHARS | |
| 3206 && composition_state == COMPOSING_COMPONENT_CHAR)) | |
| 3207 composition_state++; | |
| 3208 } | |
| 2172 continue; | 3209 continue; |
| 2173 | 3210 |
| 2174 label_invalid_code: | 3211 invalid_code: |
| 3212 MAYBE_FINISH_COMPOSITION (); | |
| 3213 src = src_base; | |
| 3214 consumed_chars = consumed_chars_base; | |
| 3215 ONE_MORE_BYTE (c); | |
| 3216 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); | |
| 3217 char_offset++; | |
| 2175 coding->errors++; | 3218 coding->errors++; |
| 2176 if (COMPOSING_P (coding)) | 3219 continue; |
| 2177 DECODE_COMPOSITION_END ('1'); | 3220 |
| 2178 src = src_base; | 3221 break_loop: |
| 2179 c = *src++; | 3222 break; |
| 2180 EMIT_CHAR (c); | 3223 } |
| 2181 } | 3224 |
| 2182 | 3225 no_more_source: |
| 2183 label_end_of_loop: | 3226 if (last_id != charset_ascii) |
| 2184 coding->consumed = coding->consumed_char = src_base - source; | 3227 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); |
| 2185 coding->produced = dst - destination; | 3228 coding->consumed_char += consumed_chars_base; |
| 2186 return; | 3229 coding->consumed = src_base - coding->source; |
| 3230 coding->charbuf_used = charbuf - coding->charbuf; | |
| 2187 } | 3231 } |
| 2188 | 3232 |
| 2189 | 3233 |
| 2190 /* ISO2022 encoding stuff. */ | 3234 /* ISO2022 encoding stuff. */ |
| 2191 | 3235 |
| 2192 /* | 3236 /* |
| 2193 It is not enough to say just "ISO2022" on encoding, we have to | 3237 It is not enough to say just "ISO2022" on encoding, we have to |
| 2194 specify more details. In Emacs, each ISO2022 coding system | 3238 specify more details. In Emacs, each coding system of ISO2022 |
| 2195 variant has the following specifications: | 3239 variant has the following specifications: |
| 2196 1. Initial designation to G0 through G3. | 3240 1. Initial designation to G0 thru G3. |
| 2197 2. Allows short-form designation? | 3241 2. Allows short-form designation? |
| 2198 3. ASCII should be designated to G0 before control characters? | 3242 3. ASCII should be designated to G0 before control characters? |
| 2199 4. ASCII should be designated to G0 at end of line? | 3243 4. ASCII should be designated to G0 at end of line? |
| 2200 5. 7-bit environment or 8-bit environment? | 3244 5. 7-bit environment or 8-bit environment? |
| 2201 6. Use locking-shift? | 3245 6. Use locking-shift? |
| 2202 7. Use Single-shift? | 3246 7. Use Single-shift? |
| 2203 And the following two are only for Japanese: | 3247 And the following two are only for Japanese: |
| 2204 8. Use ASCII in place of JIS0201-1976-Roman? | 3248 8. Use ASCII in place of JIS0201-1976-Roman? |
| 2205 9. Use JISX0208-1983 in place of JISX0208-1978? | 3249 9. Use JISX0208-1983 in place of JISX0208-1978? |
| 2206 These specifications are encoded in `coding->flags' as flag bits | 3250 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits |
| 2207 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more | 3251 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more |
| 2208 details. | 3252 details. |
| 2209 */ | 3253 */ |
| 2210 | 3254 |
| 2211 /* Produce codes (escape sequence) for designating CHARSET to graphic | 3255 /* Produce codes (escape sequence) for designating CHARSET to graphic |
| 2212 register REG at DST, and increment DST. If <final-char> of CHARSET is | 3256 register REG at DST, and increment DST. If <final-char> of CHARSET is |
| 2213 '@', 'A', or 'B' and the coding system CODING allows, produce | 3257 '@', 'A', or 'B' and the coding system CODING allows, produce |
| 2214 designation sequence of short-form. */ | 3258 designation sequence of short-form. */ |
| 2215 | 3259 |
| 2216 #define ENCODE_DESIGNATION(charset, reg, coding) \ | 3260 #define ENCODE_DESIGNATION(charset, reg, coding) \ |
| 2217 do { \ | 3261 do { \ |
| 2218 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \ | 3262 unsigned char final_char = CHARSET_ISO_FINAL (charset); \ |
| 2219 char *intermediate_char_94 = "()*+"; \ | 3263 char *intermediate_char_94 = "()*+"; \ |
| 2220 char *intermediate_char_96 = ",-./"; \ | 3264 char *intermediate_char_96 = ",-./"; \ |
| 2221 int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset); \ | 3265 int revision = -1; \ |
| 2222 \ | 3266 int c; \ |
| 2223 if (revision < 255) \ | 3267 \ |
| 3268 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \ | |
| 3269 revision = CHARSET_ISO_REVISION (charset); \ | |
| 3270 \ | |
| 3271 if (revision >= 0) \ | |
| 2224 { \ | 3272 { \ |
| 2225 *dst++ = ISO_CODE_ESC; \ | 3273 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \ |
| 2226 *dst++ = '&'; \ | 3274 EMIT_ONE_BYTE ('@' + revision); \ |
| 2227 *dst++ = '@' + revision; \ | |
| 2228 } \ | 3275 } \ |
| 2229 *dst++ = ISO_CODE_ESC; \ | 3276 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \ |
| 2230 if (CHARSET_DIMENSION (charset) == 1) \ | 3277 if (CHARSET_DIMENSION (charset) == 1) \ |
| 2231 { \ | 3278 { \ |
| 2232 if (CHARSET_CHARS (charset) == 94) \ | 3279 if (! CHARSET_ISO_CHARS_96 (charset)) \ |
| 2233 *dst++ = (unsigned char) (intermediate_char_94[reg]); \ | 3280 c = intermediate_char_94[reg]; \ |
| 2234 else \ | 3281 else \ |
| 2235 *dst++ = (unsigned char) (intermediate_char_96[reg]); \ | 3282 c = intermediate_char_96[reg]; \ |
| 3283 EMIT_ONE_ASCII_BYTE (c); \ | |
| 2236 } \ | 3284 } \ |
| 2237 else \ | 3285 else \ |
| 2238 { \ | 3286 { \ |
| 2239 *dst++ = '$'; \ | 3287 EMIT_ONE_ASCII_BYTE ('$'); \ |
| 2240 if (CHARSET_CHARS (charset) == 94) \ | 3288 if (! CHARSET_ISO_CHARS_96 (charset)) \ |
| 2241 { \ | 3289 { \ |
| 2242 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \ | 3290 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \ |
| 2243 || reg != 0 \ | 3291 || reg != 0 \ |
| 2244 || final_char < '@' || final_char > 'B') \ | 3292 || final_char < '@' || final_char > 'B') \ |
| 2245 *dst++ = (unsigned char) (intermediate_char_94[reg]); \ | 3293 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \ |
| 2246 } \ | 3294 } \ |
| 2247 else \ | 3295 else \ |
| 2248 *dst++ = (unsigned char) (intermediate_char_96[reg]); \ | 3296 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \ |
| 2249 } \ | 3297 } \ |
| 2250 *dst++ = final_char; \ | 3298 EMIT_ONE_ASCII_BYTE (final_char); \ |
| 2251 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \ | 3299 \ |
| 3300 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \ | |
| 2252 } while (0) | 3301 } while (0) |
| 3302 | |
| 2253 | 3303 |
| 2254 /* The following two macros produce codes (control character or escape | 3304 /* The following two macros produce codes (control character or escape |
| 2255 sequence) for ISO2022 single-shift functions (single-shift-2 and | 3305 sequence) for ISO2022 single-shift functions (single-shift-2 and |
| 2256 single-shift-3). */ | 3306 single-shift-3). */ |
| 2257 | 3307 |
| 2258 #define ENCODE_SINGLE_SHIFT_2 \ | 3308 #define ENCODE_SINGLE_SHIFT_2 \ |
| 2259 do { \ | 3309 do { \ |
| 2260 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \ | 3310 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \ |
| 2261 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \ | 3311 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \ |
| 2262 else \ | 3312 else \ |
| 2263 *dst++ = ISO_CODE_SS2; \ | 3313 EMIT_ONE_BYTE (ISO_CODE_SS2); \ |
| 2264 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \ | 3314 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \ |
| 2265 } while (0) | 3315 } while (0) |
| 2266 | 3316 |
| 2267 #define ENCODE_SINGLE_SHIFT_3 \ | 3317 |
| 2268 do { \ | 3318 #define ENCODE_SINGLE_SHIFT_3 \ |
| 2269 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \ | 3319 do { \ |
| 2270 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \ | 3320 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \ |
| 2271 else \ | 3321 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \ |
| 2272 *dst++ = ISO_CODE_SS3; \ | 3322 else \ |
| 2273 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \ | 3323 EMIT_ONE_BYTE (ISO_CODE_SS3); \ |
| 3324 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \ | |
| 2274 } while (0) | 3325 } while (0) |
| 3326 | |
| 2275 | 3327 |
| 2276 /* The following four macros produce codes (control character or | 3328 /* The following four macros produce codes (control character or |
| 2277 escape sequence) for ISO2022 locking-shift functions (shift-in, | 3329 escape sequence) for ISO2022 locking-shift functions (shift-in, |
| 2278 shift-out, locking-shift-2, and locking-shift-3). */ | 3330 shift-out, locking-shift-2, and locking-shift-3). */ |
| 2279 | 3331 |
| 2280 #define ENCODE_SHIFT_IN \ | 3332 #define ENCODE_SHIFT_IN \ |
| 2281 do { \ | 3333 do { \ |
| 2282 *dst++ = ISO_CODE_SI; \ | 3334 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \ |
| 2283 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \ | 3335 CODING_ISO_INVOCATION (coding, 0) = 0; \ |
| 2284 } while (0) | 3336 } while (0) |
| 2285 | 3337 |
| 2286 #define ENCODE_SHIFT_OUT \ | 3338 |
| 2287 do { \ | 3339 #define ENCODE_SHIFT_OUT \ |
| 2288 *dst++ = ISO_CODE_SO; \ | 3340 do { \ |
| 2289 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \ | 3341 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \ |
| 3342 CODING_ISO_INVOCATION (coding, 0) = 1; \ | |
| 2290 } while (0) | 3343 } while (0) |
| 2291 | 3344 |
| 2292 #define ENCODE_LOCKING_SHIFT_2 \ | 3345 |
| 2293 do { \ | 3346 #define ENCODE_LOCKING_SHIFT_2 \ |
| 2294 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \ | 3347 do { \ |
| 2295 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \ | 3348 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \ |
| 3349 CODING_ISO_INVOCATION (coding, 0) = 2; \ | |
| 2296 } while (0) | 3350 } while (0) |
| 2297 | 3351 |
| 2298 #define ENCODE_LOCKING_SHIFT_3 \ | 3352 |
| 2299 do { \ | 3353 #define ENCODE_LOCKING_SHIFT_3 \ |
| 2300 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \ | 3354 do { \ |
| 2301 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \ | 3355 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \ |
| 3356 CODING_ISO_INVOCATION (coding, 0) = 3; \ | |
| 2302 } while (0) | 3357 } while (0) |
| 3358 | |
| 2303 | 3359 |
| 2304 /* Produce codes for a DIMENSION1 character whose character set is | 3360 /* Produce codes for a DIMENSION1 character whose character set is |
| 2305 CHARSET and whose position-code is C1. Designation and invocation | 3361 CHARSET and whose position-code is C1. Designation and invocation |
| 2306 sequences are also produced in advance if necessary. */ | 3362 sequences are also produced in advance if necessary. */ |
| 2307 | 3363 |
| 2308 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \ | 3364 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \ |
| 2309 do { \ | 3365 do { \ |
| 2310 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \ | 3366 int id = CHARSET_ID (charset); \ |
| 3367 \ | |
| 3368 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \ | |
| 3369 && id == charset_ascii) \ | |
| 2311 { \ | 3370 { \ |
| 2312 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \ | 3371 id = charset_jisx0201_roman; \ |
| 2313 *dst++ = c1 & 0x7F; \ | 3372 charset = CHARSET_FROM_ID (id); \ |
| 3373 } \ | |
| 3374 \ | |
| 3375 if (CODING_ISO_SINGLE_SHIFTING (coding)) \ | |
| 3376 { \ | |
| 3377 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \ | |
| 3378 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \ | |
| 2314 else \ | 3379 else \ |
| 2315 *dst++ = c1 | 0x80; \ | 3380 EMIT_ONE_BYTE (c1 | 0x80); \ |
| 2316 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \ | 3381 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \ |
| 2317 break; \ | 3382 break; \ |
| 2318 } \ | 3383 } \ |
| 2319 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \ | 3384 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \ |
| 2320 { \ | 3385 { \ |
| 2321 *dst++ = c1 & 0x7F; \ | 3386 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \ |
| 2322 break; \ | 3387 break; \ |
| 2323 } \ | 3388 } \ |
| 2324 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \ | 3389 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \ |
| 2325 { \ | 3390 { \ |
| 2326 *dst++ = c1 | 0x80; \ | 3391 EMIT_ONE_BYTE (c1 | 0x80); \ |
| 2327 break; \ | 3392 break; \ |
| 2328 } \ | 3393 } \ |
| 2329 else \ | 3394 else \ |
| 2330 /* Since CHARSET is not yet invoked to any graphic planes, we \ | 3395 /* Since CHARSET is not yet invoked to any graphic planes, we \ |
| 2331 must invoke it, or, at first, designate it to some graphic \ | 3396 must invoke it, or, at first, designate it to some graphic \ |
| 2332 register. Then repeat the loop to actually produce the \ | 3397 register. Then repeat the loop to actually produce the \ |
| 2333 character. */ \ | 3398 character. */ \ |
| 2334 dst = encode_invocation_designation (charset, coding, dst); \ | 3399 dst = encode_invocation_designation (charset, coding, dst, \ |
| 3400 &produced_chars); \ | |
| 2335 } while (1) | 3401 } while (1) |
| 3402 | |
| 2336 | 3403 |
| 2337 /* Produce codes for a DIMENSION2 character whose character set is | 3404 /* Produce codes for a DIMENSION2 character whose character set is |
| 2338 CHARSET and whose position-codes are C1 and C2. Designation and | 3405 CHARSET and whose position-codes are C1 and C2. Designation and |
| 2339 invocation codes are also produced in advance if necessary. */ | 3406 invocation codes are also produced in advance if necessary. */ |
| 2340 | 3407 |
| 2341 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \ | 3408 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \ |
| 2342 do { \ | 3409 do { \ |
| 2343 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \ | 3410 int id = CHARSET_ID (charset); \ |
| 3411 \ | |
| 3412 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \ | |
| 3413 && id == charset_jisx0208) \ | |
| 2344 { \ | 3414 { \ |
| 2345 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \ | 3415 id = charset_jisx0208_1978; \ |
| 2346 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \ | 3416 charset = CHARSET_FROM_ID (id); \ |
| 3417 } \ | |
| 3418 \ | |
| 3419 if (CODING_ISO_SINGLE_SHIFTING (coding)) \ | |
| 3420 { \ | |
| 3421 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \ | |
| 3422 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \ | |
| 2347 else \ | 3423 else \ |
| 2348 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \ | 3424 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \ |
| 2349 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \ | 3425 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \ |
| 2350 break; \ | 3426 break; \ |
| 2351 } \ | 3427 } \ |
| 2352 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \ | 3428 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \ |
| 2353 { \ | 3429 { \ |
| 2354 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \ | 3430 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \ |
| 2355 break; \ | 3431 break; \ |
| 2356 } \ | 3432 } \ |
| 2357 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \ | 3433 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \ |
| 2358 { \ | 3434 { \ |
| 2359 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \ | 3435 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \ |
| 2360 break; \ | 3436 break; \ |
| 2361 } \ | 3437 } \ |
| 2362 else \ | 3438 else \ |
| 2363 /* Since CHARSET is not yet invoked to any graphic planes, we \ | 3439 /* Since CHARSET is not yet invoked to any graphic planes, we \ |
| 2364 must invoke it, or, at first, designate it to some graphic \ | 3440 must invoke it, or, at first, designate it to some graphic \ |
| 2365 register. Then repeat the loop to actually produce the \ | 3441 register. Then repeat the loop to actually produce the \ |
| 2366 character. */ \ | 3442 character. */ \ |
| 2367 dst = encode_invocation_designation (charset, coding, dst); \ | 3443 dst = encode_invocation_designation (charset, coding, dst, \ |
| 3444 &produced_chars); \ | |
| 2368 } while (1) | 3445 } while (1) |
| 2369 | 3446 |
| 2370 #define ENCODE_ISO_CHARACTER(c) \ | 3447 |
| 2371 do { \ | 3448 #define ENCODE_ISO_CHARACTER(charset, c) \ |
| 2372 int charset, c1, c2; \ | 3449 do { \ |
| 2373 \ | 3450 int code = ENCODE_CHAR ((charset),(c)); \ |
| 2374 SPLIT_CHAR (c, charset, c1, c2); \ | 3451 \ |
| 2375 if (CHARSET_DEFINED_P (charset)) \ | 3452 if (CHARSET_DIMENSION (charset) == 1) \ |
| 2376 { \ | 3453 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \ |
| 2377 if (CHARSET_DIMENSION (charset) == 1) \ | 3454 else \ |
| 2378 { \ | 3455 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \ |
| 2379 if (charset == CHARSET_ASCII \ | |
| 2380 && coding->flags & CODING_FLAG_ISO_USE_ROMAN) \ | |
| 2381 charset = charset_latin_jisx0201; \ | |
| 2382 ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1); \ | |
| 2383 } \ | |
| 2384 else \ | |
| 2385 { \ | |
| 2386 if (charset == charset_jisx0208 \ | |
| 2387 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS) \ | |
| 2388 charset = charset_jisx0208_1978; \ | |
| 2389 ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2); \ | |
| 2390 } \ | |
| 2391 } \ | |
| 2392 else \ | |
| 2393 { \ | |
| 2394 *dst++ = c1; \ | |
| 2395 if (c2 >= 0) \ | |
| 2396 *dst++ = c2; \ | |
| 2397 } \ | |
| 2398 } while (0) | 3456 } while (0) |
| 2399 | 3457 |
| 2400 | 3458 |
| 2401 /* Instead of encoding character C, produce one or two `?'s. */ | |
| 2402 | |
| 2403 #define ENCODE_UNSAFE_CHARACTER(c) \ | |
| 2404 do { \ | |
| 2405 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \ | |
| 2406 if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \ | |
| 2407 ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \ | |
| 2408 } while (0) | |
| 2409 | |
| 2410 | |
| 2411 /* Produce designation and invocation codes at a place pointed by DST | 3459 /* Produce designation and invocation codes at a place pointed by DST |
| 2412 to use CHARSET. The element `spec.iso2022' of *CODING is updated. | 3460 to use CHARSET. The element `spec.iso_2022' of *CODING is updated. |
| 2413 Return new DST. */ | 3461 Return new DST. */ |
| 2414 | 3462 |
| 2415 unsigned char * | 3463 unsigned char * |
| 2416 encode_invocation_designation (charset, coding, dst) | 3464 encode_invocation_designation (charset, coding, dst, p_nchars) |
| 2417 int charset; | 3465 struct charset *charset; |
| 2418 struct coding_system *coding; | 3466 struct coding_system *coding; |
| 2419 unsigned char *dst; | 3467 unsigned char *dst; |
| 2420 { | 3468 int *p_nchars; |
| 3469 { | |
| 3470 int multibytep = coding->dst_multibyte; | |
| 3471 int produced_chars = *p_nchars; | |
| 2421 int reg; /* graphic register number */ | 3472 int reg; /* graphic register number */ |
| 3473 int id = CHARSET_ID (charset); | |
| 2422 | 3474 |
| 2423 /* At first, check designations. */ | 3475 /* At first, check designations. */ |
| 2424 for (reg = 0; reg < 4; reg++) | 3476 for (reg = 0; reg < 4; reg++) |
| 2425 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg)) | 3477 if (id == CODING_ISO_DESIGNATION (coding, reg)) |
| 2426 break; | 3478 break; |
| 2427 | 3479 |
| 2428 if (reg >= 4) | 3480 if (reg >= 4) |
| 2429 { | 3481 { |
| 2430 /* CHARSET is not yet designated to any graphic registers. */ | 3482 /* CHARSET is not yet designated to any graphic registers. */ |
| 2431 /* At first check the requested designation. */ | 3483 /* At first check the requested designation. */ |
| 2432 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset); | 3484 reg = CODING_ISO_REQUEST (coding, id); |
| 2433 if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION) | 3485 if (reg < 0) |
| 2434 /* Since CHARSET requests no special designation, designate it | 3486 /* Since CHARSET requests no special designation, designate it |
| 2435 to graphic register 0. */ | 3487 to graphic register 0. */ |
| 2436 reg = 0; | 3488 reg = 0; |
| 2437 | 3489 |
| 2438 ENCODE_DESIGNATION (charset, reg, coding); | 3490 ENCODE_DESIGNATION (charset, reg, coding); |
| 2439 } | 3491 } |
| 2440 | 3492 |
| 2441 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg | 3493 if (CODING_ISO_INVOCATION (coding, 0) != reg |
| 2442 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg) | 3494 && CODING_ISO_INVOCATION (coding, 1) != reg) |
| 2443 { | 3495 { |
| 2444 /* Since the graphic register REG is not invoked to any graphic | 3496 /* Since the graphic register REG is not invoked to any graphic |
| 2445 planes, invoke it to graphic plane 0. */ | 3497 planes, invoke it to graphic plane 0. */ |
| 2446 switch (reg) | 3498 switch (reg) |
| 2447 { | 3499 { |
| 2452 case 1: /* graphic register 1 */ | 3504 case 1: /* graphic register 1 */ |
| 2453 ENCODE_SHIFT_OUT; | 3505 ENCODE_SHIFT_OUT; |
| 2454 break; | 3506 break; |
| 2455 | 3507 |
| 2456 case 2: /* graphic register 2 */ | 3508 case 2: /* graphic register 2 */ |
| 2457 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT) | 3509 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT) |
| 2458 ENCODE_SINGLE_SHIFT_2; | 3510 ENCODE_SINGLE_SHIFT_2; |
| 2459 else | 3511 else |
| 2460 ENCODE_LOCKING_SHIFT_2; | 3512 ENCODE_LOCKING_SHIFT_2; |
| 2461 break; | 3513 break; |
| 2462 | 3514 |
| 2463 case 3: /* graphic register 3 */ | 3515 case 3: /* graphic register 3 */ |
| 2464 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT) | 3516 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT) |
| 2465 ENCODE_SINGLE_SHIFT_3; | 3517 ENCODE_SINGLE_SHIFT_3; |
| 2466 else | 3518 else |
| 2467 ENCODE_LOCKING_SHIFT_3; | 3519 ENCODE_LOCKING_SHIFT_3; |
| 2468 break; | 3520 break; |
| 2469 } | 3521 } |
| 2470 } | 3522 } |
| 2471 | 3523 |
| 3524 *p_nchars = produced_chars; | |
| 2472 return dst; | 3525 return dst; |
| 2473 } | 3526 } |
| 2474 | |
| 2475 /* Produce 2-byte codes for encoded composition rule RULE. */ | |
| 2476 | |
| 2477 #define ENCODE_COMPOSITION_RULE(rule) \ | |
| 2478 do { \ | |
| 2479 int gref, nref; \ | |
| 2480 COMPOSITION_DECODE_RULE (rule, gref, nref); \ | |
| 2481 *dst++ = 32 + 81 + gref; \ | |
| 2482 *dst++ = 32 + nref; \ | |
| 2483 } while (0) | |
| 2484 | |
| 2485 /* Produce codes for indicating the start of a composition sequence | |
| 2486 (ESC 0, ESC 3, or ESC 4). DATA points to an array of integers | |
| 2487 which specify information about the composition. See the comment | |
| 2488 in coding.h for the format of DATA. */ | |
| 2489 | |
| 2490 #define ENCODE_COMPOSITION_START(coding, data) \ | |
| 2491 do { \ | |
| 2492 coding->composing = data[3]; \ | |
| 2493 *dst++ = ISO_CODE_ESC; \ | |
| 2494 if (coding->composing == COMPOSITION_RELATIVE) \ | |
| 2495 *dst++ = '0'; \ | |
| 2496 else \ | |
| 2497 { \ | |
| 2498 *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS \ | |
| 2499 ? '3' : '4'); \ | |
| 2500 coding->cmp_data_index = coding->cmp_data_start + 4; \ | |
| 2501 coding->composition_rule_follows = 0; \ | |
| 2502 } \ | |
| 2503 } while (0) | |
| 2504 | |
| 2505 /* Produce codes for indicating the end of the current composition. */ | |
| 2506 | |
| 2507 #define ENCODE_COMPOSITION_END(coding, data) \ | |
| 2508 do { \ | |
| 2509 *dst++ = ISO_CODE_ESC; \ | |
| 2510 *dst++ = '1'; \ | |
| 2511 coding->cmp_data_start += data[0]; \ | |
| 2512 coding->composing = COMPOSITION_NO; \ | |
| 2513 if (coding->cmp_data_start == coding->cmp_data->used \ | |
| 2514 && coding->cmp_data->next) \ | |
| 2515 { \ | |
| 2516 coding->cmp_data = coding->cmp_data->next; \ | |
| 2517 coding->cmp_data_start = 0; \ | |
| 2518 } \ | |
| 2519 } while (0) | |
| 2520 | |
| 2521 /* Produce composition start sequence ESC 0. Here, this sequence | |
| 2522 doesn't mean the start of a new composition but means that we have | |
| 2523 just produced components (alternate chars and composition rules) of | |
| 2524 the composition and the actual text follows in SRC. */ | |
| 2525 | |
| 2526 #define ENCODE_COMPOSITION_FAKE_START(coding) \ | |
| 2527 do { \ | |
| 2528 *dst++ = ISO_CODE_ESC; \ | |
| 2529 *dst++ = '0'; \ | |
| 2530 coding->composing = COMPOSITION_RELATIVE; \ | |
| 2531 } while (0) | |
| 2532 | 3527 |
| 2533 /* The following three macros produce codes for indicating direction | 3528 /* The following three macros produce codes for indicating direction |
| 2534 of text. */ | 3529 of text. */ |
| 2535 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \ | 3530 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \ |
| 2536 do { \ | 3531 do { \ |
| 2537 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \ | 3532 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \ |
| 2538 *dst++ = ISO_CODE_ESC, *dst++ = '['; \ | 3533 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \ |
| 2539 else \ | 3534 else \ |
| 2540 *dst++ = ISO_CODE_CSI; \ | 3535 EMIT_ONE_BYTE (ISO_CODE_CSI); \ |
| 2541 } while (0) | 3536 } while (0) |
| 2542 | 3537 |
| 2543 #define ENCODE_DIRECTION_R2L \ | 3538 |
| 2544 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']' | 3539 #define ENCODE_DIRECTION_R2L() \ |
| 2545 | 3540 do { \ |
| 2546 #define ENCODE_DIRECTION_L2R \ | 3541 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \ |
| 2547 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']' | 3542 EMIT_TWO_ASCII_BYTES ('2', ']'); \ |
| 3543 } while (0) | |
| 3544 | |
| 3545 | |
| 3546 #define ENCODE_DIRECTION_L2R() \ | |
| 3547 do { \ | |
| 3548 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \ | |
| 3549 EMIT_TWO_ASCII_BYTES ('0', ']'); \ | |
| 3550 } while (0) | |
| 3551 | |
| 2548 | 3552 |
| 2549 /* Produce codes for designation and invocation to reset the graphic | 3553 /* Produce codes for designation and invocation to reset the graphic |
| 2550 planes and registers to initial state. */ | 3554 planes and registers to initial state. */ |
| 2551 #define ENCODE_RESET_PLANE_AND_REGISTER \ | 3555 #define ENCODE_RESET_PLANE_AND_REGISTER() \ |
| 2552 do { \ | 3556 do { \ |
| 2553 int reg; \ | 3557 int reg; \ |
| 2554 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \ | 3558 struct charset *charset; \ |
| 2555 ENCODE_SHIFT_IN; \ | 3559 \ |
| 2556 for (reg = 0; reg < 4; reg++) \ | 3560 if (CODING_ISO_INVOCATION (coding, 0) != 0) \ |
| 2557 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0 \ | 3561 ENCODE_SHIFT_IN; \ |
| 2558 && (CODING_SPEC_ISO_DESIGNATION (coding, reg) \ | 3562 for (reg = 0; reg < 4; reg++) \ |
| 2559 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg))) \ | 3563 if (CODING_ISO_INITIAL (coding, reg) >= 0 \ |
| 2560 ENCODE_DESIGNATION \ | 3564 && (CODING_ISO_DESIGNATION (coding, reg) \ |
| 2561 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \ | 3565 != CODING_ISO_INITIAL (coding, reg))) \ |
| 3566 { \ | |
| 3567 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \ | |
| 3568 ENCODE_DESIGNATION (charset, reg, coding); \ | |
| 3569 } \ | |
| 2562 } while (0) | 3570 } while (0) |
| 3571 | |
| 2563 | 3572 |
| 2564 /* Produce designation sequences of charsets in the line started from | 3573 /* Produce designation sequences of charsets in the line started from |
| 2565 SRC to a place pointed by DST, and return updated DST. | 3574 SRC to a place pointed by DST, and return updated DST. |
| 2566 | 3575 |
| 2567 If the current block ends before any end-of-line, we may fail to | 3576 If the current block ends before any end-of-line, we may fail to |
| 2568 find all the necessary designations. */ | 3577 find all the necessary designations. */ |
| 2569 | 3578 |
| 2570 static unsigned char * | 3579 static unsigned char * |
| 2571 encode_designation_at_bol (coding, translation_table, src, src_end, dst) | 3580 encode_designation_at_bol (coding, charbuf, charbuf_end, dst) |
| 2572 struct coding_system *coding; | 3581 struct coding_system *coding; |
| 2573 Lisp_Object translation_table; | 3582 int *charbuf, *charbuf_end; |
| 2574 unsigned char *src, *src_end, *dst; | 3583 unsigned char *dst; |
| 2575 { | 3584 { |
| 2576 int charset, c, found = 0, reg; | 3585 struct charset *charset; |
| 2577 /* Table of charsets to be designated to each graphic register. */ | 3586 /* Table of charsets to be designated to each graphic register. */ |
| 2578 int r[4]; | 3587 int r[4]; |
| 3588 int c, found = 0, reg; | |
| 3589 int produced_chars = 0; | |
| 3590 int multibytep = coding->dst_multibyte; | |
| 3591 Lisp_Object attrs; | |
| 3592 Lisp_Object charset_list; | |
| 3593 | |
| 3594 attrs = CODING_ID_ATTRS (coding->id); | |
| 3595 charset_list = CODING_ATTR_CHARSET_LIST (attrs); | |
| 3596 if (EQ (charset_list, Qiso_2022)) | |
| 3597 charset_list = Viso_2022_charset_list; | |
| 2579 | 3598 |
| 2580 for (reg = 0; reg < 4; reg++) | 3599 for (reg = 0; reg < 4; reg++) |
| 2581 r[reg] = -1; | 3600 r[reg] = -1; |
| 2582 | 3601 |
| 2583 while (found < 4) | 3602 while (found < 4) |
| 2584 { | 3603 { |
| 2585 ONE_MORE_CHAR (c); | 3604 int id; |
| 3605 | |
| 3606 c = *charbuf++; | |
| 2586 if (c == '\n') | 3607 if (c == '\n') |
| 2587 break; | 3608 break; |
| 2588 | 3609 charset = char_charset (c, charset_list, NULL); |
| 2589 charset = CHAR_CHARSET (c); | 3610 id = CHARSET_ID (charset); |
| 2590 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset); | 3611 reg = CODING_ISO_REQUEST (coding, id); |
| 2591 if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0) | 3612 if (reg >= 0 && r[reg] < 0) |
| 2592 { | 3613 { |
| 2593 found++; | 3614 found++; |
| 2594 r[reg] = charset; | 3615 r[reg] = id; |
| 2595 } | 3616 } |
| 2596 } | 3617 } |
| 2597 | 3618 |
| 2598 label_end_of_loop: | |
| 2599 if (found) | 3619 if (found) |
| 2600 { | 3620 { |
| 2601 for (reg = 0; reg < 4; reg++) | 3621 for (reg = 0; reg < 4; reg++) |
| 2602 if (r[reg] >= 0 | 3622 if (r[reg] >= 0 |
| 2603 && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg]) | 3623 && CODING_ISO_DESIGNATION (coding, reg) != r[reg]) |
| 2604 ENCODE_DESIGNATION (r[reg], reg, coding); | 3624 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding); |
| 2605 } | 3625 } |
| 2606 | 3626 |
| 2607 return dst; | 3627 return dst; |
| 2608 } | 3628 } |
| 2609 | 3629 |
| 2610 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */ | 3630 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */ |
| 2611 | 3631 |
| 2612 static void | 3632 static int |
| 2613 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) | 3633 encode_coding_iso_2022 (coding) |
| 2614 struct coding_system *coding; | 3634 struct coding_system *coding; |
| 2615 unsigned char *source, *destination; | 3635 { |
| 2616 int src_bytes, dst_bytes; | 3636 int multibytep = coding->dst_multibyte; |
| 2617 { | 3637 int *charbuf = coding->charbuf; |
| 2618 unsigned char *src = source; | 3638 int *charbuf_end = charbuf + coding->charbuf_used; |
| 2619 unsigned char *src_end = source + src_bytes; | 3639 unsigned char *dst = coding->destination + coding->produced; |
| 2620 unsigned char *dst = destination; | 3640 unsigned char *dst_end = coding->destination + coding->dst_bytes; |
| 2621 unsigned char *dst_end = destination + dst_bytes; | 3641 int safe_room = 16; |
| 2622 /* Since the maximum bytes produced by each loop is 20, we subtract 19 | 3642 int bol_designation |
| 2623 from DST_END to assure overflow checking is necessary only at the | 3643 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL |
| 2624 head of loop. */ | 3644 && CODING_ISO_BOL (coding)); |
| 2625 unsigned char *adjusted_dst_end = dst_end - 19; | 3645 int produced_chars = 0; |
| 2626 /* SRC_BASE remembers the start position in source in each loop. | 3646 Lisp_Object attrs, eol_type, charset_list; |
| 2627 The loop will be exited when there's not enough source text to | 3647 int ascii_compatible; |
| 2628 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when | |
| 2629 there's not enough destination area to produce encoded codes | |
| 2630 (within macro EMIT_BYTES). */ | |
| 2631 unsigned char *src_base; | |
| 2632 int c; | 3648 int c; |
| 2633 Lisp_Object translation_table; | 3649 int preferred_charset_id = -1; |
| 2634 Lisp_Object safe_chars; | 3650 |
| 2635 | 3651 CODING_GET_INFO (coding, attrs, eol_type, charset_list); |
| 2636 if (coding->flags & CODING_FLAG_ISO_SAFE) | 3652 setup_iso_safe_charsets (attrs); |
| 2637 coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR; | 3653 /* Charset list may have been changed. */ |
| 2638 | 3654 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \ |
| 2639 safe_chars = coding_safe_chars (coding->symbol); | 3655 coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs)); |
| 2640 | 3656 |
| 2641 if (NILP (Venable_character_translation)) | 3657 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)); |
| 2642 translation_table = Qnil; | 3658 |
| 2643 else | 3659 while (charbuf < charbuf_end) |
| 2644 { | 3660 { |
| 2645 translation_table = coding->translation_table_for_encode; | 3661 ASSURE_DESTINATION (safe_room); |
| 2646 if (NILP (translation_table)) | 3662 |
| 2647 translation_table = Vstandard_translation_table_for_encode; | 3663 if (bol_designation) |
| 2648 } | 3664 { |
| 2649 | 3665 unsigned char *dst_prev = dst; |
| 2650 coding->consumed_char = 0; | 3666 |
| 2651 coding->errors = 0; | |
| 2652 while (1) | |
| 2653 { | |
| 2654 src_base = src; | |
| 2655 | |
| 2656 if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19))) | |
| 2657 { | |
| 2658 coding->result = CODING_FINISH_INSUFFICIENT_DST; | |
| 2659 break; | |
| 2660 } | |
| 2661 | |
| 2662 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL | |
| 2663 && CODING_SPEC_ISO_BOL (coding)) | |
| 2664 { | |
| 2665 /* We have to produce designation sequences if any now. */ | 3667 /* We have to produce designation sequences if any now. */ |
| 2666 dst = encode_designation_at_bol (coding, translation_table, | 3668 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst); |
| 2667 src, src_end, dst); | 3669 bol_designation = 0; |
| 2668 CODING_SPEC_ISO_BOL (coding) = 0; | 3670 /* We are sure that designation sequences are all ASCII bytes. */ |
| 2669 } | 3671 produced_chars += dst - dst_prev; |
| 2670 | 3672 } |
| 2671 /* Check composition start and end. */ | 3673 |
| 2672 if (coding->composing != COMPOSITION_DISABLED | 3674 c = *charbuf++; |
| 2673 && coding->cmp_data_start < coding->cmp_data->used) | 3675 |
| 2674 { | 3676 if (c < 0) |
| 2675 struct composition_data *cmp_data = coding->cmp_data; | 3677 { |
| 2676 int *data = cmp_data->data + coding->cmp_data_start; | 3678 /* Handle an annotation. */ |
| 2677 int this_pos = cmp_data->char_offset + coding->consumed_char; | 3679 switch (*charbuf) |
| 2678 | |
| 2679 if (coding->composing == COMPOSITION_RELATIVE) | |
| 2680 { | 3680 { |
| 2681 if (this_pos == data[2]) | 3681 case CODING_ANNOTATE_COMPOSITION_MASK: |
| 3682 /* Not yet implemented. */ | |
| 3683 break; | |
| 3684 case CODING_ANNOTATE_CHARSET_MASK: | |
| 3685 preferred_charset_id = charbuf[3]; | |
| 3686 if (preferred_charset_id >= 0 | |
| 3687 && NILP (Fmemq (make_number (preferred_charset_id), | |
| 3688 charset_list))) | |
| 3689 preferred_charset_id = -1; | |
| 3690 break; | |
| 3691 default: | |
| 3692 abort (); | |
| 3693 } | |
| 3694 charbuf += -c - 1; | |
| 3695 continue; | |
| 3696 } | |
| 3697 | |
| 3698 /* Now encode the character C. */ | |
| 3699 if (c < 0x20 || c == 0x7F) | |
| 3700 { | |
| 3701 if (c == '\n' | |
| 3702 || (c == '\r' && EQ (eol_type, Qmac))) | |
| 3703 { | |
| 3704 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL) | |
| 3705 ENCODE_RESET_PLANE_AND_REGISTER (); | |
| 3706 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL) | |
| 2682 { | 3707 { |
| 2683 ENCODE_COMPOSITION_END (coding, data); | 3708 int i; |
| 2684 cmp_data = coding->cmp_data; | 3709 |
| 2685 data = cmp_data->data + coding->cmp_data_start; | 3710 for (i = 0; i < 4; i++) |
| 3711 CODING_ISO_DESIGNATION (coding, i) | |
| 3712 = CODING_ISO_INITIAL (coding, i); | |
| 3713 } | |
| 3714 bol_designation | |
| 3715 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL; | |
| 3716 } | |
| 3717 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL) | |
| 3718 ENCODE_RESET_PLANE_AND_REGISTER (); | |
| 3719 EMIT_ONE_ASCII_BYTE (c); | |
| 3720 } | |
| 3721 else if (ASCII_CHAR_P (c)) | |
| 3722 { | |
| 3723 if (ascii_compatible) | |
| 3724 EMIT_ONE_ASCII_BYTE (c); | |
| 3725 else | |
| 3726 { | |
| 3727 struct charset *charset = CHARSET_FROM_ID (charset_ascii); | |
| 3728 ENCODE_ISO_CHARACTER (charset, c); | |
| 3729 } | |
| 3730 } | |
| 3731 else if (CHAR_BYTE8_P (c)) | |
| 3732 { | |
| 3733 c = CHAR_TO_BYTE8 (c); | |
| 3734 EMIT_ONE_BYTE (c); | |
| 3735 } | |
| 3736 else | |
| 3737 { | |
| 3738 struct charset *charset; | |
| 3739 | |
| 3740 if (preferred_charset_id >= 0) | |
| 3741 { | |
| 3742 charset = CHARSET_FROM_ID (preferred_charset_id); | |
| 3743 if (! CHAR_CHARSET_P (c, charset)) | |
| 3744 charset = char_charset (c, charset_list, NULL); | |
| 3745 } | |
| 3746 else | |
| 3747 charset = char_charset (c, charset_list, NULL); | |
| 3748 if (!charset) | |
| 3749 { | |
| 3750 if (coding->mode & CODING_MODE_SAFE_ENCODING) | |
| 3751 { | |
| 3752 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION; | |
| 3753 charset = CHARSET_FROM_ID (charset_ascii); | |
| 3754 } | |
| 3755 else | |
| 3756 { | |
| 3757 c = coding->default_char; | |
| 3758 charset = char_charset (c, charset_list, NULL); | |
| 2686 } | 3759 } |
| 2687 } | 3760 } |
| 2688 else if (COMPOSING_P (coding)) | 3761 ENCODE_ISO_CHARACTER (charset, c); |
| 2689 { | 3762 } |
| 2690 /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR */ | 3763 } |
| 2691 if (coding->cmp_data_index == coding->cmp_data_start + data[0]) | 3764 |
| 2692 /* We have consumed components of the composition. | 3765 if (coding->mode & CODING_MODE_LAST_BLOCK |
| 2693 What follows in SRC is the composition's base | 3766 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL) |
| 2694 text. */ | 3767 { |
| 2695 ENCODE_COMPOSITION_FAKE_START (coding); | 3768 ASSURE_DESTINATION (safe_room); |
| 2696 else | 3769 ENCODE_RESET_PLANE_AND_REGISTER (); |
| 2697 { | 3770 } |
| 2698 int c = cmp_data->data[coding->cmp_data_index++]; | 3771 coding->result = CODING_RESULT_SUCCESS; |
| 2699 if (coding->composition_rule_follows) | 3772 CODING_ISO_BOL (coding) = bol_designation; |
| 2700 { | 3773 coding->produced_char += produced_chars; |
| 2701 ENCODE_COMPOSITION_RULE (c); | 3774 coding->produced = dst - coding->destination; |
| 2702 coding->composition_rule_follows = 0; | 3775 return 0; |
| 2703 } | |
| 2704 else | |
| 2705 { | |
| 2706 if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR | |
| 2707 && ! CODING_SAFE_CHAR_P (safe_chars, c)) | |
| 2708 ENCODE_UNSAFE_CHARACTER (c); | |
| 2709 else | |
| 2710 ENCODE_ISO_CHARACTER (c); | |
| 2711 if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS) | |
| 2712 coding->composition_rule_follows = 1; | |
| 2713 } | |
| 2714 continue; | |
| 2715 } | |
| 2716 } | |
| 2717 if (!COMPOSING_P (coding)) | |
| 2718 { | |
| 2719 if (this_pos == data[1]) | |
| 2720 { | |
| 2721 ENCODE_COMPOSITION_START (coding, data); | |
| 2722 continue; | |
| 2723 } | |
| 2724 } | |
| 2725 } | |
| 2726 | |
| 2727 ONE_MORE_CHAR (c); | |
| 2728 | |
| 2729 /* Now encode the character C. */ | |
| 2730 if (c < 0x20 || c == 0x7F) | |
| 2731 { | |
| 2732 if (c == '\r') | |
| 2733 { | |
| 2734 if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)) | |
| 2735 { | |
| 2736 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL) | |
| 2737 ENCODE_RESET_PLANE_AND_REGISTER; | |
| 2738 *dst++ = c; | |
| 2739 continue; | |
| 2740 } | |
| 2741 /* fall down to treat '\r' as '\n' ... */ | |
| 2742 c = '\n'; | |
| 2743 } | |
| 2744 if (c == '\n') | |
| 2745 { | |
| 2746 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL) | |
| 2747 ENCODE_RESET_PLANE_AND_REGISTER; | |
| 2748 if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL) | |
| 2749 bcopy (coding->spec.iso2022.initial_designation, | |
| 2750 coding->spec.iso2022.current_designation, | |
| 2751 sizeof coding->spec.iso2022.initial_designation); | |
| 2752 if (coding->eol_type == CODING_EOL_LF | |
| 2753 || coding->eol_type == CODING_EOL_UNDECIDED) | |
| 2754 *dst++ = ISO_CODE_LF; | |
| 2755 else if (coding->eol_type == CODING_EOL_CRLF) | |
| 2756 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF; | |
| 2757 else | |
| 2758 *dst++ = ISO_CODE_CR; | |
| 2759 CODING_SPEC_ISO_BOL (coding) = 1; | |
| 2760 } | |
| 2761 else | |
| 2762 { | |
| 2763 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL) | |
| 2764 ENCODE_RESET_PLANE_AND_REGISTER; | |
| 2765 *dst++ = c; | |
| 2766 } | |
| 2767 } | |
| 2768 else if (ASCII_BYTE_P (c)) | |
| 2769 ENCODE_ISO_CHARACTER (c); | |
| 2770 else if (SINGLE_BYTE_CHAR_P (c)) | |
| 2771 { | |
| 2772 *dst++ = c; | |
| 2773 coding->errors++; | |
| 2774 } | |
| 2775 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR | |
| 2776 && ! CODING_SAFE_CHAR_P (safe_chars, c)) | |
| 2777 ENCODE_UNSAFE_CHARACTER (c); | |
| 2778 else | |
| 2779 ENCODE_ISO_CHARACTER (c); | |
| 2780 | |
| 2781 coding->consumed_char++; | |
| 2782 } | |
| 2783 | |
| 2784 label_end_of_loop: | |
| 2785 coding->consumed = src_base - source; | |
| 2786 coding->produced = coding->produced_char = dst - destination; | |
| 2787 } | 3776 } |
| 2788 | 3777 |
| 2789 | 3778 |
| 2790 /*** 4. SJIS and BIG5 handlers ***/ | 3779 /*** 8,9. SJIS and BIG5 handlers ***/ |
| 2791 | 3780 |
| 2792 /* Although SJIS and BIG5 are not ISO coding systems, they are used | 3781 /* Although SJIS and BIG5 are not ISO's coding system, they are used |
| 2793 quite widely. So, for the moment, Emacs supports them in the bare | 3782 quite widely. So, for the moment, Emacs supports them in the bare |
| 2794 C code. But, in the future, they may be supported only by CCL. */ | 3783 C code. But, in the future, they may be supported only by CCL. */ |
| 2795 | 3784 |
| 2796 /* SJIS is a coding system encoding three character sets: ASCII, right | 3785 /* SJIS is a coding system encoding three character sets: ASCII, right |
| 2797 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded | 3786 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded |
| 2798 as is. A character of charset katakana-jisx0201 is encoded by | 3787 as is. A character of charset katakana-jisx0201 is encoded by |
| 2799 "position-code + 0x80". A character of charset japanese-jisx0208 | 3788 "position-code + 0x80". A character of charset japanese-jisx0208 |
| 2800 is encoded in 2-byte but two position-codes are divided and shifted | 3789 is encoded in 2-byte but two position-codes are divided and shifted |
| 2801 so that it fits in the range below. | 3790 so that it fit in the range below. |
| 2802 | 3791 |
| 2803 --- CODE RANGE of SJIS --- | 3792 --- CODE RANGE of SJIS --- |
| 2804 (character set) (range) | 3793 (character set) (range) |
| 2805 ASCII 0x00 .. 0x7F | 3794 ASCII 0x00 .. 0x7F |
| 2806 KATAKANA-JISX0201 0xA1 .. 0xDF | 3795 KATAKANA-JISX0201 0xA0 .. 0xDF |
| 2807 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF | 3796 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF |
| 2808 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC | 3797 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC |
| 2809 ------------------------------- | 3798 ------------------------------- |
| 2810 | 3799 |
| 2811 */ | 3800 */ |
| 2812 | 3801 |
| 2813 /* BIG5 is a coding system encoding two character sets: ASCII and | 3802 /* BIG5 is a coding system encoding two character sets: ASCII and |
| 2814 Big5. An ASCII character is encoded as is. Big5 is a two-byte | 3803 Big5. An ASCII character is encoded as is. Big5 is a two-byte |
| 2815 character set and is encoded in two bytes. | 3804 character set and is encoded in two-byte. |
| 2816 | 3805 |
| 2817 --- CODE RANGE of BIG5 --- | 3806 --- CODE RANGE of BIG5 --- |
| 2818 (character set) (range) | 3807 (character set) (range) |
| 2819 ASCII 0x00 .. 0x7F | 3808 ASCII 0x00 .. 0x7F |
| 2820 Big5 (1st byte) 0xA1 .. 0xFE | 3809 Big5 (1st byte) 0xA1 .. 0xFE |
| 2821 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE | 3810 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE |
| 2822 -------------------------- | 3811 -------------------------- |
| 2823 | 3812 |
| 2824 Since the number of characters in Big5 is larger than maximum | 3813 */ |
| 2825 characters in Emacs' charset (96x96), it can't be handled as one | |
| 2826 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1' | |
| 2827 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former | |
| 2828 contains frequently used characters and the latter contains less | |
| 2829 frequently used characters. */ | |
| 2830 | |
| 2831 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2 | |
| 2832 are the 1st and 2nd position-codes of Big5 in BIG5 coding system. | |
| 2833 C1 and C2 are the 1st and 2nd position-codes of Emacs' internal | |
| 2834 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */ | |
| 2835 | |
| 2836 /* Number of Big5 characters which have the same code in 1st byte. */ | |
| 2837 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40) | |
| 2838 | |
| 2839 #define DECODE_BIG5(b1, b2, charset, c1, c2) \ | |
| 2840 do { \ | |
| 2841 unsigned int temp \ | |
| 2842 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \ | |
| 2843 if (b1 < 0xC9) \ | |
| 2844 charset = charset_big5_1; \ | |
| 2845 else \ | |
| 2846 { \ | |
| 2847 charset = charset_big5_2; \ | |
| 2848 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \ | |
| 2849 } \ | |
| 2850 c1 = temp / (0xFF - 0xA1) + 0x21; \ | |
| 2851 c2 = temp % (0xFF - 0xA1) + 0x21; \ | |
| 2852 } while (0) | |
| 2853 | |
| 2854 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \ | |
| 2855 do { \ | |
| 2856 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \ | |
| 2857 if (charset == charset_big5_2) \ | |
| 2858 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \ | |
| 2859 b1 = temp / BIG5_SAME_ROW + 0xA1; \ | |
| 2860 b2 = temp % BIG5_SAME_ROW; \ | |
| 2861 b2 += b2 < 0x3F ? 0x40 : 0x62; \ | |
| 2862 } while (0) | |
| 2863 | 3814 |
| 2864 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 3815 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| 2865 Check if a text is encoded in SJIS. If it is, return | 3816 Check if a text is encoded in SJIS. If it is, return |
| 2866 CODING_CATEGORY_MASK_SJIS, else return 0. */ | 3817 CATEGORY_MASK_SJIS, else return 0. */ |
| 2867 | 3818 |
| 2868 static int | 3819 static int |
| 2869 detect_coding_sjis (src, src_end, multibytep) | 3820 detect_coding_sjis (coding, detect_info) |
| 2870 unsigned char *src, *src_end; | 3821 struct coding_system *coding; |
| 2871 int multibytep; | 3822 struct coding_detection_info *detect_info; |
| 2872 { | 3823 { |
| 3824 const unsigned char *src = coding->source, *src_base = src; | |
| 3825 const unsigned char *src_end = coding->source + coding->src_bytes; | |
| 3826 int multibytep = coding->src_multibyte; | |
| 3827 int consumed_chars = 0; | |
| 3828 int found = 0; | |
| 2873 int c; | 3829 int c; |
| 2874 /* Dummy for ONE_MORE_BYTE. */ | 3830 int incomplete; |
| 2875 struct coding_system dummy_coding; | 3831 |
| 2876 struct coding_system *coding = &dummy_coding; | 3832 detect_info->checked |= CATEGORY_MASK_SJIS; |
| 3833 /* A coding system of this category is always ASCII compatible. */ | |
| 3834 src += coding->head_ascii; | |
| 2877 | 3835 |
| 2878 while (1) | 3836 while (1) |
| 2879 { | 3837 { |
| 2880 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 3838 incomplete = 0; |
| 3839 ONE_MORE_BYTE (c); | |
| 3840 incomplete = 1; | |
| 2881 if (c < 0x80) | 3841 if (c < 0x80) |
| 2882 continue; | 3842 continue; |
| 2883 if (c == 0x80 || c == 0xA0 || c > 0xEF) | 3843 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF)) |
| 2884 return 0; | 3844 { |
| 2885 if (c <= 0x9F || c >= 0xE0) | 3845 ONE_MORE_BYTE (c); |
| 2886 { | |
| 2887 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | |
| 2888 if (c < 0x40 || c == 0x7F || c > 0xFC) | 3846 if (c < 0x40 || c == 0x7F || c > 0xFC) |
| 2889 return 0; | 3847 break; |
| 2890 } | 3848 found = CATEGORY_MASK_SJIS; |
| 2891 } | 3849 } |
| 2892 label_end_of_loop: | 3850 else if (c >= 0xA0 && c < 0xE0) |
| 2893 return CODING_CATEGORY_MASK_SJIS; | 3851 found = CATEGORY_MASK_SJIS; |
| 3852 else | |
| 3853 break; | |
| 3854 } | |
| 3855 detect_info->rejected |= CATEGORY_MASK_SJIS; | |
| 3856 return 0; | |
| 3857 | |
| 3858 no_more_source: | |
| 3859 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) | |
| 3860 { | |
| 3861 detect_info->rejected |= CATEGORY_MASK_SJIS; | |
| 3862 return 0; | |
| 3863 } | |
| 3864 detect_info->found |= found; | |
| 3865 return 1; | |
| 2894 } | 3866 } |
| 2895 | 3867 |
| 2896 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 3868 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| 2897 Check if a text is encoded in BIG5. If it is, return | 3869 Check if a text is encoded in BIG5. If it is, return |
| 2898 CODING_CATEGORY_MASK_BIG5, else return 0. */ | 3870 CATEGORY_MASK_BIG5, else return 0. */ |
| 2899 | 3871 |
| 2900 static int | 3872 static int |
| 2901 detect_coding_big5 (src, src_end, multibytep) | 3873 detect_coding_big5 (coding, detect_info) |
| 2902 unsigned char *src, *src_end; | 3874 struct coding_system *coding; |
| 2903 int multibytep; | 3875 struct coding_detection_info *detect_info; |
| 2904 { | 3876 { |
| 3877 const unsigned char *src = coding->source, *src_base = src; | |
| 3878 const unsigned char *src_end = coding->source + coding->src_bytes; | |
| 3879 int multibytep = coding->src_multibyte; | |
| 3880 int consumed_chars = 0; | |
| 3881 int found = 0; | |
| 2905 int c; | 3882 int c; |
| 2906 /* Dummy for ONE_MORE_BYTE. */ | 3883 int incomplete; |
| 2907 struct coding_system dummy_coding; | 3884 |
| 2908 struct coding_system *coding = &dummy_coding; | 3885 detect_info->checked |= CATEGORY_MASK_BIG5; |
| 3886 /* A coding system of this category is always ASCII compatible. */ | |
| 3887 src += coding->head_ascii; | |
| 2909 | 3888 |
| 2910 while (1) | 3889 while (1) |
| 2911 { | 3890 { |
| 2912 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 3891 incomplete = 0; |
| 3892 ONE_MORE_BYTE (c); | |
| 3893 incomplete = 1; | |
| 2913 if (c < 0x80) | 3894 if (c < 0x80) |
| 2914 continue; | 3895 continue; |
| 2915 if (c < 0xA1 || c > 0xFE) | 3896 if (c >= 0xA1) |
| 2916 return 0; | 3897 { |
| 2917 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 3898 ONE_MORE_BYTE (c); |
| 2918 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE) | 3899 if (c < 0x40 || (c >= 0x7F && c <= 0xA0)) |
| 2919 return 0; | 3900 return 0; |
| 2920 } | 3901 found = CATEGORY_MASK_BIG5; |
| 2921 label_end_of_loop: | 3902 } |
| 2922 return CODING_CATEGORY_MASK_BIG5; | |
| 2923 } | |
| 2924 | |
| 2925 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | |
| 2926 Check if a text is encoded in UTF-8. If it is, return | |
| 2927 CODING_CATEGORY_MASK_UTF_8, else return 0. */ | |
| 2928 | |
| 2929 #define UTF_8_1_OCTET_P(c) ((c) < 0x80) | |
| 2930 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80) | |
| 2931 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0) | |
| 2932 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0) | |
| 2933 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0) | |
| 2934 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8) | |
| 2935 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC) | |
| 2936 | |
| 2937 static int | |
| 2938 detect_coding_utf_8 (src, src_end, multibytep) | |
| 2939 unsigned char *src, *src_end; | |
| 2940 int multibytep; | |
| 2941 { | |
| 2942 unsigned char c; | |
| 2943 int seq_maybe_bytes; | |
| 2944 /* Dummy for ONE_MORE_BYTE. */ | |
| 2945 struct coding_system dummy_coding; | |
| 2946 struct coding_system *coding = &dummy_coding; | |
| 2947 | |
| 2948 while (1) | |
| 2949 { | |
| 2950 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | |
| 2951 if (UTF_8_1_OCTET_P (c)) | |
| 2952 continue; | |
| 2953 else if (UTF_8_2_OCTET_LEADING_P (c)) | |
| 2954 seq_maybe_bytes = 1; | |
| 2955 else if (UTF_8_3_OCTET_LEADING_P (c)) | |
| 2956 seq_maybe_bytes = 2; | |
| 2957 else if (UTF_8_4_OCTET_LEADING_P (c)) | |
| 2958 seq_maybe_bytes = 3; | |
| 2959 else if (UTF_8_5_OCTET_LEADING_P (c)) | |
| 2960 seq_maybe_bytes = 4; | |
| 2961 else if (UTF_8_6_OCTET_LEADING_P (c)) | |
| 2962 seq_maybe_bytes = 5; | |
| 2963 else | 3903 else |
| 2964 return 0; | 3904 break; |
| 2965 | 3905 } |
| 2966 do | 3906 detect_info->rejected |= CATEGORY_MASK_BIG5; |
| 2967 { | |
| 2968 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | |
| 2969 if (!UTF_8_EXTRA_OCTET_P (c)) | |
| 2970 return 0; | |
| 2971 seq_maybe_bytes--; | |
| 2972 } | |
| 2973 while (seq_maybe_bytes > 0); | |
| 2974 } | |
| 2975 | |
| 2976 label_end_of_loop: | |
| 2977 return CODING_CATEGORY_MASK_UTF_8; | |
| 2978 } | |
| 2979 | |
| 2980 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | |
| 2981 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or | |
| 2982 Little Endian (otherwise). If it is, return | |
| 2983 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE, | |
| 2984 else return 0. */ | |
| 2985 | |
| 2986 #define UTF_16_INVALID_P(val) \ | |
| 2987 (((val) == 0xFFFE) \ | |
| 2988 || ((val) == 0xFFFF)) | |
| 2989 | |
| 2990 #define UTF_16_HIGH_SURROGATE_P(val) \ | |
| 2991 (((val) & 0xD800) == 0xD800) | |
| 2992 | |
| 2993 #define UTF_16_LOW_SURROGATE_P(val) \ | |
| 2994 (((val) & 0xDC00) == 0xDC00) | |
| 2995 | |
| 2996 static int | |
| 2997 detect_coding_utf_16 (src, src_end, multibytep) | |
| 2998 unsigned char *src, *src_end; | |
| 2999 int multibytep; | |
| 3000 { | |
| 3001 unsigned char c1, c2; | |
| 3002 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */ | |
| 3003 struct coding_system dummy_coding; | |
| 3004 struct coding_system *coding = &dummy_coding; | |
| 3005 | |
| 3006 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); | |
| 3007 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep); | |
| 3008 | |
| 3009 if ((c1 == 0xFF) && (c2 == 0xFE)) | |
| 3010 return CODING_CATEGORY_MASK_UTF_16_LE; | |
| 3011 else if ((c1 == 0xFE) && (c2 == 0xFF)) | |
| 3012 return CODING_CATEGORY_MASK_UTF_16_BE; | |
| 3013 | |
| 3014 label_end_of_loop: | |
| 3015 return 0; | 3907 return 0; |
| 3908 | |
| 3909 no_more_source: | |
| 3910 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) | |
| 3911 { | |
| 3912 detect_info->rejected |= CATEGORY_MASK_BIG5; | |
| 3913 return 0; | |
| 3914 } | |
| 3915 detect_info->found |= found; | |
| 3916 return 1; | |
| 3016 } | 3917 } |
| 3017 | 3918 |
| 3018 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". | 3919 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". |
| 3019 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ | 3920 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ |
| 3020 | 3921 |
| 3021 static void | 3922 static void |
| 3022 decode_coding_sjis_big5 (coding, source, destination, | 3923 decode_coding_sjis (coding) |
| 3023 src_bytes, dst_bytes, sjis_p) | |
| 3024 struct coding_system *coding; | 3924 struct coding_system *coding; |
| 3025 unsigned char *source, *destination; | 3925 { |
| 3026 int src_bytes, dst_bytes; | 3926 const unsigned char *src = coding->source + coding->consumed; |
| 3027 int sjis_p; | 3927 const unsigned char *src_end = coding->source + coding->src_bytes; |
| 3028 { | 3928 const unsigned char *src_base; |
| 3029 unsigned char *src = source; | 3929 int *charbuf = coding->charbuf; |
| 3030 unsigned char *src_end = source + src_bytes; | 3930 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH; |
| 3031 unsigned char *dst = destination; | 3931 int consumed_chars = 0, consumed_chars_base; |
| 3032 unsigned char *dst_end = destination + dst_bytes; | 3932 int multibytep = coding->src_multibyte; |
| 3033 /* SRC_BASE remembers the start position in source in each loop. | 3933 struct charset *charset_roman, *charset_kanji, *charset_kana; |
| 3034 The loop will be exited when there's not enough source code | 3934 Lisp_Object attrs, eol_type, charset_list, val; |
| 3035 (within macro ONE_MORE_BYTE), or when there's not enough | 3935 int char_offset = coding->produced_char; |
| 3036 destination area to produce a character (within macro | 3936 int last_offset = char_offset; |
| 3037 EMIT_CHAR). */ | 3937 int last_id = charset_ascii; |
| 3038 unsigned char *src_base; | 3938 |
| 3039 Lisp_Object translation_table; | 3939 CODING_GET_INFO (coding, attrs, eol_type, charset_list); |
| 3040 | 3940 |
| 3041 if (NILP (Venable_character_translation)) | 3941 val = charset_list; |
| 3042 translation_table = Qnil; | 3942 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); |
| 3043 else | 3943 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); |
| 3044 { | 3944 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))); |
| 3045 translation_table = coding->translation_table_for_decode; | 3945 |
| 3046 if (NILP (translation_table)) | |
| 3047 translation_table = Vstandard_translation_table_for_decode; | |
| 3048 } | |
| 3049 | |
| 3050 coding->produced_char = 0; | |
| 3051 while (1) | 3946 while (1) |
| 3052 { | 3947 { |
| 3053 int c, charset, c1, c2; | 3948 int c, c1; |
| 3054 | 3949 |
| 3055 src_base = src; | 3950 src_base = src; |
| 3056 ONE_MORE_BYTE (c1); | 3951 consumed_chars_base = consumed_chars; |
| 3057 | 3952 |
| 3058 if (c1 < 0x80) | 3953 if (charbuf >= charbuf_end) |
| 3059 { | 3954 break; |
| 3060 charset = CHARSET_ASCII; | 3955 |
| 3061 if (c1 < 0x20) | 3956 ONE_MORE_BYTE (c); |
| 3957 | |
| 3958 if (c == '\r') | |
| 3959 { | |
| 3960 if (EQ (eol_type, Qdos)) | |
| 3062 { | 3961 { |
| 3063 if (c1 == '\r') | 3962 if (src == src_end) |
| 3064 { | 3963 { |
| 3065 if (coding->eol_type == CODING_EOL_CRLF) | 3964 coding->result = CODING_RESULT_INSUFFICIENT_SRC; |
| 3066 { | 3965 goto no_more_source; |
| 3067 ONE_MORE_BYTE (c2); | |
| 3068 if (c2 == '\n') | |
| 3069 c1 = c2; | |
| 3070 else | |
| 3071 /* To process C2 again, SRC is subtracted by 1. */ | |
| 3072 src--; | |
| 3073 } | |
| 3074 else if (coding->eol_type == CODING_EOL_CR) | |
| 3075 c1 = '\n'; | |
| 3076 } | 3966 } |
| 3077 else if (c1 == '\n' | 3967 if (*src == '\n') |
| 3078 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) | 3968 ONE_MORE_BYTE (c); |
| 3079 && (coding->eol_type == CODING_EOL_CR | |
| 3080 || coding->eol_type == CODING_EOL_CRLF)) | |
| 3081 { | |
| 3082 coding->result = CODING_FINISH_INCONSISTENT_EOL; | |
| 3083 goto label_end_of_loop; | |
| 3084 } | |
| 3085 } | 3969 } |
| 3970 else if (EQ (eol_type, Qmac)) | |
| 3971 c = '\n'; | |
| 3086 } | 3972 } |
| 3087 else | 3973 else |
| 3088 { | 3974 { |
| 3089 if (sjis_p) | 3975 struct charset *charset; |
| 3976 | |
| 3977 if (c < 0x80) | |
| 3978 charset = charset_roman; | |
| 3979 else | |
| 3090 { | 3980 { |
| 3091 if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF) | 3981 if (c >= 0xF0) |
| 3092 goto label_invalid_code; | 3982 goto invalid_code; |
| 3093 if (c1 <= 0x9F || c1 >= 0xE0) | 3983 if (c < 0xA0 || c >= 0xE0) |
| 3094 { | 3984 { |
| 3095 /* SJIS -> JISX0208 */ | 3985 /* SJIS -> JISX0208 */ |
| 3096 ONE_MORE_BYTE (c2); | 3986 ONE_MORE_BYTE (c1); |
| 3097 if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC) | 3987 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC) |
| 3098 goto label_invalid_code; | 3988 goto invalid_code; |
| 3099 DECODE_SJIS (c1, c2, c1, c2); | 3989 c = (c << 8) | c1; |
| 3100 charset = charset_jisx0208; | 3990 SJIS_TO_JIS (c); |
| 3991 charset = charset_kanji; | |
| 3992 } | |
| 3993 else if (c > 0xA0) | |
| 3994 { | |
| 3995 /* SJIS -> JISX0201-Kana */ | |
| 3996 c &= 0x7F; | |
| 3997 charset = charset_kana; | |
| 3101 } | 3998 } |
| 3102 else | 3999 else |
| 3103 /* SJIS -> JISX0201-Kana */ | 4000 goto invalid_code; |
| 3104 charset = charset_katakana_jisx0201; | |
| 3105 } | 4001 } |
| 4002 if (charset->id != charset_ascii | |
| 4003 && last_id != charset->id) | |
| 4004 { | |
| 4005 if (last_id != charset_ascii) | |
| 4006 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | |
| 4007 last_id = charset->id; | |
| 4008 last_offset = char_offset; | |
| 4009 } | |
| 4010 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); | |
| 4011 } | |
| 4012 *charbuf++ = c; | |
| 4013 char_offset++; | |
| 4014 continue; | |
| 4015 | |
| 4016 invalid_code: | |
| 4017 src = src_base; | |
| 4018 consumed_chars = consumed_chars_base; | |
| 4019 ONE_MORE_BYTE (c); | |
| 4020 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); | |
| 4021 char_offset++; | |
| 4022 coding->errors++; | |
| 4023 } | |
| 4024 | |
| 4025 no_more_source: | |
| 4026 if (last_id != charset_ascii) | |
| 4027 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | |
| 4028 coding->consumed_char += consumed_chars_base; | |
| 4029 coding->consumed = src_base - coding->source; | |
| 4030 coding->charbuf_used = charbuf - coding->charbuf; | |
| 4031 } | |
| 4032 | |
| 4033 static void | |
| 4034 decode_coding_big5 (coding) | |
| 4035 struct coding_system *coding; | |
| 4036 { | |
| 4037 const unsigned char *src = coding->source + coding->consumed; | |
| 4038 const unsigned char *src_end = coding->source + coding->src_bytes; | |
| 4039 const unsigned char *src_base; | |
| 4040 int *charbuf = coding->charbuf; | |
| 4041 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH; | |
| 4042 int consumed_chars = 0, consumed_chars_base; | |
| 4043 int multibytep = coding->src_multibyte; | |
| 4044 struct charset *charset_roman, *charset_big5; | |
| 4045 Lisp_Object attrs, eol_type, charset_list, val; | |
| 4046 int char_offset = coding->produced_char; | |
| 4047 int last_offset = char_offset; | |
| 4048 int last_id = charset_ascii; | |
| 4049 | |
| 4050 CODING_GET_INFO (coding, attrs, eol_type, charset_list); | |
| 4051 val = charset_list; | |
| 4052 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); | |
| 4053 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val))); | |
| 4054 | |
| 4055 while (1) | |
| 4056 { | |
| 4057 int c, c1; | |
| 4058 | |
| 4059 src_base = src; | |
| 4060 consumed_chars_base = consumed_chars; | |
| 4061 | |
| 4062 if (charbuf >= charbuf_end) | |
| 4063 break; | |
| 4064 | |
| 4065 ONE_MORE_BYTE (c); | |
| 4066 | |
| 4067 if (c == '\r') | |
| 4068 { | |
| 4069 if (EQ (eol_type, Qdos)) | |
| 4070 { | |
| 4071 if (src == src_end) | |
| 4072 { | |
| 4073 coding->result = CODING_RESULT_INSUFFICIENT_SRC; | |
| 4074 goto no_more_source; | |
| 4075 } | |
| 4076 if (*src == '\n') | |
| 4077 ONE_MORE_BYTE (c); | |
| 4078 } | |
| 4079 else if (EQ (eol_type, Qmac)) | |
| 4080 c = '\n'; | |
| 4081 } | |
| 4082 else | |
| 4083 { | |
| 4084 struct charset *charset; | |
| 4085 if (c < 0x80) | |
| 4086 charset = charset_roman; | |
| 3106 else | 4087 else |
| 3107 { | 4088 { |
| 3108 /* BIG5 -> Big5 */ | 4089 /* BIG5 -> Big5 */ |
| 3109 if (c1 < 0xA0 || c1 > 0xFE) | 4090 if (c < 0xA1 || c > 0xFE) |
| 3110 goto label_invalid_code; | 4091 goto invalid_code; |
| 3111 ONE_MORE_BYTE (c2); | 4092 ONE_MORE_BYTE (c1); |
| 3112 if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE) | 4093 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE) |
| 3113 goto label_invalid_code; | 4094 goto invalid_code; |
| 3114 DECODE_BIG5 (c1, c2, charset, c1, c2); | 4095 c = c << 8 | c1; |
| 4096 charset = charset_big5; | |
| 3115 } | 4097 } |
| 3116 } | 4098 if (charset->id != charset_ascii |
| 3117 | 4099 && last_id != charset->id) |
| 3118 c = DECODE_ISO_CHARACTER (charset, c1, c2); | 4100 { |
| 3119 EMIT_CHAR (c); | 4101 if (last_id != charset_ascii) |
| 4102 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | |
| 4103 last_id = charset->id; | |
| 4104 last_offset = char_offset; | |
| 4105 } | |
| 4106 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); | |
| 4107 } | |
| 4108 | |
| 4109 *charbuf++ = c; | |
| 4110 char_offset++; | |
| 3120 continue; | 4111 continue; |
| 3121 | 4112 |
| 3122 label_invalid_code: | 4113 invalid_code: |
| 4114 src = src_base; | |
| 4115 consumed_chars = consumed_chars_base; | |
| 4116 ONE_MORE_BYTE (c); | |
| 4117 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); | |
| 4118 char_offset++; | |
| 3123 coding->errors++; | 4119 coding->errors++; |
| 3124 src = src_base; | 4120 } |
| 3125 c = *src++; | 4121 |
| 3126 EMIT_CHAR (c); | 4122 no_more_source: |
| 3127 } | 4123 if (last_id != charset_ascii) |
| 3128 | 4124 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); |
| 3129 label_end_of_loop: | 4125 coding->consumed_char += consumed_chars_base; |
| 3130 coding->consumed = coding->consumed_char = src_base - source; | 4126 coding->consumed = src_base - coding->source; |
| 3131 coding->produced = dst - destination; | 4127 coding->charbuf_used = charbuf - coding->charbuf; |
| 3132 return; | |
| 3133 } | 4128 } |
| 3134 | 4129 |
| 3135 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". | 4130 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". |
| 3136 This function can encode charsets `ascii', `katakana-jisx0201', | 4131 This function can encode charsets `ascii', `katakana-jisx0201', |
| 3137 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We | 4132 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We |
| 3138 are sure that all these charsets are registered as official charset | 4133 are sure that all these charsets are registered as official charset |
| 3139 (i.e. do not have extended leading-codes). Characters of other | 4134 (i.e. do not have extended leading-codes). Characters of other |
| 3140 charsets are produced without any encoding. If SJIS_P is 1, encode | 4135 charsets are produced without any encoding. If SJIS_P is 1, encode |
| 3141 SJIS text, else encode BIG5 text. */ | 4136 SJIS text, else encode BIG5 text. */ |
| 3142 | 4137 |
| 3143 static void | 4138 static int |
| 3144 encode_coding_sjis_big5 (coding, source, destination, | 4139 encode_coding_sjis (coding) |
| 3145 src_bytes, dst_bytes, sjis_p) | |
| 3146 struct coding_system *coding; | 4140 struct coding_system *coding; |
| 3147 unsigned char *source, *destination; | 4141 { |
| 3148 int src_bytes, dst_bytes; | 4142 int multibytep = coding->dst_multibyte; |
| 3149 int sjis_p; | 4143 int *charbuf = coding->charbuf; |
| 3150 { | 4144 int *charbuf_end = charbuf + coding->charbuf_used; |
| 3151 unsigned char *src = source; | 4145 unsigned char *dst = coding->destination + coding->produced; |
| 3152 unsigned char *src_end = source + src_bytes; | 4146 unsigned char *dst_end = coding->destination + coding->dst_bytes; |
| 3153 unsigned char *dst = destination; | 4147 int safe_room = 4; |
| 3154 unsigned char *dst_end = destination + dst_bytes; | 4148 int produced_chars = 0; |
| 3155 /* SRC_BASE remembers the start position in source in each loop. | 4149 Lisp_Object attrs, eol_type, charset_list, val; |
| 3156 The loop will be exited when there's not enough source text to | 4150 int ascii_compatible; |
| 3157 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when | 4151 struct charset *charset_roman, *charset_kanji, *charset_kana; |
| 3158 there's not enough destination area to produce encoded codes | 4152 int c; |
| 3159 (within macro EMIT_BYTES). */ | 4153 |
| 3160 unsigned char *src_base; | 4154 CODING_GET_INFO (coding, attrs, eol_type, charset_list); |
| 3161 Lisp_Object translation_table; | 4155 val = charset_list; |
| 3162 | 4156 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); |
| 3163 if (NILP (Venable_character_translation)) | 4157 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); |
| 3164 translation_table = Qnil; | 4158 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))); |
| 3165 else | 4159 |
| 3166 { | 4160 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)); |
| 3167 translation_table = coding->translation_table_for_encode; | 4161 |
| 3168 if (NILP (translation_table)) | 4162 while (charbuf < charbuf_end) |
| 3169 translation_table = Vstandard_translation_table_for_encode; | 4163 { |
| 3170 } | 4164 ASSURE_DESTINATION (safe_room); |
| 3171 | 4165 c = *charbuf++; |
| 3172 while (1) | |
| 3173 { | |
| 3174 int c, charset, c1, c2; | |
| 3175 | |
| 3176 src_base = src; | |
| 3177 ONE_MORE_CHAR (c); | |
| 3178 | |
| 3179 /* Now encode the character C. */ | 4166 /* Now encode the character C. */ |
| 3180 if (SINGLE_BYTE_CHAR_P (c)) | 4167 if (ASCII_CHAR_P (c) && ascii_compatible) |
| 3181 { | 4168 EMIT_ONE_ASCII_BYTE (c); |
| 3182 switch (c) | 4169 else if (CHAR_BYTE8_P (c)) |
| 4170 { | |
| 4171 c = CHAR_TO_BYTE8 (c); | |
| 4172 EMIT_ONE_BYTE (c); | |
| 4173 } | |
| 4174 else | |
| 4175 { | |
| 4176 unsigned code; | |
| 4177 struct charset *charset = char_charset (c, charset_list, &code); | |
| 4178 | |
| 4179 if (!charset) | |
| 3183 { | 4180 { |
| 3184 case '\r': | 4181 if (coding->mode & CODING_MODE_SAFE_ENCODING) |
| 3185 if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY)) | |
| 3186 { | 4182 { |
| 3187 EMIT_ONE_BYTE (c); | 4183 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION; |
| 3188 break; | 4184 charset = CHARSET_FROM_ID (charset_ascii); |
| 3189 } | |
| 3190 c = '\n'; | |
| 3191 case '\n': | |
| 3192 if (coding->eol_type == CODING_EOL_CRLF) | |
| 3193 { | |
| 3194 EMIT_TWO_BYTES ('\r', c); | |
| 3195 break; | |
| 3196 } | |
| 3197 else if (coding->eol_type == CODING_EOL_CR) | |
| 3198 c = '\r'; | |
| 3199 default: | |
| 3200 EMIT_ONE_BYTE (c); | |
| 3201 } | |
| 3202 } | |
| 3203 else | |
| 3204 { | |
| 3205 SPLIT_CHAR (c, charset, c1, c2); | |
| 3206 if (sjis_p) | |
| 3207 { | |
| 3208 if (charset == charset_jisx0208 | |
| 3209 || charset == charset_jisx0208_1978) | |
| 3210 { | |
| 3211 ENCODE_SJIS (c1, c2, c1, c2); | |
| 3212 EMIT_TWO_BYTES (c1, c2); | |
| 3213 } | |
| 3214 else if (charset == charset_katakana_jisx0201) | |
| 3215 EMIT_ONE_BYTE (c1 | 0x80); | |
| 3216 else if (charset == charset_latin_jisx0201) | |
| 3217 EMIT_ONE_BYTE (c1); | |
| 3218 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR) | |
| 3219 { | |
| 3220 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER); | |
| 3221 if (CHARSET_WIDTH (charset) > 1) | |
| 3222 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER); | |
| 3223 } | 4185 } |
| 3224 else | 4186 else |
| 3225 /* There's no way other than producing the internal | 4187 { |
| 3226 codes as is. */ | 4188 c = coding->default_char; |
| 3227 EMIT_BYTES (src_base, src); | 4189 charset = char_charset (c, charset_list, &code); |
| 4190 } | |
| 4191 } | |
| 4192 if (code == CHARSET_INVALID_CODE (charset)) | |
| 4193 abort (); | |
| 4194 if (charset == charset_kanji) | |
| 4195 { | |
| 4196 int c1, c2; | |
| 4197 JIS_TO_SJIS (code); | |
| 4198 c1 = code >> 8, c2 = code & 0xFF; | |
| 4199 EMIT_TWO_BYTES (c1, c2); | |
| 4200 } | |
| 4201 else if (charset == charset_kana) | |
| 4202 EMIT_ONE_BYTE (code | 0x80); | |
| 4203 else | |
| 4204 EMIT_ONE_ASCII_BYTE (code & 0x7F); | |
| 4205 } | |
| 4206 } | |
| 4207 coding->result = CODING_RESULT_SUCCESS; | |
| 4208 coding->produced_char += produced_chars; | |
| 4209 coding->produced = dst - coding->destination; | |
| 4210 return 0; | |
| 4211 } | |
| 4212 | |
| 4213 static int | |
| 4214 encode_coding_big5 (coding) | |
| 4215 struct coding_system *coding; | |
| 4216 { | |
| 4217 int multibytep = coding->dst_multibyte; | |
| 4218 int *charbuf = coding->charbuf; | |
| 4219 int *charbuf_end = charbuf + coding->charbuf_used; | |
| 4220 unsigned char *dst = coding->destination + coding->produced; | |
| 4221 unsigned char *dst_end = coding->destination + coding->dst_bytes; | |
| 4222 int safe_room = 4; | |
| 4223 int produced_chars = 0; | |
| 4224 Lisp_Object attrs, eol_type, charset_list, val; | |
| 4225 int ascii_compatible; | |
| 4226 struct charset *charset_roman, *charset_big5; | |
| 4227 int c; | |
| 4228 | |
| 4229 CODING_GET_INFO (coding, attrs, eol_type, charset_list); | |
| 4230 val = charset_list; | |
| 4231 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); | |
| 4232 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val))); | |
| 4233 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)); | |
| 4234 | |
| 4235 while (charbuf < charbuf_end) | |
| 4236 { | |
| 4237 ASSURE_DESTINATION (safe_room); | |
| 4238 c = *charbuf++; | |
| 4239 /* Now encode the character C. */ | |
| 4240 if (ASCII_CHAR_P (c) && ascii_compatible) | |
| 4241 EMIT_ONE_ASCII_BYTE (c); | |
| 4242 else if (CHAR_BYTE8_P (c)) | |
| 4243 { | |
| 4244 c = CHAR_TO_BYTE8 (c); | |
| 4245 EMIT_ONE_BYTE (c); | |
| 4246 } | |
| 4247 else | |
| 4248 { | |
| 4249 unsigned code; | |
| 4250 struct charset *charset = char_charset (c, charset_list, &code); | |
| 4251 | |
| 4252 if (! charset) | |
| 4253 { | |
| 4254 if (coding->mode & CODING_MODE_SAFE_ENCODING) | |
| 4255 { | |
| 4256 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION; | |
| 4257 charset = CHARSET_FROM_ID (charset_ascii); | |
| 4258 } | |
| 4259 else | |
| 4260 { | |
| 4261 c = coding->default_char; | |
| 4262 charset = char_charset (c, charset_list, &code); | |
| 4263 } | |
| 4264 } | |
| 4265 if (code == CHARSET_INVALID_CODE (charset)) | |
| 4266 abort (); | |
| 4267 if (charset == charset_big5) | |
| 4268 { | |
| 4269 int c1, c2; | |
| 4270 | |
| 4271 c1 = code >> 8, c2 = code & 0xFF; | |
| 4272 EMIT_TWO_BYTES (c1, c2); | |
| 4273 } | |
| 4274 else | |
| 4275 EMIT_ONE_ASCII_BYTE (code & 0x7F); | |
| 4276 } | |
| 4277 } | |
| 4278 coding->result = CODING_RESULT_SUCCESS; | |
| 4279 coding->produced_char += produced_chars; | |
| 4280 coding->produced = dst - coding->destination; | |
| 4281 return 0; | |
| 4282 } | |
| 4283 | |
| 4284 | |
| 4285 /*** 10. CCL handlers ***/ | |
| 4286 | |
| 4287 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | |
| 4288 Check if a text is encoded in a coding system of which | |
| 4289 encoder/decoder are written in CCL program. If it is, return | |
| 4290 CATEGORY_MASK_CCL, else return 0. */ | |
| 4291 | |
| 4292 static int | |
| 4293 detect_coding_ccl (coding, detect_info) | |
| 4294 struct coding_system *coding; | |
| 4295 struct coding_detection_info *detect_info; | |
| 4296 { | |
| 4297 const unsigned char *src = coding->source, *src_base = src; | |
| 4298 const unsigned char *src_end = coding->source + coding->src_bytes; | |
| 4299 int multibytep = coding->src_multibyte; | |
| 4300 int consumed_chars = 0; | |
| 4301 int found = 0; | |
| 4302 unsigned char *valids = CODING_CCL_VALIDS (coding); | |
| 4303 int head_ascii = coding->head_ascii; | |
| 4304 Lisp_Object attrs; | |
| 4305 | |
| 4306 detect_info->checked |= CATEGORY_MASK_CCL; | |
| 4307 | |
| 4308 coding = &coding_categories[coding_category_ccl]; | |
| 4309 attrs = CODING_ID_ATTRS (coding->id); | |
| 4310 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) | |
| 4311 src += head_ascii; | |
| 4312 | |
| 4313 while (1) | |
| 4314 { | |
| 4315 int c; | |
| 4316 ONE_MORE_BYTE (c); | |
| 4317 if (! valids[c]) | |
| 4318 break; | |
| 4319 if ((valids[c] > 1)) | |
| 4320 found = CATEGORY_MASK_CCL; | |
| 4321 } | |
| 4322 detect_info->rejected |= CATEGORY_MASK_CCL; | |
| 4323 return 0; | |
| 4324 | |
| 4325 no_more_source: | |
| 4326 detect_info->found |= found; | |
| 4327 return 1; | |
| 4328 } | |
| 4329 | |
| 4330 static void | |
| 4331 decode_coding_ccl (coding) | |
| 4332 struct coding_system *coding; | |
| 4333 { | |
| 4334 const unsigned char *src = coding->source + coding->consumed; | |
| 4335 const unsigned char *src_end = coding->source + coding->src_bytes; | |
| 4336 int *charbuf = coding->charbuf; | |
| 4337 int *charbuf_end = charbuf + coding->charbuf_size; | |
| 4338 int consumed_chars = 0; | |
| 4339 int multibytep = coding->src_multibyte; | |
| 4340 struct ccl_program ccl; | |
| 4341 int source_charbuf[1024]; | |
| 4342 int source_byteidx[1024]; | |
| 4343 Lisp_Object attrs, eol_type, charset_list; | |
| 4344 | |
| 4345 CODING_GET_INFO (coding, attrs, eol_type, charset_list); | |
| 4346 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding)); | |
| 4347 | |
| 4348 while (src < src_end) | |
| 4349 { | |
| 4350 const unsigned char *p = src; | |
| 4351 int *source, *source_end; | |
| 4352 int i = 0; | |
| 4353 | |
| 4354 if (multibytep) | |
| 4355 while (i < 1024 && p < src_end) | |
| 4356 { | |
| 4357 source_byteidx[i] = p - src; | |
| 4358 source_charbuf[i++] = STRING_CHAR_ADVANCE (p); | |
| 4359 } | |
| 4360 else | |
| 4361 while (i < 1024 && p < src_end) | |
| 4362 source_charbuf[i++] = *p++; | |
| 4363 | |
| 4364 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK) | |
| 4365 ccl.last_block = 1; | |
| 4366 | |
| 4367 source = source_charbuf; | |
| 4368 source_end = source + i; | |
| 4369 while (source < source_end) | |
| 4370 { | |
| 4371 ccl_driver (&ccl, source, charbuf, | |
| 4372 source_end - source, charbuf_end - charbuf, | |
| 4373 charset_list); | |
| 4374 source += ccl.consumed; | |
| 4375 charbuf += ccl.produced; | |
| 4376 if (ccl.status != CCL_STAT_SUSPEND_BY_DST) | |
| 4377 break; | |
| 4378 } | |
| 4379 if (source < source_end) | |
| 4380 src += source_byteidx[source - source_charbuf]; | |
| 4381 else | |
| 4382 src = p; | |
| 4383 consumed_chars += source - source_charbuf; | |
| 4384 | |
| 4385 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC | |
| 4386 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC) | |
| 4387 break; | |
| 4388 } | |
| 4389 | |
| 4390 switch (ccl.status) | |
| 4391 { | |
| 4392 case CCL_STAT_SUSPEND_BY_SRC: | |
| 4393 coding->result = CODING_RESULT_INSUFFICIENT_SRC; | |
| 4394 break; | |
| 4395 case CCL_STAT_SUSPEND_BY_DST: | |
| 4396 break; | |
| 4397 case CCL_STAT_QUIT: | |
| 4398 case CCL_STAT_INVALID_CMD: | |
| 4399 coding->result = CODING_RESULT_INTERRUPT; | |
| 4400 break; | |
| 4401 default: | |
| 4402 coding->result = CODING_RESULT_SUCCESS; | |
| 4403 break; | |
| 4404 } | |
| 4405 coding->consumed_char += consumed_chars; | |
| 4406 coding->consumed = src - coding->source; | |
| 4407 coding->charbuf_used = charbuf - coding->charbuf; | |
| 4408 } | |
| 4409 | |
| 4410 static int | |
| 4411 encode_coding_ccl (coding) | |
| 4412 struct coding_system *coding; | |
| 4413 { | |
| 4414 struct ccl_program ccl; | |
| 4415 int multibytep = coding->dst_multibyte; | |
| 4416 int *charbuf = coding->charbuf; | |
| 4417 int *charbuf_end = charbuf + coding->charbuf_used; | |
| 4418 unsigned char *dst = coding->destination + coding->produced; | |
| 4419 unsigned char *dst_end = coding->destination + coding->dst_bytes; | |
| 4420 unsigned char *adjusted_dst_end = dst_end - 1; | |
| 4421 int destination_charbuf[1024]; | |
| 4422 int i, produced_chars = 0; | |
| 4423 Lisp_Object attrs, eol_type, charset_list; | |
| 4424 | |
| 4425 CODING_GET_INFO (coding, attrs, eol_type, charset_list); | |
| 4426 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding)); | |
| 4427 | |
| 4428 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK; | |
| 4429 ccl.dst_multibyte = coding->dst_multibyte; | |
| 4430 | |
| 4431 while (charbuf < charbuf_end && dst < adjusted_dst_end) | |
| 4432 { | |
| 4433 int dst_bytes = dst_end - dst; | |
| 4434 if (dst_bytes > 1024) | |
| 4435 dst_bytes = 1024; | |
| 4436 | |
| 4437 ccl_driver (&ccl, charbuf, destination_charbuf, | |
| 4438 charbuf_end - charbuf, dst_bytes, charset_list); | |
| 4439 charbuf += ccl.consumed; | |
| 4440 if (multibytep) | |
| 4441 for (i = 0; i < ccl.produced; i++) | |
| 4442 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF); | |
| 4443 else | |
| 4444 { | |
| 4445 for (i = 0; i < ccl.produced; i++) | |
| 4446 *dst++ = destination_charbuf[i] & 0xFF; | |
| 4447 produced_chars += ccl.produced; | |
| 4448 } | |
| 4449 } | |
| 4450 | |
| 4451 switch (ccl.status) | |
| 4452 { | |
| 4453 case CCL_STAT_SUSPEND_BY_SRC: | |
| 4454 coding->result = CODING_RESULT_INSUFFICIENT_SRC; | |
| 4455 break; | |
| 4456 case CCL_STAT_SUSPEND_BY_DST: | |
| 4457 coding->result = CODING_RESULT_INSUFFICIENT_DST; | |
| 4458 break; | |
| 4459 case CCL_STAT_QUIT: | |
| 4460 case CCL_STAT_INVALID_CMD: | |
| 4461 coding->result = CODING_RESULT_INTERRUPT; | |
| 4462 break; | |
| 4463 default: | |
| 4464 coding->result = CODING_RESULT_SUCCESS; | |
| 4465 break; | |
| 4466 } | |
| 4467 | |
| 4468 coding->produced_char += produced_chars; | |
| 4469 coding->produced = dst - coding->destination; | |
| 4470 return 0; | |
| 4471 } | |
| 4472 | |
| 4473 | |
| 4474 | |
| 4475 /*** 10, 11. no-conversion handlers ***/ | |
| 4476 | |
| 4477 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ | |
| 4478 | |
| 4479 static void | |
| 4480 decode_coding_raw_text (coding) | |
| 4481 struct coding_system *coding; | |
| 4482 { | |
| 4483 coding->chars_at_source = 1; | |
| 4484 coding->consumed_char = 0; | |
| 4485 coding->consumed = 0; | |
| 4486 coding->result = CODING_RESULT_SUCCESS; | |
| 4487 } | |
| 4488 | |
| 4489 static int | |
| 4490 encode_coding_raw_text (coding) | |
| 4491 struct coding_system *coding; | |
| 4492 { | |
| 4493 int multibytep = coding->dst_multibyte; | |
| 4494 int *charbuf = coding->charbuf; | |
| 4495 int *charbuf_end = coding->charbuf + coding->charbuf_used; | |
| 4496 unsigned char *dst = coding->destination + coding->produced; | |
| 4497 unsigned char *dst_end = coding->destination + coding->dst_bytes; | |
| 4498 int produced_chars = 0; | |
| 4499 int c; | |
| 4500 | |
| 4501 if (multibytep) | |
| 4502 { | |
| 4503 int safe_room = MAX_MULTIBYTE_LENGTH * 2; | |
| 4504 | |
| 4505 if (coding->src_multibyte) | |
| 4506 while (charbuf < charbuf_end) | |
| 4507 { | |
| 4508 ASSURE_DESTINATION (safe_room); | |
| 4509 c = *charbuf++; | |
| 4510 if (ASCII_CHAR_P (c)) | |
| 4511 EMIT_ONE_ASCII_BYTE (c); | |
| 4512 else if (CHAR_BYTE8_P (c)) | |
| 4513 { | |
| 4514 c = CHAR_TO_BYTE8 (c); | |
| 4515 EMIT_ONE_BYTE (c); | |
| 4516 } | |
| 4517 else | |
| 4518 { | |
| 4519 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str; | |
| 4520 | |
| 4521 CHAR_STRING_ADVANCE (c, p1); | |
| 4522 while (p0 < p1) | |
| 4523 { | |
| 4524 EMIT_ONE_BYTE (*p0); | |
| 4525 p0++; | |
| 4526 } | |
| 4527 } | |
| 4528 } | |
| 4529 else | |
| 4530 while (charbuf < charbuf_end) | |
| 4531 { | |
| 4532 ASSURE_DESTINATION (safe_room); | |
| 4533 c = *charbuf++; | |
| 4534 EMIT_ONE_BYTE (c); | |
| 4535 } | |
| 4536 } | |
| 4537 else | |
| 4538 { | |
| 4539 if (coding->src_multibyte) | |
| 4540 { | |
| 4541 int safe_room = MAX_MULTIBYTE_LENGTH; | |
| 4542 | |
| 4543 while (charbuf < charbuf_end) | |
| 4544 { | |
| 4545 ASSURE_DESTINATION (safe_room); | |
| 4546 c = *charbuf++; | |
| 4547 if (ASCII_CHAR_P (c)) | |
| 4548 *dst++ = c; | |
| 4549 else if (CHAR_BYTE8_P (c)) | |
| 4550 *dst++ = CHAR_TO_BYTE8 (c); | |
| 4551 else | |
| 4552 CHAR_STRING_ADVANCE (c, dst); | |
| 4553 produced_chars++; | |
| 4554 } | |
| 4555 } | |
| 4556 else | |
| 4557 { | |
| 4558 ASSURE_DESTINATION (charbuf_end - charbuf); | |
| 4559 while (charbuf < charbuf_end && dst < dst_end) | |
| 4560 *dst++ = *charbuf++; | |
| 4561 produced_chars = dst - (coding->destination + coding->dst_bytes); | |
| 4562 } | |
| 4563 } | |
| 4564 coding->result = CODING_RESULT_SUCCESS; | |
| 4565 coding->produced_char += produced_chars; | |
| 4566 coding->produced = dst - coding->destination; | |
| 4567 return 0; | |
| 4568 } | |
| 4569 | |
| 4570 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | |
| 4571 Check if a text is encoded in a charset-based coding system. If it | |
| 4572 is, return 1, else return 0. */ | |
| 4573 | |
| 4574 static int | |
| 4575 detect_coding_charset (coding, detect_info) | |
| 4576 struct coding_system *coding; | |
| 4577 struct coding_detection_info *detect_info; | |
| 4578 { | |
| 4579 const unsigned char *src = coding->source, *src_base = src; | |
| 4580 const unsigned char *src_end = coding->source + coding->src_bytes; | |
| 4581 int multibytep = coding->src_multibyte; | |
| 4582 int consumed_chars = 0; | |
| 4583 Lisp_Object attrs, valids; | |
| 4584 int found = 0; | |
| 4585 | |
| 4586 detect_info->checked |= CATEGORY_MASK_CHARSET; | |
| 4587 | |
| 4588 coding = &coding_categories[coding_category_charset]; | |
| 4589 attrs = CODING_ID_ATTRS (coding->id); | |
| 4590 valids = AREF (attrs, coding_attr_charset_valids); | |
| 4591 | |
| 4592 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) | |
| 4593 src += coding->head_ascii; | |
| 4594 | |
| 4595 while (1) | |
| 4596 { | |
| 4597 int c; | |
| 4598 | |
| 4599 ONE_MORE_BYTE (c); | |
| 4600 if (NILP (AREF (valids, c))) | |
| 4601 break; | |
| 4602 if (c >= 0x80) | |
| 4603 found = CATEGORY_MASK_CHARSET; | |
| 4604 } | |
| 4605 detect_info->rejected |= CATEGORY_MASK_CHARSET; | |
| 4606 return 0; | |
| 4607 | |
| 4608 no_more_source: | |
| 4609 detect_info->found |= found; | |
| 4610 return 1; | |
| 4611 } | |
| 4612 | |
| 4613 static void | |
| 4614 decode_coding_charset (coding) | |
| 4615 struct coding_system *coding; | |
| 4616 { | |
| 4617 const unsigned char *src = coding->source + coding->consumed; | |
| 4618 const unsigned char *src_end = coding->source + coding->src_bytes; | |
| 4619 const unsigned char *src_base; | |
| 4620 int *charbuf = coding->charbuf; | |
| 4621 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH; | |
| 4622 int consumed_chars = 0, consumed_chars_base; | |
| 4623 int multibytep = coding->src_multibyte; | |
| 4624 Lisp_Object attrs, eol_type, charset_list, valids; | |
| 4625 int char_offset = coding->produced_char; | |
| 4626 int last_offset = char_offset; | |
| 4627 int last_id = charset_ascii; | |
| 4628 | |
| 4629 CODING_GET_INFO (coding, attrs, eol_type, charset_list); | |
| 4630 valids = AREF (attrs, coding_attr_charset_valids); | |
| 4631 | |
| 4632 while (1) | |
| 4633 { | |
| 4634 int c; | |
| 4635 | |
| 4636 src_base = src; | |
| 4637 consumed_chars_base = consumed_chars; | |
| 4638 | |
| 4639 if (charbuf >= charbuf_end) | |
| 4640 break; | |
| 4641 | |
| 4642 ONE_MORE_BYTE (c); | |
| 4643 if (c == '\r') | |
| 4644 { | |
| 4645 /* Here we assume that no charset maps '\r' to something | |
| 4646 else. */ | |
| 4647 if (EQ (eol_type, Qdos)) | |
| 4648 { | |
| 4649 if (src == src_end) | |
| 4650 { | |
| 4651 coding->result = CODING_RESULT_INSUFFICIENT_SRC; | |
| 4652 goto no_more_source; | |
| 4653 } | |
| 4654 if (*src == '\n') | |
| 4655 ONE_MORE_BYTE (c); | |
| 4656 } | |
| 4657 else if (EQ (eol_type, Qmac)) | |
| 4658 c = '\n'; | |
| 4659 } | |
| 4660 else | |
| 4661 { | |
| 4662 Lisp_Object val; | |
| 4663 struct charset *charset; | |
| 4664 int dim; | |
| 4665 int len = 1; | |
| 4666 unsigned code = c; | |
| 4667 | |
| 4668 val = AREF (valids, c); | |
| 4669 if (NILP (val)) | |
| 4670 goto invalid_code; | |
| 4671 if (INTEGERP (val)) | |
| 4672 { | |
| 4673 charset = CHARSET_FROM_ID (XFASTINT (val)); | |
| 4674 dim = CHARSET_DIMENSION (charset); | |
| 4675 while (len < dim) | |
| 4676 { | |
| 4677 ONE_MORE_BYTE (c); | |
| 4678 code = (code << 8) | c; | |
| 4679 len++; | |
| 4680 } | |
| 4681 CODING_DECODE_CHAR (coding, src, src_base, src_end, | |
| 4682 charset, code, c); | |
| 3228 } | 4683 } |
| 3229 else | 4684 else |
| 3230 { | 4685 { |
| 3231 if (charset == charset_big5_1 || charset == charset_big5_2) | 4686 /* VAL is a list of charset IDs. It is assured that the |
| 4687 list is sorted by charset dimensions (smaller one | |
| 4688 comes first). */ | |
| 4689 while (CONSP (val)) | |
| 3232 { | 4690 { |
| 3233 ENCODE_BIG5 (charset, c1, c2, c1, c2); | 4691 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val))); |
| 3234 EMIT_TWO_BYTES (c1, c2); | 4692 dim = CHARSET_DIMENSION (charset); |
| 3235 } | 4693 while (len < dim) |
| 3236 else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR) | 4694 { |
| 3237 { | 4695 ONE_MORE_BYTE (c); |
| 3238 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER); | 4696 code = (code << 8) | c; |
| 3239 if (CHARSET_WIDTH (charset) > 1) | 4697 len++; |
| 3240 EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER); | 4698 } |
| 3241 } | 4699 CODING_DECODE_CHAR (coding, src, src_base, |
| 3242 else | 4700 src_end, charset, code, c); |
| 3243 /* There's no way other than producing the internal | 4701 if (c >= 0) |
| 3244 codes as is. */ | 4702 break; |
| 3245 EMIT_BYTES (src_base, src); | 4703 val = XCDR (val); |
| 3246 } | |
| 3247 } | |
| 3248 coding->consumed_char++; | |
| 3249 } | |
| 3250 | |
| 3251 label_end_of_loop: | |
| 3252 coding->consumed = src_base - source; | |
| 3253 coding->produced = coding->produced_char = dst - destination; | |
| 3254 } | |
| 3255 | |
| 3256 | |
| 3257 /*** 5. CCL handlers ***/ | |
| 3258 | |
| 3259 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | |
| 3260 Check if a text is encoded in a coding system of which | |
| 3261 encoder/decoder are written in CCL program. If it is, return | |
| 3262 CODING_CATEGORY_MASK_CCL, else return 0. */ | |
| 3263 | |
| 3264 static int | |
| 3265 detect_coding_ccl (src, src_end, multibytep) | |
| 3266 unsigned char *src, *src_end; | |
| 3267 int multibytep; | |
| 3268 { | |
| 3269 unsigned char *valid; | |
| 3270 int c; | |
| 3271 /* Dummy for ONE_MORE_BYTE. */ | |
| 3272 struct coding_system dummy_coding; | |
| 3273 struct coding_system *coding = &dummy_coding; | |
| 3274 | |
| 3275 /* No coding system is assigned to coding-category-ccl. */ | |
| 3276 if (!coding_system_table[CODING_CATEGORY_IDX_CCL]) | |
| 3277 return 0; | |
| 3278 | |
| 3279 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes; | |
| 3280 while (1) | |
| 3281 { | |
| 3282 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | |
| 3283 if (! valid[c]) | |
| 3284 return 0; | |
| 3285 } | |
| 3286 label_end_of_loop: | |
| 3287 return CODING_CATEGORY_MASK_CCL; | |
| 3288 } | |
| 3289 | |
| 3290 | |
| 3291 /*** 6. End-of-line handlers ***/ | |
| 3292 | |
| 3293 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ | |
| 3294 | |
| 3295 static void | |
| 3296 decode_eol (coding, source, destination, src_bytes, dst_bytes) | |
| 3297 struct coding_system *coding; | |
| 3298 unsigned char *source, *destination; | |
| 3299 int src_bytes, dst_bytes; | |
| 3300 { | |
| 3301 unsigned char *src = source; | |
| 3302 unsigned char *dst = destination; | |
| 3303 unsigned char *src_end = src + src_bytes; | |
| 3304 unsigned char *dst_end = dst + dst_bytes; | |
| 3305 Lisp_Object translation_table; | |
| 3306 /* SRC_BASE remembers the start position in source in each loop. | |
| 3307 The loop will be exited when there's not enough source code | |
| 3308 (within macro ONE_MORE_BYTE), or when there's not enough | |
| 3309 destination area to produce a character (within macro | |
| 3310 EMIT_CHAR). */ | |
| 3311 unsigned char *src_base; | |
| 3312 int c; | |
| 3313 | |
| 3314 translation_table = Qnil; | |
| 3315 switch (coding->eol_type) | |
| 3316 { | |
| 3317 case CODING_EOL_CRLF: | |
| 3318 while (1) | |
| 3319 { | |
| 3320 src_base = src; | |
| 3321 ONE_MORE_BYTE (c); | |
| 3322 if (c == '\r') | |
| 3323 { | |
| 3324 ONE_MORE_BYTE (c); | |
| 3325 if (c != '\n') | |
| 3326 { | |
| 3327 src--; | |
| 3328 c = '\r'; | |
| 3329 } | 4704 } |
| 3330 } | 4705 } |
| 3331 else if (c == '\n' | 4706 if (c < 0) |
| 3332 && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)) | 4707 goto invalid_code; |
| 4708 if (charset->id != charset_ascii | |
| 4709 && last_id != charset->id) | |
| 3333 { | 4710 { |
| 3334 coding->result = CODING_FINISH_INCONSISTENT_EOL; | 4711 if (last_id != charset_ascii) |
| 3335 goto label_end_of_loop; | 4712 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); |
| 4713 last_id = charset->id; | |
| 4714 last_offset = char_offset; | |
| 3336 } | 4715 } |
| 3337 EMIT_CHAR (c); | 4716 } |
| 3338 } | 4717 *charbuf++ = c; |
| 3339 break; | 4718 char_offset++; |
| 3340 | 4719 continue; |
| 3341 case CODING_EOL_CR: | 4720 |
| 3342 while (1) | 4721 invalid_code: |
| 3343 { | 4722 src = src_base; |
| 3344 src_base = src; | 4723 consumed_chars = consumed_chars_base; |
| 3345 ONE_MORE_BYTE (c); | 4724 ONE_MORE_BYTE (c); |
| 3346 if (c == '\n') | 4725 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); |
| 4726 char_offset++; | |
| 4727 coding->errors++; | |
| 4728 } | |
| 4729 | |
| 4730 no_more_source: | |
| 4731 if (last_id != charset_ascii) | |
| 4732 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id); | |
| 4733 coding->consumed_char += consumed_chars_base; | |
| 4734 coding->consumed = src_base - coding->source; | |
| 4735 coding->charbuf_used = charbuf - coding->charbuf; | |
| 4736 } | |
| 4737 | |
| 4738 static int | |
| 4739 encode_coding_charset (coding) | |
| 4740 struct coding_system *coding; | |
| 4741 { | |
| 4742 int multibytep = coding->dst_multibyte; | |
| 4743 int *charbuf = coding->charbuf; | |
| 4744 int *charbuf_end = charbuf + coding->charbuf_used; | |
| 4745 unsigned char *dst = coding->destination + coding->produced; | |
| 4746 unsigned char *dst_end = coding->destination + coding->dst_bytes; | |
| 4747 int safe_room = MAX_MULTIBYTE_LENGTH; | |
| 4748 int produced_chars = 0; | |
| 4749 Lisp_Object attrs, eol_type, charset_list; | |
| 4750 int ascii_compatible; | |
| 4751 int c; | |
| 4752 | |
| 4753 CODING_GET_INFO (coding, attrs, eol_type, charset_list); | |
| 4754 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)); | |
| 4755 | |
| 4756 while (charbuf < charbuf_end) | |
| 4757 { | |
| 4758 struct charset *charset; | |
| 4759 unsigned code; | |
| 4760 | |
| 4761 ASSURE_DESTINATION (safe_room); | |
| 4762 c = *charbuf++; | |
| 4763 if (ascii_compatible && ASCII_CHAR_P (c)) | |
| 4764 EMIT_ONE_ASCII_BYTE (c); | |
| 4765 else if (CHAR_BYTE8_P (c)) | |
| 4766 { | |
| 4767 c = CHAR_TO_BYTE8 (c); | |
| 4768 EMIT_ONE_BYTE (c); | |
| 4769 } | |
| 4770 else | |
| 4771 { | |
| 4772 charset = char_charset (c, charset_list, &code); | |
| 4773 if (charset) | |
| 3347 { | 4774 { |
| 3348 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) | 4775 if (CHARSET_DIMENSION (charset) == 1) |
| 3349 { | 4776 EMIT_ONE_BYTE (code); |
| 3350 coding->result = CODING_FINISH_INCONSISTENT_EOL; | 4777 else if (CHARSET_DIMENSION (charset) == 2) |
| 3351 goto label_end_of_loop; | 4778 EMIT_TWO_BYTES (code >> 8, code & 0xFF); |
| 3352 } | 4779 else if (CHARSET_DIMENSION (charset) == 3) |
| 4780 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF); | |
| 4781 else | |
| 4782 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF, | |
| 4783 (code >> 8) & 0xFF, code & 0xFF); | |
| 3353 } | 4784 } |
| 3354 else if (c == '\r') | |
| 3355 c = '\n'; | |
| 3356 EMIT_CHAR (c); | |
| 3357 } | |
| 3358 break; | |
| 3359 | |
| 3360 default: /* no need for EOL handling */ | |
| 3361 while (1) | |
| 3362 { | |
| 3363 src_base = src; | |
| 3364 ONE_MORE_BYTE (c); | |
| 3365 EMIT_CHAR (c); | |
| 3366 } | |
| 3367 } | |
| 3368 | |
| 3369 label_end_of_loop: | |
| 3370 coding->consumed = coding->consumed_char = src_base - source; | |
| 3371 coding->produced = dst - destination; | |
| 3372 return; | |
| 3373 } | |
| 3374 | |
| 3375 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode | |
| 3376 format of end-of-line according to `coding->eol_type'. It also | |
| 3377 convert multibyte form 8-bit characters to unibyte if | |
| 3378 CODING->src_multibyte is nonzero. If `coding->mode & | |
| 3379 CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text | |
| 3380 also means end-of-line. */ | |
| 3381 | |
| 3382 static void | |
| 3383 encode_eol (coding, source, destination, src_bytes, dst_bytes) | |
| 3384 struct coding_system *coding; | |
| 3385 const unsigned char *source; | |
| 3386 unsigned char *destination; | |
| 3387 int src_bytes, dst_bytes; | |
| 3388 { | |
| 3389 const unsigned char *src = source; | |
| 3390 unsigned char *dst = destination; | |
| 3391 const unsigned char *src_end = src + src_bytes; | |
| 3392 unsigned char *dst_end = dst + dst_bytes; | |
| 3393 Lisp_Object translation_table; | |
| 3394 /* SRC_BASE remembers the start position in source in each loop. | |
| 3395 The loop will be exited when there's not enough source text to | |
| 3396 analyze multi-byte codes (within macro ONE_MORE_CHAR), or when | |
| 3397 there's not enough destination area to produce encoded codes | |
| 3398 (within macro EMIT_BYTES). */ | |
| 3399 const unsigned char *src_base; | |
| 3400 unsigned char *tmp; | |
| 3401 int c; | |
| 3402 int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY; | |
| 3403 | |
| 3404 translation_table = Qnil; | |
| 3405 if (coding->src_multibyte | |
| 3406 && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL) | |
| 3407 { | |
| 3408 src_end--; | |
| 3409 src_bytes--; | |
| 3410 coding->result = CODING_FINISH_INSUFFICIENT_SRC; | |
| 3411 } | |
| 3412 | |
| 3413 if (coding->eol_type == CODING_EOL_CRLF) | |
| 3414 { | |
| 3415 while (src < src_end) | |
| 3416 { | |
| 3417 src_base = src; | |
| 3418 c = *src++; | |
| 3419 if (c >= 0x20) | |
| 3420 EMIT_ONE_BYTE (c); | |
| 3421 else if (c == '\n' || (c == '\r' && selective_display)) | |
| 3422 EMIT_TWO_BYTES ('\r', '\n'); | |
| 3423 else | 4785 else |
| 3424 EMIT_ONE_BYTE (c); | 4786 { |
| 3425 } | 4787 if (coding->mode & CODING_MODE_SAFE_ENCODING) |
| 3426 src_base = src; | 4788 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION; |
| 3427 label_end_of_loop: | 4789 else |
| 3428 ; | 4790 c = coding->default_char; |
| 3429 } | 4791 EMIT_ONE_BYTE (c); |
| 3430 else | 4792 } |
| 3431 { | 4793 } |
| 3432 if (!dst_bytes || src_bytes <= dst_bytes) | 4794 } |
| 3433 { | 4795 |
| 3434 safe_bcopy (src, dst, src_bytes); | 4796 coding->result = CODING_RESULT_SUCCESS; |
| 3435 src_base = src_end; | 4797 coding->produced_char += produced_chars; |
| 3436 dst += src_bytes; | 4798 coding->produced = dst - coding->destination; |
| 3437 } | 4799 return 0; |
| 3438 else | |
| 3439 { | |
| 3440 if (coding->src_multibyte | |
| 3441 && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL) | |
| 3442 dst_bytes--; | |
| 3443 safe_bcopy (src, dst, dst_bytes); | |
| 3444 src_base = src + dst_bytes; | |
| 3445 dst = destination + dst_bytes; | |
| 3446 coding->result = CODING_FINISH_INSUFFICIENT_DST; | |
| 3447 } | |
| 3448 if (coding->eol_type == CODING_EOL_CR) | |
| 3449 { | |
| 3450 for (tmp = destination; tmp < dst; tmp++) | |
| 3451 if (*tmp == '\n') *tmp = '\r'; | |
| 3452 } | |
| 3453 else if (selective_display) | |
| 3454 { | |
| 3455 for (tmp = destination; tmp < dst; tmp++) | |
| 3456 if (*tmp == '\r') *tmp = '\n'; | |
| 3457 } | |
| 3458 } | |
| 3459 if (coding->src_multibyte) | |
| 3460 dst = destination + str_as_unibyte (destination, dst - destination); | |
| 3461 | |
| 3462 coding->consumed = src_base - source; | |
| 3463 coding->produced = dst - destination; | |
| 3464 coding->produced_char = coding->produced; | |
| 3465 } | 4800 } |
| 3466 | 4801 |
| 3467 | 4802 |
| 3468 /*** 7. C library functions ***/ | 4803 /*** 7. C library functions ***/ |
| 3469 | 4804 |
| 3470 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which | 4805 /* Setup coding context CODING from information about CODING_SYSTEM. |
| 3471 has a property `coding-system'. The value of this property is a | 4806 If CODING_SYSTEM is nil, `no-conversion' is assumed. If |
| 3472 vector of length 5 (called the coding-vector). Among elements of | 4807 CODING_SYSTEM is invalid, signal an error. */ |
| 3473 this vector, the first (element[0]) and the fifth (element[4]) | 4808 |
| 3474 carry important information for decoding/encoding. Before | 4809 void |
| 3475 decoding/encoding, this information should be set in fields of a | |
| 3476 structure of type `coding_system'. | |
| 3477 | |
| 3478 The value of the property `coding-system' can be a symbol of another | |
| 3479 subsidiary coding-system. In that case, Emacs gets coding-vector | |
| 3480 from that symbol. | |
| 3481 | |
| 3482 `element[0]' contains information to be set in `coding->type'. The | |
| 3483 value and its meaning is as follows: | |
| 3484 | |
| 3485 0 -- coding_type_emacs_mule | |
| 3486 1 -- coding_type_sjis | |
| 3487 2 -- coding_type_iso2022 | |
| 3488 3 -- coding_type_big5 | |
| 3489 4 -- coding_type_ccl encoder/decoder written in CCL | |
| 3490 nil -- coding_type_no_conversion | |
| 3491 t -- coding_type_undecided (automatic conversion on decoding, | |
| 3492 no-conversion on encoding) | |
| 3493 | |
| 3494 `element[4]' contains information to be set in `coding->flags' and | |
| 3495 `coding->spec'. The meaning varies by `coding->type'. | |
| 3496 | |
| 3497 If `coding->type' is `coding_type_iso2022', element[4] is a vector | |
| 3498 of length 32 (of which the first 13 sub-elements are used now). | |
| 3499 Meanings of these sub-elements are: | |
| 3500 | |
| 3501 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022' | |
| 3502 If the value is an integer of valid charset, the charset is | |
| 3503 assumed to be designated to graphic register N initially. | |
| 3504 | |
| 3505 If the value is minus, it is a minus value of charset which | |
| 3506 reserves graphic register N, which means that the charset is | |
| 3507 not designated initially but should be designated to graphic | |
| 3508 register N just before encoding a character in that charset. | |
| 3509 | |
| 3510 If the value is nil, graphic register N is never used on | |
| 3511 encoding. | |
| 3512 | |
| 3513 sub-element[N] where N is 4 through 11: to be set in `coding->flags' | |
| 3514 Each value takes t or nil. See the section ISO2022 of | |
| 3515 `coding.h' for more information. | |
| 3516 | |
| 3517 If `coding->type' is `coding_type_big5', element[4] is t to denote | |
| 3518 BIG5-ETen or nil to denote BIG5-HKU. | |
| 3519 | |
| 3520 If `coding->type' takes the other value, element[4] is ignored. | |
| 3521 | |
| 3522 Emacs Lisp's coding systems also carry information about format of | |
| 3523 end-of-line in a value of property `eol-type'. If the value is | |
| 3524 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2 | |
| 3525 means CODING_EOL_CR. If it is not integer, it should be a vector | |
| 3526 of subsidiary coding systems of which property `eol-type' has one | |
| 3527 of the above values. | |
| 3528 | |
| 3529 */ | |
| 3530 | |
| 3531 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL | |
| 3532 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING | |
| 3533 is setup so that no conversion is necessary and return -1, else | |
| 3534 return 0. */ | |
| 3535 | |
| 3536 int | |
| 3537 setup_coding_system (coding_system, coding) | 4810 setup_coding_system (coding_system, coding) |
| 3538 Lisp_Object coding_system; | 4811 Lisp_Object coding_system; |
| 3539 struct coding_system *coding; | 4812 struct coding_system *coding; |
| 3540 { | 4813 { |
| 3541 Lisp_Object coding_spec, coding_type, eol_type, plist; | 4814 Lisp_Object attrs; |
| 4815 Lisp_Object eol_type; | |
| 4816 Lisp_Object coding_type; | |
| 3542 Lisp_Object val; | 4817 Lisp_Object val; |
| 3543 | 4818 |
| 3544 /* At first, zero clear all members. */ | |
| 3545 bzero (coding, sizeof (struct coding_system)); | |
| 3546 | |
| 3547 /* Initialize some fields required for all kinds of coding systems. */ | |
| 3548 coding->symbol = coding_system; | |
| 3549 coding->heading_ascii = -1; | |
| 3550 coding->post_read_conversion = coding->pre_write_conversion = Qnil; | |
| 3551 coding->composing = COMPOSITION_DISABLED; | |
| 3552 coding->cmp_data = NULL; | |
| 3553 | |
| 3554 if (NILP (coding_system)) | 4819 if (NILP (coding_system)) |
| 3555 goto label_invalid_coding_system; | 4820 coding_system = Qno_conversion; |
| 3556 | 4821 |
| 3557 coding_spec = Fget (coding_system, Qcoding_system); | 4822 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id); |
| 3558 | 4823 |
| 3559 if (!VECTORP (coding_spec) | 4824 attrs = CODING_ID_ATTRS (coding->id); |
| 3560 || XVECTOR (coding_spec)->size != 5 | 4825 eol_type = CODING_ID_EOL_TYPE (coding->id); |
| 3561 || !CONSP (XVECTOR (coding_spec)->contents[3])) | 4826 |
| 3562 goto label_invalid_coding_system; | 4827 coding->mode = 0; |
| 3563 | 4828 coding->head_ascii = -1; |
| 3564 eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type); | 4829 coding->common_flags |
| 4830 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0); | |
| 4831 if (! NILP (CODING_ATTR_POST_READ (attrs))) | |
| 4832 coding->common_flags |= CODING_REQUIRE_DECODING_MASK; | |
| 4833 if (! NILP (CODING_ATTR_PRE_WRITE (attrs))) | |
| 4834 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK; | |
| 4835 if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs))) | |
| 4836 coding->common_flags |= CODING_FOR_UNIBYTE_MASK; | |
| 4837 | |
| 4838 val = CODING_ATTR_SAFE_CHARSETS (attrs); | |
| 4839 coding->max_charset_id = SCHARS (val) - 1; | |
| 4840 coding->safe_charsets = (char *) SDATA (val); | |
| 4841 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs)); | |
| 4842 | |
| 4843 coding_type = CODING_ATTR_TYPE (attrs); | |
| 4844 if (EQ (coding_type, Qundecided)) | |
| 4845 { | |
| 4846 coding->detector = NULL; | |
| 4847 coding->decoder = decode_coding_raw_text; | |
| 4848 coding->encoder = encode_coding_raw_text; | |
| 4849 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK; | |
| 4850 } | |
| 4851 else if (EQ (coding_type, Qiso_2022)) | |
| 4852 { | |
| 4853 int i; | |
| 4854 int flags = XINT (AREF (attrs, coding_attr_iso_flags)); | |
| 4855 | |
| 4856 /* Invoke graphic register 0 to plane 0. */ | |
| 4857 CODING_ISO_INVOCATION (coding, 0) = 0; | |
| 4858 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */ | |
| 4859 CODING_ISO_INVOCATION (coding, 1) | |
| 4860 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1); | |
| 4861 /* Setup the initial status of designation. */ | |
| 4862 for (i = 0; i < 4; i++) | |
| 4863 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i); | |
| 4864 /* Not single shifting initially. */ | |
| 4865 CODING_ISO_SINGLE_SHIFTING (coding) = 0; | |
| 4866 /* Beginning of buffer should also be regarded as bol. */ | |
| 4867 CODING_ISO_BOL (coding) = 1; | |
| 4868 coding->detector = detect_coding_iso_2022; | |
| 4869 coding->decoder = decode_coding_iso_2022; | |
| 4870 coding->encoder = encode_coding_iso_2022; | |
| 4871 if (flags & CODING_ISO_FLAG_SAFE) | |
| 4872 coding->mode |= CODING_MODE_SAFE_ENCODING; | |
| 4873 coding->common_flags | |
| 4874 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK | |
| 4875 | CODING_REQUIRE_FLUSHING_MASK); | |
| 4876 if (flags & CODING_ISO_FLAG_COMPOSITION) | |
| 4877 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK; | |
| 4878 if (flags & CODING_ISO_FLAG_DESIGNATION) | |
| 4879 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK; | |
| 4880 if (flags & CODING_ISO_FLAG_FULL_SUPPORT) | |
| 4881 { | |
| 4882 setup_iso_safe_charsets (attrs); | |
| 4883 val = CODING_ATTR_SAFE_CHARSETS (attrs); | |
| 4884 coding->max_charset_id = SCHARS (val) - 1; | |
| 4885 coding->safe_charsets = (char *) SDATA (val); | |
| 4886 } | |
| 4887 CODING_ISO_FLAGS (coding) = flags; | |
| 4888 } | |
| 4889 else if (EQ (coding_type, Qcharset)) | |
| 4890 { | |
| 4891 coding->detector = detect_coding_charset; | |
| 4892 coding->decoder = decode_coding_charset; | |
| 4893 coding->encoder = encode_coding_charset; | |
| 4894 coding->common_flags | |
| 4895 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); | |
| 4896 } | |
| 4897 else if (EQ (coding_type, Qutf_8)) | |
| 4898 { | |
| 4899 coding->detector = detect_coding_utf_8; | |
| 4900 coding->decoder = decode_coding_utf_8; | |
| 4901 coding->encoder = encode_coding_utf_8; | |
| 4902 coding->common_flags | |
| 4903 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); | |
| 4904 } | |
| 4905 else if (EQ (coding_type, Qutf_16)) | |
| 4906 { | |
| 4907 val = AREF (attrs, coding_attr_utf_16_bom); | |
| 4908 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom | |
| 4909 : EQ (val, Qt) ? utf_16_with_bom | |
| 4910 : utf_16_without_bom); | |
| 4911 val = AREF (attrs, coding_attr_utf_16_endian); | |
| 4912 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian | |
| 4913 : utf_16_little_endian); | |
| 4914 CODING_UTF_16_SURROGATE (coding) = 0; | |
| 4915 coding->detector = detect_coding_utf_16; | |
| 4916 coding->decoder = decode_coding_utf_16; | |
| 4917 coding->encoder = encode_coding_utf_16; | |
| 4918 coding->common_flags | |
| 4919 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); | |
| 4920 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom) | |
| 4921 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK; | |
| 4922 } | |
| 4923 else if (EQ (coding_type, Qccl)) | |
| 4924 { | |
| 4925 coding->detector = detect_coding_ccl; | |
| 4926 coding->decoder = decode_coding_ccl; | |
| 4927 coding->encoder = encode_coding_ccl; | |
| 4928 coding->common_flags | |
| 4929 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK | |
| 4930 | CODING_REQUIRE_FLUSHING_MASK); | |
| 4931 } | |
| 4932 else if (EQ (coding_type, Qemacs_mule)) | |
| 4933 { | |
| 4934 coding->detector = detect_coding_emacs_mule; | |
| 4935 coding->decoder = decode_coding_emacs_mule; | |
| 4936 coding->encoder = encode_coding_emacs_mule; | |
| 4937 coding->common_flags | |
| 4938 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); | |
| 4939 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full)) | |
| 4940 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list)) | |
| 4941 { | |
| 4942 Lisp_Object tail, safe_charsets; | |
| 4943 int max_charset_id = 0; | |
| 4944 | |
| 4945 for (tail = Vemacs_mule_charset_list; CONSP (tail); | |
| 4946 tail = XCDR (tail)) | |
| 4947 if (max_charset_id < XFASTINT (XCAR (tail))) | |
| 4948 max_charset_id = XFASTINT (XCAR (tail)); | |
| 4949 safe_charsets = Fmake_string (make_number (max_charset_id + 1), | |
| 4950 make_number (255)); | |
| 4951 for (tail = Vemacs_mule_charset_list; CONSP (tail); | |
| 4952 tail = XCDR (tail)) | |
| 4953 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0); | |
| 4954 coding->max_charset_id = max_charset_id; | |
| 4955 coding->safe_charsets = (char *) SDATA (safe_charsets); | |
| 4956 } | |
| 4957 } | |
| 4958 else if (EQ (coding_type, Qshift_jis)) | |
| 4959 { | |
| 4960 coding->detector = detect_coding_sjis; | |
| 4961 coding->decoder = decode_coding_sjis; | |
| 4962 coding->encoder = encode_coding_sjis; | |
| 4963 coding->common_flags | |
| 4964 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); | |
| 4965 } | |
| 4966 else if (EQ (coding_type, Qbig5)) | |
| 4967 { | |
| 4968 coding->detector = detect_coding_big5; | |
| 4969 coding->decoder = decode_coding_big5; | |
| 4970 coding->encoder = encode_coding_big5; | |
| 4971 coding->common_flags | |
| 4972 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); | |
| 4973 } | |
| 4974 else /* EQ (coding_type, Qraw_text) */ | |
| 4975 { | |
| 4976 coding->detector = NULL; | |
| 4977 coding->decoder = decode_coding_raw_text; | |
| 4978 coding->encoder = encode_coding_raw_text; | |
| 4979 } | |
| 4980 | |
| 4981 return; | |
| 4982 } | |
| 4983 | |
| 4984 /* Return raw-text or one of its subsidiaries that has the same | |
| 4985 eol_type as CODING-SYSTEM. */ | |
| 4986 | |
| 4987 Lisp_Object | |
| 4988 raw_text_coding_system (coding_system) | |
| 4989 Lisp_Object coding_system; | |
| 4990 { | |
| 4991 Lisp_Object spec, attrs; | |
| 4992 Lisp_Object eol_type, raw_text_eol_type; | |
| 4993 | |
| 4994 if (NILP (coding_system)) | |
| 4995 return Qraw_text; | |
| 4996 spec = CODING_SYSTEM_SPEC (coding_system); | |
| 4997 attrs = AREF (spec, 0); | |
| 4998 | |
| 4999 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text)) | |
| 5000 return coding_system; | |
| 5001 | |
| 5002 eol_type = AREF (spec, 2); | |
| 3565 if (VECTORP (eol_type)) | 5003 if (VECTORP (eol_type)) |
| 3566 { | 5004 return Qraw_text; |
| 3567 coding->eol_type = CODING_EOL_UNDECIDED; | 5005 spec = CODING_SYSTEM_SPEC (Qraw_text); |
| 3568 coding->common_flags = CODING_REQUIRE_DETECTION_MASK; | 5006 raw_text_eol_type = AREF (spec, 2); |
| 3569 } | 5007 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0) |
| 3570 else if (XFASTINT (eol_type) == 1) | 5008 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1) |
| 3571 { | 5009 : AREF (raw_text_eol_type, 2)); |
| 3572 coding->eol_type = CODING_EOL_CRLF; | 5010 } |
| 3573 coding->common_flags | 5011 |
| 3574 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; | 5012 |
| 3575 } | 5013 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT |
| 3576 else if (XFASTINT (eol_type) == 2) | 5014 does, return one of the subsidiary that has the same eol-spec as |
| 3577 { | 5015 PARENT. Otherwise, return CODING_SYSTEM. */ |
| 3578 coding->eol_type = CODING_EOL_CR; | 5016 |
| 3579 coding->common_flags | 5017 Lisp_Object |
| 3580 = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; | 5018 coding_inherit_eol_type (coding_system, parent) |
| 3581 } | 5019 Lisp_Object coding_system, parent; |
| 3582 else | 5020 { |
| 3583 coding->eol_type = CODING_EOL_LF; | 5021 Lisp_Object spec, attrs, eol_type; |
| 3584 | 5022 |
| 3585 coding_type = XVECTOR (coding_spec)->contents[0]; | 5023 if (NILP (coding_system)) |
| 3586 /* Try short cut. */ | 5024 coding_system = Qraw_text; |
| 3587 if (SYMBOLP (coding_type)) | 5025 spec = CODING_SYSTEM_SPEC (coding_system); |
| 3588 { | 5026 attrs = AREF (spec, 0); |
| 3589 if (EQ (coding_type, Qt)) | 5027 eol_type = AREF (spec, 2); |
| 3590 { | 5028 if (VECTORP (eol_type) |
| 3591 coding->type = coding_type_undecided; | 5029 && ! NILP (parent)) |
| 3592 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK; | 5030 { |
| 3593 } | 5031 Lisp_Object parent_spec; |
| 3594 else | 5032 Lisp_Object parent_eol_type; |
| 3595 coding->type = coding_type_no_conversion; | 5033 |
| 3596 /* Initialize this member. Any thing other than | 5034 parent_spec |
| 3597 CODING_CATEGORY_IDX_UTF_16_BE and | 5035 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system); |
| 3598 CODING_CATEGORY_IDX_UTF_16_LE are ok because they have | 5036 parent_eol_type = AREF (parent_spec, 2); |
| 3599 special treatment in detect_eol. */ | 5037 if (EQ (parent_eol_type, Qunix)) |
| 3600 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE; | 5038 coding_system = AREF (eol_type, 0); |
| 3601 | 5039 else if (EQ (parent_eol_type, Qdos)) |
| 3602 return 0; | 5040 coding_system = AREF (eol_type, 1); |
| 3603 } | 5041 else if (EQ (parent_eol_type, Qmac)) |
| 3604 | 5042 coding_system = AREF (eol_type, 2); |
| 3605 /* Get values of coding system properties: | 5043 } |
| 3606 `post-read-conversion', `pre-write-conversion', | 5044 return coding_system; |
| 3607 `translation-table-for-decode', `translation-table-for-encode'. */ | |
| 3608 plist = XVECTOR (coding_spec)->contents[3]; | |
| 3609 /* Pre & post conversion functions should be disabled if | |
| 3610 inhibit_eol_conversion is nonzero. This is the case that a code | |
| 3611 conversion function is called while those functions are running. */ | |
| 3612 if (! inhibit_pre_post_conversion) | |
| 3613 { | |
| 3614 coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion); | |
| 3615 coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion); | |
| 3616 } | |
| 3617 val = Fplist_get (plist, Qtranslation_table_for_decode); | |
| 3618 if (SYMBOLP (val)) | |
| 3619 val = Fget (val, Qtranslation_table_for_decode); | |
| 3620 coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil; | |
| 3621 val = Fplist_get (plist, Qtranslation_table_for_encode); | |
| 3622 if (SYMBOLP (val)) | |
| 3623 val = Fget (val, Qtranslation_table_for_encode); | |
| 3624 coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil; | |
| 3625 val = Fplist_get (plist, Qcoding_category); | |
| 3626 if (!NILP (val)) | |
| 3627 { | |
| 3628 val = Fget (val, Qcoding_category_index); | |
| 3629 if (INTEGERP (val)) | |
| 3630 coding->category_idx = XINT (val); | |
| 3631 else | |
| 3632 goto label_invalid_coding_system; | |
| 3633 } | |
| 3634 else | |
| 3635 goto label_invalid_coding_system; | |
| 3636 | |
| 3637 /* If the coding system has non-nil `composition' property, enable | |
| 3638 composition handling. */ | |
| 3639 val = Fplist_get (plist, Qcomposition); | |
| 3640 if (!NILP (val)) | |
| 3641 coding->composing = COMPOSITION_NO; | |
| 3642 | |
| 3643 switch (XFASTINT (coding_type)) | |
| 3644 { | |
| 3645 case 0: | |
| 3646 coding->type = coding_type_emacs_mule; | |
| 3647 coding->common_flags | |
| 3648 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; | |
| 3649 if (!NILP (coding->post_read_conversion)) | |
| 3650 coding->common_flags |= CODING_REQUIRE_DECODING_MASK; | |
| 3651 if (!NILP (coding->pre_write_conversion)) | |
| 3652 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK; | |
| 3653 break; | |
| 3654 | |
| 3655 case 1: | |
| 3656 coding->type = coding_type_sjis; | |
| 3657 coding->common_flags | |
| 3658 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; | |
| 3659 break; | |
| 3660 | |
| 3661 case 2: | |
| 3662 coding->type = coding_type_iso2022; | |
| 3663 coding->common_flags | |
| 3664 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; | |
| 3665 { | |
| 3666 Lisp_Object val, temp; | |
| 3667 Lisp_Object *flags; | |
| 3668 int i, charset, reg_bits = 0; | |
| 3669 | |
| 3670 val = XVECTOR (coding_spec)->contents[4]; | |
| 3671 | |
| 3672 if (!VECTORP (val) || XVECTOR (val)->size != 32) | |
| 3673 goto label_invalid_coding_system; | |
| 3674 | |
| 3675 flags = XVECTOR (val)->contents; | |
| 3676 coding->flags | |
| 3677 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM) | |
| 3678 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL) | |
| 3679 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL) | |
| 3680 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS) | |
| 3681 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT) | |
| 3682 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT) | |
| 3683 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN) | |
| 3684 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS) | |
| 3685 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION) | |
| 3686 | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL) | |
| 3687 | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL) | |
| 3688 | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE) | |
| 3689 | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA) | |
| 3690 ); | |
| 3691 | |
| 3692 /* Invoke graphic register 0 to plane 0. */ | |
| 3693 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; | |
| 3694 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */ | |
| 3695 CODING_SPEC_ISO_INVOCATION (coding, 1) | |
| 3696 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1); | |
| 3697 /* Not single shifting at first. */ | |
| 3698 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; | |
| 3699 /* Beginning of buffer should also be regarded as bol. */ | |
| 3700 CODING_SPEC_ISO_BOL (coding) = 1; | |
| 3701 | |
| 3702 for (charset = 0; charset <= MAX_CHARSET; charset++) | |
| 3703 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255; | |
| 3704 val = Vcharset_revision_alist; | |
| 3705 while (CONSP (val)) | |
| 3706 { | |
| 3707 charset = get_charset_id (Fcar_safe (XCAR (val))); | |
| 3708 if (charset >= 0 | |
| 3709 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp)) | |
| 3710 && (i = XINT (temp), (i >= 0 && (i + '@') < 128))) | |
| 3711 CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i; | |
| 3712 val = XCDR (val); | |
| 3713 } | |
| 3714 | |
| 3715 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations. | |
| 3716 FLAGS[REG] can be one of below: | |
| 3717 integer CHARSET: CHARSET occupies register I, | |
| 3718 t: designate nothing to REG initially, but can be used | |
| 3719 by any charsets, | |
| 3720 list of integer, nil, or t: designate the first | |
| 3721 element (if integer) to REG initially, the remaining | |
| 3722 elements (if integer) is designated to REG on request, | |
| 3723 if an element is t, REG can be used by any charsets, | |
| 3724 nil: REG is never used. */ | |
| 3725 for (charset = 0; charset <= MAX_CHARSET; charset++) | |
| 3726 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) | |
| 3727 = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION; | |
| 3728 for (i = 0; i < 4; i++) | |
| 3729 { | |
| 3730 if ((INTEGERP (flags[i]) | |
| 3731 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))) | |
| 3732 || (charset = get_charset_id (flags[i])) >= 0) | |
| 3733 { | |
| 3734 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset; | |
| 3735 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i; | |
| 3736 } | |
| 3737 else if (EQ (flags[i], Qt)) | |
| 3738 { | |
| 3739 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1; | |
| 3740 reg_bits |= 1 << i; | |
| 3741 coding->flags |= CODING_FLAG_ISO_DESIGNATION; | |
| 3742 } | |
| 3743 else if (CONSP (flags[i])) | |
| 3744 { | |
| 3745 Lisp_Object tail; | |
| 3746 tail = flags[i]; | |
| 3747 | |
| 3748 coding->flags |= CODING_FLAG_ISO_DESIGNATION; | |
| 3749 if ((INTEGERP (XCAR (tail)) | |
| 3750 && (charset = XINT (XCAR (tail)), | |
| 3751 CHARSET_VALID_P (charset))) | |
| 3752 || (charset = get_charset_id (XCAR (tail))) >= 0) | |
| 3753 { | |
| 3754 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset; | |
| 3755 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i; | |
| 3756 } | |
| 3757 else | |
| 3758 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1; | |
| 3759 tail = XCDR (tail); | |
| 3760 while (CONSP (tail)) | |
| 3761 { | |
| 3762 if ((INTEGERP (XCAR (tail)) | |
| 3763 && (charset = XINT (XCAR (tail)), | |
| 3764 CHARSET_VALID_P (charset))) | |
| 3765 || (charset = get_charset_id (XCAR (tail))) >= 0) | |
| 3766 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) | |
| 3767 = i; | |
| 3768 else if (EQ (XCAR (tail), Qt)) | |
| 3769 reg_bits |= 1 << i; | |
| 3770 tail = XCDR (tail); | |
| 3771 } | |
| 3772 } | |
| 3773 else | |
| 3774 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1; | |
| 3775 | |
| 3776 CODING_SPEC_ISO_DESIGNATION (coding, i) | |
| 3777 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i); | |
| 3778 } | |
| 3779 | |
| 3780 if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)) | |
| 3781 { | |
| 3782 /* REG 1 can be used only by locking shift in 7-bit env. */ | |
| 3783 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) | |
| 3784 reg_bits &= ~2; | |
| 3785 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)) | |
| 3786 /* Without any shifting, only REG 0 and 1 can be used. */ | |
| 3787 reg_bits &= 3; | |
| 3788 } | |
| 3789 | |
| 3790 if (reg_bits) | |
| 3791 for (charset = 0; charset <= MAX_CHARSET; charset++) | |
| 3792 { | |
| 3793 if (CHARSET_DEFINED_P (charset) | |
| 3794 && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) | |
| 3795 == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)) | |
| 3796 { | |
| 3797 /* There exist some default graphic registers to be | |
| 3798 used by CHARSET. */ | |
| 3799 | |
| 3800 /* We had better avoid designating a charset of | |
| 3801 CHARS96 to REG 0 as far as possible. */ | |
| 3802 if (CHARSET_CHARS (charset) == 96) | |
| 3803 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) | |
| 3804 = (reg_bits & 2 | |
| 3805 ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0))); | |
| 3806 else | |
| 3807 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) | |
| 3808 = (reg_bits & 1 | |
| 3809 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3))); | |
| 3810 } | |
| 3811 } | |
| 3812 } | |
| 3813 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK; | |
| 3814 coding->spec.iso2022.last_invalid_designation_register = -1; | |
| 3815 break; | |
| 3816 | |
| 3817 case 3: | |
| 3818 coding->type = coding_type_big5; | |
| 3819 coding->common_flags | |
| 3820 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; | |
| 3821 coding->flags | |
| 3822 = (NILP (XVECTOR (coding_spec)->contents[4]) | |
| 3823 ? CODING_FLAG_BIG5_HKU | |
| 3824 : CODING_FLAG_BIG5_ETEN); | |
| 3825 break; | |
| 3826 | |
| 3827 case 4: | |
| 3828 coding->type = coding_type_ccl; | |
| 3829 coding->common_flags | |
| 3830 |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK; | |
| 3831 { | |
| 3832 val = XVECTOR (coding_spec)->contents[4]; | |
| 3833 if (! CONSP (val) | |
| 3834 || setup_ccl_program (&(coding->spec.ccl.decoder), | |
| 3835 XCAR (val)) < 0 | |
| 3836 || setup_ccl_program (&(coding->spec.ccl.encoder), | |
| 3837 XCDR (val)) < 0) | |
| 3838 goto label_invalid_coding_system; | |
| 3839 | |
| 3840 bzero (coding->spec.ccl.valid_codes, 256); | |
| 3841 val = Fplist_get (plist, Qvalid_codes); | |
| 3842 if (CONSP (val)) | |
| 3843 { | |
| 3844 Lisp_Object this; | |
| 3845 | |
| 3846 for (; CONSP (val); val = XCDR (val)) | |
| 3847 { | |
| 3848 this = XCAR (val); | |
| 3849 if (INTEGERP (this) | |
| 3850 && XINT (this) >= 0 && XINT (this) < 256) | |
| 3851 coding->spec.ccl.valid_codes[XINT (this)] = 1; | |
| 3852 else if (CONSP (this) | |
| 3853 && INTEGERP (XCAR (this)) | |
| 3854 && INTEGERP (XCDR (this))) | |
| 3855 { | |
| 3856 int start = XINT (XCAR (this)); | |
| 3857 int end = XINT (XCDR (this)); | |
| 3858 | |
| 3859 if (start >= 0 && start <= end && end < 256) | |
| 3860 while (start <= end) | |
| 3861 coding->spec.ccl.valid_codes[start++] = 1; | |
| 3862 } | |
| 3863 } | |
| 3864 } | |
| 3865 } | |
| 3866 coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK; | |
| 3867 coding->spec.ccl.cr_carryover = 0; | |
| 3868 coding->spec.ccl.eight_bit_carryover[0] = 0; | |
| 3869 break; | |
| 3870 | |
| 3871 case 5: | |
| 3872 coding->type = coding_type_raw_text; | |
| 3873 break; | |
| 3874 | |
| 3875 default: | |
| 3876 goto label_invalid_coding_system; | |
| 3877 } | |
| 3878 return 0; | |
| 3879 | |
| 3880 label_invalid_coding_system: | |
| 3881 coding->type = coding_type_no_conversion; | |
| 3882 coding->category_idx = CODING_CATEGORY_IDX_BINARY; | |
| 3883 coding->common_flags = 0; | |
| 3884 coding->eol_type = CODING_EOL_LF; | |
| 3885 coding->pre_write_conversion = coding->post_read_conversion = Qnil; | |
| 3886 return -1; | |
| 3887 } | |
| 3888 | |
| 3889 /* Free memory blocks allocated for storing composition information. */ | |
| 3890 | |
| 3891 void | |
| 3892 coding_free_composition_data (coding) | |
| 3893 struct coding_system *coding; | |
| 3894 { | |
| 3895 struct composition_data *cmp_data = coding->cmp_data, *next; | |
| 3896 | |
| 3897 if (!cmp_data) | |
| 3898 return; | |
| 3899 /* Memory blocks are chained. At first, rewind to the first, then, | |
| 3900 free blocks one by one. */ | |
| 3901 while (cmp_data->prev) | |
| 3902 cmp_data = cmp_data->prev; | |
| 3903 while (cmp_data) | |
| 3904 { | |
| 3905 next = cmp_data->next; | |
| 3906 xfree (cmp_data); | |
| 3907 cmp_data = next; | |
| 3908 } | |
| 3909 coding->cmp_data = NULL; | |
| 3910 } | |
| 3911 | |
| 3912 /* Set `char_offset' member of all memory blocks pointed by | |
| 3913 coding->cmp_data to POS. */ | |
| 3914 | |
| 3915 void | |
| 3916 coding_adjust_composition_offset (coding, pos) | |
| 3917 struct coding_system *coding; | |
| 3918 int pos; | |
| 3919 { | |
| 3920 struct composition_data *cmp_data; | |
| 3921 | |
| 3922 for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next) | |
| 3923 cmp_data->char_offset = pos; | |
| 3924 } | |
| 3925 | |
| 3926 /* Setup raw-text or one of its subsidiaries in the structure | |
| 3927 coding_system CODING according to the already setup value eol_type | |
| 3928 in CODING. CODING should be setup for some coding system in | |
| 3929 advance. */ | |
| 3930 | |
| 3931 void | |
| 3932 setup_raw_text_coding_system (coding) | |
| 3933 struct coding_system *coding; | |
| 3934 { | |
| 3935 if (coding->type != coding_type_raw_text) | |
| 3936 { | |
| 3937 coding->symbol = Qraw_text; | |
| 3938 coding->type = coding_type_raw_text; | |
| 3939 if (coding->eol_type != CODING_EOL_UNDECIDED) | |
| 3940 { | |
| 3941 Lisp_Object subsidiaries; | |
| 3942 subsidiaries = Fget (Qraw_text, Qeol_type); | |
| 3943 | |
| 3944 if (VECTORP (subsidiaries) | |
| 3945 && XVECTOR (subsidiaries)->size == 3) | |
| 3946 coding->symbol | |
| 3947 = XVECTOR (subsidiaries)->contents[coding->eol_type]; | |
| 3948 } | |
| 3949 setup_coding_system (coding->symbol, coding); | |
| 3950 } | |
| 3951 return; | |
| 3952 } | 5045 } |
| 3953 | 5046 |
| 3954 /* Emacs has a mechanism to automatically detect a coding system if it | 5047 /* Emacs has a mechanism to automatically detect a coding system if it |
| 3955 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But, | 5048 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But, |
| 3956 it's impossible to distinguish some coding systems accurately | 5049 it's impossible to distinguish some coding systems accurately |
| 3999 symbol) `japanese-iso-8bit' by default. | 5092 symbol) `japanese-iso-8bit' by default. |
| 4000 | 5093 |
| 4001 o coding-category-iso-7-else | 5094 o coding-category-iso-7-else |
| 4002 | 5095 |
| 4003 The category for a coding system which has the same code range | 5096 The category for a coding system which has the same code range |
| 4004 as ISO2022 of 7-bit environment but uses locking shift or | 5097 as ISO2022 of 7-bit environemnt but uses locking shift or |
| 4005 single shift functions. Assigned the coding-system (Lisp | 5098 single shift functions. Assigned the coding-system (Lisp |
| 4006 symbol) `iso-2022-7bit-lock' by default. | 5099 symbol) `iso-2022-7bit-lock' by default. |
| 4007 | 5100 |
| 4008 o coding-category-iso-8-else | 5101 o coding-category-iso-8-else |
| 4009 | 5102 |
| 4010 The category for a coding system which has the same code range | 5103 The category for a coding system which has the same code range |
| 4011 as ISO2022 of 8-bit environment but uses locking shift or | 5104 as ISO2022 of 8-bit environemnt but uses locking shift or |
| 4012 single shift functions. Assigned the coding-system (Lisp | 5105 single shift functions. Assigned the coding-system (Lisp |
| 4013 symbol) `iso-2022-8bit-ss2' by default. | 5106 symbol) `iso-2022-8bit-ss2' by default. |
| 4014 | 5107 |
| 4015 o coding-category-big5 | 5108 o coding-category-big5 |
| 4016 | 5109 |
| 4049 The category for a coding system not categorized in any of the | 5142 The category for a coding system not categorized in any of the |
| 4050 above. Assigned the coding-system (Lisp symbol) | 5143 above. Assigned the coding-system (Lisp symbol) |
| 4051 `no-conversion' by default. | 5144 `no-conversion' by default. |
| 4052 | 5145 |
| 4053 Each of them is a Lisp symbol and the value is an actual | 5146 Each of them is a Lisp symbol and the value is an actual |
| 4054 `coding-system' (this is also a Lisp symbol) assigned by a user. | 5147 `coding-system's (this is also a Lisp symbol) assigned by a user. |
| 4055 What Emacs does actually is to detect a category of coding system. | 5148 What Emacs does actually is to detect a category of coding system. |
| 4056 Then, it uses a `coding-system' assigned to it. If Emacs can't | 5149 Then, it uses a `coding-system' assigned to it. If Emacs can't |
| 4057 decide a single possible category, it selects a category of the | 5150 decide only one possible category, it selects a category of the |
| 4058 highest priority. Priorities of categories are also specified by a | 5151 highest priority. Priorities of categories are also specified by a |
| 4059 user in a Lisp variable `coding-category-list'. | 5152 user in a Lisp variable `coding-category-list'. |
| 4060 | 5153 |
| 4061 */ | 5154 */ |
| 4062 | 5155 |
| 4063 static | 5156 #define EOL_SEEN_NONE 0 |
| 4064 int ascii_skip_code[256]; | 5157 #define EOL_SEEN_LF 1 |
| 4065 | 5158 #define EOL_SEEN_CR 2 |
| 4066 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded. | 5159 #define EOL_SEEN_CRLF 4 |
| 4067 If it detects possible coding systems, return an integer in which | 5160 |
| 4068 appropriate flag bits are set. Flag bits are defined by macros | 5161 /* Detect how end-of-line of a text of length SRC_BYTES pointed by |
| 4069 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL, | 5162 SOURCE is encoded. If CATEGORY is one of |
| 4070 it should point the table `coding_priorities'. In that case, only | 5163 coding_category_utf_16_XXXX, assume that CR and LF are encoded by |
| 4071 the flag bit for a coding system of the highest priority is set in | 5164 two-byte, else they are encoded by one-byte. |
| 4072 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the | 5165 |
| 4073 range 0x80..0x9F are in multibyte form. | 5166 Return one of EOL_SEEN_XXX. */ |
| 4074 | 5167 |
| 4075 How many ASCII characters are at the head is returned as *SKIP. */ | 5168 #define MAX_EOL_CHECK_COUNT 3 |
| 4076 | 5169 |
| 4077 static int | 5170 static int |
| 4078 detect_coding_mask (source, src_bytes, priorities, skip, multibytep) | 5171 detect_eol (source, src_bytes, category) |
| 4079 unsigned char *source; | 5172 unsigned char *source; |
| 4080 int src_bytes, *priorities, *skip; | 5173 EMACS_INT src_bytes; |
| 4081 int multibytep; | 5174 enum coding_category category; |
| 4082 { | |
| 4083 register unsigned char c; | |
| 4084 unsigned char *src = source, *src_end = source + src_bytes; | |
| 4085 unsigned int mask, utf16_examined_p, iso2022_examined_p; | |
| 4086 int i; | |
| 4087 | |
| 4088 /* At first, skip all ASCII characters and control characters except | |
| 4089 for three ISO2022 specific control characters. */ | |
| 4090 ascii_skip_code[ISO_CODE_SO] = 0; | |
| 4091 ascii_skip_code[ISO_CODE_SI] = 0; | |
| 4092 ascii_skip_code[ISO_CODE_ESC] = 0; | |
| 4093 | |
| 4094 label_loop_detect_coding: | |
| 4095 while (src < src_end && ascii_skip_code[*src]) src++; | |
| 4096 *skip = src - source; | |
| 4097 | |
| 4098 if (src >= src_end) | |
| 4099 /* We found nothing other than ASCII. There's nothing to do. */ | |
| 4100 return 0; | |
| 4101 | |
| 4102 c = *src; | |
| 4103 /* The text seems to be encoded in some multilingual coding system. | |
| 4104 Now, try to find in which coding system the text is encoded. */ | |
| 4105 if (c < 0x80) | |
| 4106 { | |
| 4107 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */ | |
| 4108 /* C is an ISO2022 specific control code of C0. */ | |
| 4109 mask = detect_coding_iso2022 (src, src_end, multibytep); | |
| 4110 if (mask == 0) | |
| 4111 { | |
| 4112 /* No valid ISO2022 code follows C. Try again. */ | |
| 4113 src++; | |
| 4114 if (c == ISO_CODE_ESC) | |
| 4115 ascii_skip_code[ISO_CODE_ESC] = 1; | |
| 4116 else | |
| 4117 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1; | |
| 4118 goto label_loop_detect_coding; | |
| 4119 } | |
| 4120 if (priorities) | |
| 4121 { | |
| 4122 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) | |
| 4123 { | |
| 4124 if (mask & priorities[i]) | |
| 4125 return priorities[i]; | |
| 4126 } | |
| 4127 return CODING_CATEGORY_MASK_RAW_TEXT; | |
| 4128 } | |
| 4129 } | |
| 4130 else | |
| 4131 { | |
| 4132 int try; | |
| 4133 | |
| 4134 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL) | |
| 4135 c = src[1] - 0x20; | |
| 4136 | |
| 4137 if (c < 0xA0) | |
| 4138 { | |
| 4139 /* C is the first byte of SJIS character code, | |
| 4140 or a leading-code of Emacs' internal format (emacs-mule), | |
| 4141 or the first byte of UTF-16. */ | |
| 4142 try = (CODING_CATEGORY_MASK_SJIS | |
| 4143 | CODING_CATEGORY_MASK_EMACS_MULE | |
| 4144 | CODING_CATEGORY_MASK_UTF_16_BE | |
| 4145 | CODING_CATEGORY_MASK_UTF_16_LE); | |
| 4146 | |
| 4147 /* Or, if C is a special latin extra code, | |
| 4148 or is an ISO2022 specific control code of C1 (SS2 or SS3), | |
| 4149 or is an ISO2022 control-sequence-introducer (CSI), | |
| 4150 we should also consider the possibility of ISO2022 codings. */ | |
| 4151 if ((VECTORP (Vlatin_extra_code_table) | |
| 4152 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) | |
| 4153 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3) | |
| 4154 || (c == ISO_CODE_CSI | |
| 4155 && (src < src_end | |
| 4156 && (*src == ']' | |
| 4157 || ((*src == '0' || *src == '1' || *src == '2') | |
| 4158 && src + 1 < src_end | |
| 4159 && src[1] == ']'))))) | |
| 4160 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE | |
| 4161 | CODING_CATEGORY_MASK_ISO_8BIT); | |
| 4162 } | |
| 4163 else | |
| 4164 /* C is a character of ISO2022 in graphic plane right, | |
| 4165 or a SJIS's 1-byte character code (i.e. JISX0201), | |
| 4166 or the first byte of BIG5's 2-byte code, | |
| 4167 or the first byte of UTF-8/16. */ | |
| 4168 try = (CODING_CATEGORY_MASK_ISO_8_ELSE | |
| 4169 | CODING_CATEGORY_MASK_ISO_8BIT | |
| 4170 | CODING_CATEGORY_MASK_SJIS | |
| 4171 | CODING_CATEGORY_MASK_BIG5 | |
| 4172 | CODING_CATEGORY_MASK_UTF_8 | |
| 4173 | CODING_CATEGORY_MASK_UTF_16_BE | |
| 4174 | CODING_CATEGORY_MASK_UTF_16_LE); | |
| 4175 | |
| 4176 /* Or, we may have to consider the possibility of CCL. */ | |
| 4177 if (coding_system_table[CODING_CATEGORY_IDX_CCL] | |
| 4178 && (coding_system_table[CODING_CATEGORY_IDX_CCL] | |
| 4179 ->spec.ccl.valid_codes)[c]) | |
| 4180 try |= CODING_CATEGORY_MASK_CCL; | |
| 4181 | |
| 4182 mask = 0; | |
| 4183 utf16_examined_p = iso2022_examined_p = 0; | |
| 4184 if (priorities) | |
| 4185 { | |
| 4186 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) | |
| 4187 { | |
| 4188 if (!iso2022_examined_p | |
| 4189 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO)) | |
| 4190 { | |
| 4191 mask |= detect_coding_iso2022 (src, src_end, multibytep); | |
| 4192 iso2022_examined_p = 1; | |
| 4193 } | |
| 4194 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) | |
| 4195 mask |= detect_coding_sjis (src, src_end, multibytep); | |
| 4196 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8) | |
| 4197 mask |= detect_coding_utf_8 (src, src_end, multibytep); | |
| 4198 else if (!utf16_examined_p | |
| 4199 && (priorities[i] & try & | |
| 4200 CODING_CATEGORY_MASK_UTF_16_BE_LE)) | |
| 4201 { | |
| 4202 mask |= detect_coding_utf_16 (src, src_end, multibytep); | |
| 4203 utf16_examined_p = 1; | |
| 4204 } | |
| 4205 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5) | |
| 4206 mask |= detect_coding_big5 (src, src_end, multibytep); | |
| 4207 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE) | |
| 4208 mask |= detect_coding_emacs_mule (src, src_end, multibytep); | |
| 4209 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) | |
| 4210 mask |= detect_coding_ccl (src, src_end, multibytep); | |
| 4211 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) | |
| 4212 mask |= CODING_CATEGORY_MASK_RAW_TEXT; | |
| 4213 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) | |
| 4214 mask |= CODING_CATEGORY_MASK_BINARY; | |
| 4215 if (mask & priorities[i]) | |
| 4216 return priorities[i]; | |
| 4217 } | |
| 4218 return CODING_CATEGORY_MASK_RAW_TEXT; | |
| 4219 } | |
| 4220 if (try & CODING_CATEGORY_MASK_ISO) | |
| 4221 mask |= detect_coding_iso2022 (src, src_end, multibytep); | |
| 4222 if (try & CODING_CATEGORY_MASK_SJIS) | |
| 4223 mask |= detect_coding_sjis (src, src_end, multibytep); | |
| 4224 if (try & CODING_CATEGORY_MASK_BIG5) | |
| 4225 mask |= detect_coding_big5 (src, src_end, multibytep); | |
| 4226 if (try & CODING_CATEGORY_MASK_UTF_8) | |
| 4227 mask |= detect_coding_utf_8 (src, src_end, multibytep); | |
| 4228 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE) | |
| 4229 mask |= detect_coding_utf_16 (src, src_end, multibytep); | |
| 4230 if (try & CODING_CATEGORY_MASK_EMACS_MULE) | |
| 4231 mask |= detect_coding_emacs_mule (src, src_end, multibytep); | |
| 4232 if (try & CODING_CATEGORY_MASK_CCL) | |
| 4233 mask |= detect_coding_ccl (src, src_end, multibytep); | |
| 4234 } | |
| 4235 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY); | |
| 4236 } | |
| 4237 | |
| 4238 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded. | |
| 4239 The information of the detected coding system is set in CODING. */ | |
| 4240 | |
| 4241 void | |
| 4242 detect_coding (coding, src, src_bytes) | |
| 4243 struct coding_system *coding; | |
| 4244 const unsigned char *src; | |
| 4245 int src_bytes; | |
| 4246 { | |
| 4247 unsigned int idx; | |
| 4248 int skip, mask; | |
| 4249 Lisp_Object val; | |
| 4250 | |
| 4251 val = Vcoding_category_list; | |
| 4252 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip, | |
| 4253 coding->src_multibyte); | |
| 4254 coding->heading_ascii = skip; | |
| 4255 | |
| 4256 if (!mask) return; | |
| 4257 | |
| 4258 /* We found a single coding system of the highest priority in MASK. */ | |
| 4259 idx = 0; | |
| 4260 while (mask && ! (mask & 1)) mask >>= 1, idx++; | |
| 4261 if (! mask) | |
| 4262 idx = CODING_CATEGORY_IDX_RAW_TEXT; | |
| 4263 | |
| 4264 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]); | |
| 4265 | |
| 4266 if (coding->eol_type != CODING_EOL_UNDECIDED) | |
| 4267 { | |
| 4268 Lisp_Object tmp; | |
| 4269 | |
| 4270 tmp = Fget (val, Qeol_type); | |
| 4271 if (VECTORP (tmp)) | |
| 4272 val = XVECTOR (tmp)->contents[coding->eol_type]; | |
| 4273 } | |
| 4274 | |
| 4275 /* Setup this new coding system while preserving some slots. */ | |
| 4276 { | |
| 4277 int src_multibyte = coding->src_multibyte; | |
| 4278 int dst_multibyte = coding->dst_multibyte; | |
| 4279 | |
| 4280 setup_coding_system (val, coding); | |
| 4281 coding->src_multibyte = src_multibyte; | |
| 4282 coding->dst_multibyte = dst_multibyte; | |
| 4283 coding->heading_ascii = skip; | |
| 4284 } | |
| 4285 } | |
| 4286 | |
| 4287 /* Detect how end-of-line of a text of length SRC_BYTES pointed by | |
| 4288 SOURCE is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF, | |
| 4289 CODING_EOL_CR, and CODING_EOL_UNDECIDED. | |
| 4290 | |
| 4291 How many non-eol characters are at the head is returned as *SKIP. */ | |
| 4292 | |
| 4293 #define MAX_EOL_CHECK_COUNT 3 | |
| 4294 | |
| 4295 static int | |
| 4296 detect_eol_type (source, src_bytes, skip) | |
| 4297 unsigned char *source; | |
| 4298 int src_bytes, *skip; | |
| 4299 { | 5175 { |
| 4300 unsigned char *src = source, *src_end = src + src_bytes; | 5176 unsigned char *src = source, *src_end = src + src_bytes; |
| 4301 unsigned char c; | 5177 unsigned char c; |
| 4302 int total = 0; /* How many end-of-lines are found so far. */ | 5178 int total = 0; |
| 4303 int eol_type = CODING_EOL_UNDECIDED; | 5179 int eol_seen = EOL_SEEN_NONE; |
| 4304 int this_eol_type; | 5180 |
| 4305 | 5181 if ((1 << category) & CATEGORY_MASK_UTF_16) |
| 4306 *skip = 0; | 5182 { |
| 4307 | 5183 int msb, lsb; |
| 4308 while (src < src_end && total < MAX_EOL_CHECK_COUNT) | 5184 |
| 4309 { | 5185 msb = category == (coding_category_utf_16_le |
| 4310 c = *src++; | 5186 | coding_category_utf_16_le_nosig); |
| 4311 if (c == '\n' || c == '\r') | 5187 lsb = 1 - msb; |
| 4312 { | 5188 |
| 4313 if (*skip == 0) | 5189 while (src + 1 < src_end) |
| 4314 *skip = src - 1 - source; | 5190 { |
| 4315 total++; | 5191 c = src[lsb]; |
| 4316 if (c == '\n') | 5192 if (src[msb] == 0 && (c == '\n' || c == '\r')) |
| 4317 this_eol_type = CODING_EOL_LF; | 5193 { |
| 4318 else if (src >= src_end || *src != '\n') | 5194 int this_eol; |
| 4319 this_eol_type = CODING_EOL_CR; | 5195 |
| 5196 if (c == '\n') | |
| 5197 this_eol = EOL_SEEN_LF; | |
| 5198 else if (src + 3 >= src_end | |
| 5199 || src[msb + 2] != 0 | |
| 5200 || src[lsb + 2] != '\n') | |
| 5201 this_eol = EOL_SEEN_CR; | |
| 5202 else | |
| 5203 this_eol = EOL_SEEN_CRLF; | |
| 5204 | |
| 5205 if (eol_seen == EOL_SEEN_NONE) | |
| 5206 /* This is the first end-of-line. */ | |
| 5207 eol_seen = this_eol; | |
| 5208 else if (eol_seen != this_eol) | |
| 5209 { | |
| 5210 /* The found type is different from what found before. */ | |
| 5211 eol_seen = EOL_SEEN_LF; | |
| 5212 break; | |
| 5213 } | |
| 5214 if (++total == MAX_EOL_CHECK_COUNT) | |
| 5215 break; | |
| 5216 } | |
| 5217 src += 2; | |
| 5218 } | |
| 5219 } | |
| 5220 else | |
| 5221 { | |
| 5222 while (src < src_end) | |
| 5223 { | |
| 5224 c = *src++; | |
| 5225 if (c == '\n' || c == '\r') | |
| 5226 { | |
| 5227 int this_eol; | |
| 5228 | |
| 5229 if (c == '\n') | |
| 5230 this_eol = EOL_SEEN_LF; | |
| 5231 else if (src >= src_end || *src != '\n') | |
| 5232 this_eol = EOL_SEEN_CR; | |
| 5233 else | |
| 5234 this_eol = EOL_SEEN_CRLF, src++; | |
| 5235 | |
| 5236 if (eol_seen == EOL_SEEN_NONE) | |
| 5237 /* This is the first end-of-line. */ | |
| 5238 eol_seen = this_eol; | |
| 5239 else if (eol_seen != this_eol) | |
| 5240 { | |
| 5241 /* The found type is different from what found before. */ | |
| 5242 eol_seen = EOL_SEEN_LF; | |
| 5243 break; | |
| 5244 } | |
| 5245 if (++total == MAX_EOL_CHECK_COUNT) | |
| 5246 break; | |
| 5247 } | |
| 5248 } | |
| 5249 } | |
| 5250 return eol_seen; | |
| 5251 } | |
| 5252 | |
| 5253 | |
| 5254 static void | |
| 5255 adjust_coding_eol_type (coding, eol_seen) | |
| 5256 struct coding_system *coding; | |
| 5257 int eol_seen; | |
| 5258 { | |
| 5259 Lisp_Object eol_type; | |
| 5260 | |
| 5261 eol_type = CODING_ID_EOL_TYPE (coding->id); | |
| 5262 if (eol_seen & EOL_SEEN_LF) | |
| 5263 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0)); | |
| 5264 else if (eol_seen & EOL_SEEN_CRLF) | |
| 5265 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1)); | |
| 5266 else if (eol_seen & EOL_SEEN_CR) | |
| 5267 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2)); | |
| 5268 } | |
| 5269 | |
| 5270 /* Detect how a text specified in CODING is encoded. If a coding | |
| 5271 system is detected, update fields of CODING by the detected coding | |
| 5272 system. */ | |
| 5273 | |
| 5274 void | |
| 5275 detect_coding (coding) | |
| 5276 struct coding_system *coding; | |
| 5277 { | |
| 5278 const unsigned char *src, *src_end; | |
| 5279 Lisp_Object attrs, coding_type; | |
| 5280 | |
| 5281 coding->consumed = coding->consumed_char = 0; | |
| 5282 coding->produced = coding->produced_char = 0; | |
| 5283 coding_set_source (coding); | |
| 5284 | |
| 5285 src_end = coding->source + coding->src_bytes; | |
| 5286 | |
| 5287 /* If we have not yet decided the text encoding type, detect it | |
| 5288 now. */ | |
| 5289 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) | |
| 5290 { | |
| 5291 int c, i; | |
| 5292 | |
| 5293 for (src = coding->source; src < src_end; src++) | |
| 5294 { | |
| 5295 c = *src; | |
| 5296 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC | |
| 5297 || c == ISO_CODE_SI | |
| 5298 || c == ISO_CODE_SO))) | |
| 5299 break; | |
| 5300 } | |
| 5301 coding->head_ascii = src - (coding->source + coding->consumed); | |
| 5302 | |
| 5303 if (coding->head_ascii < coding->src_bytes) | |
| 5304 { | |
| 5305 struct coding_detection_info detect_info; | |
| 5306 enum coding_category category; | |
| 5307 struct coding_system *this; | |
| 5308 | |
| 5309 detect_info.checked = detect_info.found = detect_info.rejected = 0; | |
| 5310 for (i = 0; i < coding_category_raw_text; i++) | |
| 5311 { | |
| 5312 category = coding_priorities[i]; | |
| 5313 this = coding_categories + category; | |
| 5314 if (this->id < 0) | |
| 5315 { | |
| 5316 /* No coding system of this category is defined. */ | |
| 5317 detect_info.rejected |= (1 << category); | |
| 5318 } | |
| 5319 else if (category >= coding_category_raw_text) | |
| 5320 continue; | |
| 5321 else if (detect_info.checked & (1 << category)) | |
| 5322 { | |
| 5323 if (detect_info.found & (1 << category)) | |
| 5324 break; | |
| 5325 } | |
| 5326 else if ((*(this->detector)) (coding, &detect_info) | |
| 5327 && detect_info.found & (1 << category)) | |
| 5328 break; | |
| 5329 } | |
| 5330 if (i < coding_category_raw_text) | |
| 5331 setup_coding_system (CODING_ID_NAME (this->id), coding); | |
| 5332 else if (detect_info.rejected == CATEGORY_MASK_ANY) | |
| 5333 setup_coding_system (Qraw_text, coding); | |
| 5334 else if (detect_info.rejected) | |
| 5335 for (i = 0; i < coding_category_raw_text; i++) | |
| 5336 if (! (detect_info.rejected & (1 << coding_priorities[i]))) | |
| 5337 { | |
| 5338 this = coding_categories + coding_priorities[i]; | |
| 5339 setup_coding_system (CODING_ID_NAME (this->id), coding); | |
| 5340 break; | |
| 5341 } | |
| 5342 } | |
| 5343 } | |
| 5344 else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qutf_16)) | |
| 5345 { | |
| 5346 Lisp_Object coding_systems; | |
| 5347 struct coding_detection_info detect_info; | |
| 5348 | |
| 5349 coding_systems | |
| 5350 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom); | |
| 5351 detect_info.found = detect_info.rejected = 0; | |
| 5352 if (CONSP (coding_systems) | |
| 5353 && detect_coding_utf_16 (coding, &detect_info) | |
| 5354 && (detect_info.found & (CATEGORY_MASK_UTF_16_LE | |
| 5355 | CATEGORY_MASK_UTF_16_BE))) | |
| 5356 { | |
| 5357 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | |
| 5358 setup_coding_system (XCAR (coding_systems), coding); | |
| 4320 else | 5359 else |
| 4321 this_eol_type = CODING_EOL_CRLF, src++; | 5360 setup_coding_system (XCDR (coding_systems), coding); |
| 4322 | 5361 } |
| 4323 if (eol_type == CODING_EOL_UNDECIDED) | 5362 } |
| 4324 /* This is the first end-of-line. */ | 5363 |
| 4325 eol_type = this_eol_type; | 5364 attrs = CODING_ID_ATTRS (coding->id); |
| 4326 else if (eol_type != this_eol_type) | 5365 coding_type = CODING_ATTR_TYPE (attrs); |
| 5366 | |
| 5367 /* If we have not yet decided the EOL type, detect it now. But, the | |
| 5368 detection is impossible for a CCL based coding system, in which | |
| 5369 case, we detct the EOL type after decoding. */ | |
| 5370 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)) | |
| 5371 && ! EQ (coding_type, Qccl)) | |
| 5372 { | |
| 5373 int eol_seen = detect_eol (coding->source, coding->src_bytes, | |
| 5374 XINT (CODING_ATTR_CATEGORY (attrs))); | |
| 5375 | |
| 5376 if (eol_seen != EOL_SEEN_NONE) | |
| 5377 adjust_coding_eol_type (coding, eol_seen); | |
| 5378 } | |
| 5379 } | |
| 5380 | |
| 5381 | |
| 5382 static void | |
| 5383 decode_eol (coding) | |
| 5384 struct coding_system *coding; | |
| 5385 { | |
| 5386 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))) | |
| 5387 { | |
| 5388 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos); | |
| 5389 unsigned char *pend = p + coding->produced; | |
| 5390 int eol_seen = EOL_SEEN_NONE; | |
| 5391 | |
| 5392 for (; p < pend; p++) | |
| 5393 { | |
| 5394 if (*p == '\n') | |
| 5395 eol_seen |= EOL_SEEN_LF; | |
| 5396 else if (*p == '\r') | |
| 4327 { | 5397 { |
| 4328 /* The found type is different from what found before. */ | 5398 if (p + 1 < pend && *(p + 1) == '\n') |
| 4329 eol_type = CODING_EOL_INCONSISTENT; | 5399 { |
| 4330 break; | 5400 eol_seen |= EOL_SEEN_CRLF; |
| 5401 p++; | |
| 5402 } | |
| 5403 else | |
| 5404 eol_seen |= EOL_SEEN_CR; | |
| 4331 } | 5405 } |
| 4332 } | 5406 } |
| 4333 } | 5407 if (eol_seen != EOL_SEEN_NONE) |
| 4334 | 5408 adjust_coding_eol_type (coding, eol_seen); |
| 4335 if (*skip == 0) | 5409 } |
| 4336 *skip = src_end - source; | 5410 |
| 4337 return eol_type; | 5411 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac)) |
| 4338 } | 5412 { |
| 4339 | 5413 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos); |
| 4340 /* Like detect_eol_type, but detect EOL type in 2-octet | 5414 unsigned char *pend = p + coding->produced; |
| 4341 big-endian/little-endian format for coding systems utf-16-be and | 5415 |
| 4342 utf-16-le. */ | 5416 for (; p < pend; p++) |
| 5417 if (*p == '\r') | |
| 5418 *p = '\n'; | |
| 5419 } | |
| 5420 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos)) | |
| 5421 { | |
| 5422 unsigned char *p, *pbeg, *pend; | |
| 5423 Lisp_Object undo_list; | |
| 5424 | |
| 5425 move_gap_both (coding->dst_pos + coding->produced_char, | |
| 5426 coding->dst_pos_byte + coding->produced); | |
| 5427 undo_list = current_buffer->undo_list; | |
| 5428 current_buffer->undo_list = Qt; | |
| 5429 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, 0); | |
| 5430 current_buffer->undo_list = undo_list; | |
| 5431 pbeg = GPT_ADDR; | |
| 5432 pend = pbeg + coding->produced; | |
| 5433 | |
| 5434 for (p = pend - 1; p >= pbeg; p--) | |
| 5435 if (*p == '\r') | |
| 5436 { | |
| 5437 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1); | |
| 5438 pend--; | |
| 5439 } | |
| 5440 coding->produced_char -= coding->produced - (pend - pbeg); | |
| 5441 coding->produced = pend - pbeg; | |
| 5442 insert_from_gap (coding->produced_char, coding->produced); | |
| 5443 } | |
| 5444 } | |
| 5445 | |
| 5446 static void | |
| 5447 translate_chars (coding, table) | |
| 5448 struct coding_system *coding; | |
| 5449 Lisp_Object table; | |
| 5450 { | |
| 5451 int *charbuf = coding->charbuf; | |
| 5452 int *charbuf_end = charbuf + coding->charbuf_used; | |
| 5453 int c; | |
| 5454 | |
| 5455 if (coding->chars_at_source) | |
| 5456 return; | |
| 5457 | |
| 5458 while (charbuf < charbuf_end) | |
| 5459 { | |
| 5460 c = *charbuf; | |
| 5461 if (c < 0) | |
| 5462 charbuf += c; | |
| 5463 else | |
| 5464 *charbuf++ = translate_char (table, c); | |
| 5465 } | |
| 5466 } | |
| 4343 | 5467 |
| 4344 static int | 5468 static int |
| 4345 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p) | 5469 produce_chars (coding) |
| 4346 unsigned char *source; | 5470 struct coding_system *coding; |
| 4347 int src_bytes, *skip, big_endian_p; | 5471 { |
| 4348 { | 5472 unsigned char *dst = coding->destination + coding->produced; |
| 4349 unsigned char *src = source, *src_end = src + src_bytes; | 5473 unsigned char *dst_end = coding->destination + coding->dst_bytes; |
| 4350 unsigned int c1, c2; | 5474 int produced; |
| 4351 int total = 0; /* How many end-of-lines are found so far. */ | 5475 int produced_chars = 0; |
| 4352 int eol_type = CODING_EOL_UNDECIDED; | 5476 |
| 4353 int this_eol_type; | 5477 if (! coding->chars_at_source) |
| 4354 int msb, lsb; | 5478 { |
| 4355 | 5479 /* Characters are in coding->charbuf. */ |
| 4356 if (big_endian_p) | 5480 int *buf = coding->charbuf; |
| 4357 msb = 0, lsb = 1; | 5481 int *buf_end = buf + coding->charbuf_used; |
| 4358 else | 5482 unsigned char *adjusted_dst_end; |
| 4359 msb = 1, lsb = 0; | 5483 |
| 4360 | 5484 if (BUFFERP (coding->src_object) |
| 4361 *skip = 0; | 5485 && EQ (coding->src_object, coding->dst_object)) |
| 4362 | 5486 dst_end = ((unsigned char *) coding->source) + coding->consumed; |
| 4363 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT) | 5487 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH; |
| 4364 { | 5488 |
| 4365 c1 = (src[msb] << 8) | (src[lsb]); | 5489 while (buf < buf_end) |
| 4366 src += 2; | 5490 { |
| 4367 | 5491 int c = *buf++; |
| 4368 if (c1 == '\n' || c1 == '\r') | 5492 |
| 4369 { | 5493 if (dst >= adjusted_dst_end) |
| 4370 if (*skip == 0) | |
| 4371 *skip = src - 2 - source; | |
| 4372 total++; | |
| 4373 if (c1 == '\n') | |
| 4374 { | 5494 { |
| 4375 this_eol_type = CODING_EOL_LF; | 5495 dst = alloc_destination (coding, |
| 5496 buf_end - buf + MAX_MULTIBYTE_LENGTH, | |
| 5497 dst); | |
| 5498 dst_end = coding->destination + coding->dst_bytes; | |
| 5499 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH; | |
| 5500 } | |
| 5501 if (c >= 0) | |
| 5502 { | |
| 5503 if (coding->dst_multibyte | |
| 5504 || ! CHAR_BYTE8_P (c)) | |
| 5505 CHAR_STRING_ADVANCE (c, dst); | |
| 5506 else | |
| 5507 *dst++ = CHAR_TO_BYTE8 (c); | |
| 5508 produced_chars++; | |
| 4376 } | 5509 } |
| 4377 else | 5510 else |
| 5511 /* This is an annotation datum. (-C) is the length of | |
| 5512 it. */ | |
| 5513 buf += -c - 1; | |
| 5514 } | |
| 5515 } | |
| 5516 else | |
| 5517 { | |
| 5518 const unsigned char *src = coding->source; | |
| 5519 const unsigned char *src_end = src + coding->src_bytes; | |
| 5520 Lisp_Object eol_type; | |
| 5521 | |
| 5522 eol_type = CODING_ID_EOL_TYPE (coding->id); | |
| 5523 | |
| 5524 if (coding->src_multibyte != coding->dst_multibyte) | |
| 5525 { | |
| 5526 if (coding->src_multibyte) | |
| 4378 { | 5527 { |
| 4379 if ((src + 1) >= src_end) | 5528 int multibytep = 1; |
| 5529 int consumed_chars; | |
| 5530 | |
| 5531 while (1) | |
| 4380 { | 5532 { |
| 4381 this_eol_type = CODING_EOL_CR; | 5533 const unsigned char *src_base = src; |
| 5534 int c; | |
| 5535 | |
| 5536 ONE_MORE_BYTE (c); | |
| 5537 if (c == '\r') | |
| 5538 { | |
| 5539 if (EQ (eol_type, Qdos)) | |
| 5540 { | |
| 5541 if (src == src_end) | |
| 5542 { | |
| 5543 coding->result = CODING_RESULT_INSUFFICIENT_SRC; | |
| 5544 goto no_more_source; | |
| 5545 } | |
| 5546 if (*src == '\n') | |
| 5547 c = *src++; | |
| 5548 } | |
| 5549 else if (EQ (eol_type, Qmac)) | |
| 5550 c = '\n'; | |
| 5551 } | |
| 5552 if (dst == dst_end) | |
| 5553 { | |
| 5554 coding->consumed = src - coding->source; | |
| 5555 | |
| 5556 if (EQ (coding->src_object, coding->dst_object)) | |
| 5557 dst_end = (unsigned char *) src; | |
| 5558 if (dst == dst_end) | |
| 5559 { | |
| 5560 dst = alloc_destination (coding, src_end - src + 1, | |
| 5561 dst); | |
| 5562 dst_end = coding->destination + coding->dst_bytes; | |
| 5563 coding_set_source (coding); | |
| 5564 src = coding->source + coding->consumed; | |
| 5565 src_end = coding->source + coding->src_bytes; | |
| 5566 } | |
| 5567 } | |
| 5568 *dst++ = c; | |
| 5569 produced_chars++; | |
| 5570 } | |
| 5571 no_more_source: | |
| 5572 ; | |
| 5573 } | |
| 5574 else | |
| 5575 while (src < src_end) | |
| 5576 { | |
| 5577 int multibytep = 1; | |
| 5578 int c = *src++; | |
| 5579 | |
| 5580 if (c == '\r') | |
| 5581 { | |
| 5582 if (EQ (eol_type, Qdos)) | |
| 5583 { | |
| 5584 if (src < src_end | |
| 5585 && *src == '\n') | |
| 5586 c = *src++; | |
| 5587 } | |
| 5588 else if (EQ (eol_type, Qmac)) | |
| 5589 c = '\n'; | |
| 5590 } | |
| 5591 if (dst >= dst_end - 1) | |
| 5592 { | |
| 5593 coding->consumed = src - coding->source; | |
| 5594 | |
| 5595 if (EQ (coding->src_object, coding->dst_object)) | |
| 5596 dst_end = (unsigned char *) src; | |
| 5597 if (dst >= dst_end - 1) | |
| 5598 { | |
| 5599 dst = alloc_destination (coding, src_end - src + 2, | |
| 5600 dst); | |
| 5601 dst_end = coding->destination + coding->dst_bytes; | |
| 5602 coding_set_source (coding); | |
| 5603 src = coding->source + coding->consumed; | |
| 5604 src_end = coding->source + coding->src_bytes; | |
| 5605 } | |
| 5606 } | |
| 5607 EMIT_ONE_BYTE (c); | |
| 5608 } | |
| 5609 } | |
| 5610 else | |
| 5611 { | |
| 5612 if (!EQ (coding->src_object, coding->dst_object)) | |
| 5613 { | |
| 5614 int require = coding->src_bytes - coding->dst_bytes; | |
| 5615 | |
| 5616 if (require > 0) | |
| 5617 { | |
| 5618 EMACS_INT offset = src - coding->source; | |
| 5619 | |
| 5620 dst = alloc_destination (coding, require, dst); | |
| 5621 coding_set_source (coding); | |
| 5622 src = coding->source + offset; | |
| 5623 src_end = coding->source + coding->src_bytes; | |
| 5624 } | |
| 5625 } | |
| 5626 produced_chars = coding->src_chars; | |
| 5627 while (src < src_end) | |
| 5628 { | |
| 5629 int c = *src++; | |
| 5630 | |
| 5631 if (c == '\r') | |
| 5632 { | |
| 5633 if (EQ (eol_type, Qdos)) | |
| 5634 { | |
| 5635 if (src < src_end | |
| 5636 && *src == '\n') | |
| 5637 c = *src++; | |
| 5638 produced_chars--; | |
| 5639 } | |
| 5640 else if (EQ (eol_type, Qmac)) | |
| 5641 c = '\n'; | |
| 5642 } | |
| 5643 *dst++ = c; | |
| 5644 } | |
| 5645 } | |
| 5646 coding->consumed = coding->src_bytes; | |
| 5647 coding->consumed_char = coding->src_chars; | |
| 5648 } | |
| 5649 | |
| 5650 produced = dst - (coding->destination + coding->produced); | |
| 5651 if (BUFFERP (coding->dst_object)) | |
| 5652 insert_from_gap (produced_chars, produced); | |
| 5653 coding->produced += produced; | |
| 5654 coding->produced_char += produced_chars; | |
| 5655 return produced_chars; | |
| 5656 } | |
| 5657 | |
| 5658 /* Compose text in CODING->object according to the annotation data at | |
| 5659 CHARBUF. CHARBUF is an array: | |
| 5660 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ] | |
| 5661 */ | |
| 5662 | |
| 5663 static INLINE void | |
| 5664 produce_composition (coding, charbuf) | |
| 5665 struct coding_system *coding; | |
| 5666 int *charbuf; | |
| 5667 { | |
| 5668 int len; | |
| 5669 EMACS_INT from, to; | |
| 5670 enum composition_method method; | |
| 5671 Lisp_Object components; | |
| 5672 | |
| 5673 len = -charbuf[0]; | |
| 5674 from = coding->dst_pos + charbuf[2]; | |
| 5675 to = coding->dst_pos + charbuf[3]; | |
| 5676 method = (enum composition_method) (charbuf[4]); | |
| 5677 | |
| 5678 if (method == COMPOSITION_RELATIVE) | |
| 5679 components = Qnil; | |
| 5680 else | |
| 5681 { | |
| 5682 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1]; | |
| 5683 int i; | |
| 5684 | |
| 5685 len -= 5; | |
| 5686 charbuf += 5; | |
| 5687 for (i = 0; i < len; i++) | |
| 5688 args[i] = make_number (charbuf[i]); | |
| 5689 components = (method == COMPOSITION_WITH_ALTCHARS | |
| 5690 ? Fstring (len, args) : Fvector (len, args)); | |
| 5691 } | |
| 5692 compose_text (from, to, components, Qnil, coding->dst_object); | |
| 5693 } | |
| 5694 | |
| 5695 | |
| 5696 /* Put `charset' property on text in CODING->object according to | |
| 5697 the annotation data at CHARBUF. CHARBUF is an array: | |
| 5698 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ] | |
| 5699 */ | |
| 5700 | |
| 5701 static INLINE void | |
| 5702 produce_charset (coding, charbuf) | |
| 5703 struct coding_system *coding; | |
| 5704 int *charbuf; | |
| 5705 { | |
| 5706 EMACS_INT from = coding->dst_pos + charbuf[2]; | |
| 5707 EMACS_INT to = coding->dst_pos + charbuf[3]; | |
| 5708 struct charset *charset = CHARSET_FROM_ID (charbuf[4]); | |
| 5709 | |
| 5710 Fput_text_property (make_number (from), make_number (to), | |
| 5711 Qcharset, CHARSET_NAME (charset), | |
| 5712 coding->dst_object); | |
| 5713 } | |
| 5714 | |
| 5715 | |
| 5716 #define CHARBUF_SIZE 0x4000 | |
| 5717 | |
| 5718 #define ALLOC_CONVERSION_WORK_AREA(coding) \ | |
| 5719 do { \ | |
| 5720 int size = CHARBUF_SIZE;; \ | |
| 5721 \ | |
| 5722 coding->charbuf = NULL; \ | |
| 5723 while (size > 1024) \ | |
| 5724 { \ | |
| 5725 coding->charbuf = (int *) alloca (sizeof (int) * size); \ | |
| 5726 if (coding->charbuf) \ | |
| 5727 break; \ | |
| 5728 size >>= 1; \ | |
| 5729 } \ | |
| 5730 if (! coding->charbuf) \ | |
| 5731 { \ | |
| 5732 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \ | |
| 5733 return coding->result; \ | |
| 5734 } \ | |
| 5735 coding->charbuf_size = size; \ | |
| 5736 } while (0) | |
| 5737 | |
| 5738 | |
| 5739 static void | |
| 5740 produce_annotation (coding) | |
| 5741 struct coding_system *coding; | |
| 5742 { | |
| 5743 int *charbuf = coding->charbuf; | |
| 5744 int *charbuf_end = charbuf + coding->charbuf_used; | |
| 5745 | |
| 5746 if (NILP (coding->dst_object)) | |
| 5747 return; | |
| 5748 | |
| 5749 while (charbuf < charbuf_end) | |
| 5750 { | |
| 5751 if (*charbuf >= 0) | |
| 5752 charbuf++; | |
| 5753 else | |
| 5754 { | |
| 5755 int len = -*charbuf; | |
| 5756 switch (charbuf[1]) | |
| 5757 { | |
| 5758 case CODING_ANNOTATE_COMPOSITION_MASK: | |
| 5759 produce_composition (coding, charbuf); | |
| 5760 break; | |
| 5761 case CODING_ANNOTATE_CHARSET_MASK: | |
| 5762 produce_charset (coding, charbuf); | |
| 5763 break; | |
| 5764 default: | |
| 5765 abort (); | |
| 5766 } | |
| 5767 charbuf += len; | |
| 5768 } | |
| 5769 } | |
| 5770 } | |
| 5771 | |
| 5772 /* Decode the data at CODING->src_object into CODING->dst_object. | |
| 5773 CODING->src_object is a buffer, a string, or nil. | |
| 5774 CODING->dst_object is a buffer. | |
| 5775 | |
| 5776 If CODING->src_object is a buffer, it must be the current buffer. | |
| 5777 In this case, if CODING->src_pos is positive, it is a position of | |
| 5778 the source text in the buffer, otherwise, the source text is in the | |
| 5779 gap area of the buffer, and CODING->src_pos specifies the offset of | |
| 5780 the text from GPT (which must be the same as PT). If this is the | |
| 5781 same buffer as CODING->dst_object, CODING->src_pos must be | |
| 5782 negative. | |
| 5783 | |
| 5784 If CODING->src_object is a string, CODING->src_pos in an index to | |
| 5785 that string. | |
| 5786 | |
| 5787 If CODING->src_object is nil, CODING->source must already point to | |
| 5788 the non-relocatable memory area. In this case, CODING->src_pos is | |
| 5789 an offset from CODING->source. | |
| 5790 | |
| 5791 The decoded data is inserted at the current point of the buffer | |
| 5792 CODING->dst_object. | |
| 5793 */ | |
| 5794 | |
| 5795 static int | |
| 5796 decode_coding (coding) | |
| 5797 struct coding_system *coding; | |
| 5798 { | |
| 5799 Lisp_Object attrs; | |
| 5800 | |
| 5801 if (BUFFERP (coding->src_object) | |
| 5802 && coding->src_pos > 0 | |
| 5803 && coding->src_pos < GPT | |
| 5804 && coding->src_pos + coding->src_chars > GPT) | |
| 5805 move_gap_both (coding->src_pos, coding->src_pos_byte); | |
| 5806 | |
| 5807 if (BUFFERP (coding->dst_object)) | |
| 5808 { | |
| 5809 if (current_buffer != XBUFFER (coding->dst_object)) | |
| 5810 set_buffer_internal (XBUFFER (coding->dst_object)); | |
| 5811 if (GPT != PT) | |
| 5812 move_gap_both (PT, PT_BYTE); | |
| 5813 } | |
| 5814 | |
| 5815 coding->consumed = coding->consumed_char = 0; | |
| 5816 coding->produced = coding->produced_char = 0; | |
| 5817 coding->chars_at_source = 0; | |
| 5818 coding->result = CODING_RESULT_SUCCESS; | |
| 5819 coding->errors = 0; | |
| 5820 | |
| 5821 ALLOC_CONVERSION_WORK_AREA (coding); | |
| 5822 | |
| 5823 attrs = CODING_ID_ATTRS (coding->id); | |
| 5824 | |
| 5825 do | |
| 5826 { | |
| 5827 coding_set_source (coding); | |
| 5828 coding->annotated = 0; | |
| 5829 (*(coding->decoder)) (coding); | |
| 5830 if (!NILP (CODING_ATTR_DECODE_TBL (attrs))) | |
| 5831 translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs)); | |
| 5832 else if (!NILP (Vstandard_translation_table_for_decode)) | |
| 5833 translate_chars (coding, Vstandard_translation_table_for_decode); | |
| 5834 coding_set_destination (coding); | |
| 5835 produce_chars (coding); | |
| 5836 if (coding->annotated) | |
| 5837 produce_annotation (coding); | |
| 5838 } | |
| 5839 while (coding->consumed < coding->src_bytes | |
| 5840 && ! coding->result); | |
| 5841 | |
| 5842 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl) | |
| 5843 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id)) | |
| 5844 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)) | |
| 5845 decode_eol (coding); | |
| 5846 | |
| 5847 coding->carryover_bytes = 0; | |
| 5848 if (coding->consumed < coding->src_bytes) | |
| 5849 { | |
| 5850 int nbytes = coding->src_bytes - coding->consumed; | |
| 5851 const unsigned char *src; | |
| 5852 | |
| 5853 coding_set_source (coding); | |
| 5854 coding_set_destination (coding); | |
| 5855 src = coding->source + coding->consumed; | |
| 5856 | |
| 5857 if (coding->mode & CODING_MODE_LAST_BLOCK) | |
| 5858 { | |
| 5859 /* Flush out unprocessed data as binary chars. We are sure | |
| 5860 that the number of data is less than the size of | |
| 5861 coding->charbuf. */ | |
| 5862 while (nbytes-- > 0) | |
| 5863 { | |
| 5864 int c = *src++; | |
| 5865 | |
| 5866 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c); | |
| 5867 } | |
| 5868 produce_chars (coding); | |
| 5869 } | |
| 5870 else | |
| 5871 { | |
| 5872 /* Record unprocessed bytes in coding->carryover. We are | |
| 5873 sure that the number of data is less than the size of | |
| 5874 coding->carryover. */ | |
| 5875 unsigned char *p = coding->carryover; | |
| 5876 | |
| 5877 coding->carryover_bytes = nbytes; | |
| 5878 while (nbytes-- > 0) | |
| 5879 *p++ = *src++; | |
| 5880 } | |
| 5881 coding->consumed = coding->src_bytes; | |
| 5882 } | |
| 5883 | |
| 5884 return coding->result; | |
| 5885 } | |
| 5886 | |
| 5887 | |
| 5888 /* Extract an annotation datum from a composition starting at POS and | |
| 5889 ending before LIMIT of CODING->src_object (buffer or string), store | |
| 5890 the data in BUF, set *STOP to a starting position of the next | |
| 5891 composition (if any) or to LIMIT, and return the address of the | |
| 5892 next element of BUF. | |
| 5893 | |
| 5894 If such an annotation is not found, set *STOP to a starting | |
| 5895 position of a composition after POS (if any) or to LIMIT, and | |
| 5896 return BUF. */ | |
| 5897 | |
| 5898 static INLINE int * | |
| 5899 handle_composition_annotation (pos, limit, coding, buf, stop) | |
| 5900 EMACS_INT pos, limit; | |
| 5901 struct coding_system *coding; | |
| 5902 int *buf; | |
| 5903 EMACS_INT *stop; | |
| 5904 { | |
| 5905 EMACS_INT start, end; | |
| 5906 Lisp_Object prop; | |
| 5907 | |
| 5908 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object) | |
| 5909 || end > limit) | |
| 5910 *stop = limit; | |
| 5911 else if (start > pos) | |
| 5912 *stop = start; | |
| 5913 else | |
| 5914 { | |
| 5915 if (start == pos) | |
| 5916 { | |
| 5917 /* We found a composition. Store the corresponding | |
| 5918 annotation data in BUF. */ | |
| 5919 int *head = buf; | |
| 5920 enum composition_method method = COMPOSITION_METHOD (prop); | |
| 5921 int nchars = COMPOSITION_LENGTH (prop); | |
| 5922 | |
| 5923 ADD_COMPOSITION_DATA (buf, 0, nchars, method); | |
| 5924 if (method != COMPOSITION_RELATIVE) | |
| 5925 { | |
| 5926 Lisp_Object components; | |
| 5927 int len, i, i_byte; | |
| 5928 | |
| 5929 components = COMPOSITION_COMPONENTS (prop); | |
| 5930 if (VECTORP (components)) | |
| 5931 { | |
| 5932 len = XVECTOR (components)->size; | |
| 5933 for (i = 0; i < len; i++) | |
| 5934 *buf++ = XINT (AREF (components, i)); | |
| 5935 } | |
| 5936 else if (STRINGP (components)) | |
| 5937 { | |
| 5938 len = SCHARS (components); | |
| 5939 i = i_byte = 0; | |
| 5940 while (i < len) | |
| 5941 { | |
| 5942 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte); | |
| 5943 buf++; | |
| 5944 } | |
| 5945 } | |
| 5946 else if (INTEGERP (components)) | |
| 5947 { | |
| 5948 len = 1; | |
| 5949 *buf++ = XINT (components); | |
| 5950 } | |
| 5951 else if (CONSP (components)) | |
| 5952 { | |
| 5953 for (len = 0; CONSP (components); | |
| 5954 len++, components = XCDR (components)) | |
| 5955 *buf++ = XINT (XCAR (components)); | |
| 4382 } | 5956 } |
| 4383 else | 5957 else |
| 4384 { | 5958 abort (); |
| 4385 c2 = (src[msb] << 8) | (src[lsb]); | 5959 *head -= len; |
| 4386 if (c2 == '\n') | |
| 4387 this_eol_type = CODING_EOL_CRLF, src += 2; | |
| 4388 else | |
| 4389 this_eol_type = CODING_EOL_CR; | |
| 4390 } | |
| 4391 } | 5960 } |
| 4392 | 5961 } |
| 4393 if (eol_type == CODING_EOL_UNDECIDED) | 5962 |
| 4394 /* This is the first end-of-line. */ | 5963 if (find_composition (end, limit, &start, &end, &prop, |
| 4395 eol_type = this_eol_type; | 5964 coding->src_object) |
| 4396 else if (eol_type != this_eol_type) | 5965 && end <= limit) |
| 5966 *stop = start; | |
| 5967 else | |
| 5968 *stop = limit; | |
| 5969 } | |
| 5970 return buf; | |
| 5971 } | |
| 5972 | |
| 5973 | |
| 5974 /* Extract an annotation datum from a text property `charset' at POS of | |
| 5975 CODING->src_object (buffer of string), store the data in BUF, set | |
| 5976 *STOP to the position where the value of `charset' property changes | |
| 5977 (limiting by LIMIT), and return the address of the next element of | |
| 5978 BUF. | |
| 5979 | |
| 5980 If the property value is nil, set *STOP to the position where the | |
| 5981 property value is non-nil (limiting by LIMIT), and return BUF. */ | |
| 5982 | |
| 5983 static INLINE int * | |
| 5984 handle_charset_annotation (pos, limit, coding, buf, stop) | |
| 5985 EMACS_INT pos, limit; | |
| 5986 struct coding_system *coding; | |
| 5987 int *buf; | |
| 5988 EMACS_INT *stop; | |
| 5989 { | |
| 5990 Lisp_Object val, next; | |
| 5991 int id; | |
| 5992 | |
| 5993 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object); | |
| 5994 if (! NILP (val) && CHARSETP (val)) | |
| 5995 id = XINT (CHARSET_SYMBOL_ID (val)); | |
| 5996 else | |
| 5997 id = -1; | |
| 5998 ADD_CHARSET_DATA (buf, 0, 0, id); | |
| 5999 next = Fnext_single_property_change (make_number (pos), Qcharset, | |
| 6000 coding->src_object, | |
| 6001 make_number (limit)); | |
| 6002 *stop = XINT (next); | |
| 6003 return buf; | |
| 6004 } | |
| 6005 | |
| 6006 | |
| 6007 static void | |
| 6008 consume_chars (coding) | |
| 6009 struct coding_system *coding; | |
| 6010 { | |
| 6011 int *buf = coding->charbuf; | |
| 6012 int *buf_end = coding->charbuf + coding->charbuf_size; | |
| 6013 const unsigned char *src = coding->source + coding->consumed; | |
| 6014 const unsigned char *src_end = coding->source + coding->src_bytes; | |
| 6015 EMACS_INT pos = coding->src_pos + coding->consumed_char; | |
| 6016 EMACS_INT end_pos = coding->src_pos + coding->src_chars; | |
| 6017 int multibytep = coding->src_multibyte; | |
| 6018 Lisp_Object eol_type; | |
| 6019 int c; | |
| 6020 EMACS_INT stop, stop_composition, stop_charset; | |
| 6021 | |
| 6022 eol_type = CODING_ID_EOL_TYPE (coding->id); | |
| 6023 if (VECTORP (eol_type)) | |
| 6024 eol_type = Qunix; | |
| 6025 | |
| 6026 /* Note: composition handling is not yet implemented. */ | |
| 6027 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; | |
| 6028 | |
| 6029 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK) | |
| 6030 stop = stop_composition = pos; | |
| 6031 else | |
| 6032 stop = stop_composition = end_pos; | |
| 6033 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK) | |
| 6034 stop = stop_charset = pos; | |
| 6035 else | |
| 6036 stop_charset = end_pos; | |
| 6037 | |
| 6038 /* Compensate for CRLF and annotation. */ | |
| 6039 buf_end -= 1 + MAX_ANNOTATION_LENGTH; | |
| 6040 while (buf < buf_end) | |
| 6041 { | |
| 6042 if (pos == stop) | |
| 6043 { | |
| 6044 if (pos == end_pos) | |
| 6045 break; | |
| 6046 if (pos == stop_composition) | |
| 6047 buf = handle_composition_annotation (pos, end_pos, coding, | |
| 6048 buf, &stop_composition); | |
| 6049 if (pos == stop_charset) | |
| 6050 buf = handle_charset_annotation (pos, end_pos, coding, | |
| 6051 buf, &stop_charset); | |
| 6052 stop = (stop_composition < stop_charset | |
| 6053 ? stop_composition : stop_charset); | |
| 6054 } | |
| 6055 | |
| 6056 if (! multibytep) | |
| 6057 { | |
| 6058 EMACS_INT bytes; | |
| 6059 | |
| 6060 if (! CODING_FOR_UNIBYTE (coding) | |
| 6061 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0) | |
| 6062 c = STRING_CHAR_ADVANCE (src), pos += bytes; | |
| 6063 else | |
| 6064 c = *src++, pos++; | |
| 6065 } | |
| 6066 else | |
| 6067 c = STRING_CHAR_ADVANCE (src), pos++; | |
| 6068 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY)) | |
| 6069 c = '\n'; | |
| 6070 if (! EQ (eol_type, Qunix)) | |
| 6071 { | |
| 6072 if (c == '\n') | |
| 4397 { | 6073 { |
| 4398 /* The found type is different from what found before. */ | 6074 if (EQ (eol_type, Qdos)) |
| 4399 eol_type = CODING_EOL_INCONSISTENT; | 6075 *buf++ = '\r'; |
| 4400 break; | 6076 else |
| 6077 c = '\r'; | |
| 4401 } | 6078 } |
| 4402 } | 6079 } |
| 4403 } | 6080 *buf++ = c; |
| 4404 | 6081 } |
| 4405 if (*skip == 0) | 6082 |
| 4406 *skip = src_end - source; | 6083 coding->consumed = src - coding->source; |
| 4407 return eol_type; | 6084 coding->consumed_char = pos - coding->src_pos; |
| 4408 } | 6085 coding->charbuf_used = buf - coding->charbuf; |
| 4409 | 6086 coding->chars_at_source = 0; |
| 4410 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC | 6087 } |
| 4411 is encoded. If it detects an appropriate format of end-of-line, it | 6088 |
| 4412 sets the information in *CODING. */ | 6089 |
| 6090 /* Encode the text at CODING->src_object into CODING->dst_object. | |
| 6091 CODING->src_object is a buffer or a string. | |
| 6092 CODING->dst_object is a buffer or nil. | |
| 6093 | |
| 6094 If CODING->src_object is a buffer, it must be the current buffer. | |
| 6095 In this case, if CODING->src_pos is positive, it is a position of | |
| 6096 the source text in the buffer, otherwise. the source text is in the | |
| 6097 gap area of the buffer, and coding->src_pos specifies the offset of | |
| 6098 the text from GPT (which must be the same as PT). If this is the | |
| 6099 same buffer as CODING->dst_object, CODING->src_pos must be | |
| 6100 negative and CODING should not have `pre-write-conversion'. | |
| 6101 | |
| 6102 If CODING->src_object is a string, CODING should not have | |
| 6103 `pre-write-conversion'. | |
| 6104 | |
| 6105 If CODING->dst_object is a buffer, the encoded data is inserted at | |
| 6106 the current point of that buffer. | |
| 6107 | |
| 6108 If CODING->dst_object is nil, the encoded data is placed at the | |
| 6109 memory area specified by CODING->destination. */ | |
| 6110 | |
| 6111 static int | |
| 6112 encode_coding (coding) | |
| 6113 struct coding_system *coding; | |
| 6114 { | |
| 6115 Lisp_Object attrs; | |
| 6116 | |
| 6117 attrs = CODING_ID_ATTRS (coding->id); | |
| 6118 | |
| 6119 if (BUFFERP (coding->dst_object)) | |
| 6120 { | |
| 6121 set_buffer_internal (XBUFFER (coding->dst_object)); | |
| 6122 coding->dst_multibyte | |
| 6123 = ! NILP (current_buffer->enable_multibyte_characters); | |
| 6124 } | |
| 6125 | |
| 6126 coding->consumed = coding->consumed_char = 0; | |
| 6127 coding->produced = coding->produced_char = 0; | |
| 6128 coding->result = CODING_RESULT_SUCCESS; | |
| 6129 coding->errors = 0; | |
| 6130 | |
| 6131 ALLOC_CONVERSION_WORK_AREA (coding); | |
| 6132 | |
| 6133 do { | |
| 6134 coding_set_source (coding); | |
| 6135 consume_chars (coding); | |
| 6136 | |
| 6137 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs))) | |
| 6138 translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs)); | |
| 6139 else if (!NILP (Vstandard_translation_table_for_encode)) | |
| 6140 translate_chars (coding, Vstandard_translation_table_for_encode); | |
| 6141 | |
| 6142 coding_set_destination (coding); | |
| 6143 (*(coding->encoder)) (coding); | |
| 6144 } while (coding->consumed_char < coding->src_chars); | |
| 6145 | |
| 6146 if (BUFFERP (coding->dst_object)) | |
| 6147 insert_from_gap (coding->produced_char, coding->produced); | |
| 6148 | |
| 6149 return (coding->result); | |
| 6150 } | |
| 6151 | |
| 6152 | |
| 6153 /* Stack of working buffers used in code conversion. An nil element | |
| 6154 means that the code conversion of that level is not using a working | |
| 6155 buffer. */ | |
| 6156 Lisp_Object Vcode_conversion_work_buf_list; | |
| 6157 | |
| 6158 /* A working buffer used by the top level conversion. */ | |
| 6159 Lisp_Object Vcode_conversion_reused_work_buf; | |
| 6160 | |
| 6161 | |
| 6162 /* Return a working buffer that can be freely used by the following | |
| 6163 code conversion. MULTIBYTEP specifies the multibyteness of the | |
| 6164 buffer. */ | |
| 6165 | |
| 6166 Lisp_Object | |
| 6167 make_conversion_work_buffer (multibytep, depth) | |
| 6168 int multibytep, depth; | |
| 6169 { | |
| 6170 struct buffer *current = current_buffer; | |
| 6171 Lisp_Object buf, name; | |
| 6172 | |
| 6173 if (depth == 0) | |
| 6174 { | |
| 6175 if (NILP (Vcode_conversion_reused_work_buf)) | |
| 6176 Vcode_conversion_reused_work_buf | |
| 6177 = Fget_buffer_create (build_string (" *code-converting-work<0>*")); | |
| 6178 buf = Vcode_conversion_reused_work_buf; | |
| 6179 } | |
| 6180 else | |
| 6181 { | |
| 6182 if (depth < 0) | |
| 6183 { | |
| 6184 name = build_string (" *code-converting-work*"); | |
| 6185 name = Fgenerate_new_buffer_name (name, Qnil); | |
| 6186 } | |
| 6187 else | |
| 6188 { | |
| 6189 char str[128]; | |
| 6190 | |
| 6191 sprintf (str, " *code-converting-work*<%d>", depth); | |
| 6192 name = build_string (str); | |
| 6193 } | |
| 6194 buf = Fget_buffer_create (name); | |
| 6195 } | |
| 6196 set_buffer_internal (XBUFFER (buf)); | |
| 6197 current_buffer->undo_list = Qt; | |
| 6198 Ferase_buffer (); | |
| 6199 Fset_buffer_multibyte (multibytep ? Qt : Qnil); | |
| 6200 set_buffer_internal (current); | |
| 6201 return buf; | |
| 6202 } | |
| 6203 | |
| 6204 static Lisp_Object | |
| 6205 code_conversion_restore (buffer) | |
| 6206 Lisp_Object buffer; | |
| 6207 { | |
| 6208 Lisp_Object workbuf; | |
| 6209 | |
| 6210 workbuf = XCAR (Vcode_conversion_work_buf_list); | |
| 6211 if (! NILP (workbuf) | |
| 6212 && ! EQ (workbuf, Vcode_conversion_reused_work_buf) | |
| 6213 && ! NILP (Fbuffer_live_p (workbuf))) | |
| 6214 Fkill_buffer (workbuf); | |
| 6215 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list); | |
| 6216 set_buffer_internal (XBUFFER (buffer)); | |
| 6217 return Qnil; | |
| 6218 } | |
| 6219 | |
| 6220 static Lisp_Object | |
| 6221 code_conversion_save (buffer, with_work_buf, multibyte) | |
| 6222 Lisp_Object buffer; | |
| 6223 int with_work_buf, multibyte; | |
| 6224 { | |
| 6225 Lisp_Object workbuf; | |
| 6226 | |
| 6227 if (with_work_buf) | |
| 6228 { | |
| 6229 int depth = XINT (Flength (Vcode_conversion_work_buf_list)); | |
| 6230 | |
| 6231 workbuf = make_conversion_work_buffer (multibyte, depth); | |
| 6232 } | |
| 6233 else | |
| 6234 workbuf = Qnil; | |
| 6235 Vcode_conversion_work_buf_list | |
| 6236 = Fcons (workbuf, Vcode_conversion_work_buf_list); | |
| 6237 record_unwind_protect (code_conversion_restore, buffer); | |
| 6238 return workbuf; | |
| 6239 } | |
| 6240 | |
| 6241 int | |
| 6242 decode_coding_gap (coding, chars, bytes) | |
| 6243 struct coding_system *coding; | |
| 6244 EMACS_INT chars, bytes; | |
| 6245 { | |
| 6246 int count = specpdl_ptr - specpdl; | |
| 6247 Lisp_Object attrs; | |
| 6248 Lisp_Object buffer; | |
| 6249 | |
| 6250 buffer = Fcurrent_buffer (); | |
| 6251 code_conversion_save (buffer, 0, 0); | |
| 6252 | |
| 6253 coding->src_object = buffer; | |
| 6254 coding->src_chars = chars; | |
| 6255 coding->src_bytes = bytes; | |
| 6256 coding->src_pos = -chars; | |
| 6257 coding->src_pos_byte = -bytes; | |
| 6258 coding->src_multibyte = chars < bytes; | |
| 6259 coding->dst_object = buffer; | |
| 6260 coding->dst_pos = PT; | |
| 6261 coding->dst_pos_byte = PT_BYTE; | |
| 6262 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters); | |
| 6263 coding->mode |= CODING_MODE_LAST_BLOCK; | |
| 6264 | |
| 6265 if (CODING_REQUIRE_DETECTION (coding)) | |
| 6266 detect_coding (coding); | |
| 6267 | |
| 6268 decode_coding (coding); | |
| 6269 | |
| 6270 attrs = CODING_ID_ATTRS (coding->id); | |
| 6271 if (! NILP (CODING_ATTR_POST_READ (attrs))) | |
| 6272 { | |
| 6273 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE; | |
| 6274 Lisp_Object val; | |
| 6275 | |
| 6276 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte); | |
| 6277 val = call1 (CODING_ATTR_POST_READ (attrs), | |
| 6278 make_number (coding->produced_char)); | |
| 6279 CHECK_NATNUM (val); | |
| 6280 coding->produced_char += Z - prev_Z; | |
| 6281 coding->produced += Z_BYTE - prev_Z_BYTE; | |
| 6282 } | |
| 6283 | |
| 6284 unbind_to (count, Qnil); | |
| 6285 return coding->result; | |
| 6286 } | |
| 6287 | |
| 6288 int | |
| 6289 encode_coding_gap (coding, chars, bytes) | |
| 6290 struct coding_system *coding; | |
| 6291 EMACS_INT chars, bytes; | |
| 6292 { | |
| 6293 int count = specpdl_ptr - specpdl; | |
| 6294 Lisp_Object buffer; | |
| 6295 | |
| 6296 buffer = Fcurrent_buffer (); | |
| 6297 code_conversion_save (buffer, 0, 0); | |
| 6298 | |
| 6299 coding->src_object = buffer; | |
| 6300 coding->src_chars = chars; | |
| 6301 coding->src_bytes = bytes; | |
| 6302 coding->src_pos = -chars; | |
| 6303 coding->src_pos_byte = -bytes; | |
| 6304 coding->src_multibyte = chars < bytes; | |
| 6305 coding->dst_object = coding->src_object; | |
| 6306 coding->dst_pos = PT; | |
| 6307 coding->dst_pos_byte = PT_BYTE; | |
| 6308 | |
| 6309 encode_coding (coding); | |
| 6310 | |
| 6311 unbind_to (count, Qnil); | |
| 6312 return coding->result; | |
| 6313 } | |
| 6314 | |
| 6315 | |
| 6316 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in | |
| 6317 SRC_OBJECT into DST_OBJECT by coding context CODING. | |
| 6318 | |
| 6319 SRC_OBJECT is a buffer, a string, or Qnil. | |
| 6320 | |
| 6321 If it is a buffer, the text is at point of the buffer. FROM and TO | |
| 6322 are positions in the buffer. | |
| 6323 | |
| 6324 If it is a string, the text is at the beginning of the string. | |
| 6325 FROM and TO are indices to the string. | |
| 6326 | |
| 6327 If it is nil, the text is at coding->source. FROM and TO are | |
| 6328 indices to coding->source. | |
| 6329 | |
| 6330 DST_OBJECT is a buffer, Qt, or Qnil. | |
| 6331 | |
| 6332 If it is a buffer, the decoded text is inserted at point of the | |
| 6333 buffer. If the buffer is the same as SRC_OBJECT, the source text | |
| 6334 is deleted. | |
| 6335 | |
| 6336 If it is Qt, a string is made from the decoded text, and | |
| 6337 set in CODING->dst_object. | |
| 6338 | |
| 6339 If it is Qnil, the decoded text is stored at CODING->destination. | |
| 6340 The caller must allocate CODING->dst_bytes bytes at | |
| 6341 CODING->destination by xmalloc. If the decoded text is longer than | |
| 6342 CODING->dst_bytes, CODING->destination is relocated by xrealloc. | |
| 6343 */ | |
| 4413 | 6344 |
| 4414 void | 6345 void |
| 4415 detect_eol (coding, src, src_bytes) | 6346 decode_coding_object (coding, src_object, from, from_byte, to, to_byte, |
| 6347 dst_object) | |
| 4416 struct coding_system *coding; | 6348 struct coding_system *coding; |
| 4417 const unsigned char *src; | 6349 Lisp_Object src_object; |
| 4418 int src_bytes; | 6350 EMACS_INT from, from_byte, to, to_byte; |
| 4419 { | 6351 Lisp_Object dst_object; |
| 4420 Lisp_Object val; | 6352 { |
| 4421 int skip; | 6353 int count = specpdl_ptr - specpdl; |
| 4422 int eol_type; | 6354 unsigned char *destination; |
| 4423 | 6355 EMACS_INT dst_bytes; |
| 4424 switch (coding->category_idx) | 6356 EMACS_INT chars = to - from; |
| 4425 { | 6357 EMACS_INT bytes = to_byte - from_byte; |
| 4426 case CODING_CATEGORY_IDX_UTF_16_BE: | 6358 Lisp_Object attrs; |
| 4427 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1); | 6359 Lisp_Object buffer; |
| 4428 break; | 6360 int saved_pt = -1, saved_pt_byte; |
| 4429 case CODING_CATEGORY_IDX_UTF_16_LE: | 6361 |
| 4430 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0); | 6362 buffer = Fcurrent_buffer (); |
| 4431 break; | 6363 |
| 4432 default: | 6364 if (NILP (dst_object)) |
| 4433 eol_type = detect_eol_type (src, src_bytes, &skip); | 6365 { |
| 4434 break; | 6366 destination = coding->destination; |
| 4435 } | 6367 dst_bytes = coding->dst_bytes; |
| 4436 | 6368 } |
| 4437 if (coding->heading_ascii > skip) | 6369 |
| 4438 coding->heading_ascii = skip; | 6370 coding->src_object = src_object; |
| 6371 coding->src_chars = chars; | |
| 6372 coding->src_bytes = bytes; | |
| 6373 coding->src_multibyte = chars < bytes; | |
| 6374 | |
| 6375 if (STRINGP (src_object)) | |
| 6376 { | |
| 6377 coding->src_pos = from; | |
| 6378 coding->src_pos_byte = from_byte; | |
| 6379 } | |
| 6380 else if (BUFFERP (src_object)) | |
| 6381 { | |
| 6382 set_buffer_internal (XBUFFER (src_object)); | |
| 6383 if (from != GPT) | |
| 6384 move_gap_both (from, from_byte); | |
| 6385 if (EQ (src_object, dst_object)) | |
| 6386 { | |
| 6387 saved_pt = PT, saved_pt_byte = PT_BYTE; | |
| 6388 TEMP_SET_PT_BOTH (from, from_byte); | |
| 6389 del_range_both (from, from_byte, to, to_byte, 1); | |
| 6390 coding->src_pos = -chars; | |
| 6391 coding->src_pos_byte = -bytes; | |
| 6392 } | |
| 6393 else | |
| 6394 { | |
| 6395 coding->src_pos = from; | |
| 6396 coding->src_pos_byte = from_byte; | |
| 6397 } | |
| 6398 } | |
| 6399 | |
| 6400 if (CODING_REQUIRE_DETECTION (coding)) | |
| 6401 detect_coding (coding); | |
| 6402 attrs = CODING_ID_ATTRS (coding->id); | |
| 6403 | |
| 6404 if (EQ (dst_object, Qt) | |
| 6405 || (! NILP (CODING_ATTR_POST_READ (attrs)) | |
| 6406 && NILP (dst_object))) | |
| 6407 { | |
| 6408 coding->dst_object = code_conversion_save (buffer, 1, 1); | |
| 6409 coding->dst_pos = BEG; | |
| 6410 coding->dst_pos_byte = BEG_BYTE; | |
| 6411 coding->dst_multibyte = 1; | |
| 6412 } | |
| 6413 else if (BUFFERP (dst_object)) | |
| 6414 { | |
| 6415 code_conversion_save (buffer, 0, 0); | |
| 6416 coding->dst_object = dst_object; | |
| 6417 coding->dst_pos = BUF_PT (XBUFFER (dst_object)); | |
| 6418 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object)); | |
| 6419 coding->dst_multibyte | |
| 6420 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters); | |
| 6421 } | |
| 4439 else | 6422 else |
| 4440 skip = coding->heading_ascii; | 6423 { |
| 4441 | 6424 code_conversion_save (buffer, 0, 0); |
| 4442 if (eol_type == CODING_EOL_UNDECIDED) | 6425 coding->dst_object = Qnil; |
| 4443 return; | 6426 coding->dst_multibyte = 1; |
| 4444 if (eol_type == CODING_EOL_INCONSISTENT) | 6427 } |
| 4445 { | 6428 |
| 4446 #if 0 | 6429 decode_coding (coding); |
| 4447 /* This code is suppressed until we find a better way to | 6430 |
| 4448 distinguish raw text file and binary file. */ | 6431 if (BUFFERP (coding->dst_object)) |
| 4449 | 6432 set_buffer_internal (XBUFFER (coding->dst_object)); |
| 4450 /* If we have already detected that the coding is raw-text, the | 6433 |
| 4451 coding should actually be no-conversion. */ | 6434 if (! NILP (CODING_ATTR_POST_READ (attrs))) |
| 4452 if (coding->type == coding_type_raw_text) | 6435 { |
| 4453 { | 6436 struct gcpro gcpro1, gcpro2; |
| 4454 setup_coding_system (Qno_conversion, coding); | 6437 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE; |
| 4455 return; | 6438 Lisp_Object val; |
| 4456 } | 6439 |
| 4457 /* Else, let's decode only text code anyway. */ | 6440 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte); |
| 4458 #endif /* 0 */ | 6441 GCPRO2 (coding->src_object, coding->dst_object); |
| 4459 eol_type = CODING_EOL_LF; | 6442 val = call1 (CODING_ATTR_POST_READ (attrs), |
| 4460 } | 6443 make_number (coding->produced_char)); |
| 4461 | 6444 UNGCPRO; |
| 4462 val = Fget (coding->symbol, Qeol_type); | 6445 CHECK_NATNUM (val); |
| 4463 if (VECTORP (val) && XVECTOR (val)->size == 3) | 6446 coding->produced_char += Z - prev_Z; |
| 4464 { | 6447 coding->produced += Z_BYTE - prev_Z_BYTE; |
| 4465 int src_multibyte = coding->src_multibyte; | 6448 } |
| 4466 int dst_multibyte = coding->dst_multibyte; | 6449 |
| 4467 struct composition_data *cmp_data = coding->cmp_data; | 6450 if (EQ (dst_object, Qt)) |
| 4468 | 6451 { |
| 4469 setup_coding_system (XVECTOR (val)->contents[eol_type], coding); | 6452 coding->dst_object = Fbuffer_string (); |
| 4470 coding->src_multibyte = src_multibyte; | 6453 } |
| 4471 coding->dst_multibyte = dst_multibyte; | 6454 else if (NILP (dst_object) && BUFFERP (coding->dst_object)) |
| 4472 coding->heading_ascii = skip; | 6455 { |
| 4473 coding->cmp_data = cmp_data; | 6456 set_buffer_internal (XBUFFER (coding->dst_object)); |
| 4474 } | 6457 if (dst_bytes < coding->produced) |
| 4475 } | 6458 { |
| 4476 | 6459 destination |
| 4477 #define CONVERSION_BUFFER_EXTRA_ROOM 256 | 6460 = (unsigned char *) xrealloc (destination, coding->produced); |
| 4478 | 6461 if (! destination) |
| 4479 #define DECODING_BUFFER_MAG(coding) \ | 6462 { |
| 4480 (coding->type == coding_type_iso2022 \ | 6463 coding->result = CODING_RESULT_INSUFFICIENT_DST; |
| 4481 ? 3 \ | 6464 unbind_to (count, Qnil); |
| 4482 : (coding->type == coding_type_ccl \ | 6465 return; |
| 4483 ? coding->spec.ccl.decoder.buf_magnification \ | 6466 } |
| 4484 : 2)) | 6467 if (BEGV < GPT && GPT < BEGV + coding->produced_char) |
| 4485 | 6468 move_gap_both (BEGV, BEGV_BYTE); |
| 4486 /* Return maximum size (bytes) of a buffer enough for decoding | 6469 bcopy (BEGV_ADDR, destination, coding->produced); |
| 4487 SRC_BYTES of text encoded in CODING. */ | 6470 coding->destination = destination; |
| 4488 | 6471 } |
| 4489 int | 6472 } |
| 4490 decoding_buffer_size (coding, src_bytes) | 6473 |
| 6474 if (saved_pt >= 0) | |
| 6475 { | |
| 6476 /* This is the case of: | |
| 6477 (BUFFERP (src_object) && EQ (src_object, dst_object)) | |
| 6478 As we have moved PT while replacing the original buffer | |
| 6479 contents, we must recover it now. */ | |
| 6480 set_buffer_internal (XBUFFER (src_object)); | |
| 6481 if (saved_pt < from) | |
| 6482 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte); | |
| 6483 else if (saved_pt < from + chars) | |
| 6484 TEMP_SET_PT_BOTH (from, from_byte); | |
| 6485 else if (! NILP (current_buffer->enable_multibyte_characters)) | |
| 6486 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars), | |
| 6487 saved_pt_byte + (coding->produced - bytes)); | |
| 6488 else | |
| 6489 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes), | |
| 6490 saved_pt_byte + (coding->produced - bytes)); | |
| 6491 } | |
| 6492 | |
| 6493 unbind_to (count, Qnil); | |
| 6494 } | |
| 6495 | |
| 6496 | |
| 6497 void | |
| 6498 encode_coding_object (coding, src_object, from, from_byte, to, to_byte, | |
| 6499 dst_object) | |
| 4491 struct coding_system *coding; | 6500 struct coding_system *coding; |
| 4492 int src_bytes; | 6501 Lisp_Object src_object; |
| 4493 { | 6502 EMACS_INT from, from_byte, to, to_byte; |
| 4494 return (src_bytes * DECODING_BUFFER_MAG (coding) | 6503 Lisp_Object dst_object; |
| 4495 + CONVERSION_BUFFER_EXTRA_ROOM); | 6504 { |
| 4496 } | 6505 int count = specpdl_ptr - specpdl; |
| 4497 | 6506 EMACS_INT chars = to - from; |
| 4498 /* Return maximum size (bytes) of a buffer enough for encoding | 6507 EMACS_INT bytes = to_byte - from_byte; |
| 4499 SRC_BYTES of text to CODING. */ | 6508 Lisp_Object attrs; |
| 4500 | 6509 Lisp_Object buffer; |
| 4501 int | 6510 int saved_pt = -1, saved_pt_byte; |
| 4502 encoding_buffer_size (coding, src_bytes) | 6511 |
| 4503 struct coding_system *coding; | 6512 buffer = Fcurrent_buffer (); |
| 4504 int src_bytes; | 6513 |
| 4505 { | 6514 coding->src_object = src_object; |
| 4506 int magnification; | 6515 coding->src_chars = chars; |
| 4507 | 6516 coding->src_bytes = bytes; |
| 4508 if (coding->type == coding_type_ccl) | 6517 coding->src_multibyte = chars < bytes; |
| 4509 { | 6518 |
| 4510 magnification = coding->spec.ccl.encoder.buf_magnification; | 6519 attrs = CODING_ID_ATTRS (coding->id); |
| 4511 if (coding->eol_type == CODING_EOL_CRLF) | 6520 |
| 4512 magnification *= 2; | 6521 if (! NILP (CODING_ATTR_PRE_WRITE (attrs))) |
| 4513 } | 6522 { |
| 4514 else if (CODING_REQUIRE_ENCODING (coding)) | 6523 coding->src_object = code_conversion_save (buffer, 1, |
| 4515 magnification = 3; | 6524 coding->src_multibyte); |
| 6525 set_buffer_internal (XBUFFER (coding->src_object)); | |
| 6526 if (STRINGP (src_object)) | |
| 6527 insert_from_string (src_object, from, from_byte, chars, bytes, 0); | |
| 6528 else if (BUFFERP (src_object)) | |
| 6529 insert_from_buffer (XBUFFER (src_object), from, chars, 0); | |
| 6530 else | |
| 6531 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0); | |
| 6532 | |
| 6533 if (EQ (src_object, dst_object)) | |
| 6534 { | |
| 6535 set_buffer_internal (XBUFFER (src_object)); | |
| 6536 saved_pt = PT, saved_pt_byte = PT_BYTE; | |
| 6537 del_range_both (from, from_byte, to, to_byte, 1); | |
| 6538 set_buffer_internal (XBUFFER (coding->src_object)); | |
| 6539 } | |
| 6540 | |
| 6541 call2 (CODING_ATTR_PRE_WRITE (attrs), | |
| 6542 make_number (BEG), make_number (Z)); | |
| 6543 coding->src_object = Fcurrent_buffer (); | |
| 6544 if (BEG != GPT) | |
| 6545 move_gap_both (BEG, BEG_BYTE); | |
| 6546 coding->src_chars = Z - BEG; | |
| 6547 coding->src_bytes = Z_BYTE - BEG_BYTE; | |
| 6548 coding->src_pos = BEG; | |
| 6549 coding->src_pos_byte = BEG_BYTE; | |
| 6550 coding->src_multibyte = Z < Z_BYTE; | |
| 6551 } | |
| 6552 else if (STRINGP (src_object)) | |
| 6553 { | |
| 6554 code_conversion_save (buffer, 0, 0); | |
| 6555 coding->src_pos = from; | |
| 6556 coding->src_pos_byte = from_byte; | |
| 6557 } | |
| 6558 else if (BUFFERP (src_object)) | |
| 6559 { | |
| 6560 code_conversion_save (buffer, 0, 0); | |
| 6561 set_buffer_internal (XBUFFER (src_object)); | |
| 6562 if (EQ (src_object, dst_object)) | |
| 6563 { | |
| 6564 saved_pt = PT, saved_pt_byte = PT_BYTE; | |
| 6565 coding->src_object = del_range_1 (from, to, 1, 1); | |
| 6566 coding->src_pos = 0; | |
| 6567 coding->src_pos_byte = 0; | |
| 6568 } | |
| 6569 else | |
| 6570 { | |
| 6571 if (from < GPT && to >= GPT) | |
| 6572 move_gap_both (from, from_byte); | |
| 6573 coding->src_pos = from; | |
| 6574 coding->src_pos_byte = from_byte; | |
| 6575 } | |
| 6576 } | |
| 4516 else | 6577 else |
| 4517 magnification = 1; | 6578 code_conversion_save (buffer, 0, 0); |
| 4518 | 6579 |
| 4519 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM); | 6580 if (BUFFERP (dst_object)) |
| 4520 } | 6581 { |
| 4521 | 6582 coding->dst_object = dst_object; |
| 4522 /* Working buffer for code conversion. */ | 6583 if (EQ (src_object, dst_object)) |
| 4523 struct conversion_buffer | 6584 { |
| 4524 { | 6585 coding->dst_pos = from; |
| 4525 int size; /* size of data. */ | 6586 coding->dst_pos_byte = from_byte; |
| 4526 int on_stack; /* 1 if allocated by alloca. */ | 6587 } |
| 4527 unsigned char *data; | 6588 else |
| 4528 }; | 6589 { |
| 4529 | 6590 coding->dst_pos = BUF_PT (XBUFFER (dst_object)); |
| 4530 /* Don't use alloca for allocating memory space larger than this, lest | 6591 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object)); |
| 4531 we overflow their stack. */ | 6592 } |
| 4532 #define MAX_ALLOCA 16*1024 | 6593 coding->dst_multibyte |
| 4533 | 6594 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters); |
| 4534 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer). */ | 6595 } |
| 4535 #define allocate_conversion_buffer(buf, len) \ | 6596 else if (EQ (dst_object, Qt)) |
| 4536 do { \ | 6597 { |
| 4537 if (len < MAX_ALLOCA) \ | 6598 coding->dst_object = Qnil; |
| 4538 { \ | 6599 coding->dst_bytes = coding->src_chars; |
| 4539 buf.data = (unsigned char *) alloca (len); \ | 6600 if (coding->dst_bytes == 0) |
| 4540 buf.on_stack = 1; \ | 6601 coding->dst_bytes = 1; |
| 4541 } \ | 6602 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes); |
| 4542 else \ | 6603 coding->dst_multibyte = 0; |
| 4543 { \ | |
| 4544 buf.data = (unsigned char *) xmalloc (len); \ | |
| 4545 buf.on_stack = 0; \ | |
| 4546 } \ | |
| 4547 buf.size = len; \ | |
| 4548 } while (0) | |
| 4549 | |
| 4550 /* Double the allocated memory for *BUF. */ | |
| 4551 static void | |
| 4552 extend_conversion_buffer (buf) | |
| 4553 struct conversion_buffer *buf; | |
| 4554 { | |
| 4555 if (buf->on_stack) | |
| 4556 { | |
| 4557 unsigned char *save = buf->data; | |
| 4558 buf->data = (unsigned char *) xmalloc (buf->size * 2); | |
| 4559 bcopy (save, buf->data, buf->size); | |
| 4560 buf->on_stack = 0; | |
| 4561 } | 6604 } |
| 4562 else | 6605 else |
| 4563 { | 6606 { |
| 4564 buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2); | 6607 coding->dst_object = Qnil; |
| 4565 } | 6608 coding->dst_multibyte = 0; |
| 4566 buf->size *= 2; | 6609 } |
| 4567 } | 6610 |
| 4568 | 6611 encode_coding (coding); |
| 4569 /* Free the allocated memory for BUF if it is not on stack. */ | 6612 |
| 4570 static void | 6613 if (EQ (dst_object, Qt)) |
| 4571 free_conversion_buffer (buf) | 6614 { |
| 4572 struct conversion_buffer *buf; | 6615 if (BUFFERP (coding->dst_object)) |
| 4573 { | 6616 coding->dst_object = Fbuffer_string (); |
| 4574 if (!buf->on_stack) | |
| 4575 xfree (buf->data); | |
| 4576 } | |
| 4577 | |
| 4578 int | |
| 4579 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep) | |
| 4580 struct coding_system *coding; | |
| 4581 unsigned char *source, *destination; | |
| 4582 int src_bytes, dst_bytes, encodep; | |
| 4583 { | |
| 4584 struct ccl_program *ccl | |
| 4585 = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder; | |
| 4586 unsigned char *dst = destination; | |
| 4587 | |
| 4588 ccl->suppress_error = coding->suppress_error; | |
| 4589 ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK; | |
| 4590 if (encodep) | |
| 4591 { | |
| 4592 /* On encoding, EOL format is converted within ccl_driver. For | |
| 4593 that, setup proper information in the structure CCL. */ | |
| 4594 ccl->eol_type = coding->eol_type; | |
| 4595 if (ccl->eol_type ==CODING_EOL_UNDECIDED) | |
| 4596 ccl->eol_type = CODING_EOL_LF; | |
| 4597 ccl->cr_consumed = coding->spec.ccl.cr_carryover; | |
| 4598 ccl->eight_bit_control = coding->dst_multibyte; | |
| 4599 } | |
| 4600 else | |
| 4601 ccl->eight_bit_control = 1; | |
| 4602 ccl->multibyte = coding->src_multibyte; | |
| 4603 if (coding->spec.ccl.eight_bit_carryover[0] != 0) | |
| 4604 { | |
| 4605 /* Move carryover bytes to DESTINATION. */ | |
| 4606 unsigned char *p = coding->spec.ccl.eight_bit_carryover; | |
| 4607 while (*p) | |
| 4608 *dst++ = *p++; | |
| 4609 coding->spec.ccl.eight_bit_carryover[0] = 0; | |
| 4610 if (dst_bytes) | |
| 4611 dst_bytes -= dst - destination; | |
| 4612 } | |
| 4613 | |
| 4614 coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes, | |
| 4615 &(coding->consumed)) | |
| 4616 + dst - destination); | |
| 4617 | |
| 4618 if (encodep) | |
| 4619 { | |
| 4620 coding->produced_char = coding->produced; | |
| 4621 coding->spec.ccl.cr_carryover = ccl->cr_consumed; | |
| 4622 } | |
| 4623 else if (!ccl->eight_bit_control) | |
| 4624 { | |
| 4625 /* The produced bytes forms a valid multibyte sequence. */ | |
| 4626 coding->produced_char | |
| 4627 = multibyte_chars_in_text (destination, coding->produced); | |
| 4628 coding->spec.ccl.eight_bit_carryover[0] = 0; | |
| 4629 } | |
| 4630 else | |
| 4631 { | |
| 4632 /* On decoding, the destination should always multibyte. But, | |
| 4633 CCL program might have been generated an invalid multibyte | |
| 4634 sequence. Here we make such a sequence valid as | |
| 4635 multibyte. */ | |
| 4636 int bytes | |
| 4637 = dst_bytes ? dst_bytes : source + coding->consumed - destination; | |
| 4638 | |
| 4639 if ((coding->consumed < src_bytes | |
| 4640 || !ccl->last_block) | |
| 4641 && coding->produced >= 1 | |
| 4642 && destination[coding->produced - 1] >= 0x80) | |
| 4643 { | |
| 4644 /* We should not convert the tailing 8-bit codes to | |
| 4645 multibyte form even if they doesn't form a valid | |
| 4646 multibyte sequence. They may form a valid sequence in | |
| 4647 the next call. */ | |
| 4648 int carryover = 0; | |
| 4649 | |
| 4650 if (destination[coding->produced - 1] < 0xA0) | |
| 4651 carryover = 1; | |
| 4652 else if (coding->produced >= 2) | |
| 4653 { | |
| 4654 if (destination[coding->produced - 2] >= 0x80) | |
| 4655 { | |
| 4656 if (destination[coding->produced - 2] < 0xA0) | |
| 4657 carryover = 2; | |
| 4658 else if (coding->produced >= 3 | |
| 4659 && destination[coding->produced - 3] >= 0x80 | |
| 4660 && destination[coding->produced - 3] < 0xA0) | |
| 4661 carryover = 3; | |
| 4662 } | |
| 4663 } | |
| 4664 if (carryover > 0) | |
| 4665 { | |
| 4666 BCOPY_SHORT (destination + coding->produced - carryover, | |
| 4667 coding->spec.ccl.eight_bit_carryover, | |
| 4668 carryover); | |
| 4669 coding->spec.ccl.eight_bit_carryover[carryover] = 0; | |
| 4670 coding->produced -= carryover; | |
| 4671 } | |
| 4672 } | |
| 4673 coding->produced = str_as_multibyte (destination, bytes, | |
| 4674 coding->produced, | |
| 4675 &(coding->produced_char)); | |
| 4676 } | |
| 4677 | |
| 4678 switch (ccl->status) | |
| 4679 { | |
| 4680 case CCL_STAT_SUSPEND_BY_SRC: | |
| 4681 coding->result = CODING_FINISH_INSUFFICIENT_SRC; | |
| 4682 break; | |
| 4683 case CCL_STAT_SUSPEND_BY_DST: | |
| 4684 coding->result = CODING_FINISH_INSUFFICIENT_DST; | |
| 4685 break; | |
| 4686 case CCL_STAT_QUIT: | |
| 4687 case CCL_STAT_INVALID_CMD: | |
| 4688 coding->result = CODING_FINISH_INTERRUPT; | |
| 4689 break; | |
| 4690 default: | |
| 4691 coding->result = CODING_FINISH_NORMAL; | |
| 4692 break; | |
| 4693 } | |
| 4694 return coding->result; | |
| 4695 } | |
| 4696 | |
| 4697 /* Decode EOL format of the text at PTR of BYTES length destructively | |
| 4698 according to CODING->eol_type. This is called after the CCL | |
| 4699 program produced a decoded text at PTR. If we do CRLF->LF | |
| 4700 conversion, update CODING->produced and CODING->produced_char. */ | |
| 4701 | |
| 4702 static void | |
| 4703 decode_eol_post_ccl (coding, ptr, bytes) | |
| 4704 struct coding_system *coding; | |
| 4705 unsigned char *ptr; | |
| 4706 int bytes; | |
| 4707 { | |
| 4708 Lisp_Object val, saved_coding_symbol; | |
| 4709 unsigned char *pend = ptr + bytes; | |
| 4710 int dummy; | |
| 4711 | |
| 4712 /* Remember the current coding system symbol. We set it back when | |
| 4713 an inconsistent EOL is found so that `last-coding-system-used' is | |
| 4714 set to the coding system that doesn't specify EOL conversion. */ | |
| 4715 saved_coding_symbol = coding->symbol; | |
| 4716 | |
| 4717 coding->spec.ccl.cr_carryover = 0; | |
| 4718 if (coding->eol_type == CODING_EOL_UNDECIDED) | |
| 4719 { | |
| 4720 /* Here, to avoid the call of setup_coding_system, we directly | |
| 4721 call detect_eol_type. */ | |
| 4722 coding->eol_type = detect_eol_type (ptr, bytes, &dummy); | |
| 4723 if (coding->eol_type == CODING_EOL_INCONSISTENT) | |
| 4724 coding->eol_type = CODING_EOL_LF; | |
| 4725 if (coding->eol_type != CODING_EOL_UNDECIDED) | |
| 4726 { | |
| 4727 val = Fget (coding->symbol, Qeol_type); | |
| 4728 if (VECTORP (val) && XVECTOR (val)->size == 3) | |
| 4729 coding->symbol = XVECTOR (val)->contents[coding->eol_type]; | |
| 4730 } | |
| 4731 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL; | |
| 4732 } | |
| 4733 | |
| 4734 if (coding->eol_type == CODING_EOL_LF | |
| 4735 || coding->eol_type == CODING_EOL_UNDECIDED) | |
| 4736 { | |
| 4737 /* We have nothing to do. */ | |
| 4738 ptr = pend; | |
| 4739 } | |
| 4740 else if (coding->eol_type == CODING_EOL_CRLF) | |
| 4741 { | |
| 4742 unsigned char *pstart = ptr, *p = ptr; | |
| 4743 | |
| 4744 if (! (coding->mode & CODING_MODE_LAST_BLOCK) | |
| 4745 && *(pend - 1) == '\r') | |
| 4746 { | |
| 4747 /* If the last character is CR, we can't handle it here | |
| 4748 because LF will be in the not-yet-decoded source text. | |
| 4749 Record that the CR is not yet processed. */ | |
| 4750 coding->spec.ccl.cr_carryover = 1; | |
| 4751 coding->produced--; | |
| 4752 coding->produced_char--; | |
| 4753 pend--; | |
| 4754 } | |
| 4755 while (ptr < pend) | |
| 4756 { | |
| 4757 if (*ptr == '\r') | |
| 4758 { | |
| 4759 if (ptr + 1 < pend && *(ptr + 1) == '\n') | |
| 4760 { | |
| 4761 *p++ = '\n'; | |
| 4762 ptr += 2; | |
| 4763 } | |
| 4764 else | |
| 4765 { | |
| 4766 if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) | |
| 4767 goto undo_eol_conversion; | |
| 4768 *p++ = *ptr++; | |
| 4769 } | |
| 4770 } | |
| 4771 else if (*ptr == '\n' | |
| 4772 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) | |
| 4773 goto undo_eol_conversion; | |
| 4774 else | |
| 4775 *p++ = *ptr++; | |
| 4776 continue; | |
| 4777 | |
| 4778 undo_eol_conversion: | |
| 4779 /* We have faced with inconsistent EOL format at PTR. | |
| 4780 Convert all LFs before PTR back to CRLFs. */ | |
| 4781 for (p--, ptr--; p >= pstart; p--) | |
| 4782 { | |
| 4783 if (*p == '\n') | |
| 4784 *ptr-- = '\n', *ptr-- = '\r'; | |
| 4785 else | |
| 4786 *ptr-- = *p; | |
| 4787 } | |
| 4788 /* If carryover is recorded, cancel it because we don't | |
| 4789 convert CRLF anymore. */ | |
| 4790 if (coding->spec.ccl.cr_carryover) | |
| 4791 { | |
| 4792 coding->spec.ccl.cr_carryover = 0; | |
| 4793 coding->produced++; | |
| 4794 coding->produced_char++; | |
| 4795 pend++; | |
| 4796 } | |
| 4797 p = ptr = pend; | |
| 4798 coding->eol_type = CODING_EOL_LF; | |
| 4799 coding->symbol = saved_coding_symbol; | |
| 4800 } | |
| 4801 if (p < pend) | |
| 4802 { | |
| 4803 /* As each two-byte sequence CRLF was converted to LF, (PEND | |
| 4804 - P) is the number of deleted characters. */ | |
| 4805 coding->produced -= pend - p; | |
| 4806 coding->produced_char -= pend - p; | |
| 4807 } | |
| 4808 } | |
| 4809 else /* i.e. coding->eol_type == CODING_EOL_CR */ | |
| 4810 { | |
| 4811 unsigned char *p = ptr; | |
| 4812 | |
| 4813 for (; ptr < pend; ptr++) | |
| 4814 { | |
| 4815 if (*ptr == '\r') | |
| 4816 *ptr = '\n'; | |
| 4817 else if (*ptr == '\n' | |
| 4818 && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) | |
| 4819 { | |
| 4820 for (; p < ptr; p++) | |
| 4821 { | |
| 4822 if (*p == '\n') | |
| 4823 *p = '\r'; | |
| 4824 } | |
| 4825 ptr = pend; | |
| 4826 coding->eol_type = CODING_EOL_LF; | |
| 4827 coding->symbol = saved_coding_symbol; | |
| 4828 } | |
| 4829 } | |
| 4830 } | |
| 4831 } | |
| 4832 | |
| 4833 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before | |
| 4834 decoding, it may detect coding system and format of end-of-line if | |
| 4835 those are not yet decided. The source should be unibyte, the | |
| 4836 result is multibyte if CODING->dst_multibyte is nonzero, else | |
| 4837 unibyte. */ | |
| 4838 | |
| 4839 int | |
| 4840 decode_coding (coding, source, destination, src_bytes, dst_bytes) | |
| 4841 struct coding_system *coding; | |
| 4842 const unsigned char *source; | |
| 4843 unsigned char *destination; | |
| 4844 int src_bytes, dst_bytes; | |
| 4845 { | |
| 4846 int extra = 0; | |
| 4847 | |
| 4848 if (coding->type == coding_type_undecided) | |
| 4849 detect_coding (coding, source, src_bytes); | |
| 4850 | |
| 4851 if (coding->eol_type == CODING_EOL_UNDECIDED | |
| 4852 && coding->type != coding_type_ccl) | |
| 4853 { | |
| 4854 detect_eol (coding, source, src_bytes); | |
| 4855 /* We had better recover the original eol format if we | |
| 4856 encounter an inconsistent eol format while decoding. */ | |
| 4857 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL; | |
| 4858 } | |
| 4859 | |
| 4860 coding->produced = coding->produced_char = 0; | |
| 4861 coding->consumed = coding->consumed_char = 0; | |
| 4862 coding->errors = 0; | |
| 4863 coding->result = CODING_FINISH_NORMAL; | |
| 4864 | |
| 4865 switch (coding->type) | |
| 4866 { | |
| 4867 case coding_type_sjis: | |
| 4868 decode_coding_sjis_big5 (coding, source, destination, | |
| 4869 src_bytes, dst_bytes, 1); | |
| 4870 break; | |
| 4871 | |
| 4872 case coding_type_iso2022: | |
| 4873 decode_coding_iso2022 (coding, source, destination, | |
| 4874 src_bytes, dst_bytes); | |
| 4875 break; | |
| 4876 | |
| 4877 case coding_type_big5: | |
| 4878 decode_coding_sjis_big5 (coding, source, destination, | |
| 4879 src_bytes, dst_bytes, 0); | |
| 4880 break; | |
| 4881 | |
| 4882 case coding_type_emacs_mule: | |
| 4883 decode_coding_emacs_mule (coding, source, destination, | |
| 4884 src_bytes, dst_bytes); | |
| 4885 break; | |
| 4886 | |
| 4887 case coding_type_ccl: | |
| 4888 if (coding->spec.ccl.cr_carryover) | |
| 4889 { | |
| 4890 /* Put the CR which was not processed by the previous call | |
| 4891 of decode_eol_post_ccl in DESTINATION. It will be | |
| 4892 decoded together with the following LF by the call to | |
| 4893 decode_eol_post_ccl below. */ | |
| 4894 *destination = '\r'; | |
| 4895 coding->produced++; | |
| 4896 coding->produced_char++; | |
| 4897 dst_bytes--; | |
| 4898 extra = coding->spec.ccl.cr_carryover; | |
| 4899 } | |
| 4900 ccl_coding_driver (coding, source, destination + extra, | |
| 4901 src_bytes, dst_bytes, 0); | |
| 4902 if (coding->eol_type != CODING_EOL_LF) | |
| 4903 { | |
| 4904 coding->produced += extra; | |
| 4905 coding->produced_char += extra; | |
| 4906 decode_eol_post_ccl (coding, destination, coding->produced); | |
| 4907 } | |
| 4908 break; | |
| 4909 | |
| 4910 default: | |
| 4911 decode_eol (coding, source, destination, src_bytes, dst_bytes); | |
| 4912 } | |
| 4913 | |
| 4914 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC | |
| 4915 && coding->mode & CODING_MODE_LAST_BLOCK | |
| 4916 && coding->consumed == src_bytes) | |
| 4917 coding->result = CODING_FINISH_NORMAL; | |
| 4918 | |
| 4919 if (coding->mode & CODING_MODE_LAST_BLOCK | |
| 4920 && coding->result == CODING_FINISH_INSUFFICIENT_SRC) | |
| 4921 { | |
| 4922 const unsigned char *src = source + coding->consumed; | |
| 4923 unsigned char *dst = destination + coding->produced; | |
| 4924 | |
| 4925 src_bytes -= coding->consumed; | |
| 4926 coding->errors++; | |
| 4927 if (COMPOSING_P (coding)) | |
| 4928 DECODE_COMPOSITION_END ('1'); | |
| 4929 while (src_bytes--) | |
| 4930 { | |
| 4931 int c = *src++; | |
| 4932 dst += CHAR_STRING (c, dst); | |
| 4933 coding->produced_char++; | |
| 4934 } | |
| 4935 coding->consumed = coding->consumed_char = src - source; | |
| 4936 coding->produced = dst - destination; | |
| 4937 coding->result = CODING_FINISH_NORMAL; | |
| 4938 } | |
| 4939 | |
| 4940 if (!coding->dst_multibyte) | |
| 4941 { | |
| 4942 coding->produced = str_as_unibyte (destination, coding->produced); | |
| 4943 coding->produced_char = coding->produced; | |
| 4944 } | |
| 4945 | |
| 4946 return coding->result; | |
| 4947 } | |
| 4948 | |
| 4949 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". The | |
| 4950 multibyteness of the source is CODING->src_multibyte, the | |
| 4951 multibyteness of the result is always unibyte. */ | |
| 4952 | |
| 4953 int | |
| 4954 encode_coding (coding, source, destination, src_bytes, dst_bytes) | |
| 4955 struct coding_system *coding; | |
| 4956 const unsigned char *source; | |
| 4957 unsigned char *destination; | |
| 4958 int src_bytes, dst_bytes; | |
| 4959 { | |
| 4960 coding->produced = coding->produced_char = 0; | |
| 4961 coding->consumed = coding->consumed_char = 0; | |
| 4962 coding->errors = 0; | |
| 4963 coding->result = CODING_FINISH_NORMAL; | |
| 4964 | |
| 4965 switch (coding->type) | |
| 4966 { | |
| 4967 case coding_type_sjis: | |
| 4968 encode_coding_sjis_big5 (coding, source, destination, | |
| 4969 src_bytes, dst_bytes, 1); | |
| 4970 break; | |
| 4971 | |
| 4972 case coding_type_iso2022: | |
| 4973 encode_coding_iso2022 (coding, source, destination, | |
| 4974 src_bytes, dst_bytes); | |
| 4975 break; | |
| 4976 | |
| 4977 case coding_type_big5: | |
| 4978 encode_coding_sjis_big5 (coding, source, destination, | |
| 4979 src_bytes, dst_bytes, 0); | |
| 4980 break; | |
| 4981 | |
| 4982 case coding_type_emacs_mule: | |
| 4983 encode_coding_emacs_mule (coding, source, destination, | |
| 4984 src_bytes, dst_bytes); | |
| 4985 break; | |
| 4986 | |
| 4987 case coding_type_ccl: | |
| 4988 ccl_coding_driver (coding, source, destination, | |
| 4989 src_bytes, dst_bytes, 1); | |
| 4990 break; | |
| 4991 | |
| 4992 default: | |
| 4993 encode_eol (coding, source, destination, src_bytes, dst_bytes); | |
| 4994 } | |
| 4995 | |
| 4996 if (coding->mode & CODING_MODE_LAST_BLOCK | |
| 4997 && coding->result == CODING_FINISH_INSUFFICIENT_SRC) | |
| 4998 { | |
| 4999 const unsigned char *src = source + coding->consumed; | |
| 5000 unsigned char *dst = destination + coding->produced; | |
| 5001 | |
| 5002 if (coding->type == coding_type_iso2022) | |
| 5003 ENCODE_RESET_PLANE_AND_REGISTER; | |
| 5004 if (COMPOSING_P (coding)) | |
| 5005 *dst++ = ISO_CODE_ESC, *dst++ = '1'; | |
| 5006 if (coding->consumed < src_bytes) | |
| 5007 { | |
| 5008 int len = src_bytes - coding->consumed; | |
| 5009 | |
| 5010 BCOPY_SHORT (src, dst, len); | |
| 5011 if (coding->src_multibyte) | |
| 5012 len = str_as_unibyte (dst, len); | |
| 5013 dst += len; | |
| 5014 coding->consumed = src_bytes; | |
| 5015 } | |
| 5016 coding->produced = coding->produced_char = dst - destination; | |
| 5017 coding->result = CODING_FINISH_NORMAL; | |
| 5018 } | |
| 5019 | |
| 5020 if (coding->result == CODING_FINISH_INSUFFICIENT_SRC | |
| 5021 && coding->consumed == src_bytes) | |
| 5022 coding->result = CODING_FINISH_NORMAL; | |
| 5023 | |
| 5024 return coding->result; | |
| 5025 } | |
| 5026 | |
| 5027 /* Scan text in the region between *BEG and *END (byte positions), | |
| 5028 skip characters which we don't have to decode by coding system | |
| 5029 CODING at the head and tail, then set *BEG and *END to the region | |
| 5030 of the text we actually have to convert. The caller should move | |
| 5031 the gap out of the region in advance if the region is from a | |
| 5032 buffer. | |
| 5033 | |
| 5034 If STR is not NULL, *BEG and *END are indices into STR. */ | |
| 5035 | |
| 5036 static void | |
| 5037 shrink_decoding_region (beg, end, coding, str) | |
| 5038 int *beg, *end; | |
| 5039 struct coding_system *coding; | |
| 5040 unsigned char *str; | |
| 5041 { | |
| 5042 unsigned char *begp_orig, *begp, *endp_orig, *endp, c; | |
| 5043 int eol_conversion; | |
| 5044 Lisp_Object translation_table; | |
| 5045 | |
| 5046 if (coding->type == coding_type_ccl | |
| 5047 || coding->type == coding_type_undecided | |
| 5048 || coding->eol_type != CODING_EOL_LF | |
| 5049 || !NILP (coding->post_read_conversion) | |
| 5050 || coding->composing != COMPOSITION_DISABLED) | |
| 5051 { | |
| 5052 /* We can't skip any data. */ | |
| 5053 return; | |
| 5054 } | |
| 5055 if (coding->type == coding_type_no_conversion | |
| 5056 || coding->type == coding_type_raw_text | |
| 5057 || coding->type == coding_type_emacs_mule) | |
| 5058 { | |
| 5059 /* We need no conversion, but don't have to skip any data here. | |
| 5060 Decoding routine handles them effectively anyway. */ | |
| 5061 return; | |
| 5062 } | |
| 5063 | |
| 5064 translation_table = coding->translation_table_for_decode; | |
| 5065 if (NILP (translation_table) && !NILP (Venable_character_translation)) | |
| 5066 translation_table = Vstandard_translation_table_for_decode; | |
| 5067 if (CHAR_TABLE_P (translation_table)) | |
| 5068 { | |
| 5069 int i; | |
| 5070 for (i = 0; i < 128; i++) | |
| 5071 if (!NILP (CHAR_TABLE_REF (translation_table, i))) | |
| 5072 break; | |
| 5073 if (i < 128) | |
| 5074 /* Some ASCII character should be translated. We give up | |
| 5075 shrinking. */ | |
| 5076 return; | |
| 5077 } | |
| 5078 | |
| 5079 if (coding->heading_ascii >= 0) | |
| 5080 /* Detection routine has already found how much we can skip at the | |
| 5081 head. */ | |
| 5082 *beg += coding->heading_ascii; | |
| 5083 | |
| 5084 if (str) | |
| 5085 { | |
| 5086 begp_orig = begp = str + *beg; | |
| 5087 endp_orig = endp = str + *end; | |
| 5088 } | |
| 5089 else | |
| 5090 { | |
| 5091 begp_orig = begp = BYTE_POS_ADDR (*beg); | |
| 5092 endp_orig = endp = begp + *end - *beg; | |
| 5093 } | |
| 5094 | |
| 5095 eol_conversion = (coding->eol_type == CODING_EOL_CR | |
| 5096 || coding->eol_type == CODING_EOL_CRLF); | |
| 5097 | |
| 5098 switch (coding->type) | |
| 5099 { | |
| 5100 case coding_type_sjis: | |
| 5101 case coding_type_big5: | |
| 5102 /* We can skip all ASCII characters at the head. */ | |
| 5103 if (coding->heading_ascii < 0) | |
| 5104 { | |
| 5105 if (eol_conversion) | |
| 5106 while (begp < endp && *begp < 0x80 && *begp != '\r') begp++; | |
| 5107 else | |
| 5108 while (begp < endp && *begp < 0x80) begp++; | |
| 5109 } | |
| 5110 /* We can skip all ASCII characters at the tail except for the | |
| 5111 second byte of SJIS or BIG5 code. */ | |
| 5112 if (eol_conversion) | |
| 5113 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--; | |
| 5114 else | 6617 else |
| 5115 while (begp < endp && endp[-1] < 0x80) endp--; | 6618 { |
| 5116 /* Do not consider LF as ascii if preceded by CR, since that | 6619 coding->dst_object |
| 5117 confuses eol decoding. */ | 6620 = make_unibyte_string ((char *) coding->destination, |
| 5118 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n') | 6621 coding->produced); |
| 5119 endp++; | 6622 xfree (coding->destination); |
| 5120 if (begp < endp && endp < endp_orig && endp[-1] >= 0x80) | 6623 } |
| 5121 endp++; | 6624 } |
| 5122 break; | 6625 |
| 5123 | 6626 if (saved_pt >= 0) |
| 5124 case coding_type_iso2022: | 6627 { |
| 5125 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII) | 6628 /* This is the case of: |
| 5126 /* We can't skip any data. */ | 6629 (BUFFERP (src_object) && EQ (src_object, dst_object)) |
| 5127 break; | 6630 As we have moved PT while replacing the original buffer |
| 5128 if (coding->heading_ascii < 0) | 6631 contents, we must recover it now. */ |
| 5129 { | 6632 set_buffer_internal (XBUFFER (src_object)); |
| 5130 /* We can skip all ASCII characters at the head except for a | 6633 if (saved_pt < from) |
| 5131 few control codes. */ | 6634 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte); |
| 5132 while (begp < endp && (c = *begp) < 0x80 | 6635 else if (saved_pt < from + chars) |
| 5133 && c != ISO_CODE_CR && c != ISO_CODE_SO | 6636 TEMP_SET_PT_BOTH (from, from_byte); |
| 5134 && c != ISO_CODE_SI && c != ISO_CODE_ESC | 6637 else if (! NILP (current_buffer->enable_multibyte_characters)) |
| 5135 && (!eol_conversion || c != ISO_CODE_LF)) | 6638 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars), |
| 5136 begp++; | 6639 saved_pt_byte + (coding->produced - bytes)); |
| 5137 } | |
| 5138 switch (coding->category_idx) | |
| 5139 { | |
| 5140 case CODING_CATEGORY_IDX_ISO_8_1: | |
| 5141 case CODING_CATEGORY_IDX_ISO_8_2: | |
| 5142 /* We can skip all ASCII characters at the tail. */ | |
| 5143 if (eol_conversion) | |
| 5144 while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--; | |
| 5145 else | |
| 5146 while (begp < endp && endp[-1] < 0x80) endp--; | |
| 5147 /* Do not consider LF as ascii if preceded by CR, since that | |
| 5148 confuses eol decoding. */ | |
| 5149 if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n') | |
| 5150 endp++; | |
| 5151 break; | |
| 5152 | |
| 5153 case CODING_CATEGORY_IDX_ISO_7: | |
| 5154 case CODING_CATEGORY_IDX_ISO_7_TIGHT: | |
| 5155 { | |
| 5156 /* We can skip all characters at the tail except for 8-bit | |
| 5157 codes and ESC and the following 2-byte at the tail. */ | |
| 5158 unsigned char *eight_bit = NULL; | |
| 5159 | |
| 5160 if (eol_conversion) | |
| 5161 while (begp < endp | |
| 5162 && (c = endp[-1]) != ISO_CODE_ESC && c != '\r') | |
| 5163 { | |
| 5164 if (!eight_bit && c & 0x80) eight_bit = endp; | |
| 5165 endp--; | |
| 5166 } | |
| 5167 else | |
| 5168 while (begp < endp | |
| 5169 && (c = endp[-1]) != ISO_CODE_ESC) | |
| 5170 { | |
| 5171 if (!eight_bit && c & 0x80) eight_bit = endp; | |
| 5172 endp--; | |
| 5173 } | |
| 5174 /* Do not consider LF as ascii if preceded by CR, since that | |
| 5175 confuses eol decoding. */ | |
| 5176 if (begp < endp && endp < endp_orig | |
| 5177 && endp[-1] == '\r' && endp[0] == '\n') | |
| 5178 endp++; | |
| 5179 if (begp < endp && endp[-1] == ISO_CODE_ESC) | |
| 5180 { | |
| 5181 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B') | |
| 5182 /* This is an ASCII designation sequence. We can | |
| 5183 surely skip the tail. But, if we have | |
| 5184 encountered an 8-bit code, skip only the codes | |
| 5185 after that. */ | |
| 5186 endp = eight_bit ? eight_bit : endp + 2; | |
| 5187 else | |
| 5188 /* Hmmm, we can't skip the tail. */ | |
| 5189 endp = endp_orig; | |
| 5190 } | |
| 5191 else if (eight_bit) | |
| 5192 endp = eight_bit; | |
| 5193 } | |
| 5194 } | |
| 5195 break; | |
| 5196 | |
| 5197 default: | |
| 5198 abort (); | |
| 5199 } | |
| 5200 *beg += begp - begp_orig; | |
| 5201 *end += endp - endp_orig; | |
| 5202 return; | |
| 5203 } | |
| 5204 | |
| 5205 /* Like shrink_decoding_region but for encoding. */ | |
| 5206 | |
| 5207 static void | |
| 5208 shrink_encoding_region (beg, end, coding, str) | |
| 5209 int *beg, *end; | |
| 5210 struct coding_system *coding; | |
| 5211 unsigned char *str; | |
| 5212 { | |
| 5213 unsigned char *begp_orig, *begp, *endp_orig, *endp; | |
| 5214 int eol_conversion; | |
| 5215 Lisp_Object translation_table; | |
| 5216 | |
| 5217 if (coding->type == coding_type_ccl | |
| 5218 || coding->eol_type == CODING_EOL_CRLF | |
| 5219 || coding->eol_type == CODING_EOL_CR | |
| 5220 || (coding->cmp_data && coding->cmp_data->used > 0)) | |
| 5221 { | |
| 5222 /* We can't skip any data. */ | |
| 5223 return; | |
| 5224 } | |
| 5225 if (coding->type == coding_type_no_conversion | |
| 5226 || coding->type == coding_type_raw_text | |
| 5227 || coding->type == coding_type_emacs_mule | |
| 5228 || coding->type == coding_type_undecided) | |
| 5229 { | |
| 5230 /* We need no conversion, but don't have to skip any data here. | |
| 5231 Encoding routine handles them effectively anyway. */ | |
| 5232 return; | |
| 5233 } | |
| 5234 | |
| 5235 translation_table = coding->translation_table_for_encode; | |
| 5236 if (NILP (translation_table) && !NILP (Venable_character_translation)) | |
| 5237 translation_table = Vstandard_translation_table_for_encode; | |
| 5238 if (CHAR_TABLE_P (translation_table)) | |
| 5239 { | |
| 5240 int i; | |
| 5241 for (i = 0; i < 128; i++) | |
| 5242 if (!NILP (CHAR_TABLE_REF (translation_table, i))) | |
| 5243 break; | |
| 5244 if (i < 128) | |
| 5245 /* Some ASCII character should be translated. We give up | |
| 5246 shrinking. */ | |
| 5247 return; | |
| 5248 } | |
| 5249 | |
| 5250 if (str) | |
| 5251 { | |
| 5252 begp_orig = begp = str + *beg; | |
| 5253 endp_orig = endp = str + *end; | |
| 5254 } | |
| 5255 else | |
| 5256 { | |
| 5257 begp_orig = begp = BYTE_POS_ADDR (*beg); | |
| 5258 endp_orig = endp = begp + *end - *beg; | |
| 5259 } | |
| 5260 | |
| 5261 eol_conversion = (coding->eol_type == CODING_EOL_CR | |
| 5262 || coding->eol_type == CODING_EOL_CRLF); | |
| 5263 | |
| 5264 /* Here, we don't have to check coding->pre_write_conversion because | |
| 5265 the caller is expected to have handled it already. */ | |
| 5266 switch (coding->type) | |
| 5267 { | |
| 5268 case coding_type_iso2022: | |
| 5269 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII) | |
| 5270 /* We can't skip any data. */ | |
| 5271 break; | |
| 5272 if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL) | |
| 5273 { | |
| 5274 unsigned char *bol = begp; | |
| 5275 while (begp < endp && *begp < 0x80) | |
| 5276 { | |
| 5277 begp++; | |
| 5278 if (begp[-1] == '\n') | |
| 5279 bol = begp; | |
| 5280 } | |
| 5281 begp = bol; | |
| 5282 goto label_skip_tail; | |
| 5283 } | |
| 5284 /* fall down ... */ | |
| 5285 | |
| 5286 case coding_type_sjis: | |
| 5287 case coding_type_big5: | |
| 5288 /* We can skip all ASCII characters at the head and tail. */ | |
| 5289 if (eol_conversion) | |
| 5290 while (begp < endp && *begp < 0x80 && *begp != '\n') begp++; | |
| 5291 else | 6640 else |
| 5292 while (begp < endp && *begp < 0x80) begp++; | 6641 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes), |
| 5293 label_skip_tail: | 6642 saved_pt_byte + (coding->produced - bytes)); |
| 5294 if (eol_conversion) | 6643 } |
| 5295 while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--; | 6644 |
| 5296 else | 6645 unbind_to (count, Qnil); |
| 5297 while (begp < endp && *(endp - 1) < 0x80) endp--; | 6646 } |
| 5298 break; | 6647 |
| 5299 | |
| 5300 default: | |
| 5301 abort (); | |
| 5302 } | |
| 5303 | |
| 5304 *beg += begp - begp_orig; | |
| 5305 *end += endp - endp_orig; | |
| 5306 return; | |
| 5307 } | |
| 5308 | |
| 5309 /* As shrinking conversion region requires some overhead, we don't try | |
| 5310 shrinking if the length of conversion region is less than this | |
| 5311 value. */ | |
| 5312 static int shrink_conversion_region_threshhold = 1024; | |
| 5313 | |
| 5314 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep) \ | |
| 5315 do { \ | |
| 5316 if (*(end) - *(beg) > shrink_conversion_region_threshhold) \ | |
| 5317 { \ | |
| 5318 if (encodep) shrink_encoding_region (beg, end, coding, str); \ | |
| 5319 else shrink_decoding_region (beg, end, coding, str); \ | |
| 5320 } \ | |
| 5321 } while (0) | |
| 5322 | |
| 5323 static Lisp_Object | |
| 5324 code_convert_region_unwind (arg) | |
| 5325 Lisp_Object arg; | |
| 5326 { | |
| 5327 inhibit_pre_post_conversion = 0; | |
| 5328 Vlast_coding_system_used = arg; | |
| 5329 return Qnil; | |
| 5330 } | |
| 5331 | |
| 5332 /* Store information about all compositions in the range FROM and TO | |
| 5333 of OBJ in memory blocks pointed by CODING->cmp_data. OBJ is a | |
| 5334 buffer or a string, defaults to the current buffer. */ | |
| 5335 | |
| 5336 void | |
| 5337 coding_save_composition (coding, from, to, obj) | |
| 5338 struct coding_system *coding; | |
| 5339 int from, to; | |
| 5340 Lisp_Object obj; | |
| 5341 { | |
| 5342 Lisp_Object prop; | |
| 5343 int start, end; | |
| 5344 | |
| 5345 if (coding->composing == COMPOSITION_DISABLED) | |
| 5346 return; | |
| 5347 if (!coding->cmp_data) | |
| 5348 coding_allocate_composition_data (coding, from); | |
| 5349 if (!find_composition (from, to, &start, &end, &prop, obj) | |
| 5350 || end > to) | |
| 5351 return; | |
| 5352 if (start < from | |
| 5353 && (!find_composition (end, to, &start, &end, &prop, obj) | |
| 5354 || end > to)) | |
| 5355 return; | |
| 5356 coding->composing = COMPOSITION_NO; | |
| 5357 do | |
| 5358 { | |
| 5359 if (COMPOSITION_VALID_P (start, end, prop)) | |
| 5360 { | |
| 5361 enum composition_method method = COMPOSITION_METHOD (prop); | |
| 5362 if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH | |
| 5363 >= COMPOSITION_DATA_SIZE) | |
| 5364 coding_allocate_composition_data (coding, from); | |
| 5365 /* For relative composition, we remember start and end | |
| 5366 positions, for the other compositions, we also remember | |
| 5367 components. */ | |
| 5368 CODING_ADD_COMPOSITION_START (coding, start - from, method); | |
| 5369 if (method != COMPOSITION_RELATIVE) | |
| 5370 { | |
| 5371 /* We must store a*/ | |
| 5372 Lisp_Object val, ch; | |
| 5373 | |
| 5374 val = COMPOSITION_COMPONENTS (prop); | |
| 5375 if (CONSP (val)) | |
| 5376 while (CONSP (val)) | |
| 5377 { | |
| 5378 ch = XCAR (val), val = XCDR (val); | |
| 5379 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch)); | |
| 5380 } | |
| 5381 else if (VECTORP (val) || STRINGP (val)) | |
| 5382 { | |
| 5383 int len = (VECTORP (val) | |
| 5384 ? XVECTOR (val)->size : SCHARS (val)); | |
| 5385 int i; | |
| 5386 for (i = 0; i < len; i++) | |
| 5387 { | |
| 5388 ch = (STRINGP (val) | |
| 5389 ? Faref (val, make_number (i)) | |
| 5390 : XVECTOR (val)->contents[i]); | |
| 5391 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch)); | |
| 5392 } | |
| 5393 } | |
| 5394 else /* INTEGERP (val) */ | |
| 5395 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val)); | |
| 5396 } | |
| 5397 CODING_ADD_COMPOSITION_END (coding, end - from); | |
| 5398 } | |
| 5399 start = end; | |
| 5400 } | |
| 5401 while (start < to | |
| 5402 && find_composition (start, to, &start, &end, &prop, obj) | |
| 5403 && end <= to); | |
| 5404 | |
| 5405 /* Make coding->cmp_data point to the first memory block. */ | |
| 5406 while (coding->cmp_data->prev) | |
| 5407 coding->cmp_data = coding->cmp_data->prev; | |
| 5408 coding->cmp_data_start = 0; | |
| 5409 } | |
| 5410 | |
| 5411 /* Reflect the saved information about compositions to OBJ. | |
| 5412 CODING->cmp_data points to a memory block for the information. OBJ | |
| 5413 is a buffer or a string, defaults to the current buffer. */ | |
| 5414 | |
| 5415 void | |
| 5416 coding_restore_composition (coding, obj) | |
| 5417 struct coding_system *coding; | |
| 5418 Lisp_Object obj; | |
| 5419 { | |
| 5420 struct composition_data *cmp_data = coding->cmp_data; | |
| 5421 | |
| 5422 if (!cmp_data) | |
| 5423 return; | |
| 5424 | |
| 5425 while (cmp_data->prev) | |
| 5426 cmp_data = cmp_data->prev; | |
| 5427 | |
| 5428 while (cmp_data) | |
| 5429 { | |
| 5430 int i; | |
| 5431 | |
| 5432 for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0; | |
| 5433 i += cmp_data->data[i]) | |
| 5434 { | |
| 5435 int *data = cmp_data->data + i; | |
| 5436 enum composition_method method = (enum composition_method) data[3]; | |
| 5437 Lisp_Object components; | |
| 5438 | |
| 5439 if (method == COMPOSITION_RELATIVE) | |
| 5440 components = Qnil; | |
| 5441 else | |
| 5442 { | |
| 5443 int len = data[0] - 4, j; | |
| 5444 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1]; | |
| 5445 | |
| 5446 if (method == COMPOSITION_WITH_RULE_ALTCHARS | |
| 5447 && len % 2 == 0) | |
| 5448 len --; | |
| 5449 for (j = 0; j < len; j++) | |
| 5450 args[j] = make_number (data[4 + j]); | |
| 5451 components = (method == COMPOSITION_WITH_ALTCHARS | |
| 5452 ? Fstring (len, args) : Fvector (len, args)); | |
| 5453 } | |
| 5454 compose_text (data[1], data[2], components, Qnil, obj); | |
| 5455 } | |
| 5456 cmp_data = cmp_data->next; | |
| 5457 } | |
| 5458 } | |
| 5459 | |
| 5460 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the | |
| 5461 text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by | |
| 5462 coding system CODING, and return the status code of code conversion | |
| 5463 (currently, this value has no meaning). | |
| 5464 | |
| 5465 How many characters (and bytes) are converted to how many | |
| 5466 characters (and bytes) are recorded in members of the structure | |
| 5467 CODING. | |
| 5468 | |
| 5469 If REPLACE is nonzero, we do various things as if the original text | |
| 5470 is deleted and a new text is inserted. See the comments in | |
| 5471 replace_range (insdel.c) to know what we are doing. | |
| 5472 | |
| 5473 If REPLACE is zero, it is assumed that the source text is unibyte. | |
| 5474 Otherwise, it is assumed that the source text is multibyte. */ | |
| 5475 | |
| 5476 int | |
| 5477 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace) | |
| 5478 int from, from_byte, to, to_byte, encodep, replace; | |
| 5479 struct coding_system *coding; | |
| 5480 { | |
| 5481 int len = to - from, len_byte = to_byte - from_byte; | |
| 5482 int nchars_del = 0, nbytes_del = 0; | |
| 5483 int require, inserted, inserted_byte; | |
| 5484 int head_skip, tail_skip, total_skip = 0; | |
| 5485 Lisp_Object saved_coding_symbol; | |
| 5486 int first = 1; | |
| 5487 unsigned char *src, *dst; | |
| 5488 Lisp_Object deletion; | |
| 5489 int orig_point = PT, orig_len = len; | |
| 5490 int prev_Z; | |
| 5491 int multibyte_p = !NILP (current_buffer->enable_multibyte_characters); | |
| 5492 | |
| 5493 deletion = Qnil; | |
| 5494 saved_coding_symbol = coding->symbol; | |
| 5495 | |
| 5496 if (from < PT && PT < to) | |
| 5497 { | |
| 5498 TEMP_SET_PT_BOTH (from, from_byte); | |
| 5499 orig_point = from; | |
| 5500 } | |
| 5501 | |
| 5502 if (replace) | |
| 5503 { | |
| 5504 int saved_from = from; | |
| 5505 int saved_inhibit_modification_hooks; | |
| 5506 | |
| 5507 prepare_to_modify_buffer (from, to, &from); | |
| 5508 if (saved_from != from) | |
| 5509 { | |
| 5510 to = from + len; | |
| 5511 from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to); | |
| 5512 len_byte = to_byte - from_byte; | |
| 5513 } | |
| 5514 | |
| 5515 /* The code conversion routine can not preserve text properties | |
| 5516 for now. So, we must remove all text properties in the | |
| 5517 region. Here, we must suppress all modification hooks. */ | |
| 5518 saved_inhibit_modification_hooks = inhibit_modification_hooks; | |
| 5519 inhibit_modification_hooks = 1; | |
| 5520 Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil); | |
| 5521 inhibit_modification_hooks = saved_inhibit_modification_hooks; | |
| 5522 } | |
| 5523 | |
| 5524 if (! encodep && CODING_REQUIRE_DETECTION (coding)) | |
| 5525 { | |
| 5526 /* We must detect encoding of text and eol format. */ | |
| 5527 | |
| 5528 if (from < GPT && to > GPT) | |
| 5529 move_gap_both (from, from_byte); | |
| 5530 if (coding->type == coding_type_undecided) | |
| 5531 { | |
| 5532 detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte); | |
| 5533 if (coding->type == coding_type_undecided) | |
| 5534 { | |
| 5535 /* It seems that the text contains only ASCII, but we | |
| 5536 should not leave it undecided because the deeper | |
| 5537 decoding routine (decode_coding) tries to detect the | |
| 5538 encodings again in vain. */ | |
| 5539 coding->type = coding_type_emacs_mule; | |
| 5540 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE; | |
| 5541 /* As emacs-mule decoder will handle composition, we | |
| 5542 need this setting to allocate coding->cmp_data | |
| 5543 later. */ | |
| 5544 coding->composing = COMPOSITION_NO; | |
| 5545 } | |
| 5546 } | |
| 5547 if (coding->eol_type == CODING_EOL_UNDECIDED | |
| 5548 && coding->type != coding_type_ccl) | |
| 5549 { | |
| 5550 detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte); | |
| 5551 if (coding->eol_type == CODING_EOL_UNDECIDED) | |
| 5552 coding->eol_type = CODING_EOL_LF; | |
| 5553 /* We had better recover the original eol format if we | |
| 5554 encounter an inconsistent eol format while decoding. */ | |
| 5555 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL; | |
| 5556 } | |
| 5557 } | |
| 5558 | |
| 5559 /* Now we convert the text. */ | |
| 5560 | |
| 5561 /* For encoding, we must process pre-write-conversion in advance. */ | |
| 5562 if (! inhibit_pre_post_conversion | |
| 5563 && encodep | |
| 5564 && SYMBOLP (coding->pre_write_conversion) | |
| 5565 && ! NILP (Ffboundp (coding->pre_write_conversion))) | |
| 5566 { | |
| 5567 /* The function in pre-write-conversion may put a new text in a | |
| 5568 new buffer. */ | |
| 5569 struct buffer *prev = current_buffer; | |
| 5570 Lisp_Object new; | |
| 5571 | |
| 5572 record_unwind_protect (code_convert_region_unwind, | |
| 5573 Vlast_coding_system_used); | |
| 5574 /* We should not call any more pre-write/post-read-conversion | |
| 5575 functions while this pre-write-conversion is running. */ | |
| 5576 inhibit_pre_post_conversion = 1; | |
| 5577 call2 (coding->pre_write_conversion, | |
| 5578 make_number (from), make_number (to)); | |
| 5579 inhibit_pre_post_conversion = 0; | |
| 5580 /* Discard the unwind protect. */ | |
| 5581 specpdl_ptr--; | |
| 5582 | |
| 5583 if (current_buffer != prev) | |
| 5584 { | |
| 5585 len = ZV - BEGV; | |
| 5586 new = Fcurrent_buffer (); | |
| 5587 set_buffer_internal_1 (prev); | |
| 5588 del_range_2 (from, from_byte, to, to_byte, 0); | |
| 5589 TEMP_SET_PT_BOTH (from, from_byte); | |
| 5590 insert_from_buffer (XBUFFER (new), 1, len, 0); | |
| 5591 Fkill_buffer (new); | |
| 5592 if (orig_point >= to) | |
| 5593 orig_point += len - orig_len; | |
| 5594 else if (orig_point > from) | |
| 5595 orig_point = from; | |
| 5596 orig_len = len; | |
| 5597 to = from + len; | |
| 5598 from_byte = CHAR_TO_BYTE (from); | |
| 5599 to_byte = CHAR_TO_BYTE (to); | |
| 5600 len_byte = to_byte - from_byte; | |
| 5601 TEMP_SET_PT_BOTH (from, from_byte); | |
| 5602 } | |
| 5603 } | |
| 5604 | |
| 5605 if (replace) | |
| 5606 { | |
| 5607 if (! EQ (current_buffer->undo_list, Qt)) | |
| 5608 deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1); | |
| 5609 else | |
| 5610 { | |
| 5611 nchars_del = to - from; | |
| 5612 nbytes_del = to_byte - from_byte; | |
| 5613 } | |
| 5614 } | |
| 5615 | |
| 5616 if (coding->composing != COMPOSITION_DISABLED) | |
| 5617 { | |
| 5618 if (encodep) | |
| 5619 coding_save_composition (coding, from, to, Fcurrent_buffer ()); | |
| 5620 else | |
| 5621 coding_allocate_composition_data (coding, from); | |
| 5622 } | |
| 5623 | |
| 5624 /* Try to skip the heading and tailing ASCIIs. */ | |
| 5625 if (coding->type != coding_type_ccl) | |
| 5626 { | |
| 5627 int from_byte_orig = from_byte, to_byte_orig = to_byte; | |
| 5628 | |
| 5629 if (from < GPT && GPT < to) | |
| 5630 move_gap_both (from, from_byte); | |
| 5631 SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep); | |
| 5632 if (from_byte == to_byte | |
| 5633 && (encodep || NILP (coding->post_read_conversion)) | |
| 5634 && ! CODING_REQUIRE_FLUSHING (coding)) | |
| 5635 { | |
| 5636 coding->produced = len_byte; | |
| 5637 coding->produced_char = len; | |
| 5638 if (!replace) | |
| 5639 /* We must record and adjust for this new text now. */ | |
| 5640 adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len); | |
| 5641 return 0; | |
| 5642 } | |
| 5643 | |
| 5644 head_skip = from_byte - from_byte_orig; | |
| 5645 tail_skip = to_byte_orig - to_byte; | |
| 5646 total_skip = head_skip + tail_skip; | |
| 5647 from += head_skip; | |
| 5648 to -= tail_skip; | |
| 5649 len -= total_skip; len_byte -= total_skip; | |
| 5650 } | |
| 5651 | |
| 5652 /* For conversion, we must put the gap before the text in addition to | |
| 5653 making the gap larger for efficient decoding. The required gap | |
| 5654 size starts from 2000 which is the magic number used in make_gap. | |
| 5655 But, after one batch of conversion, it will be incremented if we | |
| 5656 find that it is not enough . */ | |
| 5657 require = 2000; | |
| 5658 | |
| 5659 if (GAP_SIZE < require) | |
| 5660 make_gap (require - GAP_SIZE); | |
| 5661 move_gap_both (from, from_byte); | |
| 5662 | |
| 5663 inserted = inserted_byte = 0; | |
| 5664 | |
| 5665 GAP_SIZE += len_byte; | |
| 5666 ZV -= len; | |
| 5667 Z -= len; | |
| 5668 ZV_BYTE -= len_byte; | |
| 5669 Z_BYTE -= len_byte; | |
| 5670 | |
| 5671 if (GPT - BEG < BEG_UNCHANGED) | |
| 5672 BEG_UNCHANGED = GPT - BEG; | |
| 5673 if (Z - GPT < END_UNCHANGED) | |
| 5674 END_UNCHANGED = Z - GPT; | |
| 5675 | |
| 5676 if (!encodep && coding->src_multibyte) | |
| 5677 { | |
| 5678 /* Decoding routines expects that the source text is unibyte. | |
| 5679 We must convert 8-bit characters of multibyte form to | |
| 5680 unibyte. */ | |
| 5681 int len_byte_orig = len_byte; | |
| 5682 len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte); | |
| 5683 if (len_byte < len_byte_orig) | |
| 5684 safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte, | |
| 5685 len_byte); | |
| 5686 coding->src_multibyte = 0; | |
| 5687 } | |
| 5688 | |
| 5689 for (;;) | |
| 5690 { | |
| 5691 int result; | |
| 5692 | |
| 5693 /* The buffer memory is now: | |
| 5694 +--------+converted-text+---------+-------original-text-------+---+ | |
| 5695 |<-from->|<--inserted-->|---------|<--------len_byte--------->|---| | |
| 5696 |<---------------------- GAP ----------------------->| */ | |
| 5697 src = GAP_END_ADDR - len_byte; | |
| 5698 dst = GPT_ADDR + inserted_byte; | |
| 5699 | |
| 5700 if (encodep) | |
| 5701 result = encode_coding (coding, src, dst, len_byte, 0); | |
| 5702 else | |
| 5703 { | |
| 5704 if (coding->composing != COMPOSITION_DISABLED) | |
| 5705 coding->cmp_data->char_offset = from + inserted; | |
| 5706 result = decode_coding (coding, src, dst, len_byte, 0); | |
| 5707 } | |
| 5708 | |
| 5709 /* The buffer memory is now: | |
| 5710 +--------+-------converted-text----+--+------original-text----+---+ | |
| 5711 |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---| | |
| 5712 |<---------------------- GAP ----------------------->| */ | |
| 5713 | |
| 5714 inserted += coding->produced_char; | |
| 5715 inserted_byte += coding->produced; | |
| 5716 len_byte -= coding->consumed; | |
| 5717 | |
| 5718 if (result == CODING_FINISH_INSUFFICIENT_CMP) | |
| 5719 { | |
| 5720 coding_allocate_composition_data (coding, from + inserted); | |
| 5721 continue; | |
| 5722 } | |
| 5723 | |
| 5724 src += coding->consumed; | |
| 5725 dst += coding->produced; | |
| 5726 | |
| 5727 if (result == CODING_FINISH_NORMAL) | |
| 5728 { | |
| 5729 src += len_byte; | |
| 5730 break; | |
| 5731 } | |
| 5732 if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL) | |
| 5733 { | |
| 5734 unsigned char *pend = dst, *p = pend - inserted_byte; | |
| 5735 Lisp_Object eol_type; | |
| 5736 | |
| 5737 /* Encode LFs back to the original eol format (CR or CRLF). */ | |
| 5738 if (coding->eol_type == CODING_EOL_CR) | |
| 5739 { | |
| 5740 while (p < pend) if (*p++ == '\n') p[-1] = '\r'; | |
| 5741 } | |
| 5742 else | |
| 5743 { | |
| 5744 int count = 0; | |
| 5745 | |
| 5746 while (p < pend) if (*p++ == '\n') count++; | |
| 5747 if (src - dst < count) | |
| 5748 { | |
| 5749 /* We don't have sufficient room for encoding LFs | |
| 5750 back to CRLF. We must record converted and | |
| 5751 not-yet-converted text back to the buffer | |
| 5752 content, enlarge the gap, then record them out of | |
| 5753 the buffer contents again. */ | |
| 5754 int add = len_byte + inserted_byte; | |
| 5755 | |
| 5756 GAP_SIZE -= add; | |
| 5757 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add; | |
| 5758 GPT += inserted_byte; GPT_BYTE += inserted_byte; | |
| 5759 make_gap (count - GAP_SIZE); | |
| 5760 GAP_SIZE += add; | |
| 5761 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add; | |
| 5762 GPT -= inserted_byte; GPT_BYTE -= inserted_byte; | |
| 5763 /* Don't forget to update SRC, DST, and PEND. */ | |
| 5764 src = GAP_END_ADDR - len_byte; | |
| 5765 dst = GPT_ADDR + inserted_byte; | |
| 5766 pend = dst; | |
| 5767 } | |
| 5768 inserted += count; | |
| 5769 inserted_byte += count; | |
| 5770 coding->produced += count; | |
| 5771 p = dst = pend + count; | |
| 5772 while (count) | |
| 5773 { | |
| 5774 *--p = *--pend; | |
| 5775 if (*p == '\n') count--, *--p = '\r'; | |
| 5776 } | |
| 5777 } | |
| 5778 | |
| 5779 /* Suppress eol-format conversion in the further conversion. */ | |
| 5780 coding->eol_type = CODING_EOL_LF; | |
| 5781 | |
| 5782 /* Set the coding system symbol to that for Unix-like EOL. */ | |
| 5783 eol_type = Fget (saved_coding_symbol, Qeol_type); | |
| 5784 if (VECTORP (eol_type) | |
| 5785 && XVECTOR (eol_type)->size == 3 | |
| 5786 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF])) | |
| 5787 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF]; | |
| 5788 else | |
| 5789 coding->symbol = saved_coding_symbol; | |
| 5790 | |
| 5791 continue; | |
| 5792 } | |
| 5793 if (len_byte <= 0) | |
| 5794 { | |
| 5795 if (coding->type != coding_type_ccl | |
| 5796 || coding->mode & CODING_MODE_LAST_BLOCK) | |
| 5797 break; | |
| 5798 coding->mode |= CODING_MODE_LAST_BLOCK; | |
| 5799 continue; | |
| 5800 } | |
| 5801 if (result == CODING_FINISH_INSUFFICIENT_SRC) | |
| 5802 { | |
| 5803 /* The source text ends in invalid codes. Let's just | |
| 5804 make them valid buffer contents, and finish conversion. */ | |
| 5805 if (multibyte_p) | |
| 5806 { | |
| 5807 unsigned char *start = dst; | |
| 5808 | |
| 5809 inserted += len_byte; | |
| 5810 while (len_byte--) | |
| 5811 { | |
| 5812 int c = *src++; | |
| 5813 dst += CHAR_STRING (c, dst); | |
| 5814 } | |
| 5815 | |
| 5816 inserted_byte += dst - start; | |
| 5817 } | |
| 5818 else | |
| 5819 { | |
| 5820 inserted += len_byte; | |
| 5821 inserted_byte += len_byte; | |
| 5822 while (len_byte--) | |
| 5823 *dst++ = *src++; | |
| 5824 } | |
| 5825 break; | |
| 5826 } | |
| 5827 if (result == CODING_FINISH_INTERRUPT) | |
| 5828 { | |
| 5829 /* The conversion procedure was interrupted by a user. */ | |
| 5830 break; | |
| 5831 } | |
| 5832 /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST */ | |
| 5833 if (coding->consumed < 1) | |
| 5834 { | |
| 5835 /* It's quite strange to require more memory without | |
| 5836 consuming any bytes. Perhaps CCL program bug. */ | |
| 5837 break; | |
| 5838 } | |
| 5839 if (first) | |
| 5840 { | |
| 5841 /* We have just done the first batch of conversion which was | |
| 5842 stopped because of insufficient gap. Let's reconsider the | |
| 5843 required gap size (i.e. SRT - DST) now. | |
| 5844 | |
| 5845 We have converted ORIG bytes (== coding->consumed) into | |
| 5846 NEW bytes (coding->produced). To convert the remaining | |
| 5847 LEN bytes, we may need REQUIRE bytes of gap, where: | |
| 5848 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG) | |
| 5849 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG | |
| 5850 Here, we are sure that NEW >= ORIG. */ | |
| 5851 float ratio; | |
| 5852 | |
| 5853 if (coding->produced <= coding->consumed) | |
| 5854 { | |
| 5855 /* This happens because of CCL-based coding system with | |
| 5856 eol-type CRLF. */ | |
| 5857 require = 0; | |
| 5858 } | |
| 5859 else | |
| 5860 { | |
| 5861 ratio = (coding->produced - coding->consumed) / coding->consumed; | |
| 5862 require = len_byte * ratio; | |
| 5863 } | |
| 5864 first = 0; | |
| 5865 } | |
| 5866 if ((src - dst) < (require + 2000)) | |
| 5867 { | |
| 5868 /* See the comment above the previous call of make_gap. */ | |
| 5869 int add = len_byte + inserted_byte; | |
| 5870 | |
| 5871 GAP_SIZE -= add; | |
| 5872 ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add; | |
| 5873 GPT += inserted_byte; GPT_BYTE += inserted_byte; | |
| 5874 make_gap (require + 2000); | |
| 5875 GAP_SIZE += add; | |
| 5876 ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add; | |
| 5877 GPT -= inserted_byte; GPT_BYTE -= inserted_byte; | |
| 5878 } | |
| 5879 } | |
| 5880 if (src - dst > 0) *dst = 0; /* Put an anchor. */ | |
| 5881 | |
| 5882 if (encodep && coding->dst_multibyte) | |
| 5883 { | |
| 5884 /* The output is unibyte. We must convert 8-bit characters to | |
| 5885 multibyte form. */ | |
| 5886 if (inserted_byte * 2 > GAP_SIZE) | |
| 5887 { | |
| 5888 GAP_SIZE -= inserted_byte; | |
| 5889 ZV += inserted_byte; Z += inserted_byte; | |
| 5890 ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte; | |
| 5891 GPT += inserted_byte; GPT_BYTE += inserted_byte; | |
| 5892 make_gap (inserted_byte - GAP_SIZE); | |
| 5893 GAP_SIZE += inserted_byte; | |
| 5894 ZV -= inserted_byte; Z -= inserted_byte; | |
| 5895 ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte; | |
| 5896 GPT -= inserted_byte; GPT_BYTE -= inserted_byte; | |
| 5897 } | |
| 5898 inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte); | |
| 5899 } | |
| 5900 | |
| 5901 /* If we shrank the conversion area, adjust it now. */ | |
| 5902 if (total_skip > 0) | |
| 5903 { | |
| 5904 if (tail_skip > 0) | |
| 5905 safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip); | |
| 5906 inserted += total_skip; inserted_byte += total_skip; | |
| 5907 GAP_SIZE += total_skip; | |
| 5908 GPT -= head_skip; GPT_BYTE -= head_skip; | |
| 5909 ZV -= total_skip; ZV_BYTE -= total_skip; | |
| 5910 Z -= total_skip; Z_BYTE -= total_skip; | |
| 5911 from -= head_skip; from_byte -= head_skip; | |
| 5912 to += tail_skip; to_byte += tail_skip; | |
| 5913 } | |
| 5914 | |
| 5915 prev_Z = Z; | |
| 5916 if (! EQ (current_buffer->undo_list, Qt)) | |
| 5917 adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte); | |
| 5918 else | |
| 5919 adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del, | |
| 5920 inserted, inserted_byte); | |
| 5921 inserted = Z - prev_Z; | |
| 5922 | |
| 5923 if (!encodep && coding->cmp_data && coding->cmp_data->used) | |
| 5924 coding_restore_composition (coding, Fcurrent_buffer ()); | |
| 5925 coding_free_composition_data (coding); | |
| 5926 | |
| 5927 if (! inhibit_pre_post_conversion | |
| 5928 && ! encodep && ! NILP (coding->post_read_conversion)) | |
| 5929 { | |
| 5930 Lisp_Object val; | |
| 5931 Lisp_Object saved_coding_system; | |
| 5932 | |
| 5933 if (from != PT) | |
| 5934 TEMP_SET_PT_BOTH (from, from_byte); | |
| 5935 prev_Z = Z; | |
| 5936 record_unwind_protect (code_convert_region_unwind, | |
| 5937 Vlast_coding_system_used); | |
| 5938 saved_coding_system = Vlast_coding_system_used; | |
| 5939 Vlast_coding_system_used = coding->symbol; | |
| 5940 /* We should not call any more pre-write/post-read-conversion | |
| 5941 functions while this post-read-conversion is running. */ | |
| 5942 inhibit_pre_post_conversion = 1; | |
| 5943 val = call1 (coding->post_read_conversion, make_number (inserted)); | |
| 5944 inhibit_pre_post_conversion = 0; | |
| 5945 coding->symbol = Vlast_coding_system_used; | |
| 5946 Vlast_coding_system_used = saved_coding_system; | |
| 5947 /* Discard the unwind protect. */ | |
| 5948 specpdl_ptr--; | |
| 5949 CHECK_NUMBER (val); | |
| 5950 inserted += Z - prev_Z; | |
| 5951 } | |
| 5952 | |
| 5953 if (orig_point >= from) | |
| 5954 { | |
| 5955 if (orig_point >= from + orig_len) | |
| 5956 orig_point += inserted - orig_len; | |
| 5957 else | |
| 5958 orig_point = from; | |
| 5959 TEMP_SET_PT (orig_point); | |
| 5960 } | |
| 5961 | |
| 5962 if (replace) | |
| 5963 { | |
| 5964 signal_after_change (from, to - from, inserted); | |
| 5965 update_compositions (from, from + inserted, CHECK_BORDER); | |
| 5966 } | |
| 5967 | |
| 5968 { | |
| 5969 coding->consumed = to_byte - from_byte; | |
| 5970 coding->consumed_char = to - from; | |
| 5971 coding->produced = inserted_byte; | |
| 5972 coding->produced_char = inserted; | |
| 5973 } | |
| 5974 | |
| 5975 return 0; | |
| 5976 } | |
| 5977 | 6648 |
| 5978 Lisp_Object | 6649 Lisp_Object |
| 5979 run_pre_post_conversion_on_str (str, coding, encodep) | 6650 preferred_coding_system () |
| 5980 Lisp_Object str; | 6651 { |
| 5981 struct coding_system *coding; | 6652 int id = coding_categories[coding_priorities[0]].id; |
| 5982 int encodep; | 6653 |
| 5983 { | 6654 return CODING_ID_NAME (id); |
| 5984 int count = SPECPDL_INDEX (); | |
| 5985 struct gcpro gcpro1, gcpro2; | |
| 5986 int multibyte = STRING_MULTIBYTE (str); | |
| 5987 Lisp_Object buffer; | |
| 5988 struct buffer *buf; | |
| 5989 Lisp_Object old_deactivate_mark; | |
| 5990 | |
| 5991 record_unwind_protect (Fset_buffer, Fcurrent_buffer ()); | |
| 5992 record_unwind_protect (code_convert_region_unwind, | |
| 5993 Vlast_coding_system_used); | |
| 5994 /* It is not crucial to specbind this. */ | |
| 5995 old_deactivate_mark = Vdeactivate_mark; | |
| 5996 GCPRO2 (str, old_deactivate_mark); | |
| 5997 | |
| 5998 buffer = Fget_buffer_create (build_string (" *code-converting-work*")); | |
| 5999 buf = XBUFFER (buffer); | |
| 6000 | |
| 6001 delete_all_overlays (buf); | |
| 6002 buf->directory = current_buffer->directory; | |
| 6003 buf->read_only = Qnil; | |
| 6004 buf->filename = Qnil; | |
| 6005 buf->undo_list = Qt; | |
| 6006 eassert (buf->overlays_before == NULL); | |
| 6007 eassert (buf->overlays_after == NULL); | |
| 6008 | |
| 6009 set_buffer_internal (buf); | |
| 6010 /* We must insert the contents of STR as is without | |
| 6011 unibyte<->multibyte conversion. For that, we adjust the | |
| 6012 multibyteness of the working buffer to that of STR. */ | |
| 6013 Ferase_buffer (); | |
| 6014 buf->enable_multibyte_characters = multibyte ? Qt : Qnil; | |
| 6015 | |
| 6016 insert_from_string (str, 0, 0, | |
| 6017 SCHARS (str), SBYTES (str), 0); | |
| 6018 UNGCPRO; | |
| 6019 inhibit_pre_post_conversion = 1; | |
| 6020 if (encodep) | |
| 6021 call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z)); | |
| 6022 else | |
| 6023 { | |
| 6024 Vlast_coding_system_used = coding->symbol; | |
| 6025 TEMP_SET_PT_BOTH (BEG, BEG_BYTE); | |
| 6026 call1 (coding->post_read_conversion, make_number (Z - BEG)); | |
| 6027 coding->symbol = Vlast_coding_system_used; | |
| 6028 } | |
| 6029 inhibit_pre_post_conversion = 0; | |
| 6030 Vdeactivate_mark = old_deactivate_mark; | |
| 6031 str = make_buffer_string (BEG, Z, 1); | |
| 6032 return unbind_to (count, str); | |
| 6033 } | |
| 6034 | |
| 6035 Lisp_Object | |
| 6036 decode_coding_string (str, coding, nocopy) | |
| 6037 Lisp_Object str; | |
| 6038 struct coding_system *coding; | |
| 6039 int nocopy; | |
| 6040 { | |
| 6041 int len; | |
| 6042 struct conversion_buffer buf; | |
| 6043 int from, to_byte; | |
| 6044 Lisp_Object saved_coding_symbol; | |
| 6045 int result; | |
| 6046 int require_decoding; | |
| 6047 int shrinked_bytes = 0; | |
| 6048 Lisp_Object newstr; | |
| 6049 int consumed, consumed_char, produced, produced_char; | |
| 6050 | |
| 6051 from = 0; | |
| 6052 to_byte = SBYTES (str); | |
| 6053 | |
| 6054 saved_coding_symbol = coding->symbol; | |
| 6055 coding->src_multibyte = STRING_MULTIBYTE (str); | |
| 6056 coding->dst_multibyte = 1; | |
| 6057 if (CODING_REQUIRE_DETECTION (coding)) | |
| 6058 { | |
| 6059 /* See the comments in code_convert_region. */ | |
| 6060 if (coding->type == coding_type_undecided) | |
| 6061 { | |
| 6062 detect_coding (coding, SDATA (str), to_byte); | |
| 6063 if (coding->type == coding_type_undecided) | |
| 6064 { | |
| 6065 coding->type = coding_type_emacs_mule; | |
| 6066 coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE; | |
| 6067 /* As emacs-mule decoder will handle composition, we | |
| 6068 need this setting to allocate coding->cmp_data | |
| 6069 later. */ | |
| 6070 coding->composing = COMPOSITION_NO; | |
| 6071 } | |
| 6072 } | |
| 6073 if (coding->eol_type == CODING_EOL_UNDECIDED | |
| 6074 && coding->type != coding_type_ccl) | |
| 6075 { | |
| 6076 saved_coding_symbol = coding->symbol; | |
| 6077 detect_eol (coding, SDATA (str), to_byte); | |
| 6078 if (coding->eol_type == CODING_EOL_UNDECIDED) | |
| 6079 coding->eol_type = CODING_EOL_LF; | |
| 6080 /* We had better recover the original eol format if we | |
| 6081 encounter an inconsistent eol format while decoding. */ | |
| 6082 coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL; | |
| 6083 } | |
| 6084 } | |
| 6085 | |
| 6086 if (coding->type == coding_type_no_conversion | |
| 6087 || coding->type == coding_type_raw_text) | |
| 6088 coding->dst_multibyte = 0; | |
| 6089 | |
| 6090 require_decoding = CODING_REQUIRE_DECODING (coding); | |
| 6091 | |
| 6092 if (STRING_MULTIBYTE (str)) | |
| 6093 { | |
| 6094 /* Decoding routines expect the source text to be unibyte. */ | |
| 6095 str = Fstring_as_unibyte (str); | |
| 6096 to_byte = SBYTES (str); | |
| 6097 nocopy = 1; | |
| 6098 coding->src_multibyte = 0; | |
| 6099 } | |
| 6100 | |
| 6101 /* Try to skip the heading and tailing ASCIIs. */ | |
| 6102 if (require_decoding && coding->type != coding_type_ccl) | |
| 6103 { | |
| 6104 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str), | |
| 6105 0); | |
| 6106 if (from == to_byte) | |
| 6107 require_decoding = 0; | |
| 6108 shrinked_bytes = from + (SBYTES (str) - to_byte); | |
| 6109 } | |
| 6110 | |
| 6111 if (!require_decoding | |
| 6112 && !(SYMBOLP (coding->post_read_conversion) | |
| 6113 && !NILP (Ffboundp (coding->post_read_conversion)))) | |
| 6114 { | |
| 6115 coding->consumed = SBYTES (str); | |
| 6116 coding->consumed_char = SCHARS (str); | |
| 6117 if (coding->dst_multibyte) | |
| 6118 { | |
| 6119 str = Fstring_as_multibyte (str); | |
| 6120 nocopy = 1; | |
| 6121 } | |
| 6122 coding->produced = SBYTES (str); | |
| 6123 coding->produced_char = SCHARS (str); | |
| 6124 return (nocopy ? str : Fcopy_sequence (str)); | |
| 6125 } | |
| 6126 | |
| 6127 if (coding->composing != COMPOSITION_DISABLED) | |
| 6128 coding_allocate_composition_data (coding, from); | |
| 6129 len = decoding_buffer_size (coding, to_byte - from); | |
| 6130 allocate_conversion_buffer (buf, len); | |
| 6131 | |
| 6132 consumed = consumed_char = produced = produced_char = 0; | |
| 6133 while (1) | |
| 6134 { | |
| 6135 result = decode_coding (coding, SDATA (str) + from + consumed, | |
| 6136 buf.data + produced, to_byte - from - consumed, | |
| 6137 buf.size - produced); | |
| 6138 consumed += coding->consumed; | |
| 6139 consumed_char += coding->consumed_char; | |
| 6140 produced += coding->produced; | |
| 6141 produced_char += coding->produced_char; | |
| 6142 if (result == CODING_FINISH_NORMAL | |
| 6143 || (result == CODING_FINISH_INSUFFICIENT_SRC | |
| 6144 && coding->consumed == 0)) | |
| 6145 break; | |
| 6146 if (result == CODING_FINISH_INSUFFICIENT_CMP) | |
| 6147 coding_allocate_composition_data (coding, from + produced_char); | |
| 6148 else if (result == CODING_FINISH_INSUFFICIENT_DST) | |
| 6149 extend_conversion_buffer (&buf); | |
| 6150 else if (result == CODING_FINISH_INCONSISTENT_EOL) | |
| 6151 { | |
| 6152 Lisp_Object eol_type; | |
| 6153 | |
| 6154 /* Recover the original EOL format. */ | |
| 6155 if (coding->eol_type == CODING_EOL_CR) | |
| 6156 { | |
| 6157 unsigned char *p; | |
| 6158 for (p = buf.data; p < buf.data + produced; p++) | |
| 6159 if (*p == '\n') *p = '\r'; | |
| 6160 } | |
| 6161 else if (coding->eol_type == CODING_EOL_CRLF) | |
| 6162 { | |
| 6163 int num_eol = 0; | |
| 6164 unsigned char *p0, *p1; | |
| 6165 for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++) | |
| 6166 if (*p0 == '\n') num_eol++; | |
| 6167 if (produced + num_eol >= buf.size) | |
| 6168 extend_conversion_buffer (&buf); | |
| 6169 for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;) | |
| 6170 { | |
| 6171 *--p1 = *--p0; | |
| 6172 if (*p0 == '\n') *--p1 = '\r'; | |
| 6173 } | |
| 6174 produced += num_eol; | |
| 6175 produced_char += num_eol; | |
| 6176 } | |
| 6177 /* Suppress eol-format conversion in the further conversion. */ | |
| 6178 coding->eol_type = CODING_EOL_LF; | |
| 6179 | |
| 6180 /* Set the coding system symbol to that for Unix-like EOL. */ | |
| 6181 eol_type = Fget (saved_coding_symbol, Qeol_type); | |
| 6182 if (VECTORP (eol_type) | |
| 6183 && XVECTOR (eol_type)->size == 3 | |
| 6184 && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF])) | |
| 6185 coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF]; | |
| 6186 else | |
| 6187 coding->symbol = saved_coding_symbol; | |
| 6188 | |
| 6189 | |
| 6190 } | |
| 6191 } | |
| 6192 | |
| 6193 coding->consumed = consumed; | |
| 6194 coding->consumed_char = consumed_char; | |
| 6195 coding->produced = produced; | |
| 6196 coding->produced_char = produced_char; | |
| 6197 | |
| 6198 if (coding->dst_multibyte) | |
| 6199 newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes, | |
| 6200 produced + shrinked_bytes); | |
| 6201 else | |
| 6202 newstr = make_uninit_string (produced + shrinked_bytes); | |
| 6203 if (from > 0) | |
| 6204 STRING_COPYIN (newstr, 0, SDATA (str), from); | |
| 6205 STRING_COPYIN (newstr, from, buf.data, produced); | |
| 6206 if (shrinked_bytes > from) | |
| 6207 STRING_COPYIN (newstr, from + produced, | |
| 6208 SDATA (str) + to_byte, | |
| 6209 shrinked_bytes - from); | |
| 6210 free_conversion_buffer (&buf); | |
| 6211 | |
| 6212 if (coding->cmp_data && coding->cmp_data->used) | |
| 6213 coding_restore_composition (coding, newstr); | |
| 6214 coding_free_composition_data (coding); | |
| 6215 | |
| 6216 if (SYMBOLP (coding->post_read_conversion) | |
| 6217 && !NILP (Ffboundp (coding->post_read_conversion))) | |
| 6218 newstr = run_pre_post_conversion_on_str (newstr, coding, 0); | |
| 6219 | |
| 6220 return newstr; | |
| 6221 } | |
| 6222 | |
| 6223 Lisp_Object | |
| 6224 encode_coding_string (str, coding, nocopy) | |
| 6225 Lisp_Object str; | |
| 6226 struct coding_system *coding; | |
| 6227 int nocopy; | |
| 6228 { | |
| 6229 int len; | |
| 6230 struct conversion_buffer buf; | |
| 6231 int from, to, to_byte; | |
| 6232 int result; | |
| 6233 int shrinked_bytes = 0; | |
| 6234 Lisp_Object newstr; | |
| 6235 int consumed, consumed_char, produced, produced_char; | |
| 6236 | |
| 6237 if (SYMBOLP (coding->pre_write_conversion) | |
| 6238 && !NILP (Ffboundp (coding->pre_write_conversion))) | |
| 6239 str = run_pre_post_conversion_on_str (str, coding, 1); | |
| 6240 | |
| 6241 from = 0; | |
| 6242 to = SCHARS (str); | |
| 6243 to_byte = SBYTES (str); | |
| 6244 | |
| 6245 /* Encoding routines determine the multibyteness of the source text | |
| 6246 by coding->src_multibyte. */ | |
| 6247 coding->src_multibyte = STRING_MULTIBYTE (str); | |
| 6248 coding->dst_multibyte = 0; | |
| 6249 if (! CODING_REQUIRE_ENCODING (coding)) | |
| 6250 { | |
| 6251 coding->consumed = SBYTES (str); | |
| 6252 coding->consumed_char = SCHARS (str); | |
| 6253 if (STRING_MULTIBYTE (str)) | |
| 6254 { | |
| 6255 str = Fstring_as_unibyte (str); | |
| 6256 nocopy = 1; | |
| 6257 } | |
| 6258 coding->produced = SBYTES (str); | |
| 6259 coding->produced_char = SCHARS (str); | |
| 6260 return (nocopy ? str : Fcopy_sequence (str)); | |
| 6261 } | |
| 6262 | |
| 6263 if (coding->composing != COMPOSITION_DISABLED) | |
| 6264 coding_save_composition (coding, from, to, str); | |
| 6265 | |
| 6266 /* Try to skip the heading and tailing ASCIIs. */ | |
| 6267 if (coding->type != coding_type_ccl) | |
| 6268 { | |
| 6269 SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str), | |
| 6270 1); | |
| 6271 if (from == to_byte) | |
| 6272 return (nocopy ? str : Fcopy_sequence (str)); | |
| 6273 shrinked_bytes = from + (SBYTES (str) - to_byte); | |
| 6274 } | |
| 6275 | |
| 6276 len = encoding_buffer_size (coding, to_byte - from); | |
| 6277 allocate_conversion_buffer (buf, len); | |
| 6278 | |
| 6279 consumed = consumed_char = produced = produced_char = 0; | |
| 6280 while (1) | |
| 6281 { | |
| 6282 result = encode_coding (coding, SDATA (str) + from + consumed, | |
| 6283 buf.data + produced, to_byte - from - consumed, | |
| 6284 buf.size - produced); | |
| 6285 consumed += coding->consumed; | |
| 6286 consumed_char += coding->consumed_char; | |
| 6287 produced += coding->produced; | |
| 6288 produced_char += coding->produced_char; | |
| 6289 if (result == CODING_FINISH_NORMAL | |
| 6290 || (result == CODING_FINISH_INSUFFICIENT_SRC | |
| 6291 && coding->consumed == 0)) | |
| 6292 break; | |
| 6293 /* Now result should be CODING_FINISH_INSUFFICIENT_DST. */ | |
| 6294 extend_conversion_buffer (&buf); | |
| 6295 } | |
| 6296 | |
| 6297 coding->consumed = consumed; | |
| 6298 coding->consumed_char = consumed_char; | |
| 6299 coding->produced = produced; | |
| 6300 coding->produced_char = produced_char; | |
| 6301 | |
| 6302 newstr = make_uninit_string (produced + shrinked_bytes); | |
| 6303 if (from > 0) | |
| 6304 STRING_COPYIN (newstr, 0, SDATA (str), from); | |
| 6305 STRING_COPYIN (newstr, from, buf.data, produced); | |
| 6306 if (shrinked_bytes > from) | |
| 6307 STRING_COPYIN (newstr, from + produced, | |
| 6308 SDATA (str) + to_byte, | |
| 6309 shrinked_bytes - from); | |
| 6310 | |
| 6311 free_conversion_buffer (&buf); | |
| 6312 coding_free_composition_data (coding); | |
| 6313 | |
| 6314 return newstr; | |
| 6315 } | 6655 } |
| 6316 | 6656 |
| 6317 | 6657 |
| 6318 #ifdef emacs | 6658 #ifdef emacs |
| 6319 /*** 8. Emacs Lisp library functions ***/ | 6659 /*** 8. Emacs Lisp library functions ***/ |
| 6320 | 6660 |
| 6321 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0, | 6661 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0, |
| 6322 doc: /* Return t if OBJECT is nil or a coding-system. | 6662 doc: /* Return t if OBJECT is nil or a coding-system. |
| 6323 See the documentation of `make-coding-system' for information | 6663 See the documentation of `define-coding-system' for information |
| 6324 about coding-system objects. */) | 6664 about coding-system objects. */) |
| 6325 (obj) | 6665 (obj) |
| 6326 Lisp_Object obj; | 6666 Lisp_Object obj; |
| 6327 { | 6667 { |
| 6328 if (NILP (obj)) | 6668 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil); |
| 6329 return Qt; | |
| 6330 if (!SYMBOLP (obj)) | |
| 6331 return Qnil; | |
| 6332 /* Get coding-spec vector for OBJ. */ | |
| 6333 obj = Fget (obj, Qcoding_system); | |
| 6334 return ((VECTORP (obj) && XVECTOR (obj)->size == 5) | |
| 6335 ? Qt : Qnil); | |
| 6336 } | 6669 } |
| 6337 | 6670 |
| 6338 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system, | 6671 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system, |
| 6339 Sread_non_nil_coding_system, 1, 1, 0, | 6672 Sread_non_nil_coding_system, 1, 1, 0, |
| 6340 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */) | 6673 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */) |
| 6357 (prompt, default_coding_system) | 6690 (prompt, default_coding_system) |
| 6358 Lisp_Object prompt, default_coding_system; | 6691 Lisp_Object prompt, default_coding_system; |
| 6359 { | 6692 { |
| 6360 Lisp_Object val; | 6693 Lisp_Object val; |
| 6361 if (SYMBOLP (default_coding_system)) | 6694 if (SYMBOLP (default_coding_system)) |
| 6362 default_coding_system = SYMBOL_NAME (default_coding_system); | 6695 XSETSTRING (default_coding_system, SYMBOL_NAME (default_coding_system)); |
| 6363 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil, | 6696 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil, |
| 6364 Qt, Qnil, Qcoding_system_history, | 6697 Qt, Qnil, Qcoding_system_history, |
| 6365 default_coding_system, Qnil); | 6698 default_coding_system, Qnil); |
| 6366 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil)); | 6699 return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil)); |
| 6367 } | 6700 } |
| 6368 | 6701 |
| 6369 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system, | 6702 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system, |
| 6370 1, 1, 0, | 6703 1, 1, 0, |
| 6371 doc: /* Check validity of CODING-SYSTEM. | 6704 doc: /* Check validity of CODING-SYSTEM. |
| 6372 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. | 6705 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */) |
| 6373 It is valid if it is a symbol with a non-nil `coding-system' property. | 6706 (coding_system) |
| 6374 The value of property should be a vector of length 5. */) | |
| 6375 (coding_system) | |
| 6376 Lisp_Object coding_system; | 6707 Lisp_Object coding_system; |
| 6377 { | 6708 { |
| 6378 CHECK_SYMBOL (coding_system); | 6709 CHECK_SYMBOL (coding_system); |
| 6379 if (!NILP (Fcoding_system_p (coding_system))) | 6710 if (!NILP (Fcoding_system_p (coding_system))) |
| 6380 return coding_system; | 6711 return coding_system; |
| 6381 while (1) | 6712 while (1) |
| 6382 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil)); | 6713 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil)); |
| 6383 } | 6714 } |
| 6715 | |
| 6384 | 6716 |
| 6717 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If | |
| 6718 HIGHEST is nonzero, return the coding system of the highest | |
| 6719 priority among the detected coding systems. Otherwize return a | |
| 6720 list of detected coding systems sorted by their priorities. If | |
| 6721 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct | |
| 6722 multibyte form but contains only ASCII and eight-bit chars. | |
| 6723 Otherwise, the bytes are raw bytes. | |
| 6724 | |
| 6725 CODING-SYSTEM controls the detection as below: | |
| 6726 | |
| 6727 If it is nil, detect both text-format and eol-format. If the | |
| 6728 text-format part of CODING-SYSTEM is already specified | |
| 6729 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format | |
| 6730 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'), | |
| 6731 detect only text-format. */ | |
| 6732 | |
| 6385 Lisp_Object | 6733 Lisp_Object |
| 6386 detect_coding_system (src, src_bytes, highest, multibytep) | 6734 detect_coding_system (src, src_bytes, highest, multibytep, coding_system) |
| 6387 const unsigned char *src; | 6735 const unsigned char *src; |
| 6388 int src_bytes, highest; | 6736 int src_bytes, highest; |
| 6389 int multibytep; | 6737 int multibytep; |
| 6390 { | 6738 Lisp_Object coding_system; |
| 6391 int coding_mask, eol_type; | 6739 { |
| 6392 Lisp_Object val, tmp; | 6740 const unsigned char *src_end = src + src_bytes; |
| 6393 int dummy; | 6741 Lisp_Object attrs, eol_type; |
| 6394 | 6742 Lisp_Object val; |
| 6395 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep); | 6743 struct coding_system coding; |
| 6396 eol_type = detect_eol_type (src, src_bytes, &dummy); | 6744 int id; |
| 6397 if (eol_type == CODING_EOL_INCONSISTENT) | 6745 struct coding_detection_info detect_info; |
| 6398 eol_type = CODING_EOL_UNDECIDED; | 6746 |
| 6399 | 6747 if (NILP (coding_system)) |
| 6400 if (!coding_mask) | 6748 coding_system = Qundecided; |
| 6401 { | 6749 setup_coding_system (coding_system, &coding); |
| 6402 val = Qundecided; | 6750 attrs = CODING_ID_ATTRS (coding.id); |
| 6403 if (eol_type != CODING_EOL_UNDECIDED) | 6751 eol_type = CODING_ID_EOL_TYPE (coding.id); |
| 6404 { | 6752 coding_system = CODING_ATTR_BASE_NAME (attrs); |
| 6405 Lisp_Object val2; | 6753 |
| 6406 val2 = Fget (Qundecided, Qeol_type); | 6754 coding.source = src; |
| 6407 if (VECTORP (val2)) | 6755 coding.src_bytes = src_bytes; |
| 6408 val = XVECTOR (val2)->contents[eol_type]; | 6756 coding.src_multibyte = multibytep; |
| 6409 } | 6757 coding.consumed = 0; |
| 6410 return (highest ? val : Fcons (val, Qnil)); | 6758 coding.mode |= CODING_MODE_LAST_BLOCK; |
| 6411 } | 6759 |
| 6412 | 6760 detect_info.checked = detect_info.found = detect_info.rejected = 0; |
| 6413 /* At first, gather possible coding systems in VAL. */ | 6761 |
| 6414 val = Qnil; | 6762 /* At first, detect text-format if necessary. */ |
| 6415 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp)) | 6763 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided) |
| 6416 { | 6764 { |
| 6417 Lisp_Object category_val, category_index; | 6765 enum coding_category category; |
| 6418 | 6766 struct coding_system *this; |
| 6419 category_index = Fget (XCAR (tmp), Qcoding_category_index); | 6767 int c, i; |
| 6420 category_val = Fsymbol_value (XCAR (tmp)); | 6768 |
| 6421 if (!NILP (category_val) | 6769 for (; src < src_end; src++) |
| 6422 && NATNUMP (category_index) | 6770 { |
| 6423 && (coding_mask & (1 << XFASTINT (category_index)))) | 6771 c = *src; |
| 6424 { | 6772 if (c & 0x80 |
| 6425 val = Fcons (category_val, val); | 6773 || (c < 0x20 && (c == ISO_CODE_ESC |
| 6426 if (highest) | 6774 || c == ISO_CODE_SI |
| 6775 || c == ISO_CODE_SO))) | |
| 6427 break; | 6776 break; |
| 6428 } | 6777 } |
| 6429 } | 6778 coding.head_ascii = src - coding.source; |
| 6430 if (!highest) | 6779 |
| 6431 val = Fnreverse (val); | 6780 if (src < src_end) |
| 6432 | 6781 for (i = 0; i < coding_category_raw_text; i++) |
| 6433 /* Then, replace the elements with subsidiary coding systems. */ | 6782 { |
| 6434 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp)) | 6783 category = coding_priorities[i]; |
| 6435 { | 6784 this = coding_categories + category; |
| 6436 if (eol_type != CODING_EOL_UNDECIDED | 6785 |
| 6437 && eol_type != CODING_EOL_INCONSISTENT) | 6786 if (this->id < 0) |
| 6438 { | 6787 { |
| 6439 Lisp_Object eol; | 6788 /* No coding system of this category is defined. */ |
| 6440 eol = Fget (XCAR (tmp), Qeol_type); | 6789 detect_info.rejected |= (1 << category); |
| 6441 if (VECTORP (eol)) | 6790 } |
| 6442 XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]); | 6791 else if (category >= coding_category_raw_text) |
| 6443 } | 6792 continue; |
| 6444 } | 6793 else if (detect_info.checked & (1 << category)) |
| 6794 { | |
| 6795 if (highest | |
| 6796 && (detect_info.found & (1 << category))) | |
| 6797 break; | |
| 6798 } | |
| 6799 else | |
| 6800 { | |
| 6801 if ((*(this->detector)) (&coding, &detect_info) | |
| 6802 && highest | |
| 6803 && (detect_info.found & (1 << category))) | |
| 6804 break; | |
| 6805 } | |
| 6806 } | |
| 6807 | |
| 6808 | |
| 6809 if (detect_info.rejected == CATEGORY_MASK_ANY) | |
| 6810 { | |
| 6811 detect_info.found = CATEGORY_MASK_RAW_TEXT; | |
| 6812 id = coding_categories[coding_category_raw_text].id; | |
| 6813 val = Fcons (make_number (id), Qnil); | |
| 6814 } | |
| 6815 else if (! detect_info.rejected && ! detect_info.found) | |
| 6816 { | |
| 6817 detect_info.found = CATEGORY_MASK_ANY; | |
| 6818 id = coding_categories[coding_category_undecided].id; | |
| 6819 val = Fcons (make_number (id), Qnil); | |
| 6820 } | |
| 6821 else if (highest) | |
| 6822 { | |
| 6823 if (detect_info.found) | |
| 6824 { | |
| 6825 detect_info.found = 1 << category; | |
| 6826 val = Fcons (make_number (this->id), Qnil); | |
| 6827 } | |
| 6828 else | |
| 6829 for (i = 0; i < coding_category_raw_text; i++) | |
| 6830 if (! (detect_info.rejected & (1 << coding_priorities[i]))) | |
| 6831 { | |
| 6832 detect_info.found = 1 << coding_priorities[i]; | |
| 6833 id = coding_categories[coding_priorities[i]].id; | |
| 6834 val = Fcons (make_number (id), Qnil); | |
| 6835 break; | |
| 6836 } | |
| 6837 } | |
| 6838 else | |
| 6839 { | |
| 6840 int mask = detect_info.rejected | detect_info.found; | |
| 6841 int found = 0; | |
| 6842 val = Qnil; | |
| 6843 | |
| 6844 for (i = coding_category_raw_text - 1; i >= 0; i--) | |
| 6845 { | |
| 6846 category = coding_priorities[i]; | |
| 6847 if (! (mask & (1 << category))) | |
| 6848 { | |
| 6849 found |= 1 << category; | |
| 6850 id = coding_categories[category].id; | |
| 6851 val = Fcons (make_number (id), val); | |
| 6852 } | |
| 6853 } | |
| 6854 for (i = coding_category_raw_text - 1; i >= 0; i--) | |
| 6855 { | |
| 6856 category = coding_priorities[i]; | |
| 6857 if (detect_info.found & (1 << category)) | |
| 6858 { | |
| 6859 id = coding_categories[category].id; | |
| 6860 val = Fcons (make_number (id), val); | |
| 6861 } | |
| 6862 } | |
| 6863 detect_info.found |= found; | |
| 6864 } | |
| 6865 } | |
| 6866 else | |
| 6867 { | |
| 6868 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs)); | |
| 6869 val = Fcons (make_number (coding.id), Qnil); | |
| 6870 } | |
| 6871 | |
| 6872 /* Then, detect eol-format if necessary. */ | |
| 6873 { | |
| 6874 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol; | |
| 6875 Lisp_Object tail; | |
| 6876 | |
| 6877 if (VECTORP (eol_type)) | |
| 6878 { | |
| 6879 if (detect_info.found & ~CATEGORY_MASK_UTF_16) | |
| 6880 normal_eol = detect_eol (coding.source, src_bytes, | |
| 6881 coding_category_raw_text); | |
| 6882 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE | |
| 6883 | CATEGORY_MASK_UTF_16_BE_NOSIG)) | |
| 6884 utf_16_be_eol = detect_eol (coding.source, src_bytes, | |
| 6885 coding_category_utf_16_be); | |
| 6886 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE | |
| 6887 | CATEGORY_MASK_UTF_16_LE_NOSIG)) | |
| 6888 utf_16_le_eol = detect_eol (coding.source, src_bytes, | |
| 6889 coding_category_utf_16_le); | |
| 6890 } | |
| 6891 else | |
| 6892 { | |
| 6893 if (EQ (eol_type, Qunix)) | |
| 6894 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF; | |
| 6895 else if (EQ (eol_type, Qdos)) | |
| 6896 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF; | |
| 6897 else | |
| 6898 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR; | |
| 6899 } | |
| 6900 | |
| 6901 for (tail = val; CONSP (tail); tail = XCDR (tail)) | |
| 6902 { | |
| 6903 enum coding_category category; | |
| 6904 int this_eol; | |
| 6905 | |
| 6906 id = XINT (XCAR (tail)); | |
| 6907 attrs = CODING_ID_ATTRS (id); | |
| 6908 category = XINT (CODING_ATTR_CATEGORY (attrs)); | |
| 6909 eol_type = CODING_ID_EOL_TYPE (id); | |
| 6910 if (VECTORP (eol_type)) | |
| 6911 { | |
| 6912 if (category == coding_category_utf_16_be | |
| 6913 || category == coding_category_utf_16_be_nosig) | |
| 6914 this_eol = utf_16_be_eol; | |
| 6915 else if (category == coding_category_utf_16_le | |
| 6916 || category == coding_category_utf_16_le_nosig) | |
| 6917 this_eol = utf_16_le_eol; | |
| 6918 else | |
| 6919 this_eol = normal_eol; | |
| 6920 | |
| 6921 if (this_eol == EOL_SEEN_LF) | |
| 6922 XSETCAR (tail, AREF (eol_type, 0)); | |
| 6923 else if (this_eol == EOL_SEEN_CRLF) | |
| 6924 XSETCAR (tail, AREF (eol_type, 1)); | |
| 6925 else if (this_eol == EOL_SEEN_CR) | |
| 6926 XSETCAR (tail, AREF (eol_type, 2)); | |
| 6927 else | |
| 6928 XSETCAR (tail, CODING_ID_NAME (id)); | |
| 6929 } | |
| 6930 else | |
| 6931 XSETCAR (tail, CODING_ID_NAME (id)); | |
| 6932 } | |
| 6933 } | |
| 6934 | |
| 6445 return (highest ? XCAR (val) : val); | 6935 return (highest ? XCAR (val) : val); |
| 6446 } | 6936 } |
| 6937 | |
| 6447 | 6938 |
| 6448 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region, | 6939 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region, |
| 6449 2, 3, 0, | 6940 2, 3, 0, |
| 6450 doc: /* Detect how the byte sequence in the region is encoded. | 6941 doc: /* Detect coding system of the text in the region between START and END. |
| 6451 Return a list of possible coding systems used on decoding a byte | 6942 Return a list of possible coding systems ordered by priority. |
| 6452 sequence containing the bytes in the region between START and END when | |
| 6453 the coding system `undecided' is specified. The list is ordered by | |
| 6454 priority decided in the current language environment. | |
| 6455 | 6943 |
| 6456 If only ASCII characters are found, it returns a list of single element | 6944 If only ASCII characters are found, it returns a list of single element |
| 6457 `undecided' or its subsidiary coding system according to a detected | 6945 `undecided' or its subsidiary coding system according to a detected |
| 6458 end-of-line format. | 6946 end-of-line format. |
| 6459 | 6947 |
| 6462 (start, end, highest) | 6950 (start, end, highest) |
| 6463 Lisp_Object start, end, highest; | 6951 Lisp_Object start, end, highest; |
| 6464 { | 6952 { |
| 6465 int from, to; | 6953 int from, to; |
| 6466 int from_byte, to_byte; | 6954 int from_byte, to_byte; |
| 6467 int include_anchor_byte = 0; | |
| 6468 | 6955 |
| 6469 CHECK_NUMBER_COERCE_MARKER (start); | 6956 CHECK_NUMBER_COERCE_MARKER (start); |
| 6470 CHECK_NUMBER_COERCE_MARKER (end); | 6957 CHECK_NUMBER_COERCE_MARKER (end); |
| 6471 | 6958 |
| 6472 validate_region (&start, &end); | 6959 validate_region (&start, &end); |
| 6474 from_byte = CHAR_TO_BYTE (from); | 6961 from_byte = CHAR_TO_BYTE (from); |
| 6475 to_byte = CHAR_TO_BYTE (to); | 6962 to_byte = CHAR_TO_BYTE (to); |
| 6476 | 6963 |
| 6477 if (from < GPT && to >= GPT) | 6964 if (from < GPT && to >= GPT) |
| 6478 move_gap_both (to, to_byte); | 6965 move_gap_both (to, to_byte); |
| 6479 /* If we an anchor byte `\0' follows the region, we include it in | 6966 |
| 6480 the detecting source. Then code detectors can handle the tailing | |
| 6481 byte sequence more accurately. | |
| 6482 | |
| 6483 Fix me: This is not a perfect solution. It is better that we | |
| 6484 add one more argument, say LAST_BLOCK, to all detect_coding_XXX. | |
| 6485 */ | |
| 6486 if (to == Z || (to == GPT && GAP_SIZE > 0)) | |
| 6487 include_anchor_byte = 1; | |
| 6488 return detect_coding_system (BYTE_POS_ADDR (from_byte), | 6967 return detect_coding_system (BYTE_POS_ADDR (from_byte), |
| 6489 to_byte - from_byte + include_anchor_byte, | 6968 to_byte - from_byte, |
| 6490 !NILP (highest), | 6969 !NILP (highest), |
| 6491 !NILP (current_buffer | 6970 !NILP (current_buffer |
| 6492 ->enable_multibyte_characters)); | 6971 ->enable_multibyte_characters), |
| 6972 Qnil); | |
| 6493 } | 6973 } |
| 6494 | 6974 |
| 6495 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string, | 6975 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string, |
| 6496 1, 2, 0, | 6976 1, 2, 0, |
| 6497 doc: /* Detect how the byte sequence in STRING is encoded. | 6977 doc: /* Detect coding system of the text in STRING. |
| 6498 Return a list of possible coding systems used on decoding a byte | 6978 Return a list of possible coding systems ordered by priority. |
| 6499 sequence containing the bytes in STRING when the coding system | |
| 6500 `undecided' is specified. The list is ordered by priority decided in | |
| 6501 the current language environment. | |
| 6502 | 6979 |
| 6503 If only ASCII characters are found, it returns a list of single element | 6980 If only ASCII characters are found, it returns a list of single element |
| 6504 `undecided' or its subsidiary coding system according to a detected | 6981 `undecided' or its subsidiary coding system according to a detected |
| 6505 end-of-line format. | 6982 end-of-line format. |
| 6506 | 6983 |
| 6509 (string, highest) | 6986 (string, highest) |
| 6510 Lisp_Object string, highest; | 6987 Lisp_Object string, highest; |
| 6511 { | 6988 { |
| 6512 CHECK_STRING (string); | 6989 CHECK_STRING (string); |
| 6513 | 6990 |
| 6514 return detect_coding_system (SDATA (string), | 6991 return detect_coding_system (SDATA (string), SBYTES (string), |
| 6515 /* "+ 1" is to include the anchor byte | 6992 !NILP (highest), STRING_MULTIBYTE (string), |
| 6516 `\0'. With this, code detectors can | 6993 Qnil); |
| 6517 handle the tailing bytes more | 6994 } |
| 6518 accurately. */ | 6995 |
| 6519 SBYTES (string) + 1, | 6996 |
| 6520 !NILP (highest), | 6997 static INLINE int |
| 6521 STRING_MULTIBYTE (string)); | 6998 char_encodable_p (c, attrs) |
| 6522 } | 6999 int c; |
| 6523 | 7000 Lisp_Object attrs; |
| 6524 /* Subroutine for Fsafe_coding_systems_region_internal. | 7001 { |
| 6525 | 7002 Lisp_Object tail; |
| 6526 Return a list of coding systems that safely encode the multibyte | 7003 struct charset *charset; |
| 6527 text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of | 7004 |
| 6528 possible coding systems. If it is nil, it means that we have not | 7005 for (tail = CODING_ATTR_CHARSET_LIST (attrs); |
| 6529 yet found any coding systems. | 7006 CONSP (tail); tail = XCDR (tail)) |
| 6530 | 7007 { |
| 6531 WORK_TABLE is a copy of the char-table Vchar_coding_system_table. An | 7008 charset = CHARSET_FROM_ID (XINT (XCAR (tail))); |
| 6532 element of WORK_TABLE is set to t once the element is looked up. | 7009 if (CHAR_CHARSET_P (c, charset)) |
| 6533 | 7010 break; |
| 6534 If a non-ASCII single byte char is found, set | 7011 } |
| 6535 *single_byte_char_found to 1. */ | 7012 return (! NILP (tail)); |
| 6536 | 7013 } |
| 6537 static Lisp_Object | 7014 |
| 6538 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found) | 7015 |
| 6539 unsigned char *p, *pend; | 7016 /* Return a list of coding systems that safely encode the text between |
| 6540 Lisp_Object safe_codings, work_table; | 7017 START and END. If EXCLUDE is non-nil, it is a list of coding |
| 6541 int *single_byte_char_found; | 7018 systems not to check. The returned list doesn't contain any such |
| 6542 { | 7019 coding systems. In any case, if the text contains only ASCII or is |
| 6543 int c, len; | 7020 unibyte, return t. */ |
| 6544 Lisp_Object val, ch; | |
| 6545 Lisp_Object prev, tail; | |
| 6546 | |
| 6547 while (p < pend) | |
| 6548 { | |
| 6549 c = STRING_CHAR_AND_LENGTH (p, pend - p, len); | |
| 6550 p += len; | |
| 6551 if (ASCII_BYTE_P (c)) | |
| 6552 /* We can ignore ASCII characters here. */ | |
| 6553 continue; | |
| 6554 if (SINGLE_BYTE_CHAR_P (c)) | |
| 6555 *single_byte_char_found = 1; | |
| 6556 if (NILP (safe_codings)) | |
| 6557 /* Already all coding systems are excluded. But, we can't | |
| 6558 terminate the loop here because non-ASCII single-byte char | |
| 6559 must be found. */ | |
| 6560 continue; | |
| 6561 /* Check the safe coding systems for C. */ | |
| 6562 ch = make_number (c); | |
| 6563 val = Faref (work_table, ch); | |
| 6564 if (EQ (val, Qt)) | |
| 6565 /* This element was already checked. Ignore it. */ | |
| 6566 continue; | |
| 6567 /* Remember that we checked this element. */ | |
| 6568 Faset (work_table, ch, Qt); | |
| 6569 | |
| 6570 for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail)) | |
| 6571 { | |
| 6572 Lisp_Object elt, translation_table, hash_table, accept_latin_extra; | |
| 6573 int encodable; | |
| 6574 | |
| 6575 elt = XCAR (tail); | |
| 6576 if (CONSP (XCDR (elt))) | |
| 6577 { | |
| 6578 /* This entry has this format now: | |
| 6579 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE | |
| 6580 ACCEPT-LATIN-EXTRA ) */ | |
| 6581 val = XCDR (elt); | |
| 6582 encodable = ! NILP (Faref (XCAR (val), ch)); | |
| 6583 if (! encodable) | |
| 6584 { | |
| 6585 val = XCDR (val); | |
| 6586 translation_table = XCAR (val); | |
| 6587 hash_table = XCAR (XCDR (val)); | |
| 6588 accept_latin_extra = XCAR (XCDR (XCDR (val))); | |
| 6589 } | |
| 6590 } | |
| 6591 else | |
| 6592 { | |
| 6593 /* This entry has this format now: ( CODING . SAFE-CHARS) */ | |
| 6594 encodable = ! NILP (Faref (XCDR (elt), ch)); | |
| 6595 if (! encodable) | |
| 6596 { | |
| 6597 /* Transform the format to: | |
| 6598 ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE | |
| 6599 ACCEPT-LATIN-EXTRA ) */ | |
| 6600 val = Fget (XCAR (elt), Qcoding_system); | |
| 6601 translation_table | |
| 6602 = Fplist_get (AREF (val, 3), | |
| 6603 Qtranslation_table_for_encode); | |
| 6604 if (SYMBOLP (translation_table)) | |
| 6605 translation_table = Fget (translation_table, | |
| 6606 Qtranslation_table); | |
| 6607 hash_table | |
| 6608 = (CHAR_TABLE_P (translation_table) | |
| 6609 ? XCHAR_TABLE (translation_table)->extras[1] | |
| 6610 : Qnil); | |
| 6611 accept_latin_extra | |
| 6612 = ((EQ (AREF (val, 0), make_number (2)) | |
| 6613 && VECTORP (AREF (val, 4))) | |
| 6614 ? AREF (AREF (val, 4), 16) | |
| 6615 : Qnil); | |
| 6616 XSETCAR (tail, list5 (XCAR (elt), XCDR (elt), | |
| 6617 translation_table, hash_table, | |
| 6618 accept_latin_extra)); | |
| 6619 } | |
| 6620 } | |
| 6621 | |
| 6622 if (! encodable | |
| 6623 && ((CHAR_TABLE_P (translation_table) | |
| 6624 && ! NILP (Faref (translation_table, ch))) | |
| 6625 || (HASH_TABLE_P (hash_table) | |
| 6626 && ! NILP (Fgethash (ch, hash_table, Qnil))) | |
| 6627 || (SINGLE_BYTE_CHAR_P (c) | |
| 6628 && ! NILP (accept_latin_extra) | |
| 6629 && VECTORP (Vlatin_extra_code_table) | |
| 6630 && ! NILP (AREF (Vlatin_extra_code_table, c))))) | |
| 6631 encodable = 1; | |
| 6632 if (encodable) | |
| 6633 prev = tail; | |
| 6634 else | |
| 6635 { | |
| 6636 /* Exclude this coding system from SAFE_CODINGS. */ | |
| 6637 if (EQ (tail, safe_codings)) | |
| 6638 safe_codings = XCDR (safe_codings); | |
| 6639 else | |
| 6640 XSETCDR (prev, XCDR (tail)); | |
| 6641 } | |
| 6642 } | |
| 6643 } | |
| 6644 return safe_codings; | |
| 6645 } | |
| 6646 | 7021 |
| 6647 DEFUN ("find-coding-systems-region-internal", | 7022 DEFUN ("find-coding-systems-region-internal", |
| 6648 Ffind_coding_systems_region_internal, | 7023 Ffind_coding_systems_region_internal, |
| 6649 Sfind_coding_systems_region_internal, 2, 2, 0, | 7024 Sfind_coding_systems_region_internal, 2, 3, 0, |
| 6650 doc: /* Internal use only. */) | 7025 doc: /* Internal use only. */) |
| 6651 (start, end) | 7026 (start, end, exclude) |
| 6652 Lisp_Object start, end; | 7027 Lisp_Object start, end, exclude; |
| 6653 { | 7028 { |
| 6654 Lisp_Object work_table, safe_codings; | 7029 Lisp_Object coding_attrs_list, safe_codings; |
| 6655 int non_ascii_p = 0; | 7030 EMACS_INT start_byte, end_byte; |
| 6656 int single_byte_char_found = 0; | 7031 const unsigned char *p, *pbeg, *pend; |
| 6657 const unsigned char *p1, *p1end, *p2, *p2end, *p; | 7032 int c; |
| 7033 Lisp_Object tail, elt; | |
| 6658 | 7034 |
| 6659 if (STRINGP (start)) | 7035 if (STRINGP (start)) |
| 6660 { | 7036 { |
| 6661 if (!STRING_MULTIBYTE (start)) | 7037 if (!STRING_MULTIBYTE (start) |
| 7038 || SCHARS (start) == SBYTES (start)) | |
| 6662 return Qt; | 7039 return Qt; |
| 6663 p1 = SDATA (start), p1end = p1 + SBYTES (start); | 7040 start_byte = 0; |
| 6664 p2 = p2end = p1end; | 7041 end_byte = SBYTES (start); |
| 6665 if (SCHARS (start) != SBYTES (start)) | |
| 6666 non_ascii_p = 1; | |
| 6667 } | 7042 } |
| 6668 else | 7043 else |
| 6669 { | 7044 { |
| 6670 int from, to, stop; | |
| 6671 | |
| 6672 CHECK_NUMBER_COERCE_MARKER (start); | 7045 CHECK_NUMBER_COERCE_MARKER (start); |
| 6673 CHECK_NUMBER_COERCE_MARKER (end); | 7046 CHECK_NUMBER_COERCE_MARKER (end); |
| 6674 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end)) | 7047 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end)) |
| 6675 args_out_of_range (start, end); | 7048 args_out_of_range (start, end); |
| 6676 if (NILP (current_buffer->enable_multibyte_characters)) | 7049 if (NILP (current_buffer->enable_multibyte_characters)) |
| 6677 return Qt; | 7050 return Qt; |
| 6678 from = CHAR_TO_BYTE (XINT (start)); | 7051 start_byte = CHAR_TO_BYTE (XINT (start)); |
| 6679 to = CHAR_TO_BYTE (XINT (end)); | 7052 end_byte = CHAR_TO_BYTE (XINT (end)); |
| 6680 stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to; | 7053 if (XINT (end) - XINT (start) == end_byte - start_byte) |
| 6681 p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from); | 7054 return Qt; |
| 6682 if (stop == to) | 7055 |
| 6683 p2 = p2end = p1end; | 7056 if (XINT (start) < GPT && XINT (end) > GPT) |
| 7057 { | |
| 7058 if ((GPT - XINT (start)) < (XINT (end) - GPT)) | |
| 7059 move_gap_both (XINT (start), start_byte); | |
| 7060 else | |
| 7061 move_gap_both (XINT (end), end_byte); | |
| 7062 } | |
| 7063 } | |
| 7064 | |
| 7065 coding_attrs_list = Qnil; | |
| 7066 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail)) | |
| 7067 if (NILP (exclude) | |
| 7068 || NILP (Fmemq (XCAR (tail), exclude))) | |
| 7069 { | |
| 7070 Lisp_Object attrs; | |
| 7071 | |
| 7072 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0); | |
| 7073 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)) | |
| 7074 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided)) | |
| 7075 coding_attrs_list = Fcons (attrs, coding_attrs_list); | |
| 7076 } | |
| 7077 | |
| 7078 if (STRINGP (start)) | |
| 7079 p = pbeg = SDATA (start); | |
| 7080 else | |
| 7081 p = pbeg = BYTE_POS_ADDR (start_byte); | |
| 7082 pend = p + (end_byte - start_byte); | |
| 7083 | |
| 7084 while (p < pend && ASCII_BYTE_P (*p)) p++; | |
| 7085 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--; | |
| 7086 | |
| 7087 while (p < pend) | |
| 7088 { | |
| 7089 if (ASCII_BYTE_P (*p)) | |
| 7090 p++; | |
| 6684 else | 7091 else |
| 6685 p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop); | 7092 { |
| 6686 if (XINT (end) - XINT (start) != to - from) | 7093 c = STRING_CHAR_ADVANCE (p); |
| 6687 non_ascii_p = 1; | 7094 |
| 6688 } | 7095 charset_map_loaded = 0; |
| 6689 | 7096 for (tail = coding_attrs_list; CONSP (tail);) |
| 6690 if (!non_ascii_p) | 7097 { |
| 6691 { | 7098 elt = XCAR (tail); |
| 6692 /* We are sure that the text contains no multibyte character. | 7099 if (NILP (elt)) |
| 6693 Check if it contains eight-bit-graphic. */ | 7100 tail = XCDR (tail); |
| 6694 p = p1; | 7101 else if (char_encodable_p (c, elt)) |
| 6695 for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++); | 7102 tail = XCDR (tail); |
| 6696 if (p == p1end) | 7103 else if (CONSP (XCDR (tail))) |
| 6697 { | 7104 { |
| 6698 for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++); | 7105 XSETCAR (tail, XCAR (XCDR (tail))); |
| 6699 if (p == p2end) | 7106 XSETCDR (tail, XCDR (XCDR (tail))); |
| 6700 return Qt; | 7107 } |
| 6701 } | 7108 else |
| 6702 } | 7109 { |
| 6703 | 7110 XSETCAR (tail, Qnil); |
| 6704 /* The text contains non-ASCII characters. */ | 7111 tail = XCDR (tail); |
| 6705 | 7112 } |
| 6706 work_table = Fmake_char_table (Qchar_coding_system, Qnil); | 7113 } |
| 6707 safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars)); | 7114 if (charset_map_loaded) |
| 6708 | 7115 { |
| 6709 safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table, | 7116 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg; |
| 6710 &single_byte_char_found); | 7117 |
| 6711 if (p2 < p2end) | 7118 if (STRINGP (start)) |
| 6712 safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table, | 7119 pbeg = SDATA (start); |
| 6713 &single_byte_char_found); | 7120 else |
| 6714 if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars))) | 7121 pbeg = BYTE_POS_ADDR (start_byte); |
| 6715 safe_codings = Qt; | 7122 p = pbeg + p_offset; |
| 6716 else | 7123 pend = pbeg + pend_offset; |
| 6717 { | 7124 } |
| 6718 /* Turn safe_codings to a list of coding systems... */ | 7125 } |
| 6719 Lisp_Object val; | 7126 } |
| 6720 | 7127 |
| 6721 if (single_byte_char_found) | 7128 safe_codings = Qnil; |
| 6722 /* ... and append these for eight-bit chars. */ | 7129 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail)) |
| 6723 val = Fcons (Qraw_text, | 7130 if (! NILP (XCAR (tail))) |
| 6724 Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil))); | 7131 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings); |
| 6725 else | |
| 6726 /* ... and append generic coding systems. */ | |
| 6727 val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars)); | |
| 6728 | |
| 6729 for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings)) | |
| 6730 val = Fcons (XCAR (XCAR (safe_codings)), val); | |
| 6731 safe_codings = val; | |
| 6732 } | |
| 6733 | 7132 |
| 6734 return safe_codings; | 7133 return safe_codings; |
| 6735 } | |
| 6736 | |
| 6737 | |
| 6738 /* Search from position POS for such characters that are unencodable | |
| 6739 accoding to SAFE_CHARS, and return a list of their positions. P | |
| 6740 points where in the memory the character at POS exists. Limit the | |
| 6741 search at PEND or when Nth unencodable characters are found. | |
| 6742 | |
| 6743 If SAFE_CHARS is a char table, an element for an unencodable | |
| 6744 character is nil. | |
| 6745 | |
| 6746 If SAFE_CHARS is nil, all non-ASCII characters are unencodable. | |
| 6747 | |
| 6748 Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and | |
| 6749 eight-bit-graphic characters are unencodable. */ | |
| 6750 | |
| 6751 static Lisp_Object | |
| 6752 unencodable_char_position (safe_chars, pos, p, pend, n) | |
| 6753 Lisp_Object safe_chars; | |
| 6754 int pos; | |
| 6755 unsigned char *p, *pend; | |
| 6756 int n; | |
| 6757 { | |
| 6758 Lisp_Object pos_list; | |
| 6759 | |
| 6760 pos_list = Qnil; | |
| 6761 while (p < pend) | |
| 6762 { | |
| 6763 int len; | |
| 6764 int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len); | |
| 6765 | |
| 6766 if (c >= 128 | |
| 6767 && (CHAR_TABLE_P (safe_chars) | |
| 6768 ? NILP (CHAR_TABLE_REF (safe_chars, c)) | |
| 6769 : (NILP (safe_chars) || c < 256))) | |
| 6770 { | |
| 6771 pos_list = Fcons (make_number (pos), pos_list); | |
| 6772 if (--n <= 0) | |
| 6773 break; | |
| 6774 } | |
| 6775 pos++; | |
| 6776 p += len; | |
| 6777 } | |
| 6778 return Fnreverse (pos_list); | |
| 6779 } | 7134 } |
| 6780 | 7135 |
| 6781 | 7136 |
| 6782 DEFUN ("unencodable-char-position", Funencodable_char_position, | 7137 DEFUN ("unencodable-char-position", Funencodable_char_position, |
| 6783 Sunencodable_char_position, 3, 5, 0, | 7138 Sunencodable_char_position, 3, 5, 0, |
| 6795 to the string. */) | 7150 to the string. */) |
| 6796 (start, end, coding_system, count, string) | 7151 (start, end, coding_system, count, string) |
| 6797 Lisp_Object start, end, coding_system, count, string; | 7152 Lisp_Object start, end, coding_system, count, string; |
| 6798 { | 7153 { |
| 6799 int n; | 7154 int n; |
| 6800 Lisp_Object safe_chars; | |
| 6801 struct coding_system coding; | 7155 struct coding_system coding; |
| 7156 Lisp_Object attrs, charset_list; | |
| 6802 Lisp_Object positions; | 7157 Lisp_Object positions; |
| 6803 int from, to; | 7158 int from, to; |
| 6804 unsigned char *p, *pend; | 7159 const unsigned char *p, *stop, *pend; |
| 7160 int ascii_compatible; | |
| 7161 | |
| 7162 setup_coding_system (Fcheck_coding_system (coding_system), &coding); | |
| 7163 attrs = CODING_ID_ATTRS (coding.id); | |
| 7164 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text)) | |
| 7165 return Qnil; | |
| 7166 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)); | |
| 7167 charset_list = CODING_ATTR_CHARSET_LIST (attrs); | |
| 6805 | 7168 |
| 6806 if (NILP (string)) | 7169 if (NILP (string)) |
| 6807 { | 7170 { |
| 6808 validate_region (&start, &end); | 7171 validate_region (&start, &end); |
| 6809 from = XINT (start); | 7172 from = XINT (start); |
| 6810 to = XINT (end); | 7173 to = XINT (end); |
| 6811 if (NILP (current_buffer->enable_multibyte_characters)) | 7174 if (NILP (current_buffer->enable_multibyte_characters) |
| 7175 || (ascii_compatible | |
| 7176 && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from))))) | |
| 6812 return Qnil; | 7177 return Qnil; |
| 6813 p = CHAR_POS_ADDR (from); | 7178 p = CHAR_POS_ADDR (from); |
| 6814 if (to == GPT) | 7179 pend = CHAR_POS_ADDR (to); |
| 6815 pend = GPT_ADDR; | 7180 if (from < GPT && to >= GPT) |
| 7181 stop = GPT_ADDR; | |
| 6816 else | 7182 else |
| 6817 pend = CHAR_POS_ADDR (to); | 7183 stop = pend; |
| 6818 } | 7184 } |
| 6819 else | 7185 else |
| 6820 { | 7186 { |
| 6821 CHECK_STRING (string); | 7187 CHECK_STRING (string); |
| 6822 CHECK_NATNUM (start); | 7188 CHECK_NATNUM (start); |
| 6827 || to > SCHARS (string)) | 7193 || to > SCHARS (string)) |
| 6828 args_out_of_range_3 (string, start, end); | 7194 args_out_of_range_3 (string, start, end); |
| 6829 if (! STRING_MULTIBYTE (string)) | 7195 if (! STRING_MULTIBYTE (string)) |
| 6830 return Qnil; | 7196 return Qnil; |
| 6831 p = SDATA (string) + string_char_to_byte (string, from); | 7197 p = SDATA (string) + string_char_to_byte (string, from); |
| 6832 pend = SDATA (string) + string_char_to_byte (string, to); | 7198 stop = pend = SDATA (string) + string_char_to_byte (string, to); |
| 6833 } | 7199 if (ascii_compatible && (to - from) == (pend - p)) |
| 6834 | 7200 return Qnil; |
| 6835 setup_coding_system (Fcheck_coding_system (coding_system), &coding); | 7201 } |
| 6836 | 7202 |
| 6837 if (NILP (count)) | 7203 if (NILP (count)) |
| 6838 n = 1; | 7204 n = 1; |
| 6839 else | 7205 else |
| 6840 { | 7206 { |
| 6841 CHECK_NATNUM (count); | 7207 CHECK_NATNUM (count); |
| 6842 n = XINT (count); | 7208 n = XINT (count); |
| 6843 } | 7209 } |
| 6844 | 7210 |
| 6845 if (coding.type == coding_type_no_conversion | 7211 positions = Qnil; |
| 6846 || coding.type == coding_type_raw_text) | 7212 while (1) |
| 6847 return Qnil; | 7213 { |
| 6848 | 7214 int c; |
| 6849 if (coding.type == coding_type_undecided) | 7215 |
| 6850 safe_chars = Qnil; | 7216 if (ascii_compatible) |
| 7217 while (p < stop && ASCII_BYTE_P (*p)) | |
| 7218 p++, from++; | |
| 7219 if (p >= stop) | |
| 7220 { | |
| 7221 if (p >= pend) | |
| 7222 break; | |
| 7223 stop = pend; | |
| 7224 p = GAP_END_ADDR; | |
| 7225 } | |
| 7226 | |
| 7227 c = STRING_CHAR_ADVANCE (p); | |
| 7228 if (! (ASCII_CHAR_P (c) && ascii_compatible) | |
| 7229 && ! char_charset (c, charset_list, NULL)) | |
| 7230 { | |
| 7231 positions = Fcons (make_number (from), positions); | |
| 7232 n--; | |
| 7233 if (n == 0) | |
| 7234 break; | |
| 7235 } | |
| 7236 | |
| 7237 from++; | |
| 7238 } | |
| 7239 | |
| 7240 return (NILP (count) ? Fcar (positions) : Fnreverse (positions)); | |
| 7241 } | |
| 7242 | |
| 7243 | |
| 7244 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region, | |
| 7245 Scheck_coding_systems_region, 3, 3, 0, | |
| 7246 doc: /* Check if the region is encodable by coding systems. | |
| 7247 | |
| 7248 START and END are buffer positions specifying the region. | |
| 7249 CODING-SYSTEM-LIST is a list of coding systems to check. | |
| 7250 | |
| 7251 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where | |
| 7252 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the | |
| 7253 whole region, POS0, POS1, ... are buffer positions where non-encodable | |
| 7254 characters are found. | |
| 7255 | |
| 7256 If all coding systems in CODING-SYSTEM-LIST can encode the region, the | |
| 7257 value is nil. | |
| 7258 | |
| 7259 START may be a string. In that case, check if the string is | |
| 7260 encodable, and the value contains indices to the string instead of | |
| 7261 buffer positions. END is ignored. */) | |
| 7262 (start, end, coding_system_list) | |
| 7263 Lisp_Object start, end, coding_system_list; | |
| 7264 { | |
| 7265 Lisp_Object list; | |
| 7266 EMACS_INT start_byte, end_byte; | |
| 7267 int pos; | |
| 7268 const unsigned char *p, *pbeg, *pend; | |
| 7269 int c; | |
| 7270 Lisp_Object tail, elt; | |
| 7271 | |
| 7272 if (STRINGP (start)) | |
| 7273 { | |
| 7274 if (!STRING_MULTIBYTE (start) | |
| 7275 && SCHARS (start) != SBYTES (start)) | |
| 7276 return Qnil; | |
| 7277 start_byte = 0; | |
| 7278 end_byte = SBYTES (start); | |
| 7279 pos = 0; | |
| 7280 } | |
| 6851 else | 7281 else |
| 6852 safe_chars = coding_safe_chars (coding_system); | 7282 { |
| 6853 | 7283 CHECK_NUMBER_COERCE_MARKER (start); |
| 6854 if (STRINGP (string) | 7284 CHECK_NUMBER_COERCE_MARKER (end); |
| 6855 || from >= GPT || to <= GPT) | 7285 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end)) |
| 6856 positions = unencodable_char_position (safe_chars, from, p, pend, n); | 7286 args_out_of_range (start, end); |
| 7287 if (NILP (current_buffer->enable_multibyte_characters)) | |
| 7288 return Qnil; | |
| 7289 start_byte = CHAR_TO_BYTE (XINT (start)); | |
| 7290 end_byte = CHAR_TO_BYTE (XINT (end)); | |
| 7291 if (XINT (end) - XINT (start) == end_byte - start_byte) | |
| 7292 return Qt; | |
| 7293 | |
| 7294 if (XINT (start) < GPT && XINT (end) > GPT) | |
| 7295 { | |
| 7296 if ((GPT - XINT (start)) < (XINT (end) - GPT)) | |
| 7297 move_gap_both (XINT (start), start_byte); | |
| 7298 else | |
| 7299 move_gap_both (XINT (end), end_byte); | |
| 7300 } | |
| 7301 pos = XINT (start); | |
| 7302 } | |
| 7303 | |
| 7304 list = Qnil; | |
| 7305 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail)) | |
| 7306 { | |
| 7307 elt = XCAR (tail); | |
| 7308 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0), | |
| 7309 Qnil)), | |
| 7310 list); | |
| 7311 } | |
| 7312 | |
| 7313 if (STRINGP (start)) | |
| 7314 p = pbeg = SDATA (start); | |
| 6857 else | 7315 else |
| 6858 { | 7316 p = pbeg = BYTE_POS_ADDR (start_byte); |
| 6859 Lisp_Object args[2]; | 7317 pend = p + (end_byte - start_byte); |
| 6860 | 7318 |
| 6861 args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n); | 7319 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++; |
| 6862 n -= XINT (Flength (args[0])); | 7320 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--; |
| 6863 if (n <= 0) | 7321 |
| 6864 positions = args[0]; | 7322 while (p < pend) |
| 7323 { | |
| 7324 if (ASCII_BYTE_P (*p)) | |
| 7325 p++; | |
| 6865 else | 7326 else |
| 6866 { | 7327 { |
| 6867 args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR, | 7328 c = STRING_CHAR_ADVANCE (p); |
| 6868 pend, n); | 7329 |
| 6869 positions = Fappend (2, args); | 7330 charset_map_loaded = 0; |
| 6870 } | 7331 for (tail = list; CONSP (tail); tail = XCDR (tail)) |
| 6871 } | 7332 { |
| 6872 | 7333 elt = XCDR (XCAR (tail)); |
| 6873 return (NILP (count) ? Fcar (positions) : positions); | 7334 if (! char_encodable_p (c, XCAR (elt))) |
| 6874 } | 7335 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt))); |
| 7336 } | |
| 7337 if (charset_map_loaded) | |
| 7338 { | |
| 7339 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg; | |
| 7340 | |
| 7341 if (STRINGP (start)) | |
| 7342 pbeg = SDATA (start); | |
| 7343 else | |
| 7344 pbeg = BYTE_POS_ADDR (start_byte); | |
| 7345 p = pbeg + p_offset; | |
| 7346 pend = pbeg + pend_offset; | |
| 7347 } | |
| 7348 } | |
| 7349 pos++; | |
| 7350 } | |
| 7351 | |
| 7352 tail = list; | |
| 7353 list = Qnil; | |
| 7354 for (; CONSP (tail); tail = XCDR (tail)) | |
| 7355 { | |
| 7356 elt = XCAR (tail); | |
| 7357 if (CONSP (XCDR (XCDR (elt)))) | |
| 7358 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))), | |
| 7359 list); | |
| 7360 } | |
| 7361 | |
| 7362 return list; | |
| 7363 } | |
| 7364 | |
| 6875 | 7365 |
| 6876 | 7366 |
| 6877 Lisp_Object | 7367 Lisp_Object |
| 6878 code_convert_region1 (start, end, coding_system, encodep) | 7368 code_convert_region (start, end, coding_system, dst_object, encodep, norecord) |
| 6879 Lisp_Object start, end, coding_system; | 7369 Lisp_Object start, end, coding_system, dst_object; |
| 6880 int encodep; | 7370 int encodep, norecord; |
| 6881 { | 7371 { |
| 6882 struct coding_system coding; | 7372 struct coding_system coding; |
| 6883 int from, to; | 7373 EMACS_INT from, from_byte, to, to_byte; |
| 7374 Lisp_Object src_object; | |
| 6884 | 7375 |
| 6885 CHECK_NUMBER_COERCE_MARKER (start); | 7376 CHECK_NUMBER_COERCE_MARKER (start); |
| 6886 CHECK_NUMBER_COERCE_MARKER (end); | 7377 CHECK_NUMBER_COERCE_MARKER (end); |
| 6887 CHECK_SYMBOL (coding_system); | 7378 if (NILP (coding_system)) |
| 7379 coding_system = Qno_conversion; | |
| 7380 else | |
| 7381 CHECK_CODING_SYSTEM (coding_system); | |
| 7382 src_object = Fcurrent_buffer (); | |
| 7383 if (NILP (dst_object)) | |
| 7384 dst_object = src_object; | |
| 7385 else if (! EQ (dst_object, Qt)) | |
| 7386 CHECK_BUFFER (dst_object); | |
| 6888 | 7387 |
| 6889 validate_region (&start, &end); | 7388 validate_region (&start, &end); |
| 6890 from = XFASTINT (start); | 7389 from = XFASTINT (start); |
| 7390 from_byte = CHAR_TO_BYTE (from); | |
| 6891 to = XFASTINT (end); | 7391 to = XFASTINT (end); |
| 6892 | 7392 to_byte = CHAR_TO_BYTE (to); |
| 6893 if (NILP (coding_system)) | 7393 |
| 6894 return make_number (to - from); | 7394 setup_coding_system (coding_system, &coding); |
| 6895 | |
| 6896 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0) | |
| 6897 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system))); | |
| 6898 | |
| 6899 coding.mode |= CODING_MODE_LAST_BLOCK; | 7395 coding.mode |= CODING_MODE_LAST_BLOCK; |
| 6900 coding.src_multibyte = coding.dst_multibyte | 7396 |
| 6901 = !NILP (current_buffer->enable_multibyte_characters); | 7397 if (encodep) |
| 6902 code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to), | 7398 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte, |
| 6903 &coding, encodep, 1); | 7399 dst_object); |
| 6904 Vlast_coding_system_used = coding.symbol; | 7400 else |
| 6905 return make_number (coding.produced_char); | 7401 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte, |
| 6906 } | 7402 dst_object); |
| 7403 if (! norecord) | |
| 7404 Vlast_coding_system_used = CODING_ID_NAME (coding.id); | |
| 7405 | |
| 7406 if (coding.result != CODING_RESULT_SUCCESS) | |
| 7407 error ("Code conversion error: %d", coding.result); | |
| 7408 | |
| 7409 return (BUFFERP (dst_object) | |
| 7410 ? make_number (coding.produced_char) | |
| 7411 : coding.dst_object); | |
| 7412 } | |
| 7413 | |
| 6907 | 7414 |
| 6908 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region, | 7415 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region, |
| 6909 3, 3, "r\nzCoding system: ", | 7416 3, 4, "r\nzCoding system: ", |
| 6910 doc: /* Decode the current region from the specified coding system. | 7417 doc: /* Decode the current region from the specified coding system. |
| 6911 When called from a program, takes three arguments: | 7418 When called from a program, takes four arguments: |
| 6912 START, END, and CODING-SYSTEM. START and END are buffer positions. | 7419 START, END, CODING-SYSTEM, and DESTINATION. |
| 7420 START and END are buffer positions. | |
| 7421 | |
| 7422 Optional 4th arguments DESTINATION specifies where the decoded text goes. | |
| 7423 If nil, the region between START and END is replace by the decoded text. | |
| 7424 If buffer, the decoded text is inserted in the buffer. | |
| 7425 If t, the decoded text is returned. | |
| 7426 | |
| 6913 This function sets `last-coding-system-used' to the precise coding system | 7427 This function sets `last-coding-system-used' to the precise coding system |
| 6914 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is | 7428 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is |
| 6915 not fully specified.) | 7429 not fully specified.) |
| 6916 It returns the length of the decoded text. */) | 7430 It returns the length of the decoded text. */) |
| 6917 (start, end, coding_system) | 7431 (start, end, coding_system, destination) |
| 6918 Lisp_Object start, end, coding_system; | 7432 Lisp_Object start, end, coding_system, destination; |
| 6919 { | 7433 { |
| 6920 return code_convert_region1 (start, end, coding_system, 0); | 7434 return code_convert_region (start, end, coding_system, destination, 0, 0); |
| 6921 } | 7435 } |
| 6922 | 7436 |
| 6923 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region, | 7437 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region, |
| 6924 3, 3, "r\nzCoding system: ", | 7438 3, 4, "r\nzCoding system: ", |
| 6925 doc: /* Encode the current region into the specified coding system. | 7439 doc: /* Encode the current region by specified coding system. |
| 6926 When called from a program, takes three arguments: | 7440 When called from a program, takes three arguments: |
| 6927 START, END, and CODING-SYSTEM. START and END are buffer positions. | 7441 START, END, and CODING-SYSTEM. START and END are buffer positions. |
| 7442 | |
| 7443 Optional 4th arguments DESTINATION specifies where the encoded text goes. | |
| 7444 If nil, the region between START and END is replace by the encoded text. | |
| 7445 If buffer, the encoded text is inserted in the buffer. | |
| 7446 If t, the encoded text is returned. | |
| 7447 | |
| 6928 This function sets `last-coding-system-used' to the precise coding system | 7448 This function sets `last-coding-system-used' to the precise coding system |
| 6929 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is | 7449 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is |
| 6930 not fully specified.) | 7450 not fully specified.) |
| 6931 It returns the length of the encoded text. */) | 7451 It returns the length of the encoded text. */) |
| 6932 (start, end, coding_system) | 7452 (start, end, coding_system, destination) |
| 6933 Lisp_Object start, end, coding_system; | 7453 Lisp_Object start, end, coding_system, destination; |
| 6934 { | 7454 { |
| 6935 return code_convert_region1 (start, end, coding_system, 1); | 7455 return code_convert_region (start, end, coding_system, destination, 1, 0); |
| 6936 } | 7456 } |
| 6937 | 7457 |
| 6938 Lisp_Object | 7458 Lisp_Object |
| 6939 code_convert_string1 (string, coding_system, nocopy, encodep) | 7459 code_convert_string (string, coding_system, dst_object, |
| 6940 Lisp_Object string, coding_system, nocopy; | 7460 encodep, nocopy, norecord) |
| 6941 int encodep; | 7461 Lisp_Object string, coding_system, dst_object; |
| 7462 int encodep, nocopy, norecord; | |
| 6942 { | 7463 { |
| 6943 struct coding_system coding; | 7464 struct coding_system coding; |
| 7465 EMACS_INT chars, bytes; | |
| 6944 | 7466 |
| 6945 CHECK_STRING (string); | 7467 CHECK_STRING (string); |
| 6946 CHECK_SYMBOL (coding_system); | |
| 6947 | |
| 6948 if (NILP (coding_system)) | 7468 if (NILP (coding_system)) |
| 6949 return (NILP (nocopy) ? Fcopy_sequence (string) : string); | 7469 { |
| 6950 | 7470 if (! norecord) |
| 6951 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0) | 7471 Vlast_coding_system_used = Qno_conversion; |
| 6952 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system))); | 7472 if (NILP (dst_object)) |
| 6953 | 7473 return (nocopy ? Fcopy_sequence (string) : string); |
| 7474 } | |
| 7475 | |
| 7476 if (NILP (coding_system)) | |
| 7477 coding_system = Qno_conversion; | |
| 7478 else | |
| 7479 CHECK_CODING_SYSTEM (coding_system); | |
| 7480 if (NILP (dst_object)) | |
| 7481 dst_object = Qt; | |
| 7482 else if (! EQ (dst_object, Qt)) | |
| 7483 CHECK_BUFFER (dst_object); | |
| 7484 | |
| 7485 setup_coding_system (coding_system, &coding); | |
| 6954 coding.mode |= CODING_MODE_LAST_BLOCK; | 7486 coding.mode |= CODING_MODE_LAST_BLOCK; |
| 6955 string = (encodep | 7487 chars = SCHARS (string); |
| 6956 ? encode_coding_string (string, &coding, !NILP (nocopy)) | 7488 bytes = SBYTES (string); |
| 6957 : decode_coding_string (string, &coding, !NILP (nocopy))); | 7489 if (encodep) |
| 6958 Vlast_coding_system_used = coding.symbol; | 7490 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object); |
| 6959 | 7491 else |
| 6960 return string; | 7492 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object); |
| 6961 } | 7493 if (! norecord) |
| 6962 | 7494 Vlast_coding_system_used = CODING_ID_NAME (coding.id); |
| 6963 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string, | 7495 |
| 6964 2, 3, 0, | 7496 if (coding.result != CODING_RESULT_SUCCESS) |
| 6965 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result. | 7497 error ("Code conversion error: %d", coding.result); |
| 6966 Optional arg NOCOPY non-nil means it is OK to return STRING itself | 7498 |
| 6967 if the decoding operation is trivial. | 7499 return (BUFFERP (dst_object) |
| 6968 This function sets `last-coding-system-used' to the precise coding system | 7500 ? make_number (coding.produced_char) |
| 6969 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is | 7501 : coding.dst_object); |
| 6970 not fully specified.) */) | 7502 } |
| 6971 (string, coding_system, nocopy) | 7503 |
| 6972 Lisp_Object string, coding_system, nocopy; | |
| 6973 { | |
| 6974 return code_convert_string1 (string, coding_system, nocopy, 0); | |
| 6975 } | |
| 6976 | |
| 6977 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string, | |
| 6978 2, 3, 0, | |
| 6979 doc: /* Encode STRING to CODING-SYSTEM, and return the result. | |
| 6980 Optional arg NOCOPY non-nil means it is OK to return STRING itself | |
| 6981 if the encoding operation is trivial. | |
| 6982 This function sets `last-coding-system-used' to the precise coding system | |
| 6983 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is | |
| 6984 not fully specified.) */) | |
| 6985 (string, coding_system, nocopy) | |
| 6986 Lisp_Object string, coding_system, nocopy; | |
| 6987 { | |
| 6988 return code_convert_string1 (string, coding_system, nocopy, 1); | |
| 6989 } | |
| 6990 | 7504 |
| 6991 /* Encode or decode STRING according to CODING_SYSTEM. | 7505 /* Encode or decode STRING according to CODING_SYSTEM. |
| 6992 Do not set Vlast_coding_system_used. | 7506 Do not set Vlast_coding_system_used. |
| 6993 | 7507 |
| 6994 This function is called only from macros DECODE_FILE and | 7508 This function is called only from macros DECODE_FILE and |
| 6997 Lisp_Object | 7511 Lisp_Object |
| 6998 code_convert_string_norecord (string, coding_system, encodep) | 7512 code_convert_string_norecord (string, coding_system, encodep) |
| 6999 Lisp_Object string, coding_system; | 7513 Lisp_Object string, coding_system; |
| 7000 int encodep; | 7514 int encodep; |
| 7001 { | 7515 { |
| 7002 struct coding_system coding; | 7516 return code_convert_string (string, coding_system, Qt, encodep, 0, 1); |
| 7003 | 7517 } |
| 7004 CHECK_STRING (string); | 7518 |
| 7005 CHECK_SYMBOL (coding_system); | 7519 |
| 7006 | 7520 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string, |
| 7007 if (NILP (coding_system)) | 7521 2, 4, 0, |
| 7008 return string; | 7522 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result. |
| 7009 | 7523 |
| 7010 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0) | 7524 Optional third arg NOCOPY non-nil means it is OK to return STRING itself |
| 7011 error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system))); | 7525 if the decoding operation is trivial. |
| 7012 | 7526 |
| 7013 coding.composing = COMPOSITION_DISABLED; | 7527 Optional fourth arg BUFFER non-nil meant that the decoded text is |
| 7014 coding.mode |= CODING_MODE_LAST_BLOCK; | 7528 inserted in BUFFER instead of returned as a string. In this case, |
| 7015 return (encodep | 7529 the return value is BUFFER. |
| 7016 ? encode_coding_string (string, &coding, 1) | 7530 |
| 7017 : decode_coding_string (string, &coding, 1)); | 7531 This function sets `last-coding-system-used' to the precise coding system |
| 7018 } | 7532 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is |
| 7533 not fully specified. */) | |
| 7534 (string, coding_system, nocopy, buffer) | |
| 7535 Lisp_Object string, coding_system, nocopy, buffer; | |
| 7536 { | |
| 7537 return code_convert_string (string, coding_system, buffer, | |
| 7538 0, ! NILP (nocopy), 0); | |
| 7539 } | |
| 7540 | |
| 7541 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string, | |
| 7542 2, 4, 0, | |
| 7543 doc: /* Encode STRING to CODING-SYSTEM, and return the result. | |
| 7544 | |
| 7545 Optional third arg NOCOPY non-nil means it is OK to return STRING | |
| 7546 itself if the encoding operation is trivial. | |
| 7547 | |
| 7548 Optional fourth arg BUFFER non-nil meant that the encoded text is | |
| 7549 inserted in BUFFER instead of returned as a string. In this case, | |
| 7550 the return value is BUFFER. | |
| 7551 | |
| 7552 This function sets `last-coding-system-used' to the precise coding system | |
| 7553 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is | |
| 7554 not fully specified.) */) | |
| 7555 (string, coding_system, nocopy, buffer) | |
| 7556 Lisp_Object string, coding_system, nocopy, buffer; | |
| 7557 { | |
| 7558 return code_convert_string (string, coding_system, buffer, | |
| 7559 1, ! NILP (nocopy), 1); | |
| 7560 } | |
| 7561 | |
| 7019 | 7562 |
| 7020 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0, | 7563 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0, |
| 7021 doc: /* Decode a Japanese character which has CODE in shift_jis encoding. | 7564 doc: /* Decode a Japanese character which has CODE in shift_jis encoding. |
| 7022 Return the corresponding character. */) | 7565 Return the corresponding character. */) |
| 7023 (code) | 7566 (code) |
| 7024 Lisp_Object code; | 7567 Lisp_Object code; |
| 7025 { | 7568 { |
| 7026 unsigned char c1, c2, s1, s2; | 7569 Lisp_Object spec, attrs, val; |
| 7027 Lisp_Object val; | 7570 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset; |
| 7028 | 7571 int c; |
| 7029 CHECK_NUMBER (code); | 7572 |
| 7030 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF; | 7573 CHECK_NATNUM (code); |
| 7031 if (s1 == 0) | 7574 c = XFASTINT (code); |
| 7032 { | 7575 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec); |
| 7033 if (s2 < 0x80) | 7576 attrs = AREF (spec, 0); |
| 7034 XSETFASTINT (val, s2); | 7577 |
| 7035 else if (s2 >= 0xA0 || s2 <= 0xDF) | 7578 if (ASCII_BYTE_P (c) |
| 7036 XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0)); | 7579 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) |
| 7037 else | 7580 return code; |
| 7038 error ("Invalid Shift JIS code: %x", XFASTINT (code)); | 7581 |
| 7582 val = CODING_ATTR_CHARSET_LIST (attrs); | |
| 7583 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); | |
| 7584 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); | |
| 7585 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))); | |
| 7586 | |
| 7587 if (c <= 0x7F) | |
| 7588 charset = charset_roman; | |
| 7589 else if (c >= 0xA0 && c < 0xDF) | |
| 7590 { | |
| 7591 charset = charset_kana; | |
| 7592 c -= 0x80; | |
| 7039 } | 7593 } |
| 7040 else | 7594 else |
| 7041 { | 7595 { |
| 7042 if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF) | 7596 int s1 = c >> 8, s2 = c & 0xFF; |
| 7043 || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)) | 7597 |
| 7044 error ("Invalid Shift JIS code: %x", XFASTINT (code)); | 7598 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF |
| 7045 DECODE_SJIS (s1, s2, c1, c2); | 7599 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC) |
| 7046 XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2)); | 7600 error ("Invalid code: %d", code); |
| 7047 } | 7601 SJIS_TO_JIS (c); |
| 7048 return val; | 7602 charset = charset_kanji; |
| 7049 } | 7603 } |
| 7604 c = DECODE_CHAR (charset, c); | |
| 7605 if (c < 0) | |
| 7606 error ("Invalid code: %d", code); | |
| 7607 return make_number (c); | |
| 7608 } | |
| 7609 | |
| 7050 | 7610 |
| 7051 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0, | 7611 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0, |
| 7052 doc: /* Encode a Japanese character CHAR to shift_jis encoding. | 7612 doc: /* Encode a Japanese character CHAR to shift_jis encoding. |
| 7053 Return the corresponding code in SJIS. */) | 7613 Return the corresponding code in SJIS. */) |
| 7054 (ch) | 7614 (ch) |
| 7055 Lisp_Object ch; | 7615 Lisp_Object ch; |
| 7056 { | 7616 { |
| 7057 int charset, c1, c2, s1, s2; | 7617 Lisp_Object spec, attrs, charset_list; |
| 7058 Lisp_Object val; | 7618 int c; |
| 7059 | 7619 struct charset *charset; |
| 7060 CHECK_NUMBER (ch); | 7620 unsigned code; |
| 7061 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2); | 7621 |
| 7062 if (charset == CHARSET_ASCII) | 7622 CHECK_CHARACTER (ch); |
| 7063 { | 7623 c = XFASTINT (ch); |
| 7064 val = ch; | 7624 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec); |
| 7065 } | 7625 attrs = AREF (spec, 0); |
| 7066 else if (charset == charset_jisx0208 | 7626 |
| 7067 && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F) | 7627 if (ASCII_CHAR_P (c) |
| 7068 { | 7628 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) |
| 7069 ENCODE_SJIS (c1, c2, s1, s2); | 7629 return ch; |
| 7070 XSETFASTINT (val, (s1 << 8) | s2); | 7630 |
| 7071 } | 7631 charset_list = CODING_ATTR_CHARSET_LIST (attrs); |
| 7072 else if (charset == charset_katakana_jisx0201 | 7632 charset = char_charset (c, charset_list, &code); |
| 7073 && c1 > 0x20 && c2 < 0xE0) | 7633 if (code == CHARSET_INVALID_CODE (charset)) |
| 7074 { | 7634 error ("Can't encode by shift_jis encoding: %d", c); |
| 7075 XSETFASTINT (val, c1 | 0x80); | 7635 JIS_TO_SJIS (code); |
| 7076 } | 7636 |
| 7077 else | 7637 return make_number (code); |
| 7078 error ("Can't encode to shift_jis: %d", XFASTINT (ch)); | |
| 7079 return val; | |
| 7080 } | 7638 } |
| 7081 | 7639 |
| 7082 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0, | 7640 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0, |
| 7083 doc: /* Decode a Big5 character which has CODE in BIG5 coding system. | 7641 doc: /* Decode a Big5 character which has CODE in BIG5 coding system. |
| 7084 Return the corresponding character. */) | 7642 Return the corresponding character. */) |
| 7085 (code) | 7643 (code) |
| 7086 Lisp_Object code; | 7644 Lisp_Object code; |
| 7087 { | 7645 { |
| 7088 int charset; | 7646 Lisp_Object spec, attrs, val; |
| 7089 unsigned char b1, b2, c1, c2; | 7647 struct charset *charset_roman, *charset_big5, *charset; |
| 7090 Lisp_Object val; | 7648 int c; |
| 7091 | 7649 |
| 7092 CHECK_NUMBER (code); | 7650 CHECK_NATNUM (code); |
| 7093 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF; | 7651 c = XFASTINT (code); |
| 7094 if (b1 == 0) | 7652 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec); |
| 7095 { | 7653 attrs = AREF (spec, 0); |
| 7096 if (b2 >= 0x80) | 7654 |
| 7097 error ("Invalid BIG5 code: %x", XFASTINT (code)); | 7655 if (ASCII_BYTE_P (c) |
| 7098 val = code; | 7656 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) |
| 7099 } | 7657 return code; |
| 7658 | |
| 7659 val = CODING_ATTR_CHARSET_LIST (attrs); | |
| 7660 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); | |
| 7661 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val))); | |
| 7662 | |
| 7663 if (c <= 0x7F) | |
| 7664 charset = charset_roman; | |
| 7100 else | 7665 else |
| 7101 { | 7666 { |
| 7102 if ((b1 < 0xA1 || b1 > 0xFE) | 7667 int b1 = c >> 8, b2 = c & 0x7F; |
| 7103 || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)) | 7668 if (b1 < 0xA1 || b1 > 0xFE |
| 7104 error ("Invalid BIG5 code: %x", XFASTINT (code)); | 7669 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE) |
| 7105 DECODE_BIG5 (b1, b2, charset, c1, c2); | 7670 error ("Invalid code: %d", code); |
| 7106 XSETFASTINT (val, MAKE_CHAR (charset, c1, c2)); | 7671 charset = charset_big5; |
| 7107 } | 7672 } |
| 7108 return val; | 7673 c = DECODE_CHAR (charset, (unsigned )c); |
| 7674 if (c < 0) | |
| 7675 error ("Invalid code: %d", code); | |
| 7676 return make_number (c); | |
| 7109 } | 7677 } |
| 7110 | 7678 |
| 7111 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0, | 7679 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0, |
| 7112 doc: /* Encode the Big5 character CHAR to BIG5 coding system. | 7680 doc: /* Encode the Big5 character CHAR to BIG5 coding system. |
| 7113 Return the corresponding character code in Big5. */) | 7681 Return the corresponding character code in Big5. */) |
| 7114 (ch) | 7682 (ch) |
| 7115 Lisp_Object ch; | 7683 Lisp_Object ch; |
| 7116 { | 7684 { |
| 7117 int charset, c1, c2, b1, b2; | 7685 Lisp_Object spec, attrs, charset_list; |
| 7118 Lisp_Object val; | 7686 struct charset *charset; |
| 7119 | 7687 int c; |
| 7120 CHECK_NUMBER (ch); | 7688 unsigned code; |
| 7121 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2); | 7689 |
| 7122 if (charset == CHARSET_ASCII) | 7690 CHECK_CHARACTER (ch); |
| 7123 { | 7691 c = XFASTINT (ch); |
| 7124 val = ch; | 7692 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec); |
| 7125 } | 7693 attrs = AREF (spec, 0); |
| 7126 else if ((charset == charset_big5_1 | 7694 if (ASCII_CHAR_P (c) |
| 7127 && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec)) | 7695 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) |
| 7128 || (charset == charset_big5_2 | 7696 return ch; |
| 7129 && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2)) | 7697 |
| 7130 { | 7698 charset_list = CODING_ATTR_CHARSET_LIST (attrs); |
| 7131 ENCODE_BIG5 (charset, c1, c2, b1, b2); | 7699 charset = char_charset (c, charset_list, &code); |
| 7132 XSETFASTINT (val, (b1 << 8) | b2); | 7700 if (code == CHARSET_INVALID_CODE (charset)) |
| 7133 } | 7701 error ("Can't encode by Big5 encoding: %d", c); |
| 7134 else | 7702 |
| 7135 error ("Can't encode to Big5: %d", XFASTINT (ch)); | 7703 return make_number (code); |
| 7136 return val; | 7704 } |
| 7137 } | 7705 |
| 7138 | 7706 |
| 7139 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal, | 7707 DEFUN ("set-terminal-coding-system-internal", |
| 7708 Fset_terminal_coding_system_internal, | |
| 7140 Sset_terminal_coding_system_internal, 1, 1, 0, | 7709 Sset_terminal_coding_system_internal, 1, 1, 0, |
| 7141 doc: /* Internal use only. */) | 7710 doc: /* Internal use only. */) |
| 7142 (coding_system) | 7711 (coding_system) |
| 7143 Lisp_Object coding_system; | 7712 Lisp_Object coding_system; |
| 7144 { | 7713 { |
| 7145 CHECK_SYMBOL (coding_system); | 7714 CHECK_SYMBOL (coding_system); |
| 7146 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding); | 7715 setup_coding_system (Fcheck_coding_system (coding_system), |
| 7716 &terminal_coding); | |
| 7717 | |
| 7147 /* We had better not send unsafe characters to terminal. */ | 7718 /* We had better not send unsafe characters to terminal. */ |
| 7148 terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR; | 7719 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING; |
| 7149 /* Character composition should be disabled. */ | 7720 /* Characer composition should be disabled. */ |
| 7150 terminal_coding.composing = COMPOSITION_DISABLED; | 7721 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; |
| 7151 /* Error notification should be suppressed. */ | |
| 7152 terminal_coding.suppress_error = 1; | |
| 7153 terminal_coding.src_multibyte = 1; | 7722 terminal_coding.src_multibyte = 1; |
| 7154 terminal_coding.dst_multibyte = 0; | 7723 terminal_coding.dst_multibyte = 0; |
| 7155 return Qnil; | 7724 return Qnil; |
| 7156 } | 7725 } |
| 7157 | 7726 |
| 7158 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal, | 7727 DEFUN ("set-safe-terminal-coding-system-internal", |
| 7728 Fset_safe_terminal_coding_system_internal, | |
| 7159 Sset_safe_terminal_coding_system_internal, 1, 1, 0, | 7729 Sset_safe_terminal_coding_system_internal, 1, 1, 0, |
| 7160 doc: /* Internal use only. */) | 7730 doc: /* Internal use only. */) |
| 7161 (coding_system) | 7731 (coding_system) |
| 7162 Lisp_Object coding_system; | 7732 Lisp_Object coding_system; |
| 7163 { | 7733 { |
| 7164 CHECK_SYMBOL (coding_system); | 7734 CHECK_SYMBOL (coding_system); |
| 7165 setup_coding_system (Fcheck_coding_system (coding_system), | 7735 setup_coding_system (Fcheck_coding_system (coding_system), |
| 7166 &safe_terminal_coding); | 7736 &safe_terminal_coding); |
| 7167 /* Character composition should be disabled. */ | 7737 /* Characer composition should be disabled. */ |
| 7168 safe_terminal_coding.composing = COMPOSITION_DISABLED; | 7738 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; |
| 7169 /* Error notification should be suppressed. */ | |
| 7170 terminal_coding.suppress_error = 1; | |
| 7171 safe_terminal_coding.src_multibyte = 1; | 7739 safe_terminal_coding.src_multibyte = 1; |
| 7172 safe_terminal_coding.dst_multibyte = 0; | 7740 safe_terminal_coding.dst_multibyte = 0; |
| 7173 return Qnil; | 7741 return Qnil; |
| 7174 } | 7742 } |
| 7175 | 7743 |
| 7176 DEFUN ("terminal-coding-system", Fterminal_coding_system, | 7744 DEFUN ("terminal-coding-system", |
| 7177 Sterminal_coding_system, 0, 0, 0, | 7745 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0, |
| 7178 doc: /* Return coding system specified for terminal output. */) | 7746 doc: /* Return coding system specified for terminal output. */) |
| 7179 () | 7747 () |
| 7180 { | 7748 { |
| 7181 return terminal_coding.symbol; | 7749 return CODING_ID_NAME (terminal_coding.id); |
| 7182 } | 7750 } |
| 7183 | 7751 |
| 7184 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal, | 7752 DEFUN ("set-keyboard-coding-system-internal", |
| 7753 Fset_keyboard_coding_system_internal, | |
| 7185 Sset_keyboard_coding_system_internal, 1, 1, 0, | 7754 Sset_keyboard_coding_system_internal, 1, 1, 0, |
| 7186 doc: /* Internal use only. */) | 7755 doc: /* Internal use only. */) |
| 7187 (coding_system) | 7756 (coding_system) |
| 7188 Lisp_Object coding_system; | 7757 Lisp_Object coding_system; |
| 7189 { | 7758 { |
| 7190 CHECK_SYMBOL (coding_system); | 7759 CHECK_SYMBOL (coding_system); |
| 7191 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding); | 7760 setup_coding_system (Fcheck_coding_system (coding_system), |
| 7192 /* Character composition should be disabled. */ | 7761 &keyboard_coding); |
| 7193 keyboard_coding.composing = COMPOSITION_DISABLED; | 7762 /* Characer composition should be disabled. */ |
| 7763 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; | |
| 7194 return Qnil; | 7764 return Qnil; |
| 7195 } | 7765 } |
| 7196 | 7766 |
| 7197 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system, | 7767 DEFUN ("keyboard-coding-system", |
| 7198 Skeyboard_coding_system, 0, 0, 0, | 7768 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0, |
| 7199 doc: /* Return coding system specified for decoding keyboard input. */) | 7769 doc: /* Return coding system specified for decoding keyboard input. */) |
| 7200 () | 7770 () |
| 7201 { | 7771 { |
| 7202 return keyboard_coding.symbol; | 7772 return CODING_ID_NAME (keyboard_coding.id); |
| 7203 } | 7773 } |
| 7204 | 7774 |
| 7205 | 7775 |
| 7206 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system, | 7776 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system, |
| 7207 Sfind_operation_coding_system, 1, MANY, 0, | 7777 Sfind_operation_coding_system, 1, MANY, 0, |
| 7245 if (nargs < 2) | 7815 if (nargs < 2) |
| 7246 error ("Too few arguments"); | 7816 error ("Too few arguments"); |
| 7247 operation = args[0]; | 7817 operation = args[0]; |
| 7248 if (!SYMBOLP (operation) | 7818 if (!SYMBOLP (operation) |
| 7249 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx))) | 7819 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx))) |
| 7250 error ("Invalid first argument"); | 7820 error ("Invalid first arguement"); |
| 7251 if (nargs < 1 + XINT (target_idx)) | 7821 if (nargs < 1 + XINT (target_idx)) |
| 7252 error ("Too few arguments for operation: %s", | 7822 error ("Too few arguments for operation: %s", |
| 7253 SDATA (SYMBOL_NAME (operation))); | 7823 SDATA (SYMBOL_NAME (operation))); |
| 7254 /* For write-region, if the 6th argument (i.e. VISIT, the 5th | |
| 7255 argument to write-region) is string, it must be treated as a | |
| 7256 target file name. */ | |
| 7257 if (EQ (operation, Qwrite_region) | |
| 7258 && nargs > 5 | |
| 7259 && STRINGP (args[5])) | |
| 7260 target_idx = make_number (4); | |
| 7261 target = args[XINT (target_idx) + 1]; | 7824 target = args[XINT (target_idx) + 1]; |
| 7262 if (!(STRINGP (target) | 7825 if (!(STRINGP (target) |
| 7263 || (EQ (operation, Qopen_network_stream) && INTEGERP (target)))) | 7826 || (EQ (operation, Qopen_network_stream) && INTEGERP (target)))) |
| 7264 error ("Invalid argument %d", XINT (target_idx) + 1); | 7827 error ("Invalid %dth argument", XINT (target_idx) + 1); |
| 7265 | 7828 |
| 7266 chain = ((EQ (operation, Qinsert_file_contents) | 7829 chain = ((EQ (operation, Qinsert_file_contents) |
| 7267 || EQ (operation, Qwrite_region)) | 7830 || EQ (operation, Qwrite_region)) |
| 7268 ? Vfile_coding_system_alist | 7831 ? Vfile_coding_system_alist |
| 7269 : (EQ (operation, Qopen_network_stream) | 7832 : (EQ (operation, Qopen_network_stream) |
| 7273 return Qnil; | 7836 return Qnil; |
| 7274 | 7837 |
| 7275 for (; CONSP (chain); chain = XCDR (chain)) | 7838 for (; CONSP (chain); chain = XCDR (chain)) |
| 7276 { | 7839 { |
| 7277 Lisp_Object elt; | 7840 Lisp_Object elt; |
| 7841 | |
| 7278 elt = XCAR (chain); | 7842 elt = XCAR (chain); |
| 7279 | |
| 7280 if (CONSP (elt) | 7843 if (CONSP (elt) |
| 7281 && ((STRINGP (target) | 7844 && ((STRINGP (target) |
| 7282 && STRINGP (XCAR (elt)) | 7845 && STRINGP (XCAR (elt)) |
| 7283 && fast_string_match (XCAR (elt), target) >= 0) | 7846 && fast_string_match (XCAR (elt), target) >= 0) |
| 7284 || (INTEGERP (target) && EQ (target, XCAR (elt))))) | 7847 || (INTEGERP (target) && EQ (target, XCAR (elt))))) |
| 7304 } | 7867 } |
| 7305 } | 7868 } |
| 7306 return Qnil; | 7869 return Qnil; |
| 7307 } | 7870 } |
| 7308 | 7871 |
| 7309 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal, | 7872 DEFUN ("set-coding-system-priority", Fset_coding_system_priority, |
| 7310 Supdate_coding_systems_internal, 0, 0, 0, | 7873 Sset_coding_system_priority, 0, MANY, 0, |
| 7311 doc: /* Update internal database for ISO2022 and CCL based coding systems. | 7874 doc: /* Assign higher priority to the coding systems given as arguments. |
| 7312 When values of any coding categories are changed, you must | 7875 If multiple coding systems belongs to the same category, |
| 7313 call this function. */) | 7876 all but the first one are ignored. */) |
| 7314 () | 7877 (nargs, args) |
| 7878 int nargs; | |
| 7879 Lisp_Object *args; | |
| 7880 { | |
| 7881 int i, j; | |
| 7882 int changed[coding_category_max]; | |
| 7883 enum coding_category priorities[coding_category_max]; | |
| 7884 | |
| 7885 bzero (changed, sizeof changed); | |
| 7886 | |
| 7887 for (i = j = 0; i < nargs; i++) | |
| 7888 { | |
| 7889 enum coding_category category; | |
| 7890 Lisp_Object spec, attrs; | |
| 7891 | |
| 7892 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec); | |
| 7893 attrs = AREF (spec, 0); | |
| 7894 category = XINT (CODING_ATTR_CATEGORY (attrs)); | |
| 7895 if (changed[category]) | |
| 7896 /* Ignore this coding system because a coding system of the | |
| 7897 same category already had a higher priority. */ | |
| 7898 continue; | |
| 7899 changed[category] = 1; | |
| 7900 priorities[j++] = category; | |
| 7901 if (coding_categories[category].id >= 0 | |
| 7902 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id))) | |
| 7903 setup_coding_system (args[i], &coding_categories[category]); | |
| 7904 Fset (AREF (Vcoding_category_table, category), args[i]); | |
| 7905 } | |
| 7906 | |
| 7907 /* Now we have decided top J priorities. Reflect the order of the | |
| 7908 original priorities to the remaining priorities. */ | |
| 7909 | |
| 7910 for (i = j, j = 0; i < coding_category_max; i++, j++) | |
| 7911 { | |
| 7912 while (j < coding_category_max | |
| 7913 && changed[coding_priorities[j]]) | |
| 7914 j++; | |
| 7915 if (j == coding_category_max) | |
| 7916 abort (); | |
| 7917 priorities[i] = coding_priorities[j]; | |
| 7918 } | |
| 7919 | |
| 7920 bcopy (priorities, coding_priorities, sizeof priorities); | |
| 7921 | |
| 7922 /* Update `coding-category-list'. */ | |
| 7923 Vcoding_category_list = Qnil; | |
| 7924 for (i = coding_category_max - 1; i >= 0; i--) | |
| 7925 Vcoding_category_list | |
| 7926 = Fcons (AREF (Vcoding_category_table, priorities[i]), | |
| 7927 Vcoding_category_list); | |
| 7928 | |
| 7929 return Qnil; | |
| 7930 } | |
| 7931 | |
| 7932 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list, | |
| 7933 Scoding_system_priority_list, 0, 1, 0, | |
| 7934 doc: /* Return a list of coding systems ordered by their priorities. | |
| 7935 HIGHESTP non-nil means just return the highest priority one. */) | |
| 7936 (highestp) | |
| 7937 Lisp_Object highestp; | |
| 7315 { | 7938 { |
| 7316 int i; | 7939 int i; |
| 7317 | 7940 Lisp_Object val; |
| 7318 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++) | 7941 |
| 7319 { | 7942 for (i = 0, val = Qnil; i < coding_category_max; i++) |
| 7320 Lisp_Object val; | 7943 { |
| 7321 | 7944 enum coding_category category = coding_priorities[i]; |
| 7322 val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]); | 7945 int id = coding_categories[category].id; |
| 7323 if (!NILP (val)) | 7946 Lisp_Object attrs; |
| 7324 { | 7947 |
| 7325 if (! coding_system_table[i]) | 7948 if (id < 0) |
| 7326 coding_system_table[i] = ((struct coding_system *) | 7949 continue; |
| 7327 xmalloc (sizeof (struct coding_system))); | 7950 attrs = CODING_ID_ATTRS (id); |
| 7328 setup_coding_system (val, coding_system_table[i]); | 7951 if (! NILP (highestp)) |
| 7329 } | 7952 return CODING_ATTR_BASE_NAME (attrs); |
| 7330 else if (coding_system_table[i]) | 7953 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val); |
| 7331 { | 7954 } |
| 7332 xfree (coding_system_table[i]); | 7955 return Fnreverse (val); |
| 7333 coding_system_table[i] = NULL; | 7956 } |
| 7334 } | 7957 |
| 7335 } | 7958 static char *suffixes[] = { "-unix", "-dos", "-mac" }; |
| 7959 | |
| 7960 static Lisp_Object | |
| 7961 make_subsidiaries (base) | |
| 7962 Lisp_Object base; | |
| 7963 { | |
| 7964 Lisp_Object subsidiaries; | |
| 7965 int base_name_len = SBYTES (SYMBOL_NAME (base)); | |
| 7966 char *buf = (char *) alloca (base_name_len + 6); | |
| 7967 int i; | |
| 7968 | |
| 7969 bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len); | |
| 7970 subsidiaries = Fmake_vector (make_number (3), Qnil); | |
| 7971 for (i = 0; i < 3; i++) | |
| 7972 { | |
| 7973 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1); | |
| 7974 ASET (subsidiaries, i, intern (buf)); | |
| 7975 } | |
| 7976 return subsidiaries; | |
| 7977 } | |
| 7978 | |
| 7979 | |
| 7980 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal, | |
| 7981 Sdefine_coding_system_internal, coding_arg_max, MANY, 0, | |
| 7982 doc: /* For internal use only. | |
| 7983 usage: (define-coding-system-internal ...) */) | |
| 7984 (nargs, args) | |
| 7985 int nargs; | |
| 7986 Lisp_Object *args; | |
| 7987 { | |
| 7988 Lisp_Object name; | |
| 7989 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */ | |
| 7990 Lisp_Object attrs; /* Vector of attributes. */ | |
| 7991 Lisp_Object eol_type; | |
| 7992 Lisp_Object aliases; | |
| 7993 Lisp_Object coding_type, charset_list, safe_charsets; | |
| 7994 enum coding_category category; | |
| 7995 Lisp_Object tail, val; | |
| 7996 int max_charset_id = 0; | |
| 7997 int i; | |
| 7998 | |
| 7999 if (nargs < coding_arg_max) | |
| 8000 goto short_args; | |
| 8001 | |
| 8002 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil); | |
| 8003 | |
| 8004 name = args[coding_arg_name]; | |
| 8005 CHECK_SYMBOL (name); | |
| 8006 CODING_ATTR_BASE_NAME (attrs) = name; | |
| 8007 | |
| 8008 val = args[coding_arg_mnemonic]; | |
| 8009 if (! STRINGP (val)) | |
| 8010 CHECK_CHARACTER (val); | |
| 8011 CODING_ATTR_MNEMONIC (attrs) = val; | |
| 8012 | |
| 8013 coding_type = args[coding_arg_coding_type]; | |
| 8014 CHECK_SYMBOL (coding_type); | |
| 8015 CODING_ATTR_TYPE (attrs) = coding_type; | |
| 8016 | |
| 8017 charset_list = args[coding_arg_charset_list]; | |
| 8018 if (SYMBOLP (charset_list)) | |
| 8019 { | |
| 8020 if (EQ (charset_list, Qiso_2022)) | |
| 8021 { | |
| 8022 if (! EQ (coding_type, Qiso_2022)) | |
| 8023 error ("Invalid charset-list"); | |
| 8024 charset_list = Viso_2022_charset_list; | |
| 8025 } | |
| 8026 else if (EQ (charset_list, Qemacs_mule)) | |
| 8027 { | |
| 8028 if (! EQ (coding_type, Qemacs_mule)) | |
| 8029 error ("Invalid charset-list"); | |
| 8030 charset_list = Vemacs_mule_charset_list; | |
| 8031 } | |
| 8032 for (tail = charset_list; CONSP (tail); tail = XCDR (tail)) | |
| 8033 if (max_charset_id < XFASTINT (XCAR (tail))) | |
| 8034 max_charset_id = XFASTINT (XCAR (tail)); | |
| 8035 } | |
| 8036 else | |
| 8037 { | |
| 8038 charset_list = Fcopy_sequence (charset_list); | |
| 8039 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail)) | |
| 8040 { | |
| 8041 struct charset *charset; | |
| 8042 | |
| 8043 val = Fcar (tail); | |
| 8044 CHECK_CHARSET_GET_CHARSET (val, charset); | |
| 8045 if (EQ (coding_type, Qiso_2022) | |
| 8046 ? CHARSET_ISO_FINAL (charset) < 0 | |
| 8047 : EQ (coding_type, Qemacs_mule) | |
| 8048 ? CHARSET_EMACS_MULE_ID (charset) < 0 | |
| 8049 : 0) | |
| 8050 error ("Can't handle charset `%s'", | |
| 8051 SDATA (SYMBOL_NAME (CHARSET_NAME (charset)))); | |
| 8052 | |
| 8053 XSETCAR (tail, make_number (charset->id)); | |
| 8054 if (max_charset_id < charset->id) | |
| 8055 max_charset_id = charset->id; | |
| 8056 } | |
| 8057 } | |
| 8058 CODING_ATTR_CHARSET_LIST (attrs) = charset_list; | |
| 8059 | |
| 8060 safe_charsets = Fmake_string (make_number (max_charset_id + 1), | |
| 8061 make_number (255)); | |
| 8062 for (tail = charset_list; CONSP (tail); tail = XCDR (tail)) | |
| 8063 SSET (safe_charsets, XFASTINT (XCAR (tail)), 0); | |
| 8064 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets; | |
| 8065 | |
| 8066 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p]; | |
| 8067 | |
| 8068 val = args[coding_arg_decode_translation_table]; | |
| 8069 if (! NILP (val)) | |
| 8070 CHECK_CHAR_TABLE (val); | |
| 8071 CODING_ATTR_DECODE_TBL (attrs) = val; | |
| 8072 | |
| 8073 val = args[coding_arg_encode_translation_table]; | |
| 8074 if (! NILP (val)) | |
| 8075 CHECK_CHAR_TABLE (val); | |
| 8076 CODING_ATTR_ENCODE_TBL (attrs) = val; | |
| 8077 | |
| 8078 val = args[coding_arg_post_read_conversion]; | |
| 8079 CHECK_SYMBOL (val); | |
| 8080 CODING_ATTR_POST_READ (attrs) = val; | |
| 8081 | |
| 8082 val = args[coding_arg_pre_write_conversion]; | |
| 8083 CHECK_SYMBOL (val); | |
| 8084 CODING_ATTR_PRE_WRITE (attrs) = val; | |
| 8085 | |
| 8086 val = args[coding_arg_default_char]; | |
| 8087 if (NILP (val)) | |
| 8088 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' '); | |
| 8089 else | |
| 8090 { | |
| 8091 CHECK_CHARACTER (val); | |
| 8092 CODING_ATTR_DEFAULT_CHAR (attrs) = val; | |
| 8093 } | |
| 8094 | |
| 8095 val = args[coding_arg_for_unibyte]; | |
| 8096 CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt; | |
| 8097 | |
| 8098 val = args[coding_arg_plist]; | |
| 8099 CHECK_LIST (val); | |
| 8100 CODING_ATTR_PLIST (attrs) = val; | |
| 8101 | |
| 8102 if (EQ (coding_type, Qcharset)) | |
| 8103 { | |
| 8104 Lisp_Object list; | |
| 8105 /* Generate a lisp vector of 256 elements. Each element is nil, | |
| 8106 integer, or a list of charset IDs. | |
| 8107 | |
| 8108 If Nth element is nil, the byte code N is invalid in this | |
| 8109 coding system. | |
| 8110 | |
| 8111 If Nth element is a number NUM, N is the first byte of a | |
| 8112 charset whose ID is NUM. | |
| 8113 | |
| 8114 If Nth element is a list of charset IDs, N is the first byte | |
| 8115 of one of them. The list is sorted by dimensions of the | |
| 8116 charsets. A charset of smaller dimension comes firtst. | |
| 8117 */ | |
| 8118 for (list = Qnil, tail = charset_list; CONSP (tail); tail = XCDR (tail)) | |
| 8119 { | |
| 8120 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail))); | |
| 8121 | |
| 8122 if (charset->method == CHARSET_METHOD_SUPERSET) | |
| 8123 { | |
| 8124 val = CHARSET_SUPERSET (charset); | |
| 8125 for (; CONSP (val); val = XCDR (val)) | |
| 8126 list = Fcons (XCAR (XCAR (val)), list); | |
| 8127 } | |
| 8128 else | |
| 8129 list = Fcons (XCAR (tail), list); | |
| 8130 } | |
| 8131 | |
| 8132 val = Fmake_vector (make_number (256), Qnil); | |
| 8133 | |
| 8134 for (tail = Fnreverse (list); CONSP (tail); tail = XCDR (tail)) | |
| 8135 { | |
| 8136 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail))); | |
| 8137 int dim = CHARSET_DIMENSION (charset); | |
| 8138 int idx = (dim - 1) * 4; | |
| 8139 | |
| 8140 if (CHARSET_ASCII_COMPATIBLE_P (charset)) | |
| 8141 CODING_ATTR_ASCII_COMPAT (attrs) = Qt; | |
| 8142 | |
| 8143 for (i = charset->code_space[idx]; | |
| 8144 i <= charset->code_space[idx + 1]; i++) | |
| 8145 { | |
| 8146 Lisp_Object tmp, tmp2; | |
| 8147 int dim2; | |
| 8148 | |
| 8149 tmp = AREF (val, i); | |
| 8150 if (NILP (tmp)) | |
| 8151 tmp = XCAR (tail); | |
| 8152 else if (NUMBERP (tmp)) | |
| 8153 { | |
| 8154 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp))); | |
| 8155 if (dim < dim2) | |
| 8156 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil)); | |
| 8157 else | |
| 8158 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil)); | |
| 8159 } | |
| 8160 else | |
| 8161 { | |
| 8162 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2)) | |
| 8163 { | |
| 8164 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2)))); | |
| 8165 if (dim < dim2) | |
| 8166 break; | |
| 8167 } | |
| 8168 if (NILP (tmp2)) | |
| 8169 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil)); | |
| 8170 else | |
| 8171 { | |
| 8172 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2))); | |
| 8173 XSETCAR (tmp2, XCAR (tail)); | |
| 8174 } | |
| 8175 } | |
| 8176 ASET (val, i, tmp); | |
| 8177 } | |
| 8178 } | |
| 8179 ASET (attrs, coding_attr_charset_valids, val); | |
| 8180 category = coding_category_charset; | |
| 8181 } | |
| 8182 else if (EQ (coding_type, Qccl)) | |
| 8183 { | |
| 8184 Lisp_Object valids; | |
| 8185 | |
| 8186 if (nargs < coding_arg_ccl_max) | |
| 8187 goto short_args; | |
| 8188 | |
| 8189 val = args[coding_arg_ccl_decoder]; | |
| 8190 CHECK_CCL_PROGRAM (val); | |
| 8191 if (VECTORP (val)) | |
| 8192 val = Fcopy_sequence (val); | |
| 8193 ASET (attrs, coding_attr_ccl_decoder, val); | |
| 8194 | |
| 8195 val = args[coding_arg_ccl_encoder]; | |
| 8196 CHECK_CCL_PROGRAM (val); | |
| 8197 if (VECTORP (val)) | |
| 8198 val = Fcopy_sequence (val); | |
| 8199 ASET (attrs, coding_attr_ccl_encoder, val); | |
| 8200 | |
| 8201 val = args[coding_arg_ccl_valids]; | |
| 8202 valids = Fmake_string (make_number (256), make_number (0)); | |
| 8203 for (tail = val; !NILP (tail); tail = Fcdr (tail)) | |
| 8204 { | |
| 8205 int from, to; | |
| 8206 | |
| 8207 val = Fcar (tail); | |
| 8208 if (INTEGERP (val)) | |
| 8209 { | |
| 8210 from = to = XINT (val); | |
| 8211 if (from < 0 || from > 255) | |
| 8212 args_out_of_range_3 (val, make_number (0), make_number (255)); | |
| 8213 } | |
| 8214 else | |
| 8215 { | |
| 8216 CHECK_CONS (val); | |
| 8217 CHECK_NATNUM_CAR (val); | |
| 8218 CHECK_NATNUM_CDR (val); | |
| 8219 from = XINT (XCAR (val)); | |
| 8220 if (from > 255) | |
| 8221 args_out_of_range_3 (XCAR (val), | |
| 8222 make_number (0), make_number (255)); | |
| 8223 to = XINT (XCDR (val)); | |
| 8224 if (to < from || to > 255) | |
| 8225 args_out_of_range_3 (XCDR (val), | |
| 8226 XCAR (val), make_number (255)); | |
| 8227 } | |
| 8228 for (i = from; i <= to; i++) | |
| 8229 SSET (valids, i, 1); | |
| 8230 } | |
| 8231 ASET (attrs, coding_attr_ccl_valids, valids); | |
| 8232 | |
| 8233 category = coding_category_ccl; | |
| 8234 } | |
| 8235 else if (EQ (coding_type, Qutf_16)) | |
| 8236 { | |
| 8237 Lisp_Object bom, endian; | |
| 8238 | |
| 8239 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil; | |
| 8240 | |
| 8241 if (nargs < coding_arg_utf16_max) | |
| 8242 goto short_args; | |
| 8243 | |
| 8244 bom = args[coding_arg_utf16_bom]; | |
| 8245 if (! NILP (bom) && ! EQ (bom, Qt)) | |
| 8246 { | |
| 8247 CHECK_CONS (bom); | |
| 8248 val = XCAR (bom); | |
| 8249 CHECK_CODING_SYSTEM (val); | |
| 8250 val = XCDR (bom); | |
| 8251 CHECK_CODING_SYSTEM (val); | |
| 8252 } | |
| 8253 ASET (attrs, coding_attr_utf_16_bom, bom); | |
| 8254 | |
| 8255 endian = args[coding_arg_utf16_endian]; | |
| 8256 CHECK_SYMBOL (endian); | |
| 8257 if (NILP (endian)) | |
| 8258 endian = Qbig; | |
| 8259 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle)) | |
| 8260 error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian))); | |
| 8261 ASET (attrs, coding_attr_utf_16_endian, endian); | |
| 8262 | |
| 8263 category = (CONSP (bom) | |
| 8264 ? coding_category_utf_16_auto | |
| 8265 : NILP (bom) | |
| 8266 ? (EQ (endian, Qbig) | |
| 8267 ? coding_category_utf_16_be_nosig | |
| 8268 : coding_category_utf_16_le_nosig) | |
| 8269 : (EQ (endian, Qbig) | |
| 8270 ? coding_category_utf_16_be | |
| 8271 : coding_category_utf_16_le)); | |
| 8272 } | |
| 8273 else if (EQ (coding_type, Qiso_2022)) | |
| 8274 { | |
| 8275 Lisp_Object initial, reg_usage, request, flags; | |
| 8276 int i; | |
| 8277 | |
| 8278 if (nargs < coding_arg_iso2022_max) | |
| 8279 goto short_args; | |
| 8280 | |
| 8281 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]); | |
| 8282 CHECK_VECTOR (initial); | |
| 8283 for (i = 0; i < 4; i++) | |
| 8284 { | |
| 8285 val = Faref (initial, make_number (i)); | |
| 8286 if (! NILP (val)) | |
| 8287 { | |
| 8288 struct charset *charset; | |
| 8289 | |
| 8290 CHECK_CHARSET_GET_CHARSET (val, charset); | |
| 8291 ASET (initial, i, make_number (CHARSET_ID (charset))); | |
| 8292 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset)) | |
| 8293 CODING_ATTR_ASCII_COMPAT (attrs) = Qt; | |
| 8294 } | |
| 8295 else | |
| 8296 ASET (initial, i, make_number (-1)); | |
| 8297 } | |
| 8298 | |
| 8299 reg_usage = args[coding_arg_iso2022_reg_usage]; | |
| 8300 CHECK_CONS (reg_usage); | |
| 8301 CHECK_NUMBER_CAR (reg_usage); | |
| 8302 CHECK_NUMBER_CDR (reg_usage); | |
| 8303 | |
| 8304 request = Fcopy_sequence (args[coding_arg_iso2022_request]); | |
| 8305 for (tail = request; ! NILP (tail); tail = Fcdr (tail)) | |
| 8306 { | |
| 8307 int id; | |
| 8308 Lisp_Object tmp; | |
| 8309 | |
| 8310 val = Fcar (tail); | |
| 8311 CHECK_CONS (val); | |
| 8312 tmp = XCAR (val); | |
| 8313 CHECK_CHARSET_GET_ID (tmp, id); | |
| 8314 CHECK_NATNUM_CDR (val); | |
| 8315 if (XINT (XCDR (val)) >= 4) | |
| 8316 error ("Invalid graphic register number: %d", XINT (XCDR (val))); | |
| 8317 XSETCAR (val, make_number (id)); | |
| 8318 } | |
| 8319 | |
| 8320 flags = args[coding_arg_iso2022_flags]; | |
| 8321 CHECK_NATNUM (flags); | |
| 8322 i = XINT (flags); | |
| 8323 if (EQ (args[coding_arg_charset_list], Qiso_2022)) | |
| 8324 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT); | |
| 8325 | |
| 8326 ASET (attrs, coding_attr_iso_initial, initial); | |
| 8327 ASET (attrs, coding_attr_iso_usage, reg_usage); | |
| 8328 ASET (attrs, coding_attr_iso_request, request); | |
| 8329 ASET (attrs, coding_attr_iso_flags, flags); | |
| 8330 setup_iso_safe_charsets (attrs); | |
| 8331 | |
| 8332 if (i & CODING_ISO_FLAG_SEVEN_BITS) | |
| 8333 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT | |
| 8334 | CODING_ISO_FLAG_SINGLE_SHIFT)) | |
| 8335 ? coding_category_iso_7_else | |
| 8336 : EQ (args[coding_arg_charset_list], Qiso_2022) | |
| 8337 ? coding_category_iso_7 | |
| 8338 : coding_category_iso_7_tight); | |
| 8339 else | |
| 8340 { | |
| 8341 int id = XINT (AREF (initial, 1)); | |
| 8342 | |
| 8343 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT) | |
| 8344 || EQ (args[coding_arg_charset_list], Qiso_2022) | |
| 8345 || id < 0) | |
| 8346 ? coding_category_iso_8_else | |
| 8347 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1) | |
| 8348 ? coding_category_iso_8_1 | |
| 8349 : coding_category_iso_8_2); | |
| 8350 } | |
| 8351 if (category != coding_category_iso_8_1 | |
| 8352 && category != coding_category_iso_8_2) | |
| 8353 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil; | |
| 8354 } | |
| 8355 else if (EQ (coding_type, Qemacs_mule)) | |
| 8356 { | |
| 8357 if (EQ (args[coding_arg_charset_list], Qemacs_mule)) | |
| 8358 ASET (attrs, coding_attr_emacs_mule_full, Qt); | |
| 8359 CODING_ATTR_ASCII_COMPAT (attrs) = Qt; | |
| 8360 category = coding_category_emacs_mule; | |
| 8361 } | |
| 8362 else if (EQ (coding_type, Qshift_jis)) | |
| 8363 { | |
| 8364 | |
| 8365 struct charset *charset; | |
| 8366 | |
| 8367 if (XINT (Flength (charset_list)) != 3) | |
| 8368 error ("There should be just three charsets"); | |
| 8369 | |
| 8370 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list))); | |
| 8371 if (CHARSET_DIMENSION (charset) != 1) | |
| 8372 error ("Dimension of charset %s is not one", | |
| 8373 SDATA (SYMBOL_NAME (CHARSET_NAME (charset)))); | |
| 8374 if (CHARSET_ASCII_COMPATIBLE_P (charset)) | |
| 8375 CODING_ATTR_ASCII_COMPAT (attrs) = Qt; | |
| 8376 | |
| 8377 charset_list = XCDR (charset_list); | |
| 8378 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list))); | |
| 8379 if (CHARSET_DIMENSION (charset) != 1) | |
| 8380 error ("Dimension of charset %s is not one", | |
| 8381 SDATA (SYMBOL_NAME (CHARSET_NAME (charset)))); | |
| 8382 | |
| 8383 charset_list = XCDR (charset_list); | |
| 8384 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list))); | |
| 8385 if (CHARSET_DIMENSION (charset) != 2) | |
| 8386 error ("Dimension of charset %s is not two", | |
| 8387 SDATA (SYMBOL_NAME (CHARSET_NAME (charset)))); | |
| 8388 | |
| 8389 category = coding_category_sjis; | |
| 8390 Vsjis_coding_system = name; | |
| 8391 } | |
| 8392 else if (EQ (coding_type, Qbig5)) | |
| 8393 { | |
| 8394 struct charset *charset; | |
| 8395 | |
| 8396 if (XINT (Flength (charset_list)) != 2) | |
| 8397 error ("There should be just two charsets"); | |
| 8398 | |
| 8399 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list))); | |
| 8400 if (CHARSET_DIMENSION (charset) != 1) | |
| 8401 error ("Dimension of charset %s is not one", | |
| 8402 SDATA (SYMBOL_NAME (CHARSET_NAME (charset)))); | |
| 8403 if (CHARSET_ASCII_COMPATIBLE_P (charset)) | |
| 8404 CODING_ATTR_ASCII_COMPAT (attrs) = Qt; | |
| 8405 | |
| 8406 charset_list = XCDR (charset_list); | |
| 8407 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list))); | |
| 8408 if (CHARSET_DIMENSION (charset) != 2) | |
| 8409 error ("Dimension of charset %s is not two", | |
| 8410 SDATA (SYMBOL_NAME (CHARSET_NAME (charset)))); | |
| 8411 | |
| 8412 category = coding_category_big5; | |
| 8413 Vbig5_coding_system = name; | |
| 8414 } | |
| 8415 else if (EQ (coding_type, Qraw_text)) | |
| 8416 { | |
| 8417 category = coding_category_raw_text; | |
| 8418 CODING_ATTR_ASCII_COMPAT (attrs) = Qt; | |
| 8419 } | |
| 8420 else if (EQ (coding_type, Qutf_8)) | |
| 8421 { | |
| 8422 category = coding_category_utf_8; | |
| 8423 CODING_ATTR_ASCII_COMPAT (attrs) = Qt; | |
| 8424 } | |
| 8425 else if (EQ (coding_type, Qundecided)) | |
| 8426 category = coding_category_undecided; | |
| 8427 else | |
| 8428 error ("Invalid coding system type: %s", | |
| 8429 SDATA (SYMBOL_NAME (coding_type))); | |
| 8430 | |
| 8431 CODING_ATTR_CATEGORY (attrs) = make_number (category); | |
| 8432 CODING_ATTR_PLIST (attrs) | |
| 8433 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category), | |
| 8434 CODING_ATTR_PLIST (attrs))); | |
| 8435 | |
| 8436 eol_type = args[coding_arg_eol_type]; | |
| 8437 if (! NILP (eol_type) | |
| 8438 && ! EQ (eol_type, Qunix) | |
| 8439 && ! EQ (eol_type, Qdos) | |
| 8440 && ! EQ (eol_type, Qmac)) | |
| 8441 error ("Invalid eol-type"); | |
| 8442 | |
| 8443 aliases = Fcons (name, Qnil); | |
| 8444 | |
| 8445 if (NILP (eol_type)) | |
| 8446 { | |
| 8447 eol_type = make_subsidiaries (name); | |
| 8448 for (i = 0; i < 3; i++) | |
| 8449 { | |
| 8450 Lisp_Object this_spec, this_name, this_aliases, this_eol_type; | |
| 8451 | |
| 8452 this_name = AREF (eol_type, i); | |
| 8453 this_aliases = Fcons (this_name, Qnil); | |
| 8454 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac); | |
| 8455 this_spec = Fmake_vector (make_number (3), attrs); | |
| 8456 ASET (this_spec, 1, this_aliases); | |
| 8457 ASET (this_spec, 2, this_eol_type); | |
| 8458 Fputhash (this_name, this_spec, Vcoding_system_hash_table); | |
| 8459 Vcoding_system_list = Fcons (this_name, Vcoding_system_list); | |
| 8460 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil), | |
| 8461 Vcoding_system_alist); | |
| 8462 } | |
| 8463 } | |
| 8464 | |
| 8465 spec_vec = Fmake_vector (make_number (3), attrs); | |
| 8466 ASET (spec_vec, 1, aliases); | |
| 8467 ASET (spec_vec, 2, eol_type); | |
| 8468 | |
| 8469 Fputhash (name, spec_vec, Vcoding_system_hash_table); | |
| 8470 Vcoding_system_list = Fcons (name, Vcoding_system_list); | |
| 8471 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil), | |
| 8472 Vcoding_system_alist); | |
| 8473 | |
| 8474 { | |
| 8475 int id = coding_categories[category].id; | |
| 8476 | |
| 8477 if (id < 0 || EQ (name, CODING_ID_NAME (id))) | |
| 8478 setup_coding_system (name, &coding_categories[category]); | |
| 8479 } | |
| 7336 | 8480 |
| 7337 return Qnil; | 8481 return Qnil; |
| 7338 } | 8482 |
| 7339 | 8483 short_args: |
| 7340 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal, | 8484 return Fsignal (Qwrong_number_of_arguments, |
| 7341 Sset_coding_priority_internal, 0, 0, 0, | 8485 Fcons (intern ("define-coding-system-internal"), |
| 7342 doc: /* Update internal database for the current value of `coding-category-list'. | 8486 make_number (nargs))); |
| 7343 This function is internal use only. */) | 8487 } |
| 7344 () | 8488 |
| 7345 { | 8489 /* Fixme: should this record the alias relationships for |
| 7346 int i = 0, idx; | 8490 diagnostics? Should it update coding-system-list? */ |
| 7347 Lisp_Object val; | 8491 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias, |
| 7348 | 8492 Sdefine_coding_system_alias, 2, 2, 0, |
| 7349 val = Vcoding_category_list; | 8493 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */) |
| 7350 | 8494 (alias, coding_system) |
| 7351 while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX) | 8495 Lisp_Object alias, coding_system; |
| 7352 { | 8496 { |
| 7353 if (! SYMBOLP (XCAR (val))) | 8497 Lisp_Object spec, aliases, eol_type; |
| 7354 break; | 8498 |
| 7355 idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index)); | 8499 CHECK_SYMBOL (alias); |
| 7356 if (idx >= CODING_CATEGORY_IDX_MAX) | 8500 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec); |
| 7357 break; | 8501 aliases = AREF (spec, 1); |
| 7358 coding_priorities[i++] = (1 << idx); | 8502 while (!NILP (XCDR (aliases))) |
| 7359 val = XCDR (val); | 8503 aliases = XCDR (aliases); |
| 7360 } | 8504 XSETCDR (aliases, Fcons (alias, Qnil)); |
| 7361 /* If coding-category-list is valid and contains all coding | 8505 |
| 7362 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not, | 8506 eol_type = AREF (spec, 2); |
| 7363 the following code saves Emacs from crashing. */ | 8507 if (VECTORP (eol_type)) |
| 7364 while (i < CODING_CATEGORY_IDX_MAX) | 8508 { |
| 7365 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT; | 8509 Lisp_Object subsidiaries; |
| 8510 int i; | |
| 8511 | |
| 8512 subsidiaries = make_subsidiaries (alias); | |
| 8513 for (i = 0; i < 3; i++) | |
| 8514 Fdefine_coding_system_alias (AREF (subsidiaries, i), | |
| 8515 AREF (eol_type, i)); | |
| 8516 | |
| 8517 ASET (spec, 2, subsidiaries); | |
| 8518 } | |
| 8519 | |
| 8520 Fputhash (alias, spec, Vcoding_system_hash_table); | |
| 8521 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil), | |
| 8522 Vcoding_system_alist); | |
| 7366 | 8523 |
| 7367 return Qnil; | 8524 return Qnil; |
| 7368 } | 8525 } |
| 7369 | 8526 |
| 7370 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal, | 8527 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base, |
| 7371 Sdefine_coding_system_internal, 1, 1, 0, | 8528 1, 1, 0, |
| 7372 doc: /* Register CODING-SYSTEM as a base coding system. | 8529 doc: /* Return the base of CODING-SYSTEM. |
| 7373 This function is internal use only. */) | 8530 Any alias or subsidiary coding system is not a base coding system. */) |
| 8531 (coding_system) | |
| 8532 Lisp_Object coding_system; | |
| 8533 { | |
| 8534 Lisp_Object spec, attrs; | |
| 8535 | |
| 8536 if (NILP (coding_system)) | |
| 8537 return (Qno_conversion); | |
| 8538 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec); | |
| 8539 attrs = AREF (spec, 0); | |
| 8540 return CODING_ATTR_BASE_NAME (attrs); | |
| 8541 } | |
| 8542 | |
| 8543 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist, | |
| 8544 1, 1, 0, | |
| 8545 doc: "Return the property list of CODING-SYSTEM.") | |
| 7374 (coding_system) | 8546 (coding_system) |
| 7375 Lisp_Object coding_system; | 8547 Lisp_Object coding_system; |
| 7376 { | 8548 { |
| 7377 Lisp_Object safe_chars, slot; | 8549 Lisp_Object spec, attrs; |
| 7378 | 8550 |
| 7379 if (NILP (Fcheck_coding_system (coding_system))) | 8551 if (NILP (coding_system)) |
| 7380 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil)); | 8552 coding_system = Qno_conversion; |
| 7381 safe_chars = coding_safe_chars (coding_system); | 8553 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec); |
| 7382 if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars)) | 8554 attrs = AREF (spec, 0); |
| 7383 error ("No valid safe-chars property for %s", | 8555 return CODING_ATTR_PLIST (attrs); |
| 7384 SDATA (SYMBOL_NAME (coding_system))); | 8556 } |
| 7385 if (EQ (safe_chars, Qt)) | 8557 |
| 7386 { | 8558 |
| 7387 if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars)))) | 8559 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases, |
| 7388 XSETCAR (Vcoding_system_safe_chars, | 8560 1, 1, 0, |
| 7389 Fcons (coding_system, XCAR (Vcoding_system_safe_chars))); | 8561 doc: /* Return the list of aliases of CODING-SYSTEM. */) |
| 7390 } | 8562 (coding_system) |
| 7391 else | 8563 Lisp_Object coding_system; |
| 7392 { | 8564 { |
| 7393 slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars)); | 8565 Lisp_Object spec; |
| 7394 if (NILP (slot)) | 8566 |
| 7395 XSETCDR (Vcoding_system_safe_chars, | 8567 if (NILP (coding_system)) |
| 7396 nconc2 (XCDR (Vcoding_system_safe_chars), | 8568 coding_system = Qno_conversion; |
| 7397 Fcons (Fcons (coding_system, safe_chars), Qnil))); | 8569 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec); |
| 7398 else | 8570 return AREF (spec, 1); |
| 7399 XSETCDR (slot, safe_chars); | 8571 } |
| 7400 } | 8572 |
| 7401 return Qnil; | 8573 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type, |
| 8574 Scoding_system_eol_type, 1, 1, 0, | |
| 8575 doc: /* Return eol-type of CODING-SYSTEM. | |
| 8576 An eol-type is integer 0, 1, 2, or a vector of coding systems. | |
| 8577 | |
| 8578 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF, | |
| 8579 and CR respectively. | |
| 8580 | |
| 8581 A vector value indicates that a format of end-of-line should be | |
| 8582 detected automatically. Nth element of the vector is the subsidiary | |
| 8583 coding system whose eol-type is N. */) | |
| 8584 (coding_system) | |
| 8585 Lisp_Object coding_system; | |
| 8586 { | |
| 8587 Lisp_Object spec, eol_type; | |
| 8588 int n; | |
| 8589 | |
| 8590 if (NILP (coding_system)) | |
| 8591 coding_system = Qno_conversion; | |
| 8592 if (! CODING_SYSTEM_P (coding_system)) | |
| 8593 return Qnil; | |
| 8594 spec = CODING_SYSTEM_SPEC (coding_system); | |
| 8595 eol_type = AREF (spec, 2); | |
| 8596 if (VECTORP (eol_type)) | |
| 8597 return Fcopy_sequence (eol_type); | |
| 8598 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2; | |
| 8599 return make_number (n); | |
| 7402 } | 8600 } |
| 7403 | 8601 |
| 7404 #endif /* emacs */ | 8602 #endif /* emacs */ |
| 7405 | 8603 |
| 7406 | 8604 |
| 7409 void | 8607 void |
| 7410 init_coding_once () | 8608 init_coding_once () |
| 7411 { | 8609 { |
| 7412 int i; | 8610 int i; |
| 7413 | 8611 |
| 7414 /* Emacs' internal format specific initialize routine. */ | 8612 for (i = 0; i < coding_category_max; i++) |
| 7415 for (i = 0; i <= 0x20; i++) | 8613 { |
| 7416 emacs_code_class[i] = EMACS_control_code; | 8614 coding_categories[i].id = -1; |
| 7417 emacs_code_class[0x0A] = EMACS_linefeed_code; | 8615 coding_priorities[i] = i; |
| 7418 emacs_code_class[0x0D] = EMACS_carriage_return_code; | 8616 } |
| 7419 for (i = 0x21 ; i < 0x7F; i++) | |
| 7420 emacs_code_class[i] = EMACS_ascii_code; | |
| 7421 emacs_code_class[0x7F] = EMACS_control_code; | |
| 7422 for (i = 0x80; i < 0xFF; i++) | |
| 7423 emacs_code_class[i] = EMACS_invalid_code; | |
| 7424 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3; | |
| 7425 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3; | |
| 7426 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4; | |
| 7427 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4; | |
| 7428 | 8617 |
| 7429 /* ISO2022 specific initialize routine. */ | 8618 /* ISO2022 specific initialize routine. */ |
| 7430 for (i = 0; i < 0x20; i++) | 8619 for (i = 0; i < 0x20; i++) |
| 7431 iso_code_class[i] = ISO_control_0; | 8620 iso_code_class[i] = ISO_control_0; |
| 7432 for (i = 0x21; i < 0x7F; i++) | 8621 for (i = 0x21; i < 0x7F; i++) |
| 7444 iso_code_class[ISO_CODE_ESC] = ISO_escape; | 8633 iso_code_class[ISO_CODE_ESC] = ISO_escape; |
| 7445 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2; | 8634 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2; |
| 7446 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3; | 8635 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3; |
| 7447 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer; | 8636 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer; |
| 7448 | 8637 |
| 7449 setup_coding_system (Qnil, &keyboard_coding); | |
| 7450 setup_coding_system (Qnil, &terminal_coding); | |
| 7451 setup_coding_system (Qnil, &safe_terminal_coding); | |
| 7452 setup_coding_system (Qnil, &default_buffer_file_coding); | |
| 7453 | |
| 7454 bzero (coding_system_table, sizeof coding_system_table); | |
| 7455 | |
| 7456 bzero (ascii_skip_code, sizeof ascii_skip_code); | |
| 7457 for (i = 0; i < 128; i++) | |
| 7458 ascii_skip_code[i] = 1; | |
| 7459 | |
| 7460 #if defined (MSDOS) || defined (WINDOWSNT) | |
| 7461 system_eol_type = CODING_EOL_CRLF; | |
| 7462 #else | |
| 7463 system_eol_type = CODING_EOL_LF; | |
| 7464 #endif | |
| 7465 | |
| 7466 inhibit_pre_post_conversion = 0; | 8638 inhibit_pre_post_conversion = 0; |
| 8639 | |
| 8640 for (i = 0; i < 256; i++) | |
| 8641 { | |
| 8642 emacs_mule_bytes[i] = 1; | |
| 8643 } | |
| 8644 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3; | |
| 8645 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3; | |
| 8646 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4; | |
| 8647 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4; | |
| 7467 } | 8648 } |
| 7468 | 8649 |
| 7469 #ifdef emacs | 8650 #ifdef emacs |
| 7470 | 8651 |
| 7471 void | 8652 void |
| 7472 syms_of_coding () | 8653 syms_of_coding () |
| 7473 { | 8654 { |
| 7474 Qtarget_idx = intern ("target-idx"); | 8655 staticpro (&Vcoding_system_hash_table); |
| 7475 staticpro (&Qtarget_idx); | 8656 { |
| 7476 | 8657 Lisp_Object args[2]; |
| 7477 Qcoding_system_history = intern ("coding-system-history"); | 8658 args[0] = QCtest; |
| 7478 staticpro (&Qcoding_system_history); | 8659 args[1] = Qeq; |
| 8660 Vcoding_system_hash_table = Fmake_hash_table (2, args); | |
| 8661 } | |
| 8662 | |
| 8663 staticpro (&Vsjis_coding_system); | |
| 8664 Vsjis_coding_system = Qnil; | |
| 8665 | |
| 8666 staticpro (&Vbig5_coding_system); | |
| 8667 Vbig5_coding_system = Qnil; | |
| 8668 | |
| 8669 staticpro (&Vcode_conversion_work_buf_list); | |
| 8670 Vcode_conversion_work_buf_list = Qnil; | |
| 8671 | |
| 8672 staticpro (&Vcode_conversion_reused_work_buf); | |
| 8673 Vcode_conversion_reused_work_buf = Qnil; | |
| 8674 | |
| 8675 DEFSYM (Qcharset, "charset"); | |
| 8676 DEFSYM (Qtarget_idx, "target-idx"); | |
| 8677 DEFSYM (Qcoding_system_history, "coding-system-history"); | |
| 7479 Fset (Qcoding_system_history, Qnil); | 8678 Fset (Qcoding_system_history, Qnil); |
| 7480 | 8679 |
| 7481 /* Target FILENAME is the first argument. */ | 8680 /* Target FILENAME is the first argument. */ |
| 7482 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0)); | 8681 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0)); |
| 7483 /* Target FILENAME is the third argument. */ | 8682 /* Target FILENAME is the third argument. */ |
| 7484 Fput (Qwrite_region, Qtarget_idx, make_number (2)); | 8683 Fput (Qwrite_region, Qtarget_idx, make_number (2)); |
| 7485 | 8684 |
| 7486 Qcall_process = intern ("call-process"); | 8685 DEFSYM (Qcall_process, "call-process"); |
| 7487 staticpro (&Qcall_process); | |
| 7488 /* Target PROGRAM is the first argument. */ | 8686 /* Target PROGRAM is the first argument. */ |
| 7489 Fput (Qcall_process, Qtarget_idx, make_number (0)); | 8687 Fput (Qcall_process, Qtarget_idx, make_number (0)); |
| 7490 | 8688 |
| 7491 Qcall_process_region = intern ("call-process-region"); | 8689 DEFSYM (Qcall_process_region, "call-process-region"); |
| 7492 staticpro (&Qcall_process_region); | |
| 7493 /* Target PROGRAM is the third argument. */ | 8690 /* Target PROGRAM is the third argument. */ |
| 7494 Fput (Qcall_process_region, Qtarget_idx, make_number (2)); | 8691 Fput (Qcall_process_region, Qtarget_idx, make_number (2)); |
| 7495 | 8692 |
| 7496 Qstart_process = intern ("start-process"); | 8693 DEFSYM (Qstart_process, "start-process"); |
| 7497 staticpro (&Qstart_process); | |
| 7498 /* Target PROGRAM is the third argument. */ | 8694 /* Target PROGRAM is the third argument. */ |
| 7499 Fput (Qstart_process, Qtarget_idx, make_number (2)); | 8695 Fput (Qstart_process, Qtarget_idx, make_number (2)); |
| 7500 | 8696 |
| 7501 Qopen_network_stream = intern ("open-network-stream"); | 8697 DEFSYM (Qopen_network_stream, "open-network-stream"); |
| 7502 staticpro (&Qopen_network_stream); | |
| 7503 /* Target SERVICE is the fourth argument. */ | 8698 /* Target SERVICE is the fourth argument. */ |
| 7504 Fput (Qopen_network_stream, Qtarget_idx, make_number (3)); | 8699 Fput (Qopen_network_stream, Qtarget_idx, make_number (3)); |
| 7505 | 8700 |
| 7506 Qcoding_system = intern ("coding-system"); | 8701 DEFSYM (Qcoding_system, "coding-system"); |
| 7507 staticpro (&Qcoding_system); | 8702 DEFSYM (Qcoding_aliases, "coding-aliases"); |
| 7508 | 8703 |
| 7509 Qeol_type = intern ("eol-type"); | 8704 DEFSYM (Qeol_type, "eol-type"); |
| 7510 staticpro (&Qeol_type); | 8705 DEFSYM (Qunix, "unix"); |
| 7511 | 8706 DEFSYM (Qdos, "dos"); |
| 7512 Qbuffer_file_coding_system = intern ("buffer-file-coding-system"); | 8707 |
| 7513 staticpro (&Qbuffer_file_coding_system); | 8708 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system"); |
| 7514 | 8709 DEFSYM (Qpost_read_conversion, "post-read-conversion"); |
| 7515 Qpost_read_conversion = intern ("post-read-conversion"); | 8710 DEFSYM (Qpre_write_conversion, "pre-write-conversion"); |
| 7516 staticpro (&Qpost_read_conversion); | 8711 DEFSYM (Qdefault_char, "default-char"); |
| 7517 | 8712 DEFSYM (Qundecided, "undecided"); |
| 7518 Qpre_write_conversion = intern ("pre-write-conversion"); | 8713 DEFSYM (Qno_conversion, "no-conversion"); |
| 7519 staticpro (&Qpre_write_conversion); | 8714 DEFSYM (Qraw_text, "raw-text"); |
| 7520 | 8715 |
| 7521 Qno_conversion = intern ("no-conversion"); | 8716 DEFSYM (Qiso_2022, "iso-2022"); |
| 7522 staticpro (&Qno_conversion); | 8717 |
| 7523 | 8718 DEFSYM (Qutf_8, "utf-8"); |
| 7524 Qundecided = intern ("undecided"); | 8719 DEFSYM (Qutf_8_emacs, "utf-8-emacs"); |
| 7525 staticpro (&Qundecided); | 8720 |
| 7526 | 8721 DEFSYM (Qutf_16, "utf-16"); |
| 7527 Qcoding_system_p = intern ("coding-system-p"); | 8722 DEFSYM (Qbig, "big"); |
| 7528 staticpro (&Qcoding_system_p); | 8723 DEFSYM (Qlittle, "little"); |
| 7529 | 8724 |
| 7530 Qcoding_system_error = intern ("coding-system-error"); | 8725 DEFSYM (Qshift_jis, "shift-jis"); |
| 7531 staticpro (&Qcoding_system_error); | 8726 DEFSYM (Qbig5, "big5"); |
| 7532 | 8727 |
| 8728 DEFSYM (Qcoding_system_p, "coding-system-p"); | |
| 8729 | |
| 8730 DEFSYM (Qcoding_system_error, "coding-system-error"); | |
| 7533 Fput (Qcoding_system_error, Qerror_conditions, | 8731 Fput (Qcoding_system_error, Qerror_conditions, |
| 7534 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil))); | 8732 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil))); |
| 7535 Fput (Qcoding_system_error, Qerror_message, | 8733 Fput (Qcoding_system_error, Qerror_message, |
| 7536 build_string ("Invalid coding system")); | 8734 build_string ("Invalid coding system")); |
| 7537 | 8735 |
| 7538 Qcoding_category = intern ("coding-category"); | |
| 7539 staticpro (&Qcoding_category); | |
| 7540 Qcoding_category_index = intern ("coding-category-index"); | |
| 7541 staticpro (&Qcoding_category_index); | |
| 7542 | |
| 7543 Vcoding_category_table | |
| 7544 = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil); | |
| 7545 staticpro (&Vcoding_category_table); | |
| 7546 { | |
| 7547 int i; | |
| 7548 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) | |
| 7549 { | |
| 7550 XVECTOR (Vcoding_category_table)->contents[i] | |
| 7551 = intern (coding_category_name[i]); | |
| 7552 Fput (XVECTOR (Vcoding_category_table)->contents[i], | |
| 7553 Qcoding_category_index, make_number (i)); | |
| 7554 } | |
| 7555 } | |
| 7556 | |
| 7557 Vcoding_system_safe_chars = Fcons (Qnil, Qnil); | |
| 7558 staticpro (&Vcoding_system_safe_chars); | |
| 7559 | |
| 7560 Qtranslation_table = intern ("translation-table"); | |
| 7561 staticpro (&Qtranslation_table); | |
| 7562 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2)); | |
| 7563 | |
| 7564 Qtranslation_table_id = intern ("translation-table-id"); | |
| 7565 staticpro (&Qtranslation_table_id); | |
| 7566 | |
| 7567 Qtranslation_table_for_decode = intern ("translation-table-for-decode"); | |
| 7568 staticpro (&Qtranslation_table_for_decode); | |
| 7569 | |
| 7570 Qtranslation_table_for_encode = intern ("translation-table-for-encode"); | |
| 7571 staticpro (&Qtranslation_table_for_encode); | |
| 7572 | |
| 7573 Qsafe_chars = intern ("safe-chars"); | |
| 7574 staticpro (&Qsafe_chars); | |
| 7575 | |
| 7576 Qchar_coding_system = intern ("char-coding-system"); | |
| 7577 staticpro (&Qchar_coding_system); | |
| 7578 | |
| 7579 /* Intern this now in case it isn't already done. | 8736 /* Intern this now in case it isn't already done. |
| 7580 Setting this variable twice is harmless. | 8737 Setting this variable twice is harmless. |
| 7581 But don't staticpro it here--that is done in alloc.c. */ | 8738 But don't staticpro it here--that is done in alloc.c. */ |
| 7582 Qchar_table_extra_slots = intern ("char-table-extra-slots"); | 8739 Qchar_table_extra_slots = intern ("char-table-extra-slots"); |
| 7583 Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0)); | 8740 |
| 7584 Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0)); | 8741 DEFSYM (Qtranslation_table, "translation-table"); |
| 7585 | 8742 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1)); |
| 7586 Qvalid_codes = intern ("valid-codes"); | 8743 DEFSYM (Qtranslation_table_id, "translation-table-id"); |
| 7587 staticpro (&Qvalid_codes); | 8744 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode"); |
| 7588 | 8745 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode"); |
| 7589 Qemacs_mule = intern ("emacs-mule"); | 8746 |
| 7590 staticpro (&Qemacs_mule); | 8747 DEFSYM (Qvalid_codes, "valid-codes"); |
| 7591 | 8748 |
| 7592 Qraw_text = intern ("raw-text"); | 8749 DEFSYM (Qemacs_mule, "emacs-mule"); |
| 7593 staticpro (&Qraw_text); | 8750 |
| 7594 | 8751 DEFSYM (QCcategory, ":category"); |
| 7595 Qutf_8 = intern ("utf-8"); | 8752 |
| 7596 staticpro (&Qutf_8); | 8753 Vcoding_category_table |
| 8754 = Fmake_vector (make_number (coding_category_max), Qnil); | |
| 8755 staticpro (&Vcoding_category_table); | |
| 8756 /* Followings are target of code detection. */ | |
| 8757 ASET (Vcoding_category_table, coding_category_iso_7, | |
| 8758 intern ("coding-category-iso-7")); | |
| 8759 ASET (Vcoding_category_table, coding_category_iso_7_tight, | |
| 8760 intern ("coding-category-iso-7-tight")); | |
| 8761 ASET (Vcoding_category_table, coding_category_iso_8_1, | |
| 8762 intern ("coding-category-iso-8-1")); | |
| 8763 ASET (Vcoding_category_table, coding_category_iso_8_2, | |
| 8764 intern ("coding-category-iso-8-2")); | |
| 8765 ASET (Vcoding_category_table, coding_category_iso_7_else, | |
| 8766 intern ("coding-category-iso-7-else")); | |
| 8767 ASET (Vcoding_category_table, coding_category_iso_8_else, | |
| 8768 intern ("coding-category-iso-8-else")); | |
| 8769 ASET (Vcoding_category_table, coding_category_utf_8, | |
| 8770 intern ("coding-category-utf-8")); | |
| 8771 ASET (Vcoding_category_table, coding_category_utf_16_be, | |
| 8772 intern ("coding-category-utf-16-be")); | |
| 8773 ASET (Vcoding_category_table, coding_category_utf_16_auto, | |
| 8774 intern ("coding-category-utf-16-auto")); | |
| 8775 ASET (Vcoding_category_table, coding_category_utf_16_le, | |
| 8776 intern ("coding-category-utf-16-le")); | |
| 8777 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig, | |
| 8778 intern ("coding-category-utf-16-be-nosig")); | |
| 8779 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig, | |
| 8780 intern ("coding-category-utf-16-le-nosig")); | |
| 8781 ASET (Vcoding_category_table, coding_category_charset, | |
| 8782 intern ("coding-category-charset")); | |
| 8783 ASET (Vcoding_category_table, coding_category_sjis, | |
| 8784 intern ("coding-category-sjis")); | |
| 8785 ASET (Vcoding_category_table, coding_category_big5, | |
| 8786 intern ("coding-category-big5")); | |
| 8787 ASET (Vcoding_category_table, coding_category_ccl, | |
| 8788 intern ("coding-category-ccl")); | |
| 8789 ASET (Vcoding_category_table, coding_category_emacs_mule, | |
| 8790 intern ("coding-category-emacs-mule")); | |
| 8791 /* Followings are NOT target of code detection. */ | |
| 8792 ASET (Vcoding_category_table, coding_category_raw_text, | |
| 8793 intern ("coding-category-raw-text")); | |
| 8794 ASET (Vcoding_category_table, coding_category_undecided, | |
| 8795 intern ("coding-category-undecided")); | |
| 7597 | 8796 |
| 7598 defsubr (&Scoding_system_p); | 8797 defsubr (&Scoding_system_p); |
| 7599 defsubr (&Sread_coding_system); | 8798 defsubr (&Sread_coding_system); |
| 7600 defsubr (&Sread_non_nil_coding_system); | 8799 defsubr (&Sread_non_nil_coding_system); |
| 7601 defsubr (&Scheck_coding_system); | 8800 defsubr (&Scheck_coding_system); |
| 7602 defsubr (&Sdetect_coding_region); | 8801 defsubr (&Sdetect_coding_region); |
| 7603 defsubr (&Sdetect_coding_string); | 8802 defsubr (&Sdetect_coding_string); |
| 7604 defsubr (&Sfind_coding_systems_region_internal); | 8803 defsubr (&Sfind_coding_systems_region_internal); |
| 7605 defsubr (&Sunencodable_char_position); | 8804 defsubr (&Sunencodable_char_position); |
| 8805 defsubr (&Scheck_coding_systems_region); | |
| 7606 defsubr (&Sdecode_coding_region); | 8806 defsubr (&Sdecode_coding_region); |
| 7607 defsubr (&Sencode_coding_region); | 8807 defsubr (&Sencode_coding_region); |
| 7608 defsubr (&Sdecode_coding_string); | 8808 defsubr (&Sdecode_coding_string); |
| 7609 defsubr (&Sencode_coding_string); | 8809 defsubr (&Sencode_coding_string); |
| 7610 defsubr (&Sdecode_sjis_char); | 8810 defsubr (&Sdecode_sjis_char); |
| 7615 defsubr (&Sset_safe_terminal_coding_system_internal); | 8815 defsubr (&Sset_safe_terminal_coding_system_internal); |
| 7616 defsubr (&Sterminal_coding_system); | 8816 defsubr (&Sterminal_coding_system); |
| 7617 defsubr (&Sset_keyboard_coding_system_internal); | 8817 defsubr (&Sset_keyboard_coding_system_internal); |
| 7618 defsubr (&Skeyboard_coding_system); | 8818 defsubr (&Skeyboard_coding_system); |
| 7619 defsubr (&Sfind_operation_coding_system); | 8819 defsubr (&Sfind_operation_coding_system); |
| 7620 defsubr (&Supdate_coding_systems_internal); | 8820 defsubr (&Sset_coding_system_priority); |
| 7621 defsubr (&Sset_coding_priority_internal); | |
| 7622 defsubr (&Sdefine_coding_system_internal); | 8821 defsubr (&Sdefine_coding_system_internal); |
| 8822 defsubr (&Sdefine_coding_system_alias); | |
| 8823 defsubr (&Scoding_system_base); | |
| 8824 defsubr (&Scoding_system_plist); | |
| 8825 defsubr (&Scoding_system_aliases); | |
| 8826 defsubr (&Scoding_system_eol_type); | |
| 8827 defsubr (&Scoding_system_priority_list); | |
| 7623 | 8828 |
| 7624 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list, | 8829 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list, |
| 7625 doc: /* List of coding systems. | 8830 doc: /* List of coding systems. |
| 7626 | 8831 |
| 7627 Do not alter the value of this variable manually. This variable should be | 8832 Do not alter the value of this variable manually. This variable should be |
| 7628 updated by the functions `make-coding-system' and | 8833 updated by the functions `define-coding-system' and |
| 7629 `define-coding-system-alias'. */); | 8834 `define-coding-system-alias'. */); |
| 7630 Vcoding_system_list = Qnil; | 8835 Vcoding_system_list = Qnil; |
| 7631 | 8836 |
| 7632 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist, | 8837 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist, |
| 7633 doc: /* Alist of coding system names. | 8838 doc: /* Alist of coding system names. |
| 7648 system bound to the corresponding coding-category is selected. */); | 8853 system bound to the corresponding coding-category is selected. */); |
| 7649 { | 8854 { |
| 7650 int i; | 8855 int i; |
| 7651 | 8856 |
| 7652 Vcoding_category_list = Qnil; | 8857 Vcoding_category_list = Qnil; |
| 7653 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--) | 8858 for (i = coding_category_max - 1; i >= 0; i--) |
| 7654 Vcoding_category_list | 8859 Vcoding_category_list |
| 7655 = Fcons (XVECTOR (Vcoding_category_table)->contents[i], | 8860 = Fcons (XVECTOR (Vcoding_category_table)->contents[i], |
| 7656 Vcoding_category_list); | 8861 Vcoding_category_list); |
| 7657 } | 8862 } |
| 7658 | 8863 |
| 7678 For output to files, if the above procedure does not specify a coding system, | 8883 For output to files, if the above procedure does not specify a coding system, |
| 7679 the value of `buffer-file-coding-system' is used. */); | 8884 the value of `buffer-file-coding-system' is used. */); |
| 7680 Vcoding_system_for_write = Qnil; | 8885 Vcoding_system_for_write = Qnil; |
| 7681 | 8886 |
| 7682 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used, | 8887 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used, |
| 7683 doc: /* Coding system used in the latest file or process I/O. | 8888 doc: /* |
| 7684 Also set by `encode-coding-region', `decode-coding-region', | 8889 Coding system used in the latest file or process I/O. */); |
| 7685 `encode-coding-string' and `decode-coding-string'. */); | |
| 7686 Vlast_coding_system_used = Qnil; | 8890 Vlast_coding_system_used = Qnil; |
| 7687 | 8891 |
| 7688 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion, | 8892 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion, |
| 7689 doc: /* *Non-nil means always inhibit code conversion of end-of-line format. | 8893 doc: /* |
| 8894 *Non-nil means always inhibit code conversion of end-of-line format. | |
| 7690 See info node `Coding Systems' and info node `Text and Binary' concerning | 8895 See info node `Coding Systems' and info node `Text and Binary' concerning |
| 7691 such conversion. */); | 8896 such conversion. */); |
| 7692 inhibit_eol_conversion = 0; | 8897 inhibit_eol_conversion = 0; |
| 7693 | 8898 |
| 7694 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system, | 8899 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system, |
| 7695 doc: /* Non-nil means process buffer inherits coding system of process output. | 8900 doc: /* |
| 8901 Non-nil means process buffer inherits coding system of process output. | |
| 7696 Bind it to t if the process output is to be treated as if it were a file | 8902 Bind it to t if the process output is to be treated as if it were a file |
| 7697 read from some filesystem. */); | 8903 read from some filesystem. */); |
| 7698 inherit_process_coding_system = 0; | 8904 inherit_process_coding_system = 0; |
| 7699 | 8905 |
| 7700 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist, | 8906 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist, |
| 7701 doc: /* Alist to decide a coding system to use for a file I/O operation. | 8907 doc: /* |
| 8908 Alist to decide a coding system to use for a file I/O operation. | |
| 7702 The format is ((PATTERN . VAL) ...), | 8909 The format is ((PATTERN . VAL) ...), |
| 7703 where PATTERN is a regular expression matching a file name, | 8910 where PATTERN is a regular expression matching a file name, |
| 7704 VAL is a coding system, a cons of coding systems, or a function symbol. | 8911 VAL is a coding system, a cons of coding systems, or a function symbol. |
| 7705 If VAL is a coding system, it is used for both decoding and encoding | 8912 If VAL is a coding system, it is used for both decoding and encoding |
| 7706 the file contents. | 8913 the file contents. |
| 7707 If VAL is a cons of coding systems, the car part is used for decoding, | 8914 If VAL is a cons of coding systems, the car part is used for decoding, |
| 7708 and the cdr part is used for encoding. | 8915 and the cdr part is used for encoding. |
| 7709 If VAL is a function symbol, the function must return a coding system | 8916 If VAL is a function symbol, the function must return a coding system |
| 7710 or a cons of coding systems which are used as above. The function gets | 8917 or a cons of coding systems which are used as above. The function gets |
| 7711 the arguments with which `find-operation-coding-system' was called. | 8918 the arguments with which `find-operation-coding-systems' was called. |
| 7712 | 8919 |
| 7713 See also the function `find-operation-coding-system' | 8920 See also the function `find-operation-coding-system' |
| 7714 and the variable `auto-coding-alist'. */); | 8921 and the variable `auto-coding-alist'. */); |
| 7715 Vfile_coding_system_alist = Qnil; | 8922 Vfile_coding_system_alist = Qnil; |
| 7716 | 8923 |
| 7717 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist, | 8924 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist, |
| 7718 doc: /* Alist to decide a coding system to use for a process I/O operation. | 8925 doc: /* |
| 8926 Alist to decide a coding system to use for a process I/O operation. | |
| 7719 The format is ((PATTERN . VAL) ...), | 8927 The format is ((PATTERN . VAL) ...), |
| 7720 where PATTERN is a regular expression matching a program name, | 8928 where PATTERN is a regular expression matching a program name, |
| 7721 VAL is a coding system, a cons of coding systems, or a function symbol. | 8929 VAL is a coding system, a cons of coding systems, or a function symbol. |
| 7722 If VAL is a coding system, it is used for both decoding what received | 8930 If VAL is a coding system, it is used for both decoding what received |
| 7723 from the program and encoding what sent to the program. | 8931 from the program and encoding what sent to the program. |
| 7728 | 8936 |
| 7729 See also the function `find-operation-coding-system'. */); | 8937 See also the function `find-operation-coding-system'. */); |
| 7730 Vprocess_coding_system_alist = Qnil; | 8938 Vprocess_coding_system_alist = Qnil; |
| 7731 | 8939 |
| 7732 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist, | 8940 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist, |
| 7733 doc: /* Alist to decide a coding system to use for a network I/O operation. | 8941 doc: /* |
| 8942 Alist to decide a coding system to use for a network I/O operation. | |
| 7734 The format is ((PATTERN . VAL) ...), | 8943 The format is ((PATTERN . VAL) ...), |
| 7735 where PATTERN is a regular expression matching a network service name | 8944 where PATTERN is a regular expression matching a network service name |
| 7736 or is a port number to connect to, | 8945 or is a port number to connect to, |
| 7737 VAL is a coding system, a cons of coding systems, or a function symbol. | 8946 VAL is a coding system, a cons of coding systems, or a function symbol. |
| 7738 If VAL is a coding system, it is used for both decoding what received | 8947 If VAL is a coding system, it is used for both decoding what received |
| 7750 Also used for decoding keyboard input on X Window system. */); | 8959 Also used for decoding keyboard input on X Window system. */); |
| 7751 Vlocale_coding_system = Qnil; | 8960 Vlocale_coding_system = Qnil; |
| 7752 | 8961 |
| 7753 /* The eol mnemonics are reset in startup.el system-dependently. */ | 8962 /* The eol mnemonics are reset in startup.el system-dependently. */ |
| 7754 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix, | 8963 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix, |
| 7755 doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format. */); | 8964 doc: /* |
| 8965 *String displayed in mode line for UNIX-like (LF) end-of-line format. */); | |
| 7756 eol_mnemonic_unix = build_string (":"); | 8966 eol_mnemonic_unix = build_string (":"); |
| 7757 | 8967 |
| 7758 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos, | 8968 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos, |
| 7759 doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format. */); | 8969 doc: /* |
| 8970 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */); | |
| 7760 eol_mnemonic_dos = build_string ("\\"); | 8971 eol_mnemonic_dos = build_string ("\\"); |
| 7761 | 8972 |
| 7762 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac, | 8973 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac, |
| 7763 doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format. */); | 8974 doc: /* |
| 8975 *String displayed in mode line for MAC-like (CR) end-of-line format. */); | |
| 7764 eol_mnemonic_mac = build_string ("/"); | 8976 eol_mnemonic_mac = build_string ("/"); |
| 7765 | 8977 |
| 7766 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided, | 8978 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided, |
| 7767 doc: /* *String displayed in mode line when end-of-line format is not yet determined. */); | 8979 doc: /* |
| 8980 *String displayed in mode line when end-of-line format is not yet determined. */); | |
| 7768 eol_mnemonic_undecided = build_string (":"); | 8981 eol_mnemonic_undecided = build_string (":"); |
| 7769 | 8982 |
| 7770 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation, | 8983 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation, |
| 7771 doc: /* *Non-nil enables character translation while encoding and decoding. */); | 8984 doc: /* |
| 8985 *Non-nil enables character translation while encoding and decoding. */); | |
| 7772 Venable_character_translation = Qt; | 8986 Venable_character_translation = Qt; |
| 7773 | 8987 |
| 7774 DEFVAR_LISP ("standard-translation-table-for-decode", | 8988 DEFVAR_LISP ("standard-translation-table-for-decode", |
| 7775 &Vstandard_translation_table_for_decode, | 8989 &Vstandard_translation_table_for_decode, |
| 7776 doc: /* Table for translating characters while decoding. */); | 8990 doc: /* Table for translating characters while decoding. */); |
| 7779 DEFVAR_LISP ("standard-translation-table-for-encode", | 8993 DEFVAR_LISP ("standard-translation-table-for-encode", |
| 7780 &Vstandard_translation_table_for_encode, | 8994 &Vstandard_translation_table_for_encode, |
| 7781 doc: /* Table for translating characters while encoding. */); | 8995 doc: /* Table for translating characters while encoding. */); |
| 7782 Vstandard_translation_table_for_encode = Qnil; | 8996 Vstandard_translation_table_for_encode = Qnil; |
| 7783 | 8997 |
| 7784 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist, | 8998 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table, |
| 7785 doc: /* Alist of charsets vs revision numbers. | 8999 doc: /* Alist of charsets vs revision numbers. |
| 7786 While encoding, if a charset (car part of an element) is found, | 9000 While encoding, if a charset (car part of an element) is found, |
| 7787 designate it with the escape sequence identifying revision (cdr part of the element). */); | 9001 designate it with the escape sequence identifying revision (cdr part |
| 7788 Vcharset_revision_alist = Qnil; | 9002 of the element). */); |
| 9003 Vcharset_revision_table = Qnil; | |
| 7789 | 9004 |
| 7790 DEFVAR_LISP ("default-process-coding-system", | 9005 DEFVAR_LISP ("default-process-coding-system", |
| 7791 &Vdefault_process_coding_system, | 9006 &Vdefault_process_coding_system, |
| 7792 doc: /* Cons of coding systems used for process I/O by default. | 9007 doc: /* Cons of coding systems used for process I/O by default. |
| 7793 The car part is used for decoding a process output, | 9008 The car part is used for decoding a process output, |
| 7794 the cdr part is used for encoding a text to be sent to a process. */); | 9009 the cdr part is used for encoding a text to be sent to a process. */); |
| 7795 Vdefault_process_coding_system = Qnil; | 9010 Vdefault_process_coding_system = Qnil; |
| 7796 | 9011 |
| 7797 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table, | 9012 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table, |
| 7798 doc: /* Table of extra Latin codes in the range 128..159 (inclusive). | 9013 doc: /* |
| 9014 Table of extra Latin codes in the range 128..159 (inclusive). | |
| 7799 This is a vector of length 256. | 9015 This is a vector of length 256. |
| 7800 If Nth element is non-nil, the existence of code N in a file | 9016 If Nth element is non-nil, the existence of code N in a file |
| 7801 \(or output of subprocess) doesn't prevent it to be detected as | 9017 \(or output of subprocess) doesn't prevent it to be detected as |
| 7802 a coding system of ISO 2022 variant which has a flag | 9018 a coding system of ISO 2022 variant which has a flag |
| 7803 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file | 9019 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file |
| 7805 Only 128th through 159th elements has a meaning. */); | 9021 Only 128th through 159th elements has a meaning. */); |
| 7806 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil); | 9022 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil); |
| 7807 | 9023 |
| 7808 DEFVAR_LISP ("select-safe-coding-system-function", | 9024 DEFVAR_LISP ("select-safe-coding-system-function", |
| 7809 &Vselect_safe_coding_system_function, | 9025 &Vselect_safe_coding_system_function, |
| 7810 doc: /* Function to call to select safe coding system for encoding a text. | 9026 doc: /* |
| 9027 Function to call to select safe coding system for encoding a text. | |
| 7811 | 9028 |
| 7812 If set, this function is called to force a user to select a proper | 9029 If set, this function is called to force a user to select a proper |
| 7813 coding system which can encode the text in the case that a default | 9030 coding system which can encode the text in the case that a default |
| 7814 coding system used in each operation can't encode the text. | 9031 coding system used in each operation can't encode the text. |
| 7815 | 9032 |
| 7825 coding_system_require_warning = 0; | 9042 coding_system_require_warning = 0; |
| 7826 | 9043 |
| 7827 | 9044 |
| 7828 DEFVAR_BOOL ("inhibit-iso-escape-detection", | 9045 DEFVAR_BOOL ("inhibit-iso-escape-detection", |
| 7829 &inhibit_iso_escape_detection, | 9046 &inhibit_iso_escape_detection, |
| 7830 doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection. | 9047 doc: /* |
| 9048 If non-nil, Emacs ignores ISO2022's escape sequence on code detection. | |
| 7831 | 9049 |
| 7832 By default, on reading a file, Emacs tries to detect how the text is | 9050 By default, on reading a file, Emacs tries to detect how the text is |
| 7833 encoded. This code detection is sensitive to escape sequences. If | 9051 encoded. This code detection is sensitive to escape sequences. If |
| 7834 the sequence is valid as ISO2022, the code is determined as one of | 9052 the sequence is valid as ISO2022, the code is determined as one of |
| 7835 the ISO2022 encodings, and the file is decoded by the corresponding | 9053 the ISO2022 encodings, and the file is decoded by the corresponding |
| 7855 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input, | 9073 DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input, |
| 7856 doc: /* Char table for translating self-inserting characters. | 9074 doc: /* Char table for translating self-inserting characters. |
| 7857 This is applied to the result of input methods, not their input. See also | 9075 This is applied to the result of input methods, not their input. See also |
| 7858 `keyboard-translate-table'. */); | 9076 `keyboard-translate-table'. */); |
| 7859 Vtranslation_table_for_input = Qnil; | 9077 Vtranslation_table_for_input = Qnil; |
| 9078 | |
| 9079 { | |
| 9080 Lisp_Object args[coding_arg_max]; | |
| 9081 Lisp_Object plist[16]; | |
| 9082 int i; | |
| 9083 | |
| 9084 for (i = 0; i < coding_arg_max; i++) | |
| 9085 args[i] = Qnil; | |
| 9086 | |
| 9087 plist[0] = intern (":name"); | |
| 9088 plist[1] = args[coding_arg_name] = Qno_conversion; | |
| 9089 plist[2] = intern (":mnemonic"); | |
| 9090 plist[3] = args[coding_arg_mnemonic] = make_number ('='); | |
| 9091 plist[4] = intern (":coding-type"); | |
| 9092 plist[5] = args[coding_arg_coding_type] = Qraw_text; | |
| 9093 plist[6] = intern (":ascii-compatible-p"); | |
| 9094 plist[7] = args[coding_arg_ascii_compatible_p] = Qt; | |
| 9095 plist[8] = intern (":default-char"); | |
| 9096 plist[9] = args[coding_arg_default_char] = make_number (0); | |
| 9097 plist[10] = intern (":for-unibyte"); | |
| 9098 plist[11] = args[coding_arg_for_unibyte] = Qt; | |
| 9099 plist[12] = intern (":docstring"); | |
| 9100 plist[13] = build_string ("Do no conversion.\n\ | |
| 9101 \n\ | |
| 9102 When you visit a file with this coding, the file is read into a\n\ | |
| 9103 unibyte buffer as is, thus each byte of a file is treated as a\n\ | |
| 9104 character."); | |
| 9105 plist[14] = intern (":eol-type"); | |
| 9106 plist[15] = args[coding_arg_eol_type] = Qunix; | |
| 9107 args[coding_arg_plist] = Flist (16, plist); | |
| 9108 Fdefine_coding_system_internal (coding_arg_max, args); | |
| 9109 } | |
| 9110 | |
| 9111 setup_coding_system (Qno_conversion, &keyboard_coding); | |
| 9112 setup_coding_system (Qno_conversion, &terminal_coding); | |
| 9113 setup_coding_system (Qno_conversion, &safe_terminal_coding); | |
| 9114 | |
| 9115 { | |
| 9116 int i; | |
| 9117 | |
| 9118 for (i = 0; i < coding_category_max; i++) | |
| 9119 Fset (AREF (Vcoding_category_table, i), Qno_conversion); | |
| 9120 } | |
| 7860 } | 9121 } |
| 7861 | 9122 |
| 7862 char * | 9123 char * |
| 7863 emacs_strerror (error_number) | 9124 emacs_strerror (error_number) |
| 7864 int error_number; | 9125 int error_number; |
| 7878 | 9139 |
| 7879 return str; | 9140 return str; |
| 7880 } | 9141 } |
| 7881 | 9142 |
| 7882 #endif /* emacs */ | 9143 #endif /* emacs */ |
| 7883 |
