|
17052
|
1 /* Coding system handler (conversion, detection, and etc).
|
|
|
2 Ver.1.0.
|
|
|
3 Copyright (C) 1995 Free Software Foundation, Inc.
|
|
|
4 Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
|
|
|
5
|
|
17071
|
6 This file is part of GNU Emacs.
|
|
|
7
|
|
|
8 GNU Emacs is free software; you can redistribute it and/or modify
|
|
|
9 it under the terms of the GNU General Public License as published by
|
|
|
10 the Free Software Foundation; either version 2, or (at your option)
|
|
|
11 any later version.
|
|
|
12
|
|
|
13 GNU Emacs is distributed in the hope that it will be useful,
|
|
|
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
16 GNU General Public License for more details.
|
|
|
17
|
|
|
18 You should have received a copy of the GNU General Public License
|
|
|
19 along with GNU Emacs; see the file COPYING. If not, write to
|
|
|
20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
|
21 Boston, MA 02111-1307, USA. */
|
|
17052
|
22
|
|
|
23 /*** TABLE OF CONTENTS ***
|
|
|
24
|
|
|
25 1. Preamble
|
|
|
26 2. Emacs' internal format handlers
|
|
|
27 3. ISO2022 handlers
|
|
|
28 4. Shift-JIS and BIG5 handlers
|
|
|
29 5. End-of-line handlers
|
|
|
30 6. C library functions
|
|
|
31 7. Emacs Lisp library functions
|
|
|
32 8. Post-amble
|
|
|
33
|
|
|
34 */
|
|
|
35
|
|
|
36 /*** GENERAL NOTE on CODING SYSTEM ***
|
|
|
37
|
|
|
38 Coding system is an encoding mechanism of one or more character
|
|
|
39 sets. Here's a list of coding systems which Emacs can handle. When
|
|
|
40 we say "decode", it means converting some other coding system to
|
|
|
41 Emacs' internal format, and when we say "encode", it means
|
|
|
42 converting Emacs' internal format to some other coding system.
|
|
|
43
|
|
|
44 0. Emacs' internal format
|
|
|
45
|
|
|
46 Emacs itself holds a multi-lingual character in a buffer and a string
|
|
|
47 in a special format. Details are described in the section 2.
|
|
|
48
|
|
|
49 1. ISO2022
|
|
|
50
|
|
|
51 The most famous coding system for multiple character sets. X's
|
|
|
52 Compound Text, various EUCs (Extended Unix Code), and such coding
|
|
|
53 systems used in Internet communication as ISO-2022-JP are all
|
|
|
54 variants of ISO2022. Details are described in the section 3.
|
|
|
55
|
|
|
56 2. SJIS (or Shift-JIS or MS-Kanji-Code)
|
|
|
57
|
|
|
58 A coding system to encode character sets: ASCII, JISX0201, and
|
|
|
59 JISX0208. Widely used for PC's in Japan. Details are described in
|
|
|
60 the section 4.
|
|
|
61
|
|
|
62 3. BIG5
|
|
|
63
|
|
|
64 A coding system to encode character sets: ASCII and Big5. Widely
|
|
|
65 used by Chinese (mainly in Taiwan and Hong Kong). Details are
|
|
|
66 described in the section 4. In this file, when written as "BIG5"
|
|
|
67 (all uppercase), it means the coding system, and when written as
|
|
|
68 "Big5" (capitalized), it means the character set.
|
|
|
69
|
|
|
70 4. Else
|
|
|
71
|
|
|
72 If a user want to read/write a text encoded in a coding system not
|
|
|
73 listed above, he can supply a decoder and an encoder for it in CCL
|
|
|
74 (Code Conversion Language) programs. Emacs executes the CCL program
|
|
|
75 while reading/writing.
|
|
|
76
|
|
|
77 Emacs represent a coding-system by a Lisp symbol that has a property
|
|
|
78 `coding-system'. But, before actually using the coding-system, the
|
|
|
79 information about it is set in a structure of type `struct
|
|
|
80 coding_system' for rapid processing. See the section 6 for more
|
|
|
81 detail.
|
|
|
82
|
|
|
83 */
|
|
|
84
|
|
|
85 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
|
|
|
86
|
|
|
87 How end-of-line of a text is encoded depends on a system. For
|
|
|
88 instance, Unix's format is just one byte of `line-feed' code,
|
|
|
89 whereas DOS's format is two bytes sequence of `carriage-return' and
|
|
|
90 `line-feed' codes. MacOS's format is one byte of `carriage-return'.
|
|
|
91
|
|
|
92 Since how characters in a text is encoded and how end-of-line is
|
|
|
93 encoded is independent, any coding system described above can take
|
|
|
94 any format of end-of-line. So, Emacs has information of format of
|
|
|
95 end-of-line in each coding-system. See the section 6 for more
|
|
|
96 detail.
|
|
|
97
|
|
|
98 */
|
|
|
99
|
|
|
100 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
|
|
|
101
|
|
|
102 These functions check if a text between SRC and SRC_END is encoded
|
|
|
103 in the coding system category XXX. Each returns an integer value in
|
|
|
104 which appropriate flag bits for the category XXX is set. The flag
|
|
|
105 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the
|
|
|
106 template of these functions. */
|
|
|
107 #if 0
|
|
|
108 int
|
|
|
109 detect_coding_internal (src, src_end)
|
|
|
110 unsigned char *src, *src_end;
|
|
|
111 {
|
|
|
112 ...
|
|
|
113 }
|
|
|
114 #endif
|
|
|
115
|
|
|
116 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
|
|
|
117
|
|
|
118 These functions decode SRC_BYTES length text at SOURCE encoded in
|
|
|
119 CODING to Emacs' internal format. The resulting text goes to a
|
|
|
120 place pointed by DESTINATION, the length of which should not exceed
|
|
|
121 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
|
|
|
122 The return value is the length of the decoded text. Below is a
|
|
|
123 template of these functions. */
|
|
|
124 #if 0
|
|
|
125 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
|
|
|
126 struct coding_system *coding;
|
|
|
127 unsigned char *source, *destination;
|
|
|
128 int src_bytes, dst_bytes;
|
|
|
129 int *consumed;
|
|
|
130 {
|
|
|
131 ...
|
|
|
132 }
|
|
|
133 #endif
|
|
|
134
|
|
|
135 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
|
|
|
136
|
|
|
137 These functions encode SRC_BYTES length text at SOURCE of Emacs
|
|
|
138 internal format to CODING. The resulting text goes to a place
|
|
|
139 pointed by DESTINATION, the length of which should not exceed
|
|
|
140 DST_BYTES. The bytes actually processed is returned as *CONSUMED.
|
|
|
141 The return value is the length of the encoded text. Below is a
|
|
|
142 template of these functions. */
|
|
|
143 #if 0
|
|
|
144 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
|
|
|
145 struct coding_system *coding;
|
|
|
146 unsigned char *source, *destination;
|
|
|
147 int src_bytes, dst_bytes;
|
|
|
148 int *consumed;
|
|
|
149 {
|
|
|
150 ...
|
|
|
151 }
|
|
|
152 #endif
|
|
|
153
|
|
|
154 /*** COMMONLY USED MACROS ***/
|
|
|
155
|
|
|
156 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
|
|
|
157 THREE_MORE_BYTES safely get one, two, and three bytes from the
|
|
|
158 source text respectively. If there are not enough bytes in the
|
|
|
159 source, they jump to `label_end_of_loop'. The caller should set
|
|
|
160 variables `src' and `src_end' to appropriate areas in advance. */
|
|
|
161
|
|
|
162 #define ONE_MORE_BYTE(c1) \
|
|
|
163 do { \
|
|
|
164 if (src < src_end) \
|
|
|
165 c1 = *src++; \
|
|
|
166 else \
|
|
|
167 goto label_end_of_loop; \
|
|
|
168 } while (0)
|
|
|
169
|
|
|
170 #define TWO_MORE_BYTES(c1, c2) \
|
|
|
171 do { \
|
|
|
172 if (src + 1 < src_end) \
|
|
|
173 c1 = *src++, c2 = *src++; \
|
|
|
174 else \
|
|
|
175 goto label_end_of_loop; \
|
|
|
176 } while (0)
|
|
|
177
|
|
|
178 #define THREE_MORE_BYTES(c1, c2, c3) \
|
|
|
179 do { \
|
|
|
180 if (src + 2 < src_end) \
|
|
|
181 c1 = *src++, c2 = *src++, c3 = *src++; \
|
|
|
182 else \
|
|
|
183 goto label_end_of_loop; \
|
|
|
184 } while (0)
|
|
|
185
|
|
|
186 /* The following three macros DECODE_CHARACTER_ASCII,
|
|
|
187 DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
|
|
|
188 the multi-byte form of a character of each class at the place
|
|
|
189 pointed by `dst'. The caller should set the variable `dst' to
|
|
|
190 point to an appropriate area and the variable `coding' to point to
|
|
|
191 the coding-system of the currently decoding text in advance. */
|
|
|
192
|
|
|
193 /* Decode one ASCII character C. */
|
|
|
194
|
|
|
195 #define DECODE_CHARACTER_ASCII(c) \
|
|
|
196 do { \
|
|
|
197 if (COMPOSING_P (coding->composing)) \
|
|
|
198 *dst++ = 0xA0, *dst++ = (c) | 0x80; \
|
|
|
199 else \
|
|
|
200 *dst++ = (c); \
|
|
|
201 } while (0)
|
|
|
202
|
|
|
203 /* Decode one DIMENSION1 character of which charset is CHARSET and
|
|
|
204 position-code is C. */
|
|
|
205
|
|
|
206 #define DECODE_CHARACTER_DIMENSION1(charset, c) \
|
|
|
207 do { \
|
|
|
208 unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset); \
|
|
|
209 if (COMPOSING_P (coding->composing)) \
|
|
|
210 *dst++ = leading_code + 0x20; \
|
|
|
211 else \
|
|
|
212 *dst++ = leading_code; \
|
|
|
213 if (leading_code = CHARSET_LEADING_CODE_EXT (charset)) \
|
|
|
214 *dst++ = leading_code; \
|
|
|
215 *dst++ = (c) | 0x80; \
|
|
|
216 } while (0)
|
|
|
217
|
|
|
218 /* Decode one DIMENSION2 character of which charset is CHARSET and
|
|
|
219 position-codes are C1 and C2. */
|
|
|
220
|
|
|
221 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2) \
|
|
|
222 do { \
|
|
|
223 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
|
|
|
224 *dst++ = (c2) | 0x80; \
|
|
|
225 } while (0)
|
|
|
226
|
|
|
227
|
|
|
228 /*** 1. Preamble ***/
|
|
|
229
|
|
|
230 #include <stdio.h>
|
|
|
231
|
|
|
232 #ifdef emacs
|
|
|
233
|
|
|
234 #include <config.h>
|
|
|
235 #include "lisp.h"
|
|
|
236 #include "buffer.h"
|
|
|
237 #include "charset.h"
|
|
|
238 #include "ccl.h"
|
|
|
239 #include "coding.h"
|
|
|
240 #include "window.h"
|
|
|
241
|
|
|
242 #else /* not emacs */
|
|
|
243
|
|
|
244 #include "mulelib.h"
|
|
|
245
|
|
|
246 #endif /* not emacs */
|
|
|
247
|
|
|
248 Lisp_Object Qcoding_system, Qeol_type;
|
|
|
249 Lisp_Object Qbuffer_file_coding_system;
|
|
|
250 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
|
|
|
251
|
|
|
252 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
|
|
|
253 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
|
|
|
254 Lisp_Object Qstart_process, Qopen_network_stream;
|
|
|
255 Lisp_Object Qtarget_idx;
|
|
|
256
|
|
|
257 /* Mnemonic character of each format of end-of-line. */
|
|
|
258 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
|
|
|
259 /* Mnemonic character to indicate format of end-of-line is not yet
|
|
|
260 decided. */
|
|
|
261 int eol_mnemonic_undecided;
|
|
|
262
|
|
|
263 #ifdef emacs
|
|
|
264
|
|
|
265 Lisp_Object Qcoding_system_vector, Qcoding_system_p, Qcoding_system_error;
|
|
|
266
|
|
|
267 /* Coding-systems are handed between Emacs Lisp programs and C internal
|
|
|
268 routines by the following three variables. */
|
|
|
269 /* Coding-system for reading files and receiving data from process. */
|
|
|
270 Lisp_Object Vcoding_system_for_read;
|
|
|
271 /* Coding-system for writing files and sending data to process. */
|
|
|
272 Lisp_Object Vcoding_system_for_write;
|
|
|
273 /* Coding-system actually used in the latest I/O. */
|
|
|
274 Lisp_Object Vlast_coding_system_used;
|
|
|
275
|
|
|
276 /* Coding-system of what terminal accept for displaying. */
|
|
|
277 struct coding_system terminal_coding;
|
|
|
278
|
|
|
279 /* Coding-system of what is sent from terminal keyboard. */
|
|
|
280 struct coding_system keyboard_coding;
|
|
|
281
|
|
|
282 Lisp_Object Vcoding_system_alist;
|
|
|
283
|
|
|
284 #endif /* emacs */
|
|
|
285
|
|
|
286 Lisp_Object Qcoding_category_index;
|
|
|
287
|
|
|
288 /* List of symbols `coding-category-xxx' ordered by priority. */
|
|
|
289 Lisp_Object Vcoding_category_list;
|
|
|
290
|
|
|
291 /* Table of coding-systems currently assigned to each coding-category. */
|
|
|
292 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
|
|
|
293
|
|
|
294 /* Table of names of symbol for each coding-category. */
|
|
|
295 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
|
|
|
296 "coding-category-internal",
|
|
|
297 "coding-category-sjis",
|
|
|
298 "coding-category-iso-7",
|
|
|
299 "coding-category-iso-8-1",
|
|
|
300 "coding-category-iso-8-2",
|
|
|
301 "coding-category-iso-else",
|
|
|
302 "coding-category-big5",
|
|
|
303 "coding-category-binary"
|
|
|
304 };
|
|
|
305
|
|
|
306 /* Alist of charsets vs the alternate charsets. */
|
|
|
307 Lisp_Object Valternate_charset_table;
|
|
|
308
|
|
|
309 /* Alist of charsets vs revision number. */
|
|
|
310 Lisp_Object Vcharset_revision_alist;
|
|
|
311
|
|
|
312
|
|
|
313 /*** 2. Emacs internal format handlers ***/
|
|
|
314
|
|
|
315 /* Emacs' internal format for encoding multiple character sets is a
|
|
|
316 kind of multi-byte encoding, i.e. encoding a character by a sequence
|
|
|
317 of one-byte codes of variable length. ASCII characters and control
|
|
|
318 characters (e.g. `tab', `newline') are represented by one-byte as
|
|
|
319 is. It takes the range 0x00 through 0x7F. The other characters
|
|
|
320 are represented by a sequence of `base leading-code', optional
|
|
|
321 `extended leading-code', and one or two `position-code's. Length
|
|
|
322 of the sequence is decided by the base leading-code. Leading-code
|
|
|
323 takes the range 0x80 through 0x9F, whereas extended leading-code
|
|
|
324 and position-code take the range 0xA0 through 0xFF. See the
|
|
|
325 document of `charset.h' for more detail about leading-code and
|
|
|
326 position-code.
|
|
|
327
|
|
|
328 There's one exception in this rule. Special leading-code
|
|
|
329 `leading-code-composition' denotes that the following several
|
|
|
330 characters should be composed into one character. Leading-codes of
|
|
|
331 components (except for ASCII) are added 0x20. An ASCII character
|
|
|
332 component is represented by a 2-byte sequence of `0xA0' and
|
|
|
333 `ASCII-code + 0x80'. See also the document in `charset.h' for the
|
|
|
334 detail of composite character. Hence, we can summarize the code
|
|
|
335 range as follows:
|
|
|
336
|
|
|
337 --- CODE RANGE of Emacs' internal format ---
|
|
|
338 (character set) (range)
|
|
|
339 ASCII 0x00 .. 0x7F
|
|
|
340 ELSE (1st byte) 0x80 .. 0x9F
|
|
|
341 (rest bytes) 0xA0 .. 0xFF
|
|
|
342 ---------------------------------------------
|
|
|
343
|
|
|
344 */
|
|
|
345
|
|
|
346 enum emacs_code_class_type emacs_code_class[256];
|
|
|
347
|
|
|
348 /* Go to the next statement only if *SRC is accessible and the code is
|
|
|
349 greater than 0xA0. */
|
|
|
350 #define CHECK_CODE_RANGE_A0_FF \
|
|
|
351 do { \
|
|
|
352 if (src >= src_end) \
|
|
|
353 goto label_end_of_switch; \
|
|
|
354 else if (*src++ < 0xA0) \
|
|
|
355 return 0; \
|
|
|
356 } while (0)
|
|
|
357
|
|
|
358 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
|
|
|
359 Check if a text is encoded in Emacs' internal format. If it is,
|
|
|
360 return CODING_CATEGORY_MASK_INTERNAL, else return 0. */
|
|
|
361
|
|
|
362 int
|
|
|
363 detect_coding_internal (src, src_end)
|
|
|
364 unsigned char *src, *src_end;
|
|
|
365 {
|
|
|
366 unsigned char c;
|
|
|
367 int composing = 0;
|
|
|
368
|
|
|
369 while (src < src_end)
|
|
|
370 {
|
|
|
371 c = *src++;
|
|
|
372
|
|
|
373 if (composing)
|
|
|
374 {
|
|
|
375 if (c < 0xA0)
|
|
|
376 composing = 0;
|
|
|
377 else
|
|
|
378 c -= 0x20;
|
|
|
379 }
|
|
|
380
|
|
|
381 switch (emacs_code_class[c])
|
|
|
382 {
|
|
|
383 case EMACS_ascii_code:
|
|
|
384 case EMACS_linefeed_code:
|
|
|
385 break;
|
|
|
386
|
|
|
387 case EMACS_control_code:
|
|
|
388 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
|
|
|
389 return 0;
|
|
|
390 break;
|
|
|
391
|
|
|
392 case EMACS_invalid_code:
|
|
|
393 return 0;
|
|
|
394
|
|
|
395 case EMACS_leading_code_composition: /* c == 0x80 */
|
|
|
396 if (composing)
|
|
|
397 CHECK_CODE_RANGE_A0_FF;
|
|
|
398 else
|
|
|
399 composing = 1;
|
|
|
400 break;
|
|
|
401
|
|
|
402 case EMACS_leading_code_4:
|
|
|
403 CHECK_CODE_RANGE_A0_FF;
|
|
|
404 /* fall down to check it two more times ... */
|
|
|
405
|
|
|
406 case EMACS_leading_code_3:
|
|
|
407 CHECK_CODE_RANGE_A0_FF;
|
|
|
408 /* fall down to check it one more time ... */
|
|
|
409
|
|
|
410 case EMACS_leading_code_2:
|
|
|
411 CHECK_CODE_RANGE_A0_FF;
|
|
|
412 break;
|
|
|
413
|
|
|
414 default:
|
|
|
415 label_end_of_switch:
|
|
|
416 break;
|
|
|
417 }
|
|
|
418 }
|
|
|
419 return CODING_CATEGORY_MASK_INTERNAL;
|
|
|
420 }
|
|
|
421
|
|
|
422
|
|
|
423 /*** 3. ISO2022 handlers ***/
|
|
|
424
|
|
|
425 /* The following note describes the coding system ISO2022 briefly.
|
|
|
426 Since the intension of this note is to help understanding of the
|
|
|
427 programs in this file, some parts are NOT ACCURATE or OVERLY
|
|
|
428 SIMPLIFIED. For the thorough understanding, please refer to the
|
|
|
429 original document of ISO2022.
|
|
|
430
|
|
|
431 ISO2022 provides many mechanisms to encode several character sets
|
|
|
432 in 7-bit and 8-bit environment. If one choose 7-bite environment,
|
|
|
433 all text is encoded by codes of less than 128. This may make the
|
|
|
434 encoded text a little bit longer, but the text get more stability
|
|
|
435 to pass through several gateways (some of them split MSB off).
|
|
|
436
|
|
|
437 There are two kind of character set: control character set and
|
|
|
438 graphic character set. The former contains control characters such
|
|
|
439 as `newline' and `escape' to provide control functions (control
|
|
|
440 functions are provided also by escape sequence). The latter
|
|
|
441 contains graphic characters such as ' A' and '-'. Emacs recognizes
|
|
|
442 two control character sets and many graphic character sets.
|
|
|
443
|
|
|
444 Graphic character sets are classified into one of the following
|
|
|
445 four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
|
|
|
446 DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
|
|
|
447 bytes (DIMENSION) and the number of characters in one dimension
|
|
|
448 (CHARS) of the set. In addition, each character set is assigned an
|
|
|
449 identification tag (called "final character" and denoted as <F>
|
|
|
450 here after) which is unique in each class. <F> of each character
|
|
|
451 set is decided by ECMA(*) when it is registered in ISO. Code range
|
|
|
452 of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
|
|
|
453
|
|
|
454 Note (*): ECMA = European Computer Manufacturers Association
|
|
|
455
|
|
|
456 Here are examples of graphic character set [NAME(<F>)]:
|
|
|
457 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
|
|
|
458 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
|
|
|
459 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
|
|
|
460 o DIMENSION2_CHARS96 -- none for the moment
|
|
|
461
|
|
|
462 A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
|
|
|
463 C0 [0x00..0x1F] -- control character plane 0
|
|
|
464 GL [0x20..0x7F] -- graphic character plane 0
|
|
|
465 C1 [0x80..0x9F] -- control character plane 1
|
|
|
466 GR [0xA0..0xFF] -- graphic character plane 1
|
|
|
467
|
|
|
468 A control character set is directly designated and invoked to C0 or
|
|
|
469 C1 by an escape sequence. The most common case is that ISO646's
|
|
|
470 control character set is designated/invoked to C0 and ISO6429's
|
|
|
471 control character set is designated/invoked to C1, and usually
|
|
|
472 these designations/invocations are omitted in a coded text. With
|
|
|
473 7-bit environment, only C0 can be used, and a control character for
|
|
|
474 C1 is encoded by an appropriate escape sequence to fit in the
|
|
|
475 environment. All control characters for C1 are defined the
|
|
|
476 corresponding escape sequences.
|
|
|
477
|
|
|
478 A graphic character set is at first designated to one of four
|
|
|
479 graphic registers (G0 through G3), then these graphic registers are
|
|
|
480 invoked to GL or GR. These designations and invocations can be
|
|
|
481 done independently. The most common case is that G0 is invoked to
|
|
|
482 GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
|
|
|
483 these invocations and designations are omitted in a coded text.
|
|
|
484 With 7-bit environment, only GL can be used.
|
|
|
485
|
|
|
486 When a graphic character set of CHARS94 is invoked to GL, code 0x20
|
|
|
487 and 0x7F of GL area work as control characters SPACE and DEL
|
|
|
488 respectively, and code 0xA0 and 0xFF of GR area should not be used.
|
|
|
489
|
|
|
490 There are two ways of invocation: locking-shift and single-shift.
|
|
|
491 With locking-shift, the invocation lasts until the next different
|
|
|
492 invocation, whereas with single-shift, the invocation works only
|
|
|
493 for the following character and doesn't affect locking-shift.
|
|
|
494 Invocations are done by the following control characters or escape
|
|
|
495 sequences.
|
|
|
496
|
|
|
497 ----------------------------------------------------------------------
|
|
|
498 function control char escape sequence description
|
|
|
499 ----------------------------------------------------------------------
|
|
|
500 SI (shift-in) 0x0F none invoke G0 to GL
|
|
|
501 SI (shift-out) 0x0E none invoke G1 to GL
|
|
|
502 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
|
|
|
503 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
|
|
|
504 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 into GL
|
|
|
505 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 into GL
|
|
|
506 ----------------------------------------------------------------------
|
|
|
507 The first four are for locking-shift. Control characters for these
|
|
|
508 functions are defined by macros ISO_CODE_XXX in `coding.h'.
|
|
|
509
|
|
|
510 Designations are done by the following escape sequences.
|
|
|
511 ----------------------------------------------------------------------
|
|
|
512 escape sequence description
|
|
|
513 ----------------------------------------------------------------------
|
|
|
514 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
|
|
|
515 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
|
|
|
516 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
|
|
|
517 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
|
|
|
518 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
|
|
|
519 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
|
|
|
520 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
|
|
|
521 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
|
|
|
522 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
|
|
|
523 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
|
|
|
524 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
|
|
|
525 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
|
|
|
526 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
|
|
|
527 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
|
|
|
528 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
|
|
|
529 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
|
|
|
530 ----------------------------------------------------------------------
|
|
|
531
|
|
|
532 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
|
|
|
533 of dimension 1, chars 94, and final character <F>, and etc.
|
|
|
534
|
|
|
535 Note (*): Although these designations are not allowed in ISO2022,
|
|
|
536 Emacs accepts them on decoding, and produces them on encoding
|
|
|
537 CHARS96 character set in a coding system which is characterized as
|
|
|
538 7-bit environment, non-locking-shift, and non-single-shift.
|
|
|
539
|
|
|
540 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
|
|
|
541 '(' can be omitted. We call this as "short-form" here after.
|
|
|
542
|
|
|
543 Now you may notice that there are a lot of ways for encoding the
|
|
|
544 same multilingual text in ISO2022. Actually, there exist many
|
|
|
545 coding systems such as Compound Text (used in X's inter client
|
|
|
546 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
|
|
|
547 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
|
|
|
548 localized platforms), and all of these are variants of ISO2022.
|
|
|
549
|
|
|
550 In addition to the above, Emacs handles two more kinds of escape
|
|
|
551 sequences: ISO6429's direction specification and Emacs' private
|
|
|
552 sequence for specifying character composition.
|
|
|
553
|
|
|
554 ISO6429's direction specification takes the following format:
|
|
|
555 o CSI ']' -- end of the current direction
|
|
|
556 o CSI '0' ']' -- end of the current direction
|
|
|
557 o CSI '1' ']' -- start of left-to-right text
|
|
|
558 o CSI '2' ']' -- start of right-to-left text
|
|
|
559 The control character CSI (0x9B: control sequence introducer) is
|
|
|
560 abbreviated to the escape sequence ESC '[' in 7-bit environment.
|
|
|
561
|
|
|
562 Character composition specification takes the following format:
|
|
|
563 o ESC '0' -- start character composition
|
|
|
564 o ESC '1' -- end character composition
|
|
|
565 Since these are not standard escape sequences of any ISO, the use
|
|
|
566 of them for these meaning is restricted to Emacs only. */
|
|
|
567
|
|
|
568 enum iso_code_class_type iso_code_class[256];
|
|
|
569
|
|
|
570 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
|
|
|
571 Check if a text is encoded in ISO2022. If it is, returns an
|
|
|
572 integer in which appropriate flag bits any of:
|
|
|
573 CODING_CATEGORY_MASK_ISO_7
|
|
|
574 CODING_CATEGORY_MASK_ISO_8_1
|
|
|
575 CODING_CATEGORY_MASK_ISO_8_2
|
|
|
576 CODING_CATEGORY_MASK_ISO_ELSE
|
|
|
577 are set. If a code which should never appear in ISO2022 is found,
|
|
|
578 returns 0. */
|
|
|
579
|
|
|
580 int
|
|
|
581 detect_coding_iso2022 (src, src_end)
|
|
|
582 unsigned char *src, *src_end;
|
|
|
583 {
|
|
|
584 unsigned char graphic_register[4];
|
|
|
585 unsigned char c, esc_cntl;
|
|
|
586 int mask = (CODING_CATEGORY_MASK_ISO_7
|
|
|
587 | CODING_CATEGORY_MASK_ISO_8_1
|
|
|
588 | CODING_CATEGORY_MASK_ISO_8_2);
|
|
|
589 /* We may look ahead maximum 3 bytes. */
|
|
|
590 unsigned char *adjusted_src_end = src_end - 3;
|
|
|
591 int i;
|
|
|
592
|
|
|
593 for (i = 0; i < 4; i++)
|
|
|
594 graphic_register[i] = CHARSET_ASCII;
|
|
|
595
|
|
|
596 while (src < adjusted_src_end)
|
|
|
597 {
|
|
|
598 c = *src++;
|
|
|
599 switch (c)
|
|
|
600 {
|
|
|
601 case ISO_CODE_ESC:
|
|
|
602 if (src >= adjusted_src_end)
|
|
|
603 break;
|
|
|
604 c = *src++;
|
|
|
605 if (c == '$')
|
|
|
606 {
|
|
|
607 /* Designation of 2-byte character set. */
|
|
|
608 if (src >= adjusted_src_end)
|
|
|
609 break;
|
|
|
610 c = *src++;
|
|
|
611 }
|
|
|
612 if ((c >= ')' && c <= '+') || (c >= '-' && c <= '/'))
|
|
|
613 /* Designation to graphic register 1, 2, or 3. */
|
|
|
614 mask &= ~CODING_CATEGORY_MASK_ISO_7;
|
|
|
615 else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
|
|
|
616 return CODING_CATEGORY_MASK_ISO_ELSE;
|
|
|
617 break;
|
|
|
618
|
|
|
619 case ISO_CODE_SI:
|
|
|
620 case ISO_CODE_SO:
|
|
|
621 return CODING_CATEGORY_MASK_ISO_ELSE;
|
|
|
622
|
|
|
623 case ISO_CODE_CSI:
|
|
|
624 case ISO_CODE_SS2:
|
|
|
625 case ISO_CODE_SS3:
|
|
|
626 mask &= ~CODING_CATEGORY_MASK_ISO_7;
|
|
|
627 break;
|
|
|
628
|
|
|
629 default:
|
|
|
630 if (c < 0x80)
|
|
|
631 break;
|
|
|
632 else if (c < 0xA0)
|
|
|
633 return 0;
|
|
|
634 else
|
|
|
635 {
|
|
|
636 int count = 1;
|
|
|
637
|
|
|
638 mask &= ~CODING_CATEGORY_MASK_ISO_7;
|
|
|
639 while (src < adjusted_src_end && *src >= 0xA0)
|
|
|
640 count++, src++;
|
|
|
641 if (count & 1 && src < adjusted_src_end)
|
|
|
642 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
|
|
|
643 }
|
|
|
644 break;
|
|
|
645 }
|
|
|
646 }
|
|
|
647
|
|
|
648 return mask;
|
|
|
649 }
|
|
|
650
|
|
|
651 /* Decode a character of which charset is CHARSET and the 1st position
|
|
|
652 code is C1. If dimension of CHARSET 2, the 2nd position code is
|
|
|
653 fetched from SRC and set to C2. If CHARSET is negative, it means
|
|
|
654 that we are decoding ill formed text, and what we can do is just to
|
|
|
655 read C1 as is. */
|
|
|
656
|
|
|
657 #define DECODE_ISO_CHARACTER(charset, c1) \
|
|
|
658 do { \
|
|
|
659 if ((charset) >= 0 && CHARSET_DIMENSION (charset) == 2) \
|
|
|
660 ONE_MORE_BYTE (c2); \
|
|
|
661 if (COMPOSING_HEAD_P (coding->composing)) \
|
|
|
662 { \
|
|
|
663 *dst++ = LEADING_CODE_COMPOSITION; \
|
|
|
664 if (COMPOSING_WITH_RULE_P (coding->composing)) \
|
|
|
665 /* To tell composition rules are embeded. */ \
|
|
|
666 *dst++ = 0xFF; \
|
|
|
667 coding->composing += 2; \
|
|
|
668 } \
|
|
|
669 if ((charset) < 0) \
|
|
|
670 *dst++ = c1; \
|
|
|
671 else if ((charset) == CHARSET_ASCII) \
|
|
|
672 DECODE_CHARACTER_ASCII (c1); \
|
|
|
673 else if (CHARSET_DIMENSION (charset) == 1) \
|
|
|
674 DECODE_CHARACTER_DIMENSION1 (charset, c1); \
|
|
|
675 else \
|
|
|
676 DECODE_CHARACTER_DIMENSION2 (charset, c1, c2); \
|
|
|
677 if (COMPOSING_WITH_RULE_P (coding->composing)) \
|
|
|
678 /* To tell a composition rule follows. */ \
|
|
|
679 coding->composing = COMPOSING_WITH_RULE_RULE; \
|
|
|
680 } while (0)
|
|
|
681
|
|
|
682 /* Set designation state into CODING. */
|
|
|
683 #define DECODE_DESIGNATION(reg, dimension, chars, final_char) \
|
|
|
684 do { \
|
|
|
685 int charset = ISO_CHARSET_TABLE (dimension, chars, final_char); \
|
|
|
686 Lisp_Object temp \
|
|
|
687 = Fassq (CHARSET_SYMBOL (charset), Valternate_charset_table); \
|
|
|
688 if (! NILP (temp)) \
|
|
|
689 charset = get_charset_id (XCONS (temp)->cdr); \
|
|
|
690 if (charset >= 0) \
|
|
|
691 { \
|
|
|
692 if (coding->direction == 1 \
|
|
|
693 && CHARSET_REVERSE_CHARSET (charset) >= 0) \
|
|
|
694 charset = CHARSET_REVERSE_CHARSET (charset); \
|
|
|
695 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
|
|
|
696 } \
|
|
|
697 } while (0)
|
|
|
698
|
|
|
699 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
|
|
|
700
|
|
|
701 int
|
|
|
702 decode_coding_iso2022 (coding, source, destination,
|
|
|
703 src_bytes, dst_bytes, consumed)
|
|
|
704 struct coding_system *coding;
|
|
|
705 unsigned char *source, *destination;
|
|
|
706 int src_bytes, dst_bytes;
|
|
|
707 int *consumed;
|
|
|
708 {
|
|
|
709 unsigned char *src = source;
|
|
|
710 unsigned char *src_end = source + src_bytes;
|
|
|
711 unsigned char *dst = destination;
|
|
|
712 unsigned char *dst_end = destination + dst_bytes;
|
|
|
713 /* Since the maximum bytes produced by each loop is 7, we subtract 6
|
|
|
714 from DST_END to assure that overflow checking is necessary only
|
|
|
715 at the head of loop. */
|
|
|
716 unsigned char *adjusted_dst_end = dst_end - 6;
|
|
|
717 int charset;
|
|
|
718 /* Charsets invoked to graphic plane 0 and 1 respectively. */
|
|
|
719 int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
|
|
|
720 int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
|
|
|
721
|
|
|
722 while (src < src_end && dst < adjusted_dst_end)
|
|
|
723 {
|
|
|
724 /* SRC_BASE remembers the start position in source in each loop.
|
|
|
725 The loop will be exited when there's not enough source text
|
|
|
726 to analyze long escape sequence or 2-byte code (within macros
|
|
|
727 ONE_MORE_BYTE or TWO_MORE_BYTES). In that case, SRC is reset
|
|
|
728 to SRC_BASE before exiting. */
|
|
|
729 unsigned char *src_base = src;
|
|
|
730 unsigned char c1 = *src++, c2, cmprule;
|
|
|
731
|
|
|
732 switch (iso_code_class [c1])
|
|
|
733 {
|
|
|
734 case ISO_0x20_or_0x7F:
|
|
|
735 if (!coding->composing
|
|
|
736 && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
|
|
|
737 {
|
|
|
738 /* This is SPACE or DEL. */
|
|
|
739 *dst++ = c1;
|
|
|
740 break;
|
|
|
741 }
|
|
|
742 /* This is a graphic character, we fall down ... */
|
|
|
743
|
|
|
744 case ISO_graphic_plane_0:
|
|
|
745 if (coding->composing == COMPOSING_WITH_RULE_RULE)
|
|
|
746 {
|
|
|
747 /* This is a composition rule. */
|
|
|
748 *dst++ = c1 | 0x80;
|
|
|
749 coding->composing = COMPOSING_WITH_RULE_TAIL;
|
|
|
750 }
|
|
|
751 else
|
|
|
752 DECODE_ISO_CHARACTER (charset0, c1);
|
|
|
753 break;
|
|
|
754
|
|
|
755 case ISO_0xA0_or_0xFF:
|
|
|
756 if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
|
|
|
757 {
|
|
|
758 /* Invalid code. */
|
|
|
759 *dst++ = c1;
|
|
|
760 break;
|
|
|
761 }
|
|
|
762 /* This is a graphic character, we fall down ... */
|
|
|
763
|
|
|
764 case ISO_graphic_plane_1:
|
|
|
765 DECODE_ISO_CHARACTER (charset1, c1);
|
|
|
766 break;
|
|
|
767
|
|
|
768 case ISO_control_code:
|
|
|
769 /* All ISO2022 control characters in this class have the
|
|
|
770 same representation in Emacs internal format. */
|
|
|
771 *dst++ = c1;
|
|
|
772 break;
|
|
|
773
|
|
|
774 case ISO_carriage_return:
|
|
|
775 if (coding->eol_type == CODING_EOL_CR)
|
|
|
776 {
|
|
|
777 *dst++ = '\n';
|
|
|
778 }
|
|
|
779 else if (coding->eol_type == CODING_EOL_CRLF)
|
|
|
780 {
|
|
|
781 ONE_MORE_BYTE (c1);
|
|
|
782 if (c1 == ISO_CODE_LF)
|
|
|
783 *dst++ = '\n';
|
|
|
784 else
|
|
|
785 {
|
|
|
786 src--;
|
|
|
787 *dst++ = c1;
|
|
|
788 }
|
|
|
789 }
|
|
|
790 else
|
|
|
791 {
|
|
|
792 *dst++ = c1;
|
|
|
793 }
|
|
|
794 break;
|
|
|
795
|
|
|
796 case ISO_shift_out:
|
|
|
797 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
|
|
|
798 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
|
|
|
799 break;
|
|
|
800
|
|
|
801 case ISO_shift_in:
|
|
|
802 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
|
|
|
803 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
|
|
|
804 break;
|
|
|
805
|
|
|
806 case ISO_single_shift_2_7:
|
|
|
807 case ISO_single_shift_2:
|
|
|
808 /* SS2 is handled as an escape sequence of ESC 'N' */
|
|
|
809 c1 = 'N';
|
|
|
810 goto label_escape_sequence;
|
|
|
811
|
|
|
812 case ISO_single_shift_3:
|
|
|
813 /* SS2 is handled as an escape sequence of ESC 'O' */
|
|
|
814 c1 = 'O';
|
|
|
815 goto label_escape_sequence;
|
|
|
816
|
|
|
817 case ISO_control_sequence_introducer:
|
|
|
818 /* CSI is handled as an escape sequence of ESC '[' ... */
|
|
|
819 c1 = '[';
|
|
|
820 goto label_escape_sequence;
|
|
|
821
|
|
|
822 case ISO_escape:
|
|
|
823 ONE_MORE_BYTE (c1);
|
|
|
824 label_escape_sequence:
|
|
|
825 /* Escape sequences handled by Emacs are invocation,
|
|
|
826 designation, direction specification, and character
|
|
|
827 composition specification. */
|
|
|
828 switch (c1)
|
|
|
829 {
|
|
|
830 case '&': /* revision of following character set */
|
|
|
831 ONE_MORE_BYTE (c1);
|
|
|
832 if (!(c1 >= '@' && c1 <= '~'))
|
|
|
833 {
|
|
|
834 goto label_invalid_escape_sequence;
|
|
|
835 }
|
|
|
836 ONE_MORE_BYTE (c1);
|
|
|
837 if (c1 != ISO_CODE_ESC)
|
|
|
838 {
|
|
|
839 goto label_invalid_escape_sequence;
|
|
|
840 }
|
|
|
841 ONE_MORE_BYTE (c1);
|
|
|
842 goto label_escape_sequence;
|
|
|
843
|
|
|
844 case '$': /* designation of 2-byte character set */
|
|
|
845 ONE_MORE_BYTE (c1);
|
|
|
846 if (c1 >= '@' && c1 <= 'B')
|
|
|
847 { /* designation of JISX0208.1978, GB2312.1980,
|
|
|
848 or JISX0208.1980 */
|
|
|
849 DECODE_DESIGNATION (0, 2, 94, c1);
|
|
|
850 }
|
|
|
851 else if (c1 >= 0x28 && c1 <= 0x2B)
|
|
|
852 { /* designation of DIMENSION2_CHARS94 character set */
|
|
|
853 ONE_MORE_BYTE (c2);
|
|
|
854 DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
|
|
|
855 }
|
|
|
856 else if (c1 >= 0x2C && c1 <= 0x2F)
|
|
|
857 { /* designation of DIMENSION2_CHARS96 character set */
|
|
|
858 ONE_MORE_BYTE (c2);
|
|
|
859 DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
|
|
|
860 }
|
|
|
861 else
|
|
|
862 {
|
|
|
863 goto label_invalid_escape_sequence;
|
|
|
864 }
|
|
|
865 break;
|
|
|
866
|
|
|
867 case 'n': /* invocation of locking-shift-2 */
|
|
|
868 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
|
|
|
869 break;
|
|
|
870
|
|
|
871 case 'o': /* invocation of locking-shift-3 */
|
|
|
872 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
|
|
|
873 break;
|
|
|
874
|
|
|
875 case 'N': /* invocation of single-shift-2 */
|
|
|
876 ONE_MORE_BYTE (c1);
|
|
|
877 charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
|
|
|
878 DECODE_ISO_CHARACTER (charset, c1);
|
|
|
879 break;
|
|
|
880
|
|
|
881 case 'O': /* invocation of single-shift-3 */
|
|
|
882 ONE_MORE_BYTE (c1);
|
|
|
883 charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
|
|
|
884 DECODE_ISO_CHARACTER (charset, c1);
|
|
|
885 break;
|
|
|
886
|
|
|
887 case '0': /* start composing without embeded rules */
|
|
|
888 coding->composing = COMPOSING_NO_RULE_HEAD;
|
|
|
889 break;
|
|
|
890
|
|
|
891 case '1': /* end composing */
|
|
|
892 coding->composing = COMPOSING_NO;
|
|
|
893 break;
|
|
|
894
|
|
|
895 case '2': /* start composing with embeded rules */
|
|
|
896 coding->composing = COMPOSING_WITH_RULE_HEAD;
|
|
|
897 break;
|
|
|
898
|
|
|
899 case '[': /* specification of direction */
|
|
|
900 /* For the moment, nested direction is not supported.
|
|
|
901 So, the value of `coding->direction' is 0 or 1: 0
|
|
|
902 means left-to-right, 1 means right-to-left. */
|
|
|
903 ONE_MORE_BYTE (c1);
|
|
|
904 switch (c1)
|
|
|
905 {
|
|
|
906 case ']': /* end of the current direction */
|
|
|
907 coding->direction = 0;
|
|
|
908
|
|
|
909 case '0': /* end of the current direction */
|
|
|
910 case '1': /* start of left-to-right direction */
|
|
|
911 ONE_MORE_BYTE (c1);
|
|
|
912 if (c1 == ']')
|
|
|
913 coding->direction = 0;
|
|
|
914 else
|
|
|
915 goto label_invalid_escape_sequence;
|
|
|
916 break;
|
|
|
917
|
|
|
918 case '2': /* start of right-to-left direction */
|
|
|
919 ONE_MORE_BYTE (c1);
|
|
|
920 if (c1 == ']')
|
|
|
921 coding->direction= 1;
|
|
|
922 else
|
|
|
923 goto label_invalid_escape_sequence;
|
|
|
924 break;
|
|
|
925
|
|
|
926 default:
|
|
|
927 goto label_invalid_escape_sequence;
|
|
|
928 }
|
|
|
929 break;
|
|
|
930
|
|
|
931 default:
|
|
|
932 if (c1 >= 0x28 && c1 <= 0x2B)
|
|
|
933 { /* designation of DIMENSION1_CHARS94 character set */
|
|
|
934 ONE_MORE_BYTE (c2);
|
|
|
935 DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
|
|
|
936 }
|
|
|
937 else if (c1 >= 0x2C && c1 <= 0x2F)
|
|
|
938 { /* designation of DIMENSION1_CHARS96 character set */
|
|
|
939 ONE_MORE_BYTE (c2);
|
|
|
940 DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
|
|
|
941 }
|
|
|
942 else
|
|
|
943 {
|
|
|
944 goto label_invalid_escape_sequence;
|
|
|
945 }
|
|
|
946 }
|
|
|
947 /* We must update these variables now. */
|
|
|
948 charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
|
|
|
949 charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
|
|
|
950 break;
|
|
|
951
|
|
|
952 label_invalid_escape_sequence:
|
|
|
953 {
|
|
|
954 int length = src - src_base;
|
|
|
955
|
|
|
956 bcopy (src_base, dst, length);
|
|
|
957 dst += length;
|
|
|
958 }
|
|
|
959 }
|
|
|
960 continue;
|
|
|
961
|
|
|
962 label_end_of_loop:
|
|
|
963 coding->carryover_size = src - src_base;
|
|
|
964 bcopy (src_base, coding->carryover, coding->carryover_size);
|
|
|
965 src = src_base;
|
|
|
966 break;
|
|
|
967 }
|
|
|
968
|
|
|
969 /* If this is the last block of the text to be decoded, we had
|
|
|
970 better just flush out all remaining codes in the text although
|
|
|
971 they are not valid characters. */
|
|
|
972 if (coding->last_block)
|
|
|
973 {
|
|
|
974 bcopy (src, dst, src_end - src);
|
|
|
975 dst += (src_end - src);
|
|
|
976 src = src_end;
|
|
|
977 }
|
|
|
978 *consumed = src - source;
|
|
|
979 return dst - destination;
|
|
|
980 }
|
|
|
981
|
|
|
982 /* ISO2022 encoding staffs. */
|
|
|
983
|
|
|
984 /*
|
|
|
985 It is not enough to say just "ISO2022" on encoding, but we have to
|
|
|
986 specify more details. In Emacs, each coding-system of ISO2022
|
|
|
987 variant has the following specifications:
|
|
|
988 1. Initial designation to G0 thru G3.
|
|
|
989 2. Allows short-form designation?
|
|
|
990 3. ASCII should be designated to G0 before control characters?
|
|
|
991 4. ASCII should be designated to G0 at end of line?
|
|
|
992 5. 7-bit environment or 8-bit environment?
|
|
|
993 6. Use locking-shift?
|
|
|
994 7. Use Single-shift?
|
|
|
995 And the following two are only for Japanese:
|
|
|
996 8. Use ASCII in place of JIS0201-1976-Roman?
|
|
|
997 9. Use JISX0208-1983 in place of JISX0208-1978?
|
|
|
998 These specifications are encoded in `coding->flags' as flag bits
|
|
|
999 defined by macros CODING_FLAG_ISO_XXX. See `coding.h' for more
|
|
|
1000 detail.
|
|
|
1001 */
|
|
|
1002
|
|
|
1003 /* Produce codes (escape sequence) for designating CHARSET to graphic
|
|
|
1004 register REG. If <final-char> of CHARSET is '@', 'A', or 'B' and
|
|
|
1005 the coding system CODING allows, produce designation sequence of
|
|
|
1006 short-form. */
|
|
|
1007
|
|
|
1008 #define ENCODE_DESIGNATION(charset, reg, coding) \
|
|
|
1009 do { \
|
|
|
1010 unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset); \
|
|
|
1011 char *intermediate_char_94 = "()*+"; \
|
|
|
1012 char *intermediate_char_96 = ",-./"; \
|
|
|
1013 Lisp_Object temp \
|
|
|
1014 = Fassq (make_number (charset), Vcharset_revision_alist); \
|
|
|
1015 if (! NILP (temp)) \
|
|
|
1016 { \
|
|
|
1017 *dst++ = ISO_CODE_ESC; \
|
|
|
1018 *dst++ = '&'; \
|
|
|
1019 *dst++ = XINT (XCONS (temp)->cdr) + '@'; \
|
|
|
1020 } \
|
|
|
1021 *dst++ = ISO_CODE_ESC; \
|
|
|
1022 if (CHARSET_DIMENSION (charset) == 1) \
|
|
|
1023 { \
|
|
|
1024 if (CHARSET_CHARS (charset) == 94) \
|
|
|
1025 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
|
|
|
1026 else \
|
|
|
1027 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
|
|
|
1028 } \
|
|
|
1029 else \
|
|
|
1030 { \
|
|
|
1031 *dst++ = '$'; \
|
|
|
1032 if (CHARSET_CHARS (charset) == 94) \
|
|
|
1033 { \
|
|
|
1034 if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM) \
|
|
|
1035 || reg != 0 \
|
|
|
1036 || final_char < '@' || final_char > 'B') \
|
|
|
1037 *dst++ = (unsigned char) (intermediate_char_94[reg]); \
|
|
|
1038 } \
|
|
|
1039 else \
|
|
|
1040 *dst++ = (unsigned char) (intermediate_char_96[reg]); \
|
|
|
1041 } \
|
|
|
1042 *dst++ = final_char; \
|
|
|
1043 CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset; \
|
|
|
1044 } while (0)
|
|
|
1045
|
|
|
1046 /* The following two macros produce codes (control character or escape
|
|
|
1047 sequence) for ISO2022 single-shift functions (single-shift-2 and
|
|
|
1048 single-shift-3). */
|
|
|
1049
|
|
|
1050 #define ENCODE_SINGLE_SHIFT_2 \
|
|
|
1051 do { \
|
|
|
1052 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
|
|
|
1053 *dst++ = ISO_CODE_ESC, *dst++ = 'N'; \
|
|
|
1054 else \
|
|
|
1055 *dst++ = ISO_CODE_SS2; \
|
|
|
1056 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
|
|
|
1057 } while (0)
|
|
|
1058
|
|
|
1059 #define ENCODE_SINGLE_SHIFT_3 \
|
|
|
1060 do { \
|
|
|
1061 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
|
|
|
1062 *dst++ = ISO_CODE_ESC, *dst++ = 'O'; \
|
|
|
1063 else \
|
|
|
1064 *dst++ = ISO_CODE_SS3; \
|
|
|
1065 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1; \
|
|
|
1066 } while (0)
|
|
|
1067
|
|
|
1068 /* The following four macros produce codes (control character or
|
|
|
1069 escape sequence) for ISO2022 locking-shift functions (shift-in,
|
|
|
1070 shift-out, locking-shift-2, and locking-shift-3). */
|
|
|
1071
|
|
|
1072 #define ENCODE_SHIFT_IN \
|
|
|
1073 do { \
|
|
|
1074 *dst++ = ISO_CODE_SI; \
|
|
|
1075 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
|
|
|
1076 } while (0)
|
|
|
1077
|
|
|
1078 #define ENCODE_SHIFT_OUT \
|
|
|
1079 do { \
|
|
|
1080 *dst++ = ISO_CODE_SO; \
|
|
|
1081 CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
|
|
|
1082 } while (0)
|
|
|
1083
|
|
|
1084 #define ENCODE_LOCKING_SHIFT_2 \
|
|
|
1085 do { \
|
|
|
1086 *dst++ = ISO_CODE_ESC, *dst++ = 'n'; \
|
|
|
1087 CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
|
|
|
1088 } while (0)
|
|
|
1089
|
|
|
1090 #define ENCODE_LOCKING_SHIFT_3 \
|
|
|
1091 do { \
|
|
|
1092 *dst++ = ISO_CODE_ESC, *dst++ = 'o'; \
|
|
|
1093 CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
|
|
|
1094 } while (0)
|
|
|
1095
|
|
|
1096 /* Produce codes for a DIMENSION1 character of which character set is
|
|
|
1097 CHARSET and position-code is C1. Designation and invocation
|
|
|
1098 sequences are also produced in advance if necessary. */
|
|
|
1099
|
|
|
1100
|
|
|
1101 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
|
|
|
1102 do { \
|
|
|
1103 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
|
|
|
1104 { \
|
|
|
1105 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
|
|
|
1106 *dst++ = c1 & 0x7F; \
|
|
|
1107 else \
|
|
|
1108 *dst++ = c1 | 0x80; \
|
|
|
1109 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
|
|
|
1110 break; \
|
|
|
1111 } \
|
|
|
1112 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
|
|
|
1113 { \
|
|
|
1114 *dst++ = c1 & 0x7F; \
|
|
|
1115 break; \
|
|
|
1116 } \
|
|
|
1117 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
|
|
|
1118 { \
|
|
|
1119 *dst++ = c1 | 0x80; \
|
|
|
1120 break; \
|
|
|
1121 } \
|
|
|
1122 else \
|
|
|
1123 /* Since CHARSET is not yet invoked to any graphic planes, we \
|
|
|
1124 must invoke it, or, at first, designate it to some graphic \
|
|
|
1125 register. Then repeat the loop to actually produce the \
|
|
|
1126 character. */ \
|
|
|
1127 dst = encode_invocation_designation (charset, coding, dst); \
|
|
|
1128 } while (1)
|
|
|
1129
|
|
|
1130 /* Produce codes for a DIMENSION2 character of which character set is
|
|
|
1131 CHARSET and position-codes are C1 and C2. Designation and
|
|
|
1132 invocation codes are also produced in advance if necessary. */
|
|
|
1133
|
|
|
1134 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
|
|
|
1135 do { \
|
|
|
1136 if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding)) \
|
|
|
1137 { \
|
|
|
1138 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS) \
|
|
|
1139 *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F; \
|
|
|
1140 else \
|
|
|
1141 *dst++ = c1 | 0x80, *dst++ = c2 | 0x80; \
|
|
|
1142 CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0; \
|
|
|
1143 break; \
|
|
|
1144 } \
|
|
|
1145 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0)) \
|
|
|
1146 { \
|
|
|
1147 *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F; \
|
|
|
1148 break; \
|
|
|
1149 } \
|
|
|
1150 else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1)) \
|
|
|
1151 { \
|
|
|
1152 *dst++ = c1 | 0x80, *dst++= c2 | 0x80; \
|
|
|
1153 break; \
|
|
|
1154 } \
|
|
|
1155 else \
|
|
|
1156 /* Since CHARSET is not yet invoked to any graphic planes, we \
|
|
|
1157 must invoke it, or, at first, designate it to some graphic \
|
|
|
1158 register. Then repeat the loop to actually produce the \
|
|
|
1159 character. */ \
|
|
|
1160 dst = encode_invocation_designation (charset, coding, dst); \
|
|
|
1161 } while (1)
|
|
|
1162
|
|
|
1163 /* Produce designation and invocation codes at a place pointed by DST
|
|
|
1164 to use CHARSET. The element `spec.iso2022' of *CODING is updated.
|
|
|
1165 Return new DST. */
|
|
|
1166
|
|
|
1167 unsigned char *
|
|
|
1168 encode_invocation_designation (charset, coding, dst)
|
|
|
1169 int charset;
|
|
|
1170 struct coding_system *coding;
|
|
|
1171 unsigned char *dst;
|
|
|
1172 {
|
|
|
1173 int reg; /* graphic register number */
|
|
|
1174
|
|
|
1175 /* At first, check designations. */
|
|
|
1176 for (reg = 0; reg < 4; reg++)
|
|
|
1177 if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
|
|
|
1178 break;
|
|
|
1179
|
|
|
1180 if (reg >= 4)
|
|
|
1181 {
|
|
|
1182 /* CHARSET is not yet designated to any graphic registers. */
|
|
|
1183 /* At first check the requested designation. */
|
|
|
1184 reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
|
|
|
1185 if (reg < 0)
|
|
|
1186 /* Since CHARSET requests no special designation, designate to
|
|
|
1187 graphic register 0. */
|
|
|
1188 reg = 0;
|
|
|
1189
|
|
|
1190 ENCODE_DESIGNATION (charset, reg, coding);
|
|
|
1191 }
|
|
|
1192
|
|
|
1193 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
|
|
|
1194 && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
|
|
|
1195 {
|
|
|
1196 /* Since the graphic register REG is not invoked to any graphic
|
|
|
1197 planes, invoke it to graphic plane 0. */
|
|
|
1198 switch (reg)
|
|
|
1199 {
|
|
|
1200 case 0: /* graphic register 0 */
|
|
|
1201 ENCODE_SHIFT_IN;
|
|
|
1202 break;
|
|
|
1203
|
|
|
1204 case 1: /* graphic register 1 */
|
|
|
1205 ENCODE_SHIFT_OUT;
|
|
|
1206 break;
|
|
|
1207
|
|
|
1208 case 2: /* graphic register 2 */
|
|
|
1209 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
|
|
|
1210 ENCODE_SINGLE_SHIFT_2;
|
|
|
1211 else
|
|
|
1212 ENCODE_LOCKING_SHIFT_2;
|
|
|
1213 break;
|
|
|
1214
|
|
|
1215 case 3: /* graphic register 3 */
|
|
|
1216 if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
|
|
|
1217 ENCODE_SINGLE_SHIFT_3;
|
|
|
1218 else
|
|
|
1219 ENCODE_LOCKING_SHIFT_3;
|
|
|
1220 break;
|
|
|
1221 }
|
|
|
1222 }
|
|
|
1223 return dst;
|
|
|
1224 }
|
|
|
1225
|
|
|
1226 /* The following two macros produce codes for indicating composition. */
|
|
|
1227 #define ENCODE_COMPOSITION_NO_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '0'
|
|
|
1228 #define ENCODE_COMPOSITION_WITH_RULE_START *dst++ = ISO_CODE_ESC, *dst++ = '2'
|
|
|
1229 #define ENCODE_COMPOSITION_END *dst++ = ISO_CODE_ESC, *dst++ = '1'
|
|
|
1230
|
|
|
1231 /* The following three macros produce codes for indicating direction
|
|
|
1232 of text. */
|
|
|
1233 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
|
|
|
1234 do { \
|
|
|
1235 if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS) \
|
|
|
1236 *dst++ = ISO_CODE_ESC, *dst++ = '['; \
|
|
|
1237 else \
|
|
|
1238 *dst++ = ISO_CODE_CSI; \
|
|
|
1239 } while (0)
|
|
|
1240
|
|
|
1241 #define ENCODE_DIRECTION_R2L \
|
|
|
1242 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
|
|
|
1243
|
|
|
1244 #define ENCODE_DIRECTION_L2R \
|
|
|
1245 ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
|
|
|
1246
|
|
|
1247 /* Produce codes for designation and invocation to reset the graphic
|
|
|
1248 planes and registers to initial state. */
|
|
|
1249 #define ENCODE_RESET_PLANE_AND_REGISTER(eol) \
|
|
|
1250 do { \
|
|
|
1251 int reg; \
|
|
|
1252 if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0) \
|
|
|
1253 ENCODE_SHIFT_IN; \
|
|
|
1254 for (reg = 0; reg < 4; reg++) \
|
|
|
1255 { \
|
|
|
1256 if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) < 0) \
|
|
|
1257 { \
|
|
|
1258 if (eol) CODING_SPEC_ISO_DESIGNATION (coding, reg) = -1; \
|
|
|
1259 } \
|
|
|
1260 else if (CODING_SPEC_ISO_DESIGNATION (coding, reg) \
|
|
|
1261 != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)) \
|
|
|
1262 ENCODE_DESIGNATION \
|
|
|
1263 (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
|
|
|
1264 } \
|
|
|
1265 } while (0)
|
|
|
1266
|
|
|
1267 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
|
|
|
1268
|
|
|
1269 int
|
|
|
1270 encode_coding_iso2022 (coding, source, destination,
|
|
|
1271 src_bytes, dst_bytes, consumed)
|
|
|
1272 struct coding_system *coding;
|
|
|
1273 unsigned char *source, *destination;
|
|
|
1274 int src_bytes, dst_bytes;
|
|
|
1275 int *consumed;
|
|
|
1276 {
|
|
|
1277 unsigned char *src = source;
|
|
|
1278 unsigned char *src_end = source + src_bytes;
|
|
|
1279 unsigned char *dst = destination;
|
|
|
1280 unsigned char *dst_end = destination + dst_bytes;
|
|
|
1281 /* Since the maximum bytes produced by each loop is 6, we subtract 5
|
|
|
1282 from DST_END to assure overflow checking is necessary only at the
|
|
|
1283 head of loop. */
|
|
|
1284 unsigned char *adjusted_dst_end = dst_end - 5;
|
|
|
1285
|
|
|
1286 while (src < src_end && dst < adjusted_dst_end)
|
|
|
1287 {
|
|
|
1288 /* SRC_BASE remembers the start position in source in each loop.
|
|
|
1289 The loop will be exited when there's not enough source text
|
|
|
1290 to analyze multi-byte codes (within macros ONE_MORE_BYTE,
|
|
|
1291 TWO_MORE_BYTES, and THREE_MORE_BYTES). In that case, SRC is
|
|
|
1292 reset to SRC_BASE before exiting. */
|
|
|
1293 unsigned char *src_base = src;
|
|
|
1294 unsigned char c1 = *src++, c2, c3, c4;
|
|
|
1295 int charset;
|
|
|
1296
|
|
|
1297 /* If we are seeing a component of a composite character, we are
|
|
|
1298 seeing a leading-code specially encoded for composition, or a
|
|
|
1299 composition rule if composing with rule. We must set C1
|
|
|
1300 to a normal leading-code or an ASCII code. If we are not at
|
|
|
1301 a composed character, we must reset the composition state. */
|
|
|
1302 if (COMPOSING_P (coding->composing))
|
|
|
1303 {
|
|
|
1304 if (c1 < 0xA0)
|
|
|
1305 {
|
|
|
1306 /* We are not in a composite character any longer. */
|
|
|
1307 coding->composing = COMPOSING_NO;
|
|
|
1308 ENCODE_COMPOSITION_END;
|
|
|
1309 }
|
|
|
1310 else
|
|
|
1311 {
|
|
|
1312 if (coding->composing == COMPOSING_WITH_RULE_RULE)
|
|
|
1313 {
|
|
|
1314 *dst++ = c1 & 0x7F;
|
|
|
1315 coding->composing = COMPOSING_WITH_RULE_HEAD;
|
|
|
1316 continue;
|
|
|
1317 }
|
|
|
1318 else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
|
|
|
1319 coding->composing = COMPOSING_WITH_RULE_RULE;
|
|
|
1320 if (c1 == 0xA0)
|
|
|
1321 {
|
|
|
1322 /* This is an ASCII component. */
|
|
|
1323 ONE_MORE_BYTE (c1);
|
|
|
1324 c1 &= 0x7F;
|
|
|
1325 }
|
|
|
1326 else
|
|
|
1327 /* This is a leading-code of non ASCII component. */
|
|
|
1328 c1 -= 0x20;
|
|
|
1329 }
|
|
|
1330 }
|
|
|
1331
|
|
|
1332 /* Now encode one character. C1 is a control character, an
|
|
|
1333 ASCII character, or a leading-code of multi-byte character. */
|
|
|
1334 switch (emacs_code_class[c1])
|
|
|
1335 {
|
|
|
1336 case EMACS_ascii_code:
|
|
|
1337 ENCODE_ISO_CHARACTER_DIMENSION1 (CHARSET_ASCII, c1);
|
|
|
1338 break;
|
|
|
1339
|
|
|
1340 case EMACS_control_code:
|
|
|
1341 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
|
|
|
1342 ENCODE_RESET_PLANE_AND_REGISTER (0);
|
|
|
1343 *dst++ = c1;
|
|
|
1344 break;
|
|
|
1345
|
|
|
1346 case EMACS_carriage_return_code:
|
|
|
1347 if (!coding->selective)
|
|
|
1348 {
|
|
|
1349 if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
|
|
|
1350 ENCODE_RESET_PLANE_AND_REGISTER (0);
|
|
|
1351 *dst++ = c1;
|
|
|
1352 break;
|
|
|
1353 }
|
|
|
1354 /* fall down to treat '\r' as '\n' ... */
|
|
|
1355
|
|
|
1356 case EMACS_linefeed_code:
|
|
|
1357 if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
|
|
|
1358 ENCODE_RESET_PLANE_AND_REGISTER (1);
|
|
|
1359 if (coding->eol_type == CODING_EOL_LF
|
|
|
1360 || coding->eol_type == CODING_EOL_AUTOMATIC)
|
|
|
1361 *dst++ = ISO_CODE_LF;
|
|
|
1362 else if (coding->eol_type == CODING_EOL_CRLF)
|
|
|
1363 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
|
|
|
1364 else
|
|
|
1365 *dst++ = ISO_CODE_CR;
|
|
|
1366 break;
|
|
|
1367
|
|
|
1368 case EMACS_leading_code_2:
|
|
|
1369 ONE_MORE_BYTE (c2);
|
|
|
1370 ENCODE_ISO_CHARACTER_DIMENSION1 (c1, c2);
|
|
|
1371 break;
|
|
|
1372
|
|
|
1373 case EMACS_leading_code_3:
|
|
|
1374 TWO_MORE_BYTES (c2, c3);
|
|
|
1375 if (c1 < LEADING_CODE_PRIVATE_11)
|
|
|
1376 ENCODE_ISO_CHARACTER_DIMENSION2 (c1, c2, c3);
|
|
|
1377 else
|
|
|
1378 ENCODE_ISO_CHARACTER_DIMENSION1 (c2, c3);
|
|
|
1379 break;
|
|
|
1380
|
|
|
1381 case EMACS_leading_code_4:
|
|
|
1382 THREE_MORE_BYTES (c2, c3, c4);
|
|
|
1383 ENCODE_ISO_CHARACTER_DIMENSION2 (c2, c3, c4);
|
|
|
1384 break;
|
|
|
1385
|
|
|
1386 case EMACS_leading_code_composition:
|
|
|
1387 ONE_MORE_BYTE (c1);
|
|
|
1388 if (c1 == 0xFF)
|
|
|
1389 {
|
|
|
1390 coding->composing = COMPOSING_WITH_RULE_HEAD;
|
|
|
1391 ENCODE_COMPOSITION_WITH_RULE_START;
|
|
|
1392 }
|
|
|
1393 else
|
|
|
1394 {
|
|
|
1395 /* Rewind one byte because it is a character code of
|
|
|
1396 composition elements. */
|
|
|
1397 src--;
|
|
|
1398 coding->composing = COMPOSING_NO_RULE_HEAD;
|
|
|
1399 ENCODE_COMPOSITION_NO_RULE_START;
|
|
|
1400 }
|
|
|
1401 break;
|
|
|
1402
|
|
|
1403 case EMACS_invalid_code:
|
|
|
1404 *dst++ = c1;
|
|
|
1405 break;
|
|
|
1406 }
|
|
|
1407 continue;
|
|
|
1408 label_end_of_loop:
|
|
|
1409 coding->carryover_size = src - src_base;
|
|
|
1410 bcopy (src_base, coding->carryover, coding->carryover_size);
|
|
|
1411 src = src_base;
|
|
|
1412 break;
|
|
|
1413 }
|
|
|
1414
|
|
|
1415 /* If this is the last block of the text to be encoded, we must
|
|
|
1416 reset the state of graphic planes and registers to initial one.
|
|
|
1417 In addition, we had better just flush out all remaining codes in
|
|
|
1418 the text although they are not valid characters. */
|
|
|
1419 if (coding->last_block)
|
|
|
1420 {
|
|
|
1421 ENCODE_RESET_PLANE_AND_REGISTER (1);
|
|
|
1422 bcopy(src, dst, src_end - src);
|
|
|
1423 dst += (src_end - src);
|
|
|
1424 src = src_end;
|
|
|
1425 }
|
|
|
1426 *consumed = src - source;
|
|
|
1427 return dst - destination;
|
|
|
1428 }
|
|
|
1429
|
|
|
1430
|
|
|
1431 /*** 4. SJIS and BIG5 handlers ***/
|
|
|
1432
|
|
|
1433 /* Although SJIS and BIG5 are not ISO's coding system, They are used
|
|
|
1434 quite widely. So, for the moment, Emacs supports them in the bare
|
|
|
1435 C code. But, in the future, they may be supported only by CCL. */
|
|
|
1436
|
|
|
1437 /* SJIS is a coding system encoding three character sets: ASCII, right
|
|
|
1438 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
|
|
|
1439 as is. A character of charset katakana-jisx0201 is encoded by
|
|
|
1440 "position-code + 0x80". A character of charset japanese-jisx0208
|
|
|
1441 is encoded in 2-byte but two position-codes are divided and shifted
|
|
|
1442 so that it fit in the range below.
|
|
|
1443
|
|
|
1444 --- CODE RANGE of SJIS ---
|
|
|
1445 (character set) (range)
|
|
|
1446 ASCII 0x00 .. 0x7F
|
|
|
1447 KATAKANA-JISX0201 0xA0 .. 0xDF
|
|
|
1448 JISX0208 (1st byte) 0x80 .. 0x9F and 0xE0 .. 0xFF
|
|
|
1449 (2nd byte) 0x40 .. 0xFF
|
|
|
1450 -------------------------------
|
|
|
1451
|
|
|
1452 */
|
|
|
1453
|
|
|
1454 /* BIG5 is a coding system encoding two character sets: ASCII and
|
|
|
1455 Big5. An ASCII character is encoded as is. Big5 is a two-byte
|
|
|
1456 character set and is encoded in two-byte.
|
|
|
1457
|
|
|
1458 --- CODE RANGE of BIG5 ---
|
|
|
1459 (character set) (range)
|
|
|
1460 ASCII 0x00 .. 0x7F
|
|
|
1461 Big5 (1st byte) 0xA1 .. 0xFE
|
|
|
1462 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
|
|
|
1463 --------------------------
|
|
|
1464
|
|
|
1465 Since the number of characters in Big5 is larger than maximum
|
|
|
1466 characters in Emacs' charset (96x96), it can't be handled as one
|
|
|
1467 charset. So, in Emacs, Big5 is divided into two: `charset-big5-1'
|
|
|
1468 and `charset-big5-2'. Both are DIMENSION2 and CHARS94. The former
|
|
|
1469 contains frequently used characters and the latter contains less
|
|
|
1470 frequently used characters. */
|
|
|
1471
|
|
|
1472 /* Macros to decode or encode a character of Big5 in BIG5. B1 and B2
|
|
|
1473 are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
|
|
|
1474 C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
|
|
|
1475 format. CHARSET is `charset_big5_1' or `charset_big5_2'. */
|
|
|
1476
|
|
|
1477 /* Number of Big5 characters which have the same code in 1st byte. */
|
|
|
1478 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
|
|
|
1479
|
|
|
1480 #define DECODE_BIG5(b1, b2, charset, c1, c2) \
|
|
|
1481 do { \
|
|
|
1482 unsigned int temp \
|
|
|
1483 = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62); \
|
|
|
1484 if (b1 < 0xC9) \
|
|
|
1485 charset = charset_big5_1; \
|
|
|
1486 else \
|
|
|
1487 { \
|
|
|
1488 charset = charset_big5_2; \
|
|
|
1489 temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW; \
|
|
|
1490 } \
|
|
|
1491 c1 = temp / (0xFF - 0xA1) + 0x21; \
|
|
|
1492 c2 = temp % (0xFF - 0xA1) + 0x21; \
|
|
|
1493 } while (0)
|
|
|
1494
|
|
|
1495 #define ENCODE_BIG5(charset, c1, c2, b1, b2) \
|
|
|
1496 do { \
|
|
|
1497 unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21); \
|
|
|
1498 if (charset == charset_big5_2) \
|
|
|
1499 temp += BIG5_SAME_ROW * (0xC9 - 0xA1); \
|
|
|
1500 b1 = temp / BIG5_SAME_ROW + 0xA1; \
|
|
|
1501 b2 = temp % BIG5_SAME_ROW; \
|
|
|
1502 b2 += b2 < 0x3F ? 0x40 : 0x62; \
|
|
|
1503 } while (0)
|
|
|
1504
|
|
|
1505 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
|
|
|
1506 Check if a text is encoded in SJIS. If it is, return
|
|
|
1507 CODING_CATEGORY_MASK_SJIS, else return 0. */
|
|
|
1508
|
|
|
1509 int
|
|
|
1510 detect_coding_sjis (src, src_end)
|
|
|
1511 unsigned char *src, *src_end;
|
|
|
1512 {
|
|
|
1513 unsigned char c;
|
|
|
1514
|
|
|
1515 while (src < src_end)
|
|
|
1516 {
|
|
|
1517 c = *src++;
|
|
|
1518 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
|
|
|
1519 return 0;
|
|
|
1520 if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
|
|
|
1521 {
|
|
|
1522 if (src < src_end && *src++ < 0x40)
|
|
|
1523 return 0;
|
|
|
1524 }
|
|
|
1525 }
|
|
|
1526 return CODING_CATEGORY_MASK_SJIS;
|
|
|
1527 }
|
|
|
1528
|
|
|
1529 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
|
|
|
1530 Check if a text is encoded in BIG5. If it is, return
|
|
|
1531 CODING_CATEGORY_MASK_BIG5, else return 0. */
|
|
|
1532
|
|
|
1533 int
|
|
|
1534 detect_coding_big5 (src, src_end)
|
|
|
1535 unsigned char *src, *src_end;
|
|
|
1536 {
|
|
|
1537 unsigned char c;
|
|
|
1538
|
|
|
1539 while (src < src_end)
|
|
|
1540 {
|
|
|
1541 c = *src++;
|
|
|
1542 if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
|
|
|
1543 return 0;
|
|
|
1544 if (c >= 0xA1)
|
|
|
1545 {
|
|
|
1546 if (src >= src_end)
|
|
|
1547 break;
|
|
|
1548 c = *src++;
|
|
|
1549 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
|
|
|
1550 return 0;
|
|
|
1551 }
|
|
|
1552 }
|
|
|
1553 return CODING_CATEGORY_MASK_BIG5;
|
|
|
1554 }
|
|
|
1555
|
|
|
1556 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
|
|
|
1557 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
|
|
|
1558
|
|
|
1559 int
|
|
|
1560 decode_coding_sjis_big5 (coding, source, destination,
|
|
|
1561 src_bytes, dst_bytes, consumed, sjis_p)
|
|
|
1562 struct coding_system *coding;
|
|
|
1563 unsigned char *source, *destination;
|
|
|
1564 int src_bytes, dst_bytes;
|
|
|
1565 int *consumed;
|
|
|
1566 int sjis_p;
|
|
|
1567 {
|
|
|
1568 unsigned char *src = source;
|
|
|
1569 unsigned char *src_end = source + src_bytes;
|
|
|
1570 unsigned char *dst = destination;
|
|
|
1571 unsigned char *dst_end = destination + dst_bytes;
|
|
|
1572 /* Since the maximum bytes produced by each loop is 4, we subtract 3
|
|
|
1573 from DST_END to assure overflow checking is necessary only at the
|
|
|
1574 head of loop. */
|
|
|
1575 unsigned char *adjusted_dst_end = dst_end - 3;
|
|
|
1576
|
|
|
1577 while (src < src_end && dst < adjusted_dst_end)
|
|
|
1578 {
|
|
|
1579 /* SRC_BASE remembers the start position in source in each loop.
|
|
|
1580 The loop will be exited when there's not enough source text
|
|
|
1581 to analyze two-byte character (within macro ONE_MORE_BYTE).
|
|
|
1582 In that case, SRC is reset to SRC_BASE before exiting. */
|
|
|
1583 unsigned char *src_base = src;
|
|
|
1584 unsigned char c1 = *src++, c2, c3, c4;
|
|
|
1585
|
|
|
1586 if (c1 == '\r')
|
|
|
1587 {
|
|
|
1588 if (coding->eol_type == CODING_EOL_CRLF)
|
|
|
1589 {
|
|
|
1590 ONE_MORE_BYTE (c2);
|
|
|
1591 if (c2 == '\n')
|
|
|
1592 *dst++ = c2;
|
|
|
1593 else
|
|
|
1594 /* To process C2 again, SRC is subtracted by 1. */
|
|
|
1595 *dst++ = c1, src--;
|
|
|
1596 }
|
|
|
1597 else
|
|
|
1598 *dst++ = c1;
|
|
|
1599 }
|
|
|
1600 else if (c1 < 0x80)
|
|
|
1601 *dst++ = c1;
|
|
|
1602 else if (c1 < 0xA0 || c1 >= 0xE0)
|
|
|
1603 {
|
|
|
1604 /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
|
|
|
1605 if (sjis_p)
|
|
|
1606 {
|
|
|
1607 ONE_MORE_BYTE (c2);
|
|
|
1608 DECODE_SJIS (c1, c2, c3, c4);
|
|
|
1609 DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
|
|
|
1610 }
|
|
|
1611 else if (c1 >= 0xE0 && c1 < 0xFF)
|
|
|
1612 {
|
|
|
1613 int charset;
|
|
|
1614
|
|
|
1615 ONE_MORE_BYTE (c2);
|
|
|
1616 DECODE_BIG5 (c1, c2, charset, c3, c4);
|
|
|
1617 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
|
|
|
1618 }
|
|
|
1619 else /* Invalid code */
|
|
|
1620 *dst++ = c1;
|
|
|
1621 }
|
|
|
1622 else
|
|
|
1623 {
|
|
|
1624 /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
|
|
|
1625 if (sjis_p)
|
|
|
1626 DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
|
|
|
1627 else
|
|
|
1628 {
|
|
|
1629 int charset;
|
|
|
1630
|
|
|
1631 ONE_MORE_BYTE (c2);
|
|
|
1632 DECODE_BIG5 (c1, c2, charset, c3, c4);
|
|
|
1633 DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
|
|
|
1634 }
|
|
|
1635 }
|
|
|
1636 continue;
|
|
|
1637
|
|
|
1638 label_end_of_loop:
|
|
|
1639 coding->carryover_size = src - src_base;
|
|
|
1640 bcopy (src_base, coding->carryover, coding->carryover_size);
|
|
|
1641 src = src_base;
|
|
|
1642 break;
|
|
|
1643 }
|
|
|
1644
|
|
|
1645 *consumed = src - source;
|
|
|
1646 return dst - destination;
|
|
|
1647 }
|
|
|
1648
|
|
|
1649 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
|
|
|
1650 This function can encode `charset_ascii', `charset_katakana_jisx0201',
|
|
|
1651 `charset_jisx0208', `charset_big5_1', and `charset_big5-2'. We are
|
|
|
1652 sure that all these charsets are registered as official charset
|
|
|
1653 (i.e. do not have extended leading-codes). Characters of other
|
|
|
1654 charsets are produced without any encoding. If SJIS_P is 1, encode
|
|
|
1655 SJIS text, else encode BIG5 text. */
|
|
|
1656
|
|
|
1657 int
|
|
|
1658 encode_coding_sjis_big5 (coding, source, destination,
|
|
|
1659 src_bytes, dst_bytes, consumed, sjis_p)
|
|
|
1660 struct coding_system *coding;
|
|
|
1661 unsigned char *source, *destination;
|
|
|
1662 int src_bytes, dst_bytes;
|
|
|
1663 int *consumed;
|
|
|
1664 int sjis_p;
|
|
|
1665 {
|
|
|
1666 unsigned char *src = source;
|
|
|
1667 unsigned char *src_end = source + src_bytes;
|
|
|
1668 unsigned char *dst = destination;
|
|
|
1669 unsigned char *dst_end = destination + dst_bytes;
|
|
|
1670 /* Since the maximum bytes produced by each loop is 2, we subtract 1
|
|
|
1671 from DST_END to assure overflow checking is necessary only at the
|
|
|
1672 head of loop. */
|
|
|
1673 unsigned char *adjusted_dst_end = dst_end - 1;
|
|
|
1674
|
|
|
1675 while (src < src_end && dst < adjusted_dst_end)
|
|
|
1676 {
|
|
|
1677 /* SRC_BASE remembers the start position in source in each loop.
|
|
|
1678 The loop will be exited when there's not enough source text
|
|
|
1679 to analyze multi-byte codes (within macros ONE_MORE_BYTE and
|
|
|
1680 TWO_MORE_BYTES). In that case, SRC is reset to SRC_BASE
|
|
|
1681 before exiting. */
|
|
|
1682 unsigned char *src_base = src;
|
|
|
1683 unsigned char c1 = *src++, c2, c3, c4;
|
|
|
1684
|
|
|
1685 if (coding->composing)
|
|
|
1686 {
|
|
|
1687 if (c1 == 0xA0)
|
|
|
1688 {
|
|
|
1689 ONE_MORE_BYTE (c1);
|
|
|
1690 c1 &= 0x7F;
|
|
|
1691 }
|
|
|
1692 else if (c1 >= 0xA0)
|
|
|
1693 c1 -= 0x20;
|
|
|
1694 else
|
|
|
1695 coding->composing = 0;
|
|
|
1696 }
|
|
|
1697
|
|
|
1698 switch (emacs_code_class[c1])
|
|
|
1699 {
|
|
|
1700 case EMACS_ascii_code:
|
|
|
1701 case EMACS_control_code:
|
|
|
1702 *dst++ = c1;
|
|
|
1703 break;
|
|
|
1704
|
|
|
1705 case EMACS_carriage_return_code:
|
|
|
1706 if (!coding->selective)
|
|
|
1707 {
|
|
|
1708 *dst++ = c1;
|
|
|
1709 break;
|
|
|
1710 }
|
|
|
1711 /* fall down to treat '\r' as '\n' ... */
|
|
|
1712
|
|
|
1713 case EMACS_linefeed_code:
|
|
|
1714 if (coding->eol_type == CODING_EOL_LF
|
|
|
1715 || coding->eol_type == CODING_EOL_AUTOMATIC)
|
|
|
1716 *dst++ = '\n';
|
|
|
1717 else if (coding->eol_type == CODING_EOL_CRLF)
|
|
|
1718 *dst++ = '\r', *dst++ = '\n';
|
|
|
1719 else
|
|
|
1720 *dst++ = '\r';
|
|
|
1721 break;
|
|
|
1722
|
|
|
1723 case EMACS_leading_code_2:
|
|
|
1724 ONE_MORE_BYTE (c2);
|
|
|
1725 if (sjis_p && c1 == charset_katakana_jisx0201)
|
|
|
1726 *dst++ = c2;
|
|
|
1727 else
|
|
|
1728 *dst++ = c1, *dst++ = c2;
|
|
|
1729 break;
|
|
|
1730
|
|
|
1731 case EMACS_leading_code_3:
|
|
|
1732 TWO_MORE_BYTES (c2, c3);
|
|
|
1733 c2 &= 0x7F, c3 &= 0x7F;
|
|
|
1734 if (sjis_p && c1 == charset_jisx0208)
|
|
|
1735 {
|
|
|
1736 unsigned char s1, s2;
|
|
|
1737
|
|
|
1738 ENCODE_SJIS (c2, c3, s1, s2);
|
|
|
1739 *dst++ = s1, *dst++ = s2;
|
|
|
1740 }
|
|
|
1741 else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
|
|
|
1742 {
|
|
|
1743 unsigned char b1, b2;
|
|
|
1744
|
|
|
1745 ENCODE_BIG5 (c1, c2, c3, b1, b2);
|
|
|
1746 *dst++ = b1, *dst++ = b2;
|
|
|
1747 }
|
|
|
1748 else
|
|
|
1749 *dst++ = c1, *dst++ = c2, *dst++ = c3;
|
|
|
1750 break;
|
|
|
1751
|
|
|
1752 case EMACS_leading_code_4:
|
|
|
1753 THREE_MORE_BYTES (c2, c3, c4);
|
|
|
1754 *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
|
|
|
1755 break;
|
|
|
1756
|
|
|
1757 case EMACS_leading_code_composition:
|
|
|
1758 coding->composing = 1;
|
|
|
1759 break;
|
|
|
1760
|
|
|
1761 default: /* i.e. case EMACS_invalid_code: */
|
|
|
1762 *dst++ = c1;
|
|
|
1763 }
|
|
|
1764 continue;
|
|
|
1765
|
|
|
1766 label_end_of_loop:
|
|
|
1767 coding->carryover_size = src - src_base;
|
|
|
1768 bcopy (src_base, coding->carryover, coding->carryover_size);
|
|
|
1769 src = src_base;
|
|
|
1770 break;
|
|
|
1771 }
|
|
|
1772
|
|
|
1773 *consumed = src - source;
|
|
|
1774 return dst - destination;
|
|
|
1775 }
|
|
|
1776
|
|
|
1777
|
|
|
1778 /*** 5. End-of-line handlers ***/
|
|
|
1779
|
|
|
1780 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
|
|
|
1781 This function is called only when `coding->eol_type' is
|
|
|
1782 CODING_EOL_CRLF or CODING_EOL_CR. */
|
|
|
1783
|
|
|
1784 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
|
|
|
1785 struct coding_system *coding;
|
|
|
1786 unsigned char *source, *destination;
|
|
|
1787 int src_bytes, dst_bytes;
|
|
|
1788 int *consumed;
|
|
|
1789 {
|
|
|
1790 unsigned char *src = source;
|
|
|
1791 unsigned char *src_end = source + src_bytes;
|
|
|
1792 unsigned char *dst = destination;
|
|
|
1793 unsigned char *dst_end = destination + dst_bytes;
|
|
|
1794 int produced;
|
|
|
1795
|
|
|
1796 switch (coding->eol_type)
|
|
|
1797 {
|
|
|
1798 case CODING_EOL_CRLF:
|
|
|
1799 {
|
|
|
1800 /* Since the maximum bytes produced by each loop is 2, we
|
|
|
1801 subtract 1 from DST_END to assure overflow checking is
|
|
|
1802 necessary only at the head of loop. */
|
|
|
1803 unsigned char *adjusted_dst_end = dst_end - 1;
|
|
|
1804
|
|
|
1805 while (src < src_end && dst < adjusted_dst_end)
|
|
|
1806 {
|
|
|
1807 unsigned char *src_base = src;
|
|
|
1808 unsigned char c = *src++;
|
|
|
1809 if (c == '\r')
|
|
|
1810 {
|
|
|
1811 ONE_MORE_BYTE (c);
|
|
|
1812 if (c != '\n')
|
|
|
1813 *dst++ = '\r';
|
|
|
1814
|
|
|
1815 }
|
|
|
1816 else
|
|
|
1817 *dst++ = c;
|
|
|
1818 continue;
|
|
|
1819
|
|
|
1820 label_end_of_loop:
|
|
|
1821 coding->carryover_size = src - src_base;
|
|
|
1822 bcopy (src_base, coding->carryover, coding->carryover_size);
|
|
|
1823 src = src_base;
|
|
|
1824 break;
|
|
|
1825 }
|
|
|
1826 *consumed = src - source;
|
|
|
1827 produced = dst - destination;
|
|
|
1828 break;
|
|
|
1829 }
|
|
|
1830
|
|
|
1831 case CODING_EOL_CR:
|
|
|
1832 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
|
|
|
1833 bcopy (source, destination, produced);
|
|
|
1834 dst_end = destination + produced;
|
|
|
1835 while (dst < dst_end)
|
|
|
1836 if (*dst++ == '\r') dst[-1] = '\n';
|
|
|
1837 *consumed = produced;
|
|
|
1838 break;
|
|
|
1839
|
|
|
1840 default: /* i.e. case: CODING_EOL_LF */
|
|
|
1841 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
|
|
|
1842 bcopy (source, destination, produced);
|
|
|
1843 *consumed = produced;
|
|
|
1844 break;
|
|
|
1845 }
|
|
|
1846
|
|
|
1847 return produced;
|
|
|
1848 }
|
|
|
1849
|
|
|
1850 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". Encode
|
|
|
1851 format of end-of-line according to `coding->eol_type'. If
|
|
|
1852 `coding->selective' is 1, code '\r' in source text also means
|
|
|
1853 end-of-line. */
|
|
|
1854
|
|
|
1855 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
|
|
|
1856 struct coding_system *coding;
|
|
|
1857 unsigned char *source, *destination;
|
|
|
1858 int src_bytes, dst_bytes;
|
|
|
1859 int *consumed;
|
|
|
1860 {
|
|
|
1861 unsigned char *src = source;
|
|
|
1862 unsigned char *dst = destination;
|
|
|
1863 int produced;
|
|
|
1864
|
|
|
1865 if (src_bytes <= 0)
|
|
|
1866 return 0;
|
|
|
1867
|
|
|
1868 switch (coding->eol_type)
|
|
|
1869 {
|
|
|
1870 case CODING_EOL_LF:
|
|
|
1871 case CODING_EOL_AUTOMATIC:
|
|
|
1872 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
|
|
|
1873 bcopy (source, destination, produced);
|
|
|
1874 if (coding->selective)
|
|
|
1875 {
|
|
|
1876 int i = produced;
|
|
|
1877 while (i--)
|
|
|
1878 if (*dst++ == '\r') dst[-1] = '\n';
|
|
|
1879 }
|
|
|
1880 *consumed = produced;
|
|
|
1881
|
|
|
1882 case CODING_EOL_CRLF:
|
|
|
1883 {
|
|
|
1884 unsigned char c;
|
|
|
1885 unsigned char *src_end = source + src_bytes;
|
|
|
1886 unsigned char *dst_end = destination + dst_bytes;
|
|
|
1887 /* Since the maximum bytes produced by each loop is 2, we
|
|
|
1888 subtract 1 from DST_END to assure overflow checking is
|
|
|
1889 necessary only at the head of loop. */
|
|
|
1890 unsigned char *adjusted_dst_end = dst_end - 1;
|
|
|
1891
|
|
|
1892 while (src < src_end && dst < adjusted_dst_end)
|
|
|
1893 {
|
|
|
1894 c = *src++;
|
|
|
1895 if (c == '\n' || (c == '\r' && coding->selective))
|
|
|
1896 *dst++ = '\r', *dst++ = '\n';
|
|
|
1897 else
|
|
|
1898 *dst++ = c;
|
|
|
1899 }
|
|
|
1900 produced = dst - destination;
|
|
|
1901 *consumed = src - source;
|
|
|
1902 break;
|
|
|
1903 }
|
|
|
1904
|
|
|
1905 default: /* i.e. case CODING_EOL_CR: */
|
|
|
1906 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
|
|
|
1907 bcopy (source, destination, produced);
|
|
|
1908 {
|
|
|
1909 int i = produced;
|
|
|
1910 while (i--)
|
|
|
1911 if (*dst++ == '\n') dst[-1] = '\r';
|
|
|
1912 }
|
|
|
1913 *consumed = produced;
|
|
|
1914 }
|
|
|
1915
|
|
|
1916 return produced;
|
|
|
1917 }
|
|
|
1918
|
|
|
1919
|
|
|
1920 /*** 6. C library functions ***/
|
|
|
1921
|
|
|
1922 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
|
|
|
1923 has a property `coding-system'. The value of this property is a
|
|
|
1924 vector of length 5 (called as coding-vector). Among elements of
|
|
|
1925 this vector, the first (element[0]) and the fifth (element[4])
|
|
|
1926 carry important information for decoding/encoding. Before
|
|
|
1927 decoding/encoding, this information should be set in fields of a
|
|
|
1928 structure of type `coding_system'.
|
|
|
1929
|
|
|
1930 A value of property `coding-system' can be a symbol of another
|
|
|
1931 subsidiary coding-system. In that case, Emacs gets coding-vector
|
|
|
1932 from that symbol.
|
|
|
1933
|
|
|
1934 `element[0]' contains information to be set in `coding->type'. The
|
|
|
1935 value and its meaning is as follows:
|
|
|
1936
|
|
|
1937 0 -- coding_system_internal
|
|
|
1938 1 -- coding_system_sjis
|
|
|
1939 2 -- coding_system_iso2022
|
|
|
1940 3 -- coding_system_big5
|
|
|
1941 4 -- coding_system_ccl
|
|
|
1942 nil -- coding_system_no_conversion
|
|
|
1943 t -- coding_system_automatic
|
|
|
1944
|
|
|
1945 `element[4]' contains information to be set in `coding->flags' and
|
|
|
1946 `coding->spec'. The meaning varies by `coding->type'.
|
|
|
1947
|
|
|
1948 If `coding->type' is `coding_type_iso2022', element[4] is a vector
|
|
|
1949 of length 32 (of which the first 13 sub-elements are used now).
|
|
|
1950 Meanings of these sub-elements are:
|
|
|
1951
|
|
|
1952 sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
|
|
|
1953 If the value is an integer of valid charset, the charset is
|
|
|
1954 assumed to be designated to graphic register N initially.
|
|
|
1955
|
|
|
1956 If the value is minus, it is a minus value of charset which
|
|
|
1957 reserves graphic register N, which means that the charset is
|
|
|
1958 not designated initially but should be designated to graphic
|
|
|
1959 register N just before encoding a character in that charset.
|
|
|
1960
|
|
|
1961 If the value is nil, graphic register N is never used on
|
|
|
1962 encoding.
|
|
|
1963
|
|
|
1964 sub-element[N] where N is 4 through 11: to be set in `coding->flags'
|
|
|
1965 Each value takes t or nil. See the section ISO2022 of
|
|
|
1966 `coding.h' for more information.
|
|
|
1967
|
|
|
1968 If `coding->type' is `coding_type_big5', element[4] is t to denote
|
|
|
1969 BIG5-ETen or nil to denote BIG5-HKU.
|
|
|
1970
|
|
|
1971 If `coding->type' takes the other value, element[4] is ignored.
|
|
|
1972
|
|
|
1973 Emacs Lisp's coding system also carries information about format of
|
|
|
1974 end-of-line in a value of property `eol-type'. If the value is
|
|
|
1975 integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
|
|
|
1976 means CODING_EOL_CR. If it is not integer, it should be a vector
|
|
|
1977 of subsidiary coding systems of which property `eol-type' has one
|
|
|
1978 of above values.
|
|
|
1979
|
|
|
1980 */
|
|
|
1981
|
|
|
1982 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
|
|
|
1983 and set it in CODING. If CODING_SYSTEM_SYMBOL is invalid, CODING
|
|
|
1984 is setup so that no conversion is necessary and return -1, else
|
|
|
1985 return 0. */
|
|
|
1986
|
|
|
1987 int
|
|
|
1988 setup_coding_system (coding_system_symbol, coding)
|
|
|
1989 Lisp_Object coding_system_symbol;
|
|
|
1990 struct coding_system *coding;
|
|
|
1991 {
|
|
|
1992 Lisp_Object coding_system_vector = Qnil;
|
|
|
1993 Lisp_Object type, eol_type;
|
|
|
1994
|
|
|
1995 /* At first, set several fields default values. */
|
|
|
1996 coding->require_flushing = 0;
|
|
|
1997 coding->last_block = 0;
|
|
|
1998 coding->selective = 0;
|
|
|
1999 coding->composing = 0;
|
|
|
2000 coding->direction = 0;
|
|
|
2001 coding->carryover_size = 0;
|
|
|
2002 coding->symbol = Qnil;
|
|
|
2003 coding->post_read_conversion = coding->pre_write_conversion = Qnil;
|
|
|
2004
|
|
|
2005 /* Get value of property `coding-system'. If it is a Lisp symbol
|
|
|
2006 pointing another coding system, fetch its property until we get a
|
|
|
2007 vector. */
|
|
|
2008 while (!NILP (coding_system_symbol))
|
|
|
2009 {
|
|
|
2010 coding->symbol = coding_system_symbol;
|
|
|
2011 if (NILP (coding->post_read_conversion))
|
|
|
2012 coding->post_read_conversion = Fget (coding_system_symbol,
|
|
|
2013 Qpost_read_conversion);
|
|
|
2014 if (NILP (coding->pre_write_conversion))
|
|
|
2015 coding->pre_write_conversion = Fget (coding_system_symbol,
|
|
|
2016 Qpre_write_conversion);
|
|
|
2017
|
|
|
2018 coding_system_vector = Fget (coding_system_symbol, Qcoding_system);
|
|
|
2019 if (VECTORP (coding_system_vector))
|
|
|
2020 break;
|
|
|
2021 coding_system_symbol = coding_system_vector;
|
|
|
2022 }
|
|
|
2023 Vlast_coding_system_used = coding->symbol;
|
|
|
2024
|
|
|
2025 if (!VECTORP (coding_system_vector)
|
|
|
2026 || XVECTOR (coding_system_vector)->size != 5)
|
|
|
2027 goto label_invalid_coding_system;
|
|
|
2028
|
|
|
2029 /* Get value of property `eol-type' by searching from the root
|
|
|
2030 coding-system. */
|
|
|
2031 coding_system_symbol = coding->symbol;
|
|
|
2032 eol_type = Qnil;
|
|
|
2033 while (SYMBOLP (coding_system_symbol) && !NILP (coding_system_symbol))
|
|
|
2034 {
|
|
|
2035 eol_type = Fget (coding_system_symbol, Qeol_type);
|
|
|
2036 if (!NILP (eol_type))
|
|
|
2037 break;
|
|
|
2038 coding_system_symbol = Fget (coding_system_symbol, Qcoding_system);
|
|
|
2039 }
|
|
|
2040
|
|
|
2041 if (VECTORP (eol_type))
|
|
|
2042 coding->eol_type = CODING_EOL_AUTOMATIC;
|
|
|
2043 else if (XFASTINT (eol_type) == 1)
|
|
|
2044 coding->eol_type = CODING_EOL_CRLF;
|
|
|
2045 else if (XFASTINT (eol_type) == 2)
|
|
|
2046 coding->eol_type = CODING_EOL_CR;
|
|
|
2047 else
|
|
|
2048 coding->eol_type = CODING_EOL_LF;
|
|
|
2049
|
|
|
2050 type = XVECTOR (coding_system_vector)->contents[0];
|
|
|
2051 switch (XFASTINT (type))
|
|
|
2052 {
|
|
|
2053 case 0:
|
|
|
2054 coding->type = coding_type_internal;
|
|
|
2055 break;
|
|
|
2056
|
|
|
2057 case 1:
|
|
|
2058 coding->type = coding_type_sjis;
|
|
|
2059 break;
|
|
|
2060
|
|
|
2061 case 2:
|
|
|
2062 coding->type = coding_type_iso2022;
|
|
|
2063 {
|
|
|
2064 Lisp_Object val = XVECTOR (coding_system_vector)->contents[4];
|
|
|
2065 Lisp_Object *flags;
|
|
|
2066 int i, charset, default_reg_bits = 0;
|
|
|
2067
|
|
|
2068 if (!VECTORP (val) || XVECTOR (val)->size != 32)
|
|
|
2069 goto label_invalid_coding_system;
|
|
|
2070
|
|
|
2071 flags = XVECTOR (val)->contents;
|
|
|
2072 coding->flags
|
|
|
2073 = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
|
|
|
2074 | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
|
|
|
2075 | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
|
|
|
2076 | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
|
|
|
2077 | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
|
|
|
2078 | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
|
|
|
2079 | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
|
|
|
2080 | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
|
|
|
2081 | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION));
|
|
|
2082
|
|
|
2083 /* Invoke graphic register 0 to plane 0. */
|
|
|
2084 CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
|
|
|
2085 /* Invoke graphic register 1 to plane 1 if we can use full 8-bit. */
|
|
|
2086 CODING_SPEC_ISO_INVOCATION (coding, 1)
|
|
|
2087 = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
|
|
|
2088 /* Not single shifting at first. */
|
|
|
2089 CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
|
|
|
2090
|
|
|
2091 /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
|
|
|
2092 FLAGS[REG] can be one of below:
|
|
|
2093 integer CHARSET: CHARSET occupies register I,
|
|
|
2094 t: designate nothing to REG initially, but can be used
|
|
|
2095 by any charsets,
|
|
|
2096 list of integer, nil, or t: designate the first
|
|
|
2097 element (if integer) to REG initially, the remaining
|
|
|
2098 elements (if integer) is designated to REG on request,
|
|
|
2099 if an element is t, REG can be used by any charset,
|
|
|
2100 nil: REG is never used. */
|
|
|
2101 for (charset = 0; charset < MAX_CHARSET; charset++)
|
|
|
2102 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = -1;
|
|
|
2103 for (i = 0; i < 4; i++)
|
|
|
2104 {
|
|
|
2105 if (INTEGERP (flags[i])
|
|
|
2106 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
|
|
|
2107 {
|
|
|
2108 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
|
|
|
2109 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
|
|
|
2110 }
|
|
|
2111 else if (EQ (flags[i], Qt))
|
|
|
2112 {
|
|
|
2113 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
|
|
|
2114 default_reg_bits |= 1 << i;
|
|
|
2115 }
|
|
|
2116 else if (CONSP (flags[i]))
|
|
|
2117 {
|
|
|
2118 Lisp_Object tail = flags[i];
|
|
|
2119
|
|
|
2120 if (INTEGERP (XCONS (tail)->car)
|
|
|
2121 && (charset = XINT (XCONS (tail)->car),
|
|
|
2122 CHARSET_VALID_P (charset)))
|
|
|
2123 {
|
|
|
2124 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
|
|
|
2125 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
|
|
|
2126 }
|
|
|
2127 else
|
|
|
2128 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
|
|
|
2129 tail = XCONS (tail)->cdr;
|
|
|
2130 while (CONSP (tail))
|
|
|
2131 {
|
|
|
2132 if (INTEGERP (XCONS (tail)->car)
|
|
|
2133 && (charset = XINT (XCONS (tail)->car),
|
|
|
2134 CHARSET_VALID_P (charset)))
|
|
|
2135 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
|
|
|
2136 = i;
|
|
|
2137 else if (EQ (XCONS (tail)->car, Qt))
|
|
|
2138 default_reg_bits |= 1 << i;
|
|
|
2139 tail = XCONS (tail)->cdr;
|
|
|
2140 }
|
|
|
2141 }
|
|
|
2142 else
|
|
|
2143 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
|
|
|
2144
|
|
|
2145 CODING_SPEC_ISO_DESIGNATION (coding, i)
|
|
|
2146 = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
|
|
|
2147 }
|
|
|
2148
|
|
|
2149 if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
|
|
|
2150 {
|
|
|
2151 /* REG 1 can be used only by locking shift in 7-bit env. */
|
|
|
2152 if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
|
|
|
2153 default_reg_bits &= ~2;
|
|
|
2154 if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
|
|
|
2155 /* Without any shifting, only REG 0 and 1 can be used. */
|
|
|
2156 default_reg_bits &= 3;
|
|
|
2157 }
|
|
|
2158
|
|
|
2159 for (charset = 0; charset < MAX_CHARSET; charset++)
|
|
|
2160 if (CHARSET_VALID_P (charset)
|
|
|
2161 && CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) < 0)
|
|
|
2162 {
|
|
|
2163 /* We have not yet decided where to designate CHARSET. */
|
|
|
2164 int reg_bits = default_reg_bits;
|
|
|
2165
|
|
|
2166 if (CHARSET_CHARS (charset) == 96)
|
|
|
2167 /* A charset of CHARS96 can't be designated to REG 0. */
|
|
|
2168 reg_bits &= ~1;
|
|
|
2169
|
|
|
2170 if (reg_bits)
|
|
|
2171 /* There exist some default graphic register. */
|
|
|
2172 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
|
|
|
2173 = (reg_bits & 1
|
|
|
2174 ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
|
|
|
2175 else
|
|
|
2176 /* We anyway have to designate CHARSET to somewhere. */
|
|
|
2177 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
|
|
|
2178 = (CHARSET_CHARS (charset) == 94
|
|
|
2179 ? 0
|
|
|
2180 : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
|
|
|
2181 || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
|
|
|
2182 ? 1
|
|
|
2183 : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
|
|
|
2184 ? 2 : 0)));
|
|
|
2185 }
|
|
|
2186 }
|
|
|
2187 coding->require_flushing = 1;
|
|
|
2188 break;
|
|
|
2189
|
|
|
2190 case 3:
|
|
|
2191 coding->type = coding_type_big5;
|
|
|
2192 coding->flags
|
|
|
2193 = (NILP (XVECTOR (coding_system_vector)->contents[4])
|
|
|
2194 ? CODING_FLAG_BIG5_HKU
|
|
|
2195 : CODING_FLAG_BIG5_ETEN);
|
|
|
2196 break;
|
|
|
2197
|
|
|
2198 case 4:
|
|
|
2199 coding->type = coding_type_ccl;
|
|
|
2200 {
|
|
|
2201 Lisp_Object val = XVECTOR (coding_system_vector)->contents[4];
|
|
|
2202 if (CONSP (val)
|
|
|
2203 && VECTORP (XCONS (val)->car)
|
|
|
2204 && VECTORP (XCONS (val)->cdr))
|
|
|
2205 {
|
|
|
2206 setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
|
|
|
2207 setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
|
|
|
2208 }
|
|
|
2209 else
|
|
|
2210 goto label_invalid_coding_system;
|
|
|
2211 }
|
|
|
2212 coding->require_flushing = 1;
|
|
|
2213 break;
|
|
|
2214
|
|
|
2215 default:
|
|
|
2216 if (EQ (type, Qt))
|
|
|
2217 coding->type = coding_type_automatic;
|
|
|
2218 else
|
|
|
2219 coding->type = coding_type_no_conversion;
|
|
|
2220 break;
|
|
|
2221 }
|
|
|
2222 return 0;
|
|
|
2223
|
|
|
2224 label_invalid_coding_system:
|
|
|
2225 coding->type = coding_type_no_conversion;
|
|
|
2226 return -1;
|
|
|
2227 }
|
|
|
2228
|
|
|
2229 /* Emacs has a mechanism to automatically detect a coding system if it
|
|
|
2230 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
|
|
|
2231 it's impossible to distinguish some coding systems accurately
|
|
|
2232 because they use the same range of codes. So, at first, coding
|
|
|
2233 systems are categorized into 7, those are:
|
|
|
2234
|
|
|
2235 o coding-category-internal
|
|
|
2236
|
|
|
2237 The category for a coding system which has the same code range
|
|
|
2238 as Emacs' internal format. Assigned the coding-system (Lisp
|
|
|
2239 symbol) `coding-system-internal' by default.
|
|
|
2240
|
|
|
2241 o coding-category-sjis
|
|
|
2242
|
|
|
2243 The category for a coding system which has the same code range
|
|
|
2244 as SJIS. Assigned the coding-system (Lisp
|
|
|
2245 symbol) `coding-system-sjis' by default.
|
|
|
2246
|
|
|
2247 o coding-category-iso-7
|
|
|
2248
|
|
|
2249 The category for a coding system which has the same code range
|
|
|
2250 as ISO2022 of 7-bit environment. Assigned the coding-system
|
|
|
2251 (Lisp symbol) `coding-system-junet' by default.
|
|
|
2252
|
|
|
2253 o coding-category-iso-8-1
|
|
|
2254
|
|
|
2255 The category for a coding system which has the same code range
|
|
|
2256 as ISO2022 of 8-bit environment and graphic plane 1 used only
|
|
|
2257 for DIMENSION1 charset. Assigned the coding-system (Lisp
|
|
|
2258 symbol) `coding-system-ctext' by default.
|
|
|
2259
|
|
|
2260 o coding-category-iso-8-2
|
|
|
2261
|
|
|
2262 The category for a coding system which has the same code range
|
|
|
2263 as ISO2022 of 8-bit environment and graphic plane 1 used only
|
|
|
2264 for DIMENSION2 charset. Assigned the coding-system (Lisp
|
|
|
2265 symbol) `coding-system-euc-japan' by default.
|
|
|
2266
|
|
|
2267 o coding-category-iso-else
|
|
|
2268
|
|
|
2269 The category for a coding system which has the same code range
|
|
|
2270 as ISO2022 but not belongs to any of the above three
|
|
|
2271 categories. Assigned the coding-system (Lisp symbol)
|
|
|
2272 `coding-system-iso-2022-ss2-7' by default.
|
|
|
2273
|
|
|
2274 o coding-category-big5
|
|
|
2275
|
|
|
2276 The category for a coding system which has the same code range
|
|
|
2277 as BIG5. Assigned the coding-system (Lisp symbol)
|
|
|
2278 `coding-system-big5' by default.
|
|
|
2279
|
|
|
2280 o coding-category-binary
|
|
|
2281
|
|
|
2282 The category for a coding system not categorized in any of the
|
|
|
2283 above. Assigned the coding-system (Lisp symbol)
|
|
|
2284 `coding-system-noconv' by default.
|
|
|
2285
|
|
|
2286 Each of them is a Lisp symbol and the value is an actual
|
|
|
2287 `coding-system's (this is also a Lisp symbol) assigned by a user.
|
|
|
2288 What Emacs does actually is to detect a category of coding system.
|
|
|
2289 Then, it uses a `coding-system' assigned to it. If Emacs can't
|
|
|
2290 decide only one possible category, it selects a category of the
|
|
|
2291 highest priority. Priorities of categories are also specified by a
|
|
|
2292 user in a Lisp variable `coding-category-list'.
|
|
|
2293
|
|
|
2294 */
|
|
|
2295
|
|
|
2296 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
|
|
|
2297 If it detects possible coding systems, return an integer in which
|
|
|
2298 appropriate flag bits are set. Flag bits are defined by macros
|
|
|
2299 CODING_CATEGORY_MASK_XXX in `coding.h'. */
|
|
|
2300
|
|
|
2301 int
|
|
|
2302 detect_coding_mask (src, src_bytes)
|
|
|
2303 unsigned char *src;
|
|
|
2304 int src_bytes;
|
|
|
2305 {
|
|
|
2306 register unsigned char c;
|
|
|
2307 unsigned char *src_end = src + src_bytes;
|
|
|
2308 int mask;
|
|
|
2309
|
|
|
2310 /* At first, skip all ASCII characters and control characters except
|
|
|
2311 for three ISO2022 specific control characters. */
|
|
|
2312 while (src < src_end)
|
|
|
2313 {
|
|
|
2314 c = *src;
|
|
|
2315 if (c >= 0x80
|
|
|
2316 || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
|
|
|
2317 break;
|
|
|
2318 src++;
|
|
|
2319 }
|
|
|
2320
|
|
|
2321 if (src >= src_end)
|
|
|
2322 /* We found nothing other than ASCII. There's nothing to do. */
|
|
|
2323 return CODING_CATEGORY_MASK_ANY;
|
|
|
2324
|
|
|
2325 /* The text seems to be encoded in some multilingual coding system.
|
|
|
2326 Now, try to find in which coding system the text is encoded. */
|
|
|
2327 if (c < 0x80)
|
|
|
2328 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
|
|
|
2329 /* C is an ISO2022 specific control code of C0. */
|
|
|
2330 mask = detect_coding_iso2022 (src, src_end);
|
|
|
2331
|
|
|
2332 else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
|
|
|
2333 /* C is an ISO2022 specific control code of C1,
|
|
|
2334 or the first byte of SJIS's 2-byte character code,
|
|
|
2335 or a leading code of Emacs. */
|
|
|
2336 mask = (detect_coding_iso2022 (src, src_end)
|
|
|
2337 | detect_coding_sjis (src, src_end)
|
|
|
2338 | detect_coding_internal (src, src_end));
|
|
|
2339
|
|
|
2340 else if (c < 0xA0)
|
|
|
2341 /* C is the first byte of SJIS character code,
|
|
|
2342 or a leading-code of Emacs. */
|
|
|
2343 mask = (detect_coding_sjis (src, src_end)
|
|
|
2344 | detect_coding_internal (src, src_end));
|
|
|
2345
|
|
|
2346 else
|
|
|
2347 /* C is a character of ISO2022 in graphic plane right,
|
|
|
2348 or a SJIS's 1-byte character code (i.e. JISX0201),
|
|
|
2349 or the first byte of BIG5's 2-byte code. */
|
|
|
2350 mask = (detect_coding_iso2022 (src, src_end)
|
|
|
2351 | detect_coding_sjis (src, src_end)
|
|
|
2352 | detect_coding_big5 (src, src_end));
|
|
|
2353
|
|
|
2354 return mask;
|
|
|
2355 }
|
|
|
2356
|
|
|
2357 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
|
|
|
2358 The information of the detected coding system is set in CODING. */
|
|
|
2359
|
|
|
2360 void
|
|
|
2361 detect_coding (coding, src, src_bytes)
|
|
|
2362 struct coding_system *coding;
|
|
|
2363 unsigned char *src;
|
|
|
2364 int src_bytes;
|
|
|
2365 {
|
|
|
2366 int mask = detect_coding_mask (src, src_bytes);
|
|
|
2367 int idx;
|
|
|
2368
|
|
|
2369 if (mask == CODING_CATEGORY_MASK_ANY)
|
|
|
2370 /* We found nothing other than ASCII. There's nothing to do. */
|
|
|
2371 return;
|
|
|
2372
|
|
|
2373 if (!mask)
|
|
|
2374 /* The source text seems to be encoded in unknown coding system.
|
|
|
2375 Emacs regards the category of such a kind of coding system as
|
|
|
2376 `coding-category-binary'. We assume that a user has assigned
|
|
|
2377 an appropriate coding system for a `coding-category-binary'. */
|
|
|
2378 idx = CODING_CATEGORY_IDX_BINARY;
|
|
|
2379 else
|
|
|
2380 {
|
|
|
2381 /* We found some plausible coding systems. Let's use a coding
|
|
|
2382 system of the highest priority. */
|
|
|
2383 Lisp_Object val = Vcoding_category_list;
|
|
|
2384
|
|
|
2385 if (CONSP (val))
|
|
|
2386 while (!NILP (val))
|
|
|
2387 {
|
|
|
2388 idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
|
|
|
2389 if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
|
|
|
2390 break;
|
|
|
2391 val = XCONS (val)->cdr;
|
|
|
2392 }
|
|
|
2393 else
|
|
|
2394 val = Qnil;
|
|
|
2395
|
|
|
2396 if (NILP (val))
|
|
|
2397 {
|
|
|
2398 /* For unknown reason, `Vcoding_category_list' contains none
|
|
|
2399 of found categories. Let's use any of them. */
|
|
|
2400 for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
|
|
|
2401 if (mask & (1 << idx))
|
|
|
2402 break;
|
|
|
2403 }
|
|
|
2404 }
|
|
|
2405 setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
|
|
|
2406 }
|
|
|
2407
|
|
|
2408 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
|
|
|
2409 is encoded. Return one of CODING_EOL_LF, CODING_EOL_CRLF,
|
|
|
2410 CODING_EOL_CR, and CODING_EOL_AUTOMATIC. */
|
|
|
2411
|
|
|
2412 int
|
|
|
2413 detect_eol_type (src, src_bytes)
|
|
|
2414 unsigned char *src;
|
|
|
2415 int src_bytes;
|
|
|
2416 {
|
|
|
2417 unsigned char *src_end = src + src_bytes;
|
|
|
2418 unsigned char c;
|
|
|
2419
|
|
|
2420 while (src < src_end)
|
|
|
2421 {
|
|
|
2422 c = *src++;
|
|
|
2423 if (c == '\n')
|
|
|
2424 return CODING_EOL_LF;
|
|
|
2425 else if (c == '\r')
|
|
|
2426 {
|
|
|
2427 if (src < src_end && *src == '\n')
|
|
|
2428 return CODING_EOL_CRLF;
|
|
|
2429 else
|
|
|
2430 return CODING_EOL_CR;
|
|
|
2431 }
|
|
|
2432 }
|
|
|
2433 return CODING_EOL_AUTOMATIC;
|
|
|
2434 }
|
|
|
2435
|
|
|
2436 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
|
|
|
2437 is encoded. If it detects an appropriate format of end-of-line, it
|
|
|
2438 sets the information in *CODING. */
|
|
|
2439
|
|
|
2440 void
|
|
|
2441 detect_eol (coding, src, src_bytes)
|
|
|
2442 struct coding_system *coding;
|
|
|
2443 unsigned char *src;
|
|
|
2444 int src_bytes;
|
|
|
2445 {
|
|
|
2446 Lisp_Object val;
|
|
|
2447 int eol_type = detect_eol_type (src, src_bytes);
|
|
|
2448
|
|
|
2449 if (eol_type == CODING_EOL_AUTOMATIC)
|
|
|
2450 /* We found no end-of-line in the source text. */
|
|
|
2451 return;
|
|
|
2452
|
|
|
2453 val = Fget (coding->symbol, Qeol_type);
|
|
|
2454 if (VECTORP (val) && XVECTOR (val)->size == 3)
|
|
|
2455 setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
|
|
|
2456 }
|
|
|
2457
|
|
|
2458 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions". Before
|
|
|
2459 decoding, it may detect coding system and format of end-of-line if
|
|
|
2460 those are not yet decided. */
|
|
|
2461
|
|
|
2462 int
|
|
|
2463 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
|
|
|
2464 struct coding_system *coding;
|
|
|
2465 unsigned char *source, *destination;
|
|
|
2466 int src_bytes, dst_bytes;
|
|
|
2467 int *consumed;
|
|
|
2468 {
|
|
|
2469 int produced;
|
|
|
2470
|
|
|
2471 if (src_bytes <= 0)
|
|
|
2472 {
|
|
|
2473 *consumed = 0;
|
|
|
2474 return 0;
|
|
|
2475 }
|
|
|
2476
|
|
|
2477 if (coding->type == coding_type_automatic)
|
|
|
2478 detect_coding (coding, source, src_bytes);
|
|
|
2479
|
|
|
2480 if (coding->eol_type == CODING_EOL_AUTOMATIC)
|
|
|
2481 detect_eol (coding, source, src_bytes);
|
|
|
2482
|
|
|
2483 coding->carryover_size = 0;
|
|
|
2484 switch (coding->type)
|
|
|
2485 {
|
|
|
2486 case coding_type_no_conversion:
|
|
|
2487 label_no_conversion:
|
|
|
2488 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
|
|
|
2489 bcopy (source, destination, produced);
|
|
|
2490 *consumed = produced;
|
|
|
2491 break;
|
|
|
2492
|
|
|
2493 case coding_type_internal:
|
|
|
2494 case coding_type_automatic:
|
|
|
2495 if (coding->eol_type == CODING_EOL_LF
|
|
|
2496 || coding->eol_type == CODING_EOL_AUTOMATIC)
|
|
|
2497 goto label_no_conversion;
|
|
|
2498 produced = decode_eol (coding, source, destination,
|
|
|
2499 src_bytes, dst_bytes, consumed);
|
|
|
2500 break;
|
|
|
2501
|
|
|
2502 case coding_type_sjis:
|
|
|
2503 produced = decode_coding_sjis_big5 (coding, source, destination,
|
|
|
2504 src_bytes, dst_bytes, consumed,
|
|
|
2505 1);
|
|
|
2506 break;
|
|
|
2507
|
|
|
2508 case coding_type_iso2022:
|
|
|
2509 produced = decode_coding_iso2022 (coding, source, destination,
|
|
|
2510 src_bytes, dst_bytes, consumed);
|
|
|
2511 break;
|
|
|
2512
|
|
|
2513 case coding_type_big5:
|
|
|
2514 produced = decode_coding_sjis_big5 (coding, source, destination,
|
|
|
2515 src_bytes, dst_bytes, consumed,
|
|
|
2516 0);
|
|
|
2517 break;
|
|
|
2518
|
|
|
2519 case coding_type_ccl:
|
|
|
2520 produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
|
|
|
2521 src_bytes, dst_bytes, consumed);
|
|
|
2522 break;
|
|
|
2523 }
|
|
|
2524
|
|
|
2525 return produced;
|
|
|
2526 }
|
|
|
2527
|
|
|
2528 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions". */
|
|
|
2529
|
|
|
2530 int
|
|
|
2531 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
|
|
|
2532 struct coding_system *coding;
|
|
|
2533 unsigned char *source, *destination;
|
|
|
2534 int src_bytes, dst_bytes;
|
|
|
2535 int *consumed;
|
|
|
2536 {
|
|
|
2537 int produced;
|
|
|
2538
|
|
|
2539 coding->carryover_size = 0;
|
|
|
2540 switch (coding->type)
|
|
|
2541 {
|
|
|
2542 case coding_type_no_conversion:
|
|
|
2543 label_no_conversion:
|
|
|
2544 produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
|
|
|
2545 if (produced > 0)
|
|
|
2546 {
|
|
|
2547 bcopy (source, destination, produced);
|
|
|
2548 if (coding->selective)
|
|
|
2549 {
|
|
|
2550 unsigned char *p = destination, *pend = destination + produced;
|
|
|
2551 while (p < pend)
|
|
|
2552 if (*p++ = '\015') p[-1] = '\n';
|
|
|
2553 }
|
|
|
2554 }
|
|
|
2555 *consumed = produced;
|
|
|
2556 break;
|
|
|
2557
|
|
|
2558 case coding_type_internal:
|
|
|
2559 case coding_type_automatic:
|
|
|
2560 if (coding->eol_type == CODING_EOL_LF
|
|
|
2561 || coding->eol_type == CODING_EOL_AUTOMATIC)
|
|
|
2562 goto label_no_conversion;
|
|
|
2563 produced = encode_eol (coding, source, destination,
|
|
|
2564 src_bytes, dst_bytes, consumed);
|
|
|
2565 break;
|
|
|
2566
|
|
|
2567 case coding_type_sjis:
|
|
|
2568 produced = encode_coding_sjis_big5 (coding, source, destination,
|
|
|
2569 src_bytes, dst_bytes, consumed,
|
|
|
2570 1);
|
|
|
2571 break;
|
|
|
2572
|
|
|
2573 case coding_type_iso2022:
|
|
|
2574 produced = encode_coding_iso2022 (coding, source, destination,
|
|
|
2575 src_bytes, dst_bytes, consumed);
|
|
|
2576 break;
|
|
|
2577
|
|
|
2578 case coding_type_big5:
|
|
|
2579 produced = encode_coding_sjis_big5 (coding, source, destination,
|
|
|
2580 src_bytes, dst_bytes, consumed,
|
|
|
2581 0);
|
|
|
2582 break;
|
|
|
2583
|
|
|
2584 case coding_type_ccl:
|
|
|
2585 produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
|
|
|
2586 src_bytes, dst_bytes, consumed);
|
|
|
2587 break;
|
|
|
2588 }
|
|
|
2589
|
|
|
2590 return produced;
|
|
|
2591 }
|
|
|
2592
|
|
|
2593 #define CONVERSION_BUFFER_EXTRA_ROOM 256
|
|
|
2594
|
|
|
2595 /* Return maximum size (bytes) of a buffer enough for decoding
|
|
|
2596 SRC_BYTES of text encoded in CODING. */
|
|
|
2597
|
|
|
2598 int
|
|
|
2599 decoding_buffer_size (coding, src_bytes)
|
|
|
2600 struct coding_system *coding;
|
|
|
2601 int src_bytes;
|
|
|
2602 {
|
|
|
2603 int magnification;
|
|
|
2604
|
|
|
2605 if (coding->type == coding_type_iso2022)
|
|
|
2606 magnification = 3;
|
|
|
2607 else if (coding->type == coding_type_ccl)
|
|
|
2608 magnification = coding->spec.ccl.decoder.buf_magnification;
|
|
|
2609 else
|
|
|
2610 magnification = 2;
|
|
|
2611
|
|
|
2612 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
|
|
|
2613 }
|
|
|
2614
|
|
|
2615 /* Return maximum size (bytes) of a buffer enough for encoding
|
|
|
2616 SRC_BYTES of text to CODING. */
|
|
|
2617
|
|
|
2618 int
|
|
|
2619 encoding_buffer_size (coding, src_bytes)
|
|
|
2620 struct coding_system *coding;
|
|
|
2621 int src_bytes;
|
|
|
2622 {
|
|
|
2623 int magnification;
|
|
|
2624
|
|
|
2625 if (coding->type == coding_type_ccl)
|
|
|
2626 magnification = coding->spec.ccl.encoder.buf_magnification;
|
|
|
2627 else
|
|
|
2628 magnification = 3;
|
|
|
2629
|
|
|
2630 return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
|
|
|
2631 }
|
|
|
2632
|
|
|
2633 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
|
|
|
2634 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
|
|
|
2635 #endif
|
|
|
2636
|
|
|
2637 char *conversion_buffer;
|
|
|
2638 int conversion_buffer_size;
|
|
|
2639
|
|
|
2640 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
|
|
|
2641 or decoding. Sufficient memory is allocated automatically. If we
|
|
|
2642 run out of memory, return NULL. */
|
|
|
2643
|
|
|
2644 char *
|
|
|
2645 get_conversion_buffer (size)
|
|
|
2646 int size;
|
|
|
2647 {
|
|
|
2648 if (size > conversion_buffer_size)
|
|
|
2649 {
|
|
|
2650 char *buf;
|
|
|
2651 int real_size = conversion_buffer_size * 2;
|
|
|
2652
|
|
|
2653 while (real_size < size) real_size *= 2;
|
|
|
2654 buf = (char *) xmalloc (real_size);
|
|
|
2655 xfree (conversion_buffer);
|
|
|
2656 conversion_buffer = buf;
|
|
|
2657 conversion_buffer_size = real_size;
|
|
|
2658 }
|
|
|
2659 return conversion_buffer;
|
|
|
2660 }
|
|
|
2661
|
|
|
2662
|
|
|
2663 #ifdef emacs
|
|
|
2664 /*** 7. Emacs Lisp library functions ***/
|
|
|
2665
|
|
|
2666 DEFUN ("coding-system-vector", Fcoding_system_vector, Scoding_system_vector,
|
|
|
2667 1, 1, 0,
|
|
|
2668 "Return coding-vector of CODING-SYSTEM.\n\
|
|
|
2669 If CODING-SYSTEM is not a valid coding-system, return nil.")
|
|
|
2670 (obj)
|
|
|
2671 Lisp_Object obj;
|
|
|
2672 {
|
|
|
2673 while (SYMBOLP (obj) && !NILP (obj))
|
|
|
2674 obj = Fget (obj, Qcoding_system);
|
|
|
2675 return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
|
|
|
2676 ? Qnil : obj);
|
|
|
2677 }
|
|
|
2678
|
|
|
2679 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
|
|
|
2680 "Return t if OBJECT is nil or a coding-system.\n\
|
|
|
2681 See document of make-coding-system for coding-system object.")
|
|
|
2682 (obj)
|
|
|
2683 Lisp_Object obj;
|
|
|
2684 {
|
|
|
2685 return ((NILP (obj) || !NILP (Fcoding_system_vector (obj))) ? Qt : Qnil);
|
|
|
2686 }
|
|
|
2687
|
|
|
2688 DEFUN ("read-non-nil-coding-system",
|
|
|
2689 Fread_non_nil_coding_system, Sread_non_nil_coding_system, 1, 1, 0,
|
|
|
2690 "Read a coding-system from the minibuffer, prompting with string PROMPT.")
|
|
|
2691 (prompt)
|
|
|
2692 Lisp_Object prompt;
|
|
|
2693 {
|
|
|
2694 return Fintern (Fcompleting_read (prompt, Vobarray, Qcoding_system_vector,
|
|
|
2695 Qt, Qnil, Qnil),
|
|
|
2696 Qnil);
|
|
|
2697 }
|
|
|
2698
|
|
|
2699 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
|
|
|
2700 "Read a coding-system or nil from the minibuffer, prompting with string PROMPT.")
|
|
|
2701 (prompt)
|
|
|
2702 Lisp_Object prompt;
|
|
|
2703 {
|
|
|
2704 return Fintern (Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
|
|
|
2705 Qt, Qnil, Qnil),
|
|
|
2706 Qnil);
|
|
|
2707 }
|
|
|
2708
|
|
|
2709 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
|
|
|
2710 1, 1, 0,
|
|
|
2711 "Check validity of CODING-SYSTEM.\n\
|
|
|
2712 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
|
|
|
2713 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
|
|
|
2714 The value of property should be a vector of length 5.")
|
|
|
2715 (coding_system)
|
|
|
2716 Lisp_Object coding_system;
|
|
|
2717 {
|
|
|
2718 CHECK_SYMBOL (coding_system, 0);
|
|
|
2719 if (!NILP (Fcoding_system_p (coding_system)))
|
|
|
2720 return coding_system;
|
|
|
2721 while (1)
|
|
|
2722 Fsignal (Qcoding_system_error, coding_system);
|
|
|
2723 }
|
|
|
2724
|
|
|
2725 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
|
|
|
2726 2, 2, 0,
|
|
|
2727 "Detect coding-system of the text in the region between START and END.\n\
|
|
|
2728 Return a list of possible coding-systems ordered by priority.\n\
|
|
|
2729 If only ASCII characters are found, it returns `coding-system-automatic'\n\
|
|
|
2730 or its subsidiary coding-system according to a detected end-of-line format.")
|
|
|
2731 (b, e)
|
|
|
2732 Lisp_Object b, e;
|
|
|
2733 {
|
|
|
2734 int coding_mask, eol_type;
|
|
|
2735 Lisp_Object val;
|
|
|
2736 int beg, end;
|
|
|
2737
|
|
|
2738 validate_region (&b, &e);
|
|
|
2739 beg = XINT (b), end = XINT (e);
|
|
|
2740 if (beg < GPT && end >= GPT) move_gap (end);
|
|
|
2741
|
|
|
2742 coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
|
|
|
2743 eol_type = detect_eol_type (POS_ADDR (beg), end - beg);
|
|
|
2744
|
|
|
2745 if (coding_mask == CODING_CATEGORY_MASK_ANY)
|
|
|
2746 {
|
|
|
2747 val = intern ("coding-system-automatic");
|
|
|
2748 if (eol_type != CODING_EOL_AUTOMATIC)
|
|
|
2749 {
|
|
|
2750 Lisp_Object val2 = Fget (val, Qeol_type);
|
|
|
2751 if (VECTORP (val2))
|
|
|
2752 val = XVECTOR (val2)->contents[eol_type];
|
|
|
2753 }
|
|
|
2754 }
|
|
|
2755 else
|
|
|
2756 {
|
|
|
2757 Lisp_Object val2;
|
|
|
2758
|
|
|
2759 /* At first, gather possible coding-systems in VAL in a reverse
|
|
|
2760 order. */
|
|
|
2761 val = Qnil;
|
|
|
2762 for (val2 = Vcoding_category_list;
|
|
|
2763 !NILP (val2);
|
|
|
2764 val2 = XCONS (val2)->cdr)
|
|
|
2765 {
|
|
|
2766 int idx
|
|
|
2767 = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
|
|
|
2768 if (coding_mask & (1 << idx))
|
|
|
2769 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
|
|
|
2770 }
|
|
|
2771
|
|
|
2772 /* Then, change the order of the list, while getting subsidiary
|
|
|
2773 coding-systems. */
|
|
|
2774 val2 = val;
|
|
|
2775 val = Qnil;
|
|
|
2776 for (; !NILP (val2); val2 = XCONS (val2)->cdr)
|
|
|
2777 {
|
|
|
2778 if (eol_type == CODING_EOL_AUTOMATIC)
|
|
|
2779 val = Fcons (XCONS (val2)->car, val);
|
|
|
2780 else
|
|
|
2781 {
|
|
|
2782 Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
|
|
|
2783 if (VECTORP (val3))
|
|
|
2784 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
|
|
|
2785 else
|
|
|
2786 val = Fcons (XCONS (val2)->car, val);
|
|
|
2787 }
|
|
|
2788 }
|
|
|
2789 }
|
|
|
2790
|
|
|
2791 return val;
|
|
|
2792 }
|
|
|
2793
|
|
|
2794 /* Scan text in the region between *BEGP and *ENDP, skip characters
|
|
|
2795 which we never have to encode to (iff ENCODEP is 1) or decode from
|
|
|
2796 coding system CODING at the head and tail, then set BEGP and ENDP
|
|
|
2797 to the addresses of start and end of the text we actually convert. */
|
|
|
2798
|
|
|
2799 void
|
|
|
2800 shrink_conversion_area (begp, endp, coding, encodep)
|
|
|
2801 unsigned char **begp, **endp;
|
|
|
2802 struct coding_system *coding;
|
|
|
2803 int encodep;
|
|
|
2804 {
|
|
|
2805 register unsigned char *beg_addr = *begp, *end_addr = *endp;
|
|
|
2806
|
|
|
2807 if (coding->eol_type != CODING_EOL_LF
|
|
|
2808 && coding->eol_type != CODING_EOL_AUTOMATIC)
|
|
|
2809 /* Since we anyway have to convert end-of-line format, it is not
|
|
|
2810 worth skipping at most 100 bytes or so. */
|
|
|
2811 return;
|
|
|
2812
|
|
|
2813 if (encodep) /* for encoding */
|
|
|
2814 {
|
|
|
2815 switch (coding->type)
|
|
|
2816 {
|
|
|
2817 case coding_type_no_conversion:
|
|
|
2818 case coding_type_internal:
|
|
|
2819 case coding_type_automatic:
|
|
|
2820 /* We need no conversion. */
|
|
|
2821 *begp = *endp;
|
|
|
2822 return;
|
|
|
2823 case coding_type_ccl:
|
|
|
2824 /* We can't skip any data. */
|
|
|
2825 return;
|
|
|
2826 default:
|
|
|
2827 /* We can skip all ASCII characters at the head and tail. */
|
|
|
2828 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
|
|
|
2829 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
|
|
|
2830 break;
|
|
|
2831 }
|
|
|
2832 }
|
|
|
2833 else /* for decoding */
|
|
|
2834 {
|
|
|
2835 switch (coding->type)
|
|
|
2836 {
|
|
|
2837 case coding_type_no_conversion:
|
|
|
2838 /* We need no conversion. */
|
|
|
2839 *begp = *endp;
|
|
|
2840 return;
|
|
|
2841 case coding_type_internal:
|
|
|
2842 if (coding->eol_type == CODING_EOL_LF)
|
|
|
2843 {
|
|
|
2844 /* We need no conversion. */
|
|
|
2845 *begp = *endp;
|
|
|
2846 return;
|
|
|
2847 }
|
|
|
2848 /* We can skip all but carriage-return. */
|
|
|
2849 while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
|
|
|
2850 while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
|
|
|
2851 break;
|
|
|
2852 case coding_type_sjis:
|
|
|
2853 case coding_type_big5:
|
|
|
2854 /* We can skip all ASCII characters at the head. */
|
|
|
2855 while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
|
|
|
2856 /* We can skip all ASCII characters at the tail except for
|
|
|
2857 the second byte of SJIS or BIG5 code. */
|
|
|
2858 while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
|
|
|
2859 if (end_addr != *endp)
|
|
|
2860 end_addr++;
|
|
|
2861 break;
|
|
|
2862 case coding_type_ccl:
|
|
|
2863 /* We can't skip any data. */
|
|
|
2864 return;
|
|
|
2865 default: /* i.e. case coding_type_iso2022: */
|
|
|
2866 {
|
|
|
2867 unsigned char c;
|
|
|
2868
|
|
|
2869 /* We can skip all ASCII characters except for a few
|
|
|
2870 control codes at the head. */
|
|
|
2871 while (beg_addr < end_addr && (c = *beg_addr) < 0x80
|
|
|
2872 && c != ISO_CODE_CR && c != ISO_CODE_SO
|
|
|
2873 && c != ISO_CODE_SI && c != ISO_CODE_ESC)
|
|
|
2874 beg_addr++;
|
|
|
2875 }
|
|
|
2876 break;
|
|
|
2877 }
|
|
|
2878 }
|
|
|
2879 *begp = beg_addr;
|
|
|
2880 *endp = end_addr;
|
|
|
2881 return;
|
|
|
2882 }
|
|
|
2883
|
|
|
2884 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
|
|
|
2885 text between B and E. B and E are buffer position. */
|
|
|
2886
|
|
|
2887 Lisp_Object
|
|
|
2888 code_convert_region (b, e, coding, encodep)
|
|
|
2889 Lisp_Object b, e;
|
|
|
2890 struct coding_system *coding;
|
|
|
2891 int encodep;
|
|
|
2892 {
|
|
|
2893 int beg, end, len, consumed, produced;
|
|
|
2894 char *buf;
|
|
|
2895 unsigned char *begp, *endp;
|
|
|
2896 int pos = PT;
|
|
|
2897
|
|
|
2898 validate_region (&b, &e);
|
|
|
2899 beg = XINT (b), end = XINT (e);
|
|
|
2900 if (beg < GPT && end >= GPT)
|
|
|
2901 move_gap (end);
|
|
|
2902
|
|
|
2903 if (encodep && !NILP (coding->pre_write_conversion))
|
|
|
2904 {
|
|
|
2905 /* We must call a pre-conversion function which may put a new
|
|
|
2906 text to be converted in a new buffer. */
|
|
|
2907 struct buffer *old = current_buffer, *new;
|
|
|
2908
|
|
|
2909 TEMP_SET_PT (beg);
|
|
|
2910 call2 (coding->pre_write_conversion, b, e);
|
|
|
2911 if (old != current_buffer)
|
|
|
2912 {
|
|
|
2913 /* Replace the original text by the text just generated. */
|
|
|
2914 len = ZV - BEGV;
|
|
|
2915 new = current_buffer;
|
|
|
2916 set_buffer_internal (old);
|
|
|
2917 del_range (beg, end);
|
|
|
2918 insert_from_buffer (new, 1, len, 0);
|
|
|
2919 end = beg + len;
|
|
|
2920 }
|
|
|
2921 }
|
|
|
2922
|
|
|
2923 /* We may be able to shrink the conversion region. */
|
|
|
2924 begp = POS_ADDR (beg); endp = begp + (end - beg);
|
|
|
2925 shrink_conversion_area (&begp, &endp, coding, encodep);
|
|
|
2926
|
|
|
2927 if (begp == endp)
|
|
|
2928 /* We need no conversion. */
|
|
|
2929 len = end - beg;
|
|
|
2930 else
|
|
|
2931 {
|
|
|
2932 beg += begp - POS_ADDR (beg);
|
|
|
2933 end = beg + (endp - begp);
|
|
|
2934
|
|
|
2935 if (encodep)
|
|
|
2936 len = encoding_buffer_size (coding, end - beg);
|
|
|
2937 else
|
|
|
2938 len = decoding_buffer_size (coding, end - beg);
|
|
|
2939 buf = get_conversion_buffer (len);
|
|
|
2940
|
|
|
2941 coding->last_block = 1;
|
|
|
2942 produced = (encodep
|
|
|
2943 ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
|
|
|
2944 &consumed)
|
|
|
2945 : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
|
|
|
2946 &consumed));
|
|
|
2947
|
|
|
2948 len = produced + (beg - XINT (b)) + (XINT (e) - end);
|
|
|
2949
|
|
|
2950 TEMP_SET_PT (beg);
|
|
|
2951 insert (buf, produced);
|
|
|
2952 del_range (PT, PT + end - beg);
|
|
|
2953 if (pos >= end)
|
|
|
2954 pos = PT + (pos - end);
|
|
|
2955 else if (pos > beg)
|
|
|
2956 pos = beg;
|
|
|
2957 TEMP_SET_PT (pos);
|
|
|
2958 }
|
|
|
2959
|
|
|
2960 if (!encodep && !NILP (coding->post_read_conversion))
|
|
|
2961 {
|
|
|
2962 /* We must call a post-conversion function which may alter
|
|
|
2963 the text just converted. */
|
|
|
2964 Lisp_Object insval;
|
|
|
2965
|
|
|
2966 beg = XINT (b);
|
|
|
2967 TEMP_SET_PT (beg);
|
|
|
2968 insval = call1 (coding->post_read_conversion, make_number (len));
|
|
|
2969 CHECK_NUMBER (insval, 0);
|
|
|
2970 len = XINT (insval);
|
|
|
2971 }
|
|
|
2972
|
|
|
2973 return make_number (len);
|
|
|
2974 }
|
|
|
2975
|
|
|
2976 Lisp_Object
|
|
|
2977 code_convert_string (str, coding, encodep)
|
|
|
2978 Lisp_Object str;
|
|
|
2979 struct coding_system *coding;
|
|
|
2980 int encodep;
|
|
|
2981 {
|
|
|
2982 int len, consumed, produced;
|
|
|
2983 char *buf;
|
|
|
2984 unsigned char *begp, *endp;
|
|
|
2985 int head_skip, tail_skip;
|
|
|
2986 struct gcpro gcpro1;
|
|
|
2987
|
|
|
2988 if (encodep && !NILP (coding->pre_write_conversion)
|
|
|
2989 || !encodep && !NILP (coding->post_read_conversion))
|
|
|
2990 {
|
|
|
2991 /* Since we have to call Lisp functions which assume target text
|
|
|
2992 is in a buffer, after setting a temporary buffer, call
|
|
|
2993 code_convert_region. */
|
|
|
2994 int count = specpdl_ptr - specpdl;
|
|
|
2995 int len = XSTRING (str)->size;
|
|
|
2996 Lisp_Object result;
|
|
|
2997 struct buffer *old = current_buffer;
|
|
|
2998
|
|
|
2999 record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
|
|
|
3000 temp_output_buffer_setup (" *code-converting-work*");
|
|
|
3001 set_buffer_internal (XBUFFER (Vstandard_output));
|
|
|
3002 insert_from_string (str, 0, len, 0);
|
|
|
3003 code_convert_region (make_number (BEGV), make_number (ZV),
|
|
|
3004 coding, encodep);
|
|
|
3005 result = make_buffer_string (BEGV, ZV, 0);
|
|
|
3006 set_buffer_internal (old);
|
|
|
3007 return unbind_to (count, result);
|
|
|
3008 }
|
|
|
3009
|
|
|
3010 /* We may be able to shrink the conversion region. */
|
|
|
3011 begp = XSTRING (str)->data;
|
|
|
3012 endp = begp + XSTRING (str)->size;
|
|
|
3013 shrink_conversion_area (&begp, &endp, coding, encodep);
|
|
|
3014
|
|
|
3015 if (begp == endp)
|
|
|
3016 /* We need no conversion. */
|
|
|
3017 return str;
|
|
|
3018
|
|
|
3019 head_skip = begp - XSTRING (str)->data;
|
|
|
3020 tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
|
|
|
3021
|
|
|
3022 GCPRO1 (str);
|
|
|
3023
|
|
|
3024 if (encodep)
|
|
|
3025 len = encoding_buffer_size (coding, endp - begp);
|
|
|
3026 else
|
|
|
3027 len = decoding_buffer_size (coding, endp - begp);
|
|
|
3028 buf = get_conversion_buffer (len + head_skip + tail_skip);
|
|
|
3029
|
|
|
3030 bcopy (XSTRING (str)->data, buf, head_skip);
|
|
|
3031 coding->last_block = 1;
|
|
|
3032 produced = (encodep
|
|
|
3033 ? encode_coding (coding, XSTRING (str)->data + head_skip,
|
|
|
3034 buf + head_skip, endp - begp, len, &consumed)
|
|
|
3035 : decode_coding (coding, XSTRING (str)->data + head_skip,
|
|
|
3036 buf + head_skip, endp - begp, len, &consumed));
|
|
|
3037 bcopy (XSTRING (str)->data + head_skip + (endp - begp),
|
|
|
3038 buf + head_skip + produced,
|
|
|
3039 tail_skip);
|
|
|
3040
|
|
|
3041 UNGCPRO;
|
|
|
3042
|
|
|
3043 return make_string (buf, head_skip + produced + tail_skip);
|
|
|
3044 }
|
|
|
3045
|
|
|
3046 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
|
|
|
3047 3, 3, 0,
|
|
|
3048 "Decode the text between START and END which is encoded in CODING-SYSTEM.\n\
|
|
|
3049 Return length of decoded text.")
|
|
|
3050 (b, e, coding_system)
|
|
|
3051 Lisp_Object b, e, coding_system;
|
|
|
3052 {
|
|
|
3053 struct coding_system coding;
|
|
|
3054
|
|
|
3055 CHECK_NUMBER_COERCE_MARKER (b, 0);
|
|
|
3056 CHECK_NUMBER_COERCE_MARKER (e, 1);
|
|
|
3057 CHECK_SYMBOL (coding_system, 2);
|
|
|
3058
|
|
|
3059 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
|
|
|
3060 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
|
|
|
3061
|
|
|
3062 return code_convert_region (b, e, &coding, 0);
|
|
|
3063 }
|
|
|
3064
|
|
|
3065 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
|
|
|
3066 3, 3, 0,
|
|
|
3067 "Encode the text between START and END to CODING-SYSTEM.\n\
|
|
|
3068 Return length of encoded text.")
|
|
|
3069 (b, e, coding_system)
|
|
|
3070 Lisp_Object b, e, coding_system;
|
|
|
3071 {
|
|
|
3072 struct coding_system coding;
|
|
|
3073
|
|
|
3074 CHECK_NUMBER_COERCE_MARKER (b, 0);
|
|
|
3075 CHECK_NUMBER_COERCE_MARKER (e, 1);
|
|
|
3076 CHECK_SYMBOL (coding_system, 2);
|
|
|
3077
|
|
|
3078 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
|
|
|
3079 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
|
|
|
3080
|
|
|
3081 return code_convert_region (b, e, &coding, 1);
|
|
|
3082 }
|
|
|
3083
|
|
|
3084 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
|
|
|
3085 2, 2, 0,
|
|
|
3086 "Decode STRING which is encoded in CODING-SYSTEM, and return the result.")
|
|
|
3087 (string, coding_system)
|
|
|
3088 Lisp_Object string, coding_system;
|
|
|
3089 {
|
|
|
3090 struct coding_system coding;
|
|
|
3091
|
|
|
3092 CHECK_STRING (string, 0);
|
|
|
3093 CHECK_SYMBOL (coding_system, 1);
|
|
|
3094
|
|
|
3095 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
|
|
|
3096 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
|
|
|
3097
|
|
|
3098 return code_convert_string (string, &coding, 0);
|
|
|
3099 }
|
|
|
3100
|
|
|
3101 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
|
|
|
3102 2, 2, 0,
|
|
|
3103 "Encode STRING to CODING-SYSTEM, and return the result.")
|
|
|
3104 (string, coding_system)
|
|
|
3105 Lisp_Object string, coding_system;
|
|
|
3106 {
|
|
|
3107 struct coding_system coding;
|
|
|
3108
|
|
|
3109 CHECK_STRING (string, 0);
|
|
|
3110 CHECK_SYMBOL (coding_system, 1);
|
|
|
3111
|
|
|
3112 if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
|
|
|
3113 error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
|
|
|
3114
|
|
|
3115 return code_convert_string (string, &coding, 1);
|
|
|
3116 }
|
|
|
3117
|
|
|
3118 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
|
|
|
3119 "Decode a JISX0208 character of SJIS coding-system-sjis.\n\
|
|
|
3120 CODE is the character code in SJIS.\n\
|
|
|
3121 Return the corresponding character.")
|
|
|
3122 (code)
|
|
|
3123 Lisp_Object code;
|
|
|
3124 {
|
|
|
3125 unsigned char c1, c2, s1, s2;
|
|
|
3126 Lisp_Object val;
|
|
|
3127
|
|
|
3128 CHECK_NUMBER (code, 0);
|
|
|
3129 s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
|
|
|
3130 DECODE_SJIS (s1, s2, c1, c2);
|
|
|
3131 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
|
|
|
3132 return val;
|
|
|
3133 }
|
|
|
3134
|
|
|
3135 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
|
|
|
3136 "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
|
|
|
3137 Return the corresponding character code in SJIS.")
|
|
|
3138 (ch)
|
|
|
3139 Lisp_Object ch;
|
|
|
3140 {
|
|
|
3141 int charset;
|
|
|
3142 unsigned char c1, c2, s1, s2;
|
|
|
3143 Lisp_Object val;
|
|
|
3144
|
|
|
3145 CHECK_NUMBER (ch, 0);
|
|
|
3146 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
|
|
|
3147 if (charset == charset_jisx0208)
|
|
|
3148 {
|
|
|
3149 ENCODE_SJIS (c1, c2, s1, s2);
|
|
|
3150 XSETFASTINT (val, ((int)s1 << 8) | s2);
|
|
|
3151 }
|
|
|
3152 else
|
|
|
3153 XSETFASTINT (val, 0);
|
|
|
3154 return val;
|
|
|
3155 }
|
|
|
3156
|
|
|
3157 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
|
|
|
3158 "Decode a Big5 character CODE of BIG5 coding-system.\n\
|
|
|
3159 CODE is the character code in BIG5.\n\
|
|
|
3160 Return the corresponding character.")
|
|
|
3161 (code)
|
|
|
3162 Lisp_Object code;
|
|
|
3163 {
|
|
|
3164 int charset;
|
|
|
3165 unsigned char b1, b2, c1, c2;
|
|
|
3166 Lisp_Object val;
|
|
|
3167
|
|
|
3168 CHECK_NUMBER (code, 0);
|
|
|
3169 b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
|
|
|
3170 DECODE_BIG5 (b1, b2, charset, c1, c2);
|
|
|
3171 XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
|
|
|
3172 return val;
|
|
|
3173 }
|
|
|
3174
|
|
|
3175 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
|
|
|
3176 "Encode the Big5 character CHAR to BIG5 coding-system.\n\
|
|
|
3177 Return the corresponding character code in Big5.")
|
|
|
3178 (ch)
|
|
|
3179 Lisp_Object ch;
|
|
|
3180 {
|
|
|
3181 int charset;
|
|
|
3182 unsigned char c1, c2, b1, b2;
|
|
|
3183 Lisp_Object val;
|
|
|
3184
|
|
|
3185 CHECK_NUMBER (ch, 0);
|
|
|
3186 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
|
|
|
3187 if (charset == charset_big5_1 || charset == charset_big5_2)
|
|
|
3188 {
|
|
|
3189 ENCODE_BIG5 (charset, c1, c2, b1, b2);
|
|
|
3190 XSETFASTINT (val, ((int)b1 << 8) | b2);
|
|
|
3191 }
|
|
|
3192 else
|
|
|
3193 XSETFASTINT (val, 0);
|
|
|
3194 return val;
|
|
|
3195 }
|
|
|
3196
|
|
|
3197 DEFUN ("set-terminal-coding-system",
|
|
|
3198 Fset_terminal_coding_system, Sset_terminal_coding_system, 1, 1,
|
|
|
3199 "zCoding-system for terminal display: ",
|
|
|
3200 "Set coding-system of your terminal to CODING-SYSTEM.\n\
|
|
|
3201 All outputs to terminal are encoded to this coding-system.")
|
|
|
3202 (coding_system)
|
|
|
3203 Lisp_Object coding_system;
|
|
|
3204 {
|
|
|
3205 CHECK_SYMBOL (coding_system, 0);
|
|
|
3206 setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
|
|
|
3207 update_mode_lines++;
|
|
|
3208 if (!NILP (Finteractive_p ()))
|
|
|
3209 Fredraw_display ();
|
|
|
3210 return Qnil;
|
|
|
3211 }
|
|
|
3212
|
|
|
3213 DEFUN ("terminal-coding-system",
|
|
|
3214 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
|
|
|
3215 "Return coding-system of your terminal.")
|
|
|
3216 ()
|
|
|
3217 {
|
|
|
3218 return terminal_coding.symbol;
|
|
|
3219 }
|
|
|
3220
|
|
|
3221 DEFUN ("set-keyboard-coding-system",
|
|
|
3222 Fset_keyboard_coding_system, Sset_keyboard_coding_system, 1, 1,
|
|
|
3223 "zCoding-system for keyboard input: ",
|
|
|
3224 "Set coding-system of what is sent from terminal keyboard to CODING-SYSTEM.\n\
|
|
|
3225 All inputs from terminal are decoded from this coding-system.")
|
|
|
3226 (coding_system)
|
|
|
3227 Lisp_Object coding_system;
|
|
|
3228 {
|
|
|
3229 CHECK_SYMBOL (coding_system, 0);
|
|
|
3230 setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
|
|
|
3231 return Qnil;
|
|
|
3232 }
|
|
|
3233
|
|
|
3234 DEFUN ("keyboard-coding-system",
|
|
|
3235 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
|
|
|
3236 "Return coding-system of what is sent from terminal keyboard.")
|
|
|
3237 ()
|
|
|
3238 {
|
|
|
3239 return keyboard_coding.symbol;
|
|
|
3240 }
|
|
|
3241
|
|
|
3242
|
|
|
3243 DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
|
|
|
3244 1, MANY, 0,
|
|
|
3245 "Return a cons of coding systems for I/O primitive OPERATION.\n\
|
|
|
3246 Remaining arguments are for OPERATION.\n\
|
|
|
3247 OPERATION is one of the following Emacs I/O primitives:\n\
|
|
|
3248 For file I/O, insert-file-contents or write-region.\n\
|
|
|
3249 For process I/O, call-process, call-process-region, or start-process.\n\
|
|
|
3250 For network I/O, open-network-stream.\n\
|
|
|
3251 For each OPERATION, TARGET is selected from the arguments as below:\n\
|
|
|
3252 For file I/O, TARGET is a file name.\n\
|
|
|
3253 For process I/O, TARGET is a process name.\n\
|
|
|
3254 For network I/O, TARGET is a service name or a port number\n\
|
|
|
3255 \n\
|
|
|
3256 The return value is a cons of coding systems for decoding and encoding\n\
|
|
|
3257 registered in nested alist `coding-system-alist' (which see) at a slot\n\
|
|
|
3258 corresponding to OPERATION and TARGET.
|
|
|
3259 If a function symbol is at the slot, return a result of the function call.\n\
|
|
|
3260 The function is called with one argument, a list of all the arguments.")
|
|
|
3261 (nargs, args)
|
|
|
3262 int nargs;
|
|
|
3263 Lisp_Object *args;
|
|
|
3264 {
|
|
|
3265 Lisp_Object operation, target_idx, target, val;
|
|
|
3266 register Lisp_Object chain;
|
|
|
3267
|
|
|
3268 if (nargs < 2)
|
|
|
3269 error ("Too few arguments");
|
|
|
3270 operation = args[0];
|
|
|
3271 if (!SYMBOLP (operation)
|
|
|
3272 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
|
|
|
3273 error ("Invalid first arguement");
|
|
|
3274 if (nargs < 1 + XINT (target_idx))
|
|
|
3275 error ("Too few arguments for operation: %s",
|
|
|
3276 XSYMBOL (operation)->name->data);
|
|
|
3277 target = args[XINT (target_idx) + 1];
|
|
|
3278 if (!(STRINGP (target)
|
|
|
3279 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
|
|
|
3280 error ("Invalid %dth argument", XINT (target_idx) + 1);
|
|
|
3281
|
|
|
3282 chain = Fassq (operation, Vcoding_system_alist);
|
|
|
3283 if (NILP (chain))
|
|
|
3284 return Qnil;
|
|
|
3285
|
|
|
3286 for (chain = XCONS (chain)->cdr; CONSP (chain); chain = XCONS (chain)->cdr)
|
|
|
3287 {
|
|
|
3288 Lisp_Object elt = XCONS (chain)->car;
|
|
|
3289
|
|
|
3290 if (CONSP (elt)
|
|
|
3291 && ((STRINGP (target)
|
|
|
3292 && STRINGP (XCONS (elt)->car)
|
|
|
3293 && fast_string_match (XCONS (elt)->car, target) >= 0)
|
|
|
3294 || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
|
|
|
3295 return (CONSP (val = XCONS (elt)->cdr)
|
|
|
3296 ? val
|
|
|
3297 : ((SYMBOLP (val) && Fboundp (val)
|
|
|
3298 ? call2 (val, Flist (nargs, args))
|
|
|
3299 : Qnil)));
|
|
|
3300 }
|
|
|
3301 return Qnil;
|
|
|
3302 }
|
|
|
3303
|
|
|
3304 #endif /* emacs */
|
|
|
3305
|
|
|
3306
|
|
|
3307 /*** 8. Post-amble ***/
|
|
|
3308
|
|
|
3309 init_coding_once ()
|
|
|
3310 {
|
|
|
3311 int i;
|
|
|
3312
|
|
|
3313 /* Emacs internal format specific initialize routine. */
|
|
|
3314 for (i = 0; i <= 0x20; i++)
|
|
|
3315 emacs_code_class[i] = EMACS_control_code;
|
|
|
3316 emacs_code_class[0x0A] = EMACS_linefeed_code;
|
|
|
3317 emacs_code_class[0x0D] = EMACS_carriage_return_code;
|
|
|
3318 for (i = 0x21 ; i < 0x7F; i++)
|
|
|
3319 emacs_code_class[i] = EMACS_ascii_code;
|
|
|
3320 emacs_code_class[0x7F] = EMACS_control_code;
|
|
|
3321 emacs_code_class[0x80] = EMACS_leading_code_composition;
|
|
|
3322 for (i = 0x81; i < 0xFF; i++)
|
|
|
3323 emacs_code_class[i] = EMACS_invalid_code;
|
|
|
3324 emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
|
|
|
3325 emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
|
|
|
3326 emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
|
|
|
3327 emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
|
|
|
3328
|
|
|
3329 /* ISO2022 specific initialize routine. */
|
|
|
3330 for (i = 0; i < 0x20; i++)
|
|
|
3331 iso_code_class[i] = ISO_control_code;
|
|
|
3332 for (i = 0x21; i < 0x7F; i++)
|
|
|
3333 iso_code_class[i] = ISO_graphic_plane_0;
|
|
|
3334 for (i = 0x80; i < 0xA0; i++)
|
|
|
3335 iso_code_class[i] = ISO_control_code;
|
|
|
3336 for (i = 0xA1; i < 0xFF; i++)
|
|
|
3337 iso_code_class[i] = ISO_graphic_plane_1;
|
|
|
3338 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
|
|
|
3339 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
|
|
|
3340 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
|
|
|
3341 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
|
|
|
3342 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
|
|
|
3343 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
|
|
|
3344 iso_code_class[ISO_CODE_ESC] = ISO_escape;
|
|
|
3345 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
|
|
|
3346 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
|
|
|
3347 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
|
|
|
3348
|
|
|
3349 Qcoding_system = intern ("coding-system");
|
|
|
3350 staticpro (&Qcoding_system);
|
|
|
3351
|
|
|
3352 Qeol_type = intern ("eol-type");
|
|
|
3353 staticpro (&Qeol_type);
|
|
|
3354
|
|
|
3355 Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
|
|
|
3356 staticpro (&Qbuffer_file_coding_system);
|
|
|
3357
|
|
|
3358 Qpost_read_conversion = intern ("post-read-conversion");
|
|
|
3359 staticpro (&Qpost_read_conversion);
|
|
|
3360
|
|
|
3361 Qpre_write_conversion = intern ("pre-write-conversion");
|
|
|
3362 staticpro (&Qpre_write_conversion);
|
|
|
3363
|
|
|
3364 Qcoding_system_vector = intern ("coding-system-vector");
|
|
|
3365 staticpro (&Qcoding_system_vector);
|
|
|
3366
|
|
|
3367 Qcoding_system_p = intern ("coding-system-p");
|
|
|
3368 staticpro (&Qcoding_system_p);
|
|
|
3369
|
|
|
3370 Qcoding_system_error = intern ("coding-system-error");
|
|
|
3371 staticpro (&Qcoding_system_error);
|
|
|
3372
|
|
|
3373 Fput (Qcoding_system_error, Qerror_conditions,
|
|
|
3374 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
|
|
|
3375 Fput (Qcoding_system_error, Qerror_message,
|
|
|
3376 build_string ("Coding-system error"));
|
|
|
3377
|
|
|
3378 Qcoding_category_index = intern ("coding-category-index");
|
|
|
3379 staticpro (&Qcoding_category_index);
|
|
|
3380
|
|
|
3381 {
|
|
|
3382 int i;
|
|
|
3383 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
|
|
|
3384 {
|
|
|
3385 coding_category_table[i] = intern (coding_category_name[i]);
|
|
|
3386 staticpro (&coding_category_table[i]);
|
|
|
3387 Fput (coding_category_table[i], Qcoding_category_index,
|
|
|
3388 make_number (i));
|
|
|
3389 }
|
|
|
3390 }
|
|
|
3391
|
|
|
3392 conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
|
|
|
3393 conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
|
|
|
3394
|
|
|
3395 setup_coding_system (Qnil, &keyboard_coding);
|
|
|
3396 setup_coding_system (Qnil, &terminal_coding);
|
|
|
3397 }
|
|
|
3398
|
|
|
3399 #ifdef emacs
|
|
|
3400
|
|
|
3401 syms_of_coding ()
|
|
|
3402 {
|
|
|
3403 Qtarget_idx = intern ("target-idx");
|
|
|
3404 staticpro (&Qtarget_idx);
|
|
|
3405
|
|
|
3406 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
|
|
|
3407 Fput (Qwrite_region, Qtarget_idx, make_number (2));
|
|
|
3408
|
|
|
3409 Qcall_process = intern ("call-process");
|
|
|
3410 staticpro (&Qcall_process);
|
|
|
3411 Fput (Qcall_process, Qtarget_idx, make_number (0));
|
|
|
3412
|
|
|
3413 Qcall_process_region = intern ("call-process-region");
|
|
|
3414 staticpro (&Qcall_process_region);
|
|
|
3415 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
|
|
|
3416
|
|
|
3417 Qstart_process = intern ("start-process");
|
|
|
3418 staticpro (&Qstart_process);
|
|
|
3419 Fput (Qstart_process, Qtarget_idx, make_number (2));
|
|
|
3420
|
|
|
3421 Qopen_network_stream = intern ("open-network-stream");
|
|
|
3422 staticpro (&Qopen_network_stream);
|
|
|
3423 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
|
|
|
3424
|
|
|
3425 defsubr (&Scoding_system_vector);
|
|
|
3426 defsubr (&Scoding_system_p);
|
|
|
3427 defsubr (&Sread_coding_system);
|
|
|
3428 defsubr (&Sread_non_nil_coding_system);
|
|
|
3429 defsubr (&Scheck_coding_system);
|
|
|
3430 defsubr (&Sdetect_coding_region);
|
|
|
3431 defsubr (&Sdecode_coding_region);
|
|
|
3432 defsubr (&Sencode_coding_region);
|
|
|
3433 defsubr (&Sdecode_coding_string);
|
|
|
3434 defsubr (&Sencode_coding_string);
|
|
|
3435 defsubr (&Sdecode_sjis_char);
|
|
|
3436 defsubr (&Sencode_sjis_char);
|
|
|
3437 defsubr (&Sdecode_big5_char);
|
|
|
3438 defsubr (&Sencode_big5_char);
|
|
|
3439 defsubr (&Sset_terminal_coding_system);
|
|
|
3440 defsubr (&Sterminal_coding_system);
|
|
|
3441 defsubr (&Sset_keyboard_coding_system);
|
|
|
3442 defsubr (&Skeyboard_coding_system);
|
|
|
3443 defsubr (&Sfind_coding_system);
|
|
|
3444
|
|
|
3445 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
|
|
|
3446 "List of coding-categories (symbols) ordered by priority.");
|
|
|
3447 {
|
|
|
3448 int i;
|
|
|
3449
|
|
|
3450 Vcoding_category_list = Qnil;
|
|
|
3451 for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
|
|
|
3452 Vcoding_category_list
|
|
|
3453 = Fcons (coding_category_table[i], Vcoding_category_list);
|
|
|
3454 }
|
|
|
3455
|
|
|
3456 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
|
|
|
3457 "A variable of internal use only.\n\
|
|
|
3458 If the value is a coding system, it is used for decoding on read operation.\n\
|
|
|
3459 If not, an appropriate element in `coding-system-alist' (which see) is used.");
|
|
|
3460 Vcoding_system_for_read = Qnil;
|
|
|
3461
|
|
|
3462 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
|
|
|
3463 "A variable of internal use only.\n\
|
|
|
3464 If the value is a coding system, it is used for encoding on write operation.\n\
|
|
|
3465 If not, an appropriate element in `coding-system-alist' (which see) is used.");
|
|
|
3466 Vcoding_system_for_write = Qnil;
|
|
|
3467
|
|
|
3468 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
|
|
|
3469 "Coding-system used in the latest file or process I/O.");
|
|
|
3470 Vlast_coding_system_used = Qnil;
|
|
|
3471
|
|
|
3472 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
|
|
|
3473 "Nested alist to decide a coding system for a specific I/O operation.\n\
|
|
|
3474 The format is ((OPERATION . ((REGEXP . CODING-SYSTEMS) ...)) ...).\n\
|
|
|
3475
|
|
|
3476 OPERATION is one of the following Emacs I/O primitives:\n\
|
|
|
3477 For file I/O, insert-file-contents and write-region.\n\
|
|
|
3478 For process I/O, call-process, call-process-region, and start-process.\n\
|
|
|
3479 For network I/O, open-network-stream.\n\
|
|
|
3480 In addition, for process I/O, `process-argument' can be specified for\n\
|
|
|
3481 encoding arguments of the process.\n\
|
|
|
3482 \n\
|
|
|
3483 REGEXP is a regular expression matching a target of OPERATION, where\n\
|
|
|
3484 target is a file name for file I/O operations, a process name for\n\
|
|
|
3485 process I/O operations, or a service name for network I/O\n\
|
|
|
3486 operations. REGEXP might be a port number for network I/O operation.\n\
|
|
|
3487 \n\
|
|
|
3488 CODING-SYSTEMS is a cons of coding systems to encode and decode\n\
|
|
|
3489 character code on OPERATION, or a function symbol returning the cons.\n\
|
|
|
3490 See the documentation of `find-coding-system' for more detail.");
|
|
|
3491 Vcoding_system_alist = Qnil;
|
|
|
3492
|
|
|
3493 DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
|
|
|
3494 "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
|
|
|
3495 eol_mnemonic_unix = '.';
|
|
|
3496
|
|
|
3497 DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
|
|
|
3498 "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
|
|
|
3499 eol_mnemonic_dos = ':';
|
|
|
3500
|
|
|
3501 DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
|
|
|
3502 "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
|
|
|
3503 eol_mnemonic_mac = '\'';
|
|
|
3504
|
|
|
3505 DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
|
|
|
3506 "Mnemonic character indicating end-of-line format is not yet decided.");
|
|
|
3507 eol_mnemonic_undecided = '-';
|
|
|
3508
|
|
|
3509 DEFVAR_LISP ("alternate-charset-table", &Valternate_charset_table,
|
|
|
3510 "Alist of charsets vs the alternate charsets.\n\
|
|
|
3511 While decoding, if a charset (car part of an element) is found,\n\
|
|
|
3512 decode it as the alternate charset (cdr part of the element).");
|
|
|
3513 Valternate_charset_table = Qnil;
|
|
|
3514
|
|
|
3515 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
|
|
|
3516 "Alist of charsets vs revision numbers.\n\
|
|
|
3517 While encoding, if a charset (car part of an element) is found,\n\
|
|
|
3518 designate it with the escape sequence identifing revision (cdr part of the element).");
|
|
|
3519 Vcharset_revision_alist = Qnil;
|
|
|
3520 }
|
|
|
3521
|
|
|
3522 #endif /* emacs */
|