comparison src/coding.c @ 110997:b8fde5ef9e14

Merge changes from emacs-23 branch.
author Juanma Barranquero <lekktu@gmail.com>
date Thu, 14 Oct 2010 16:32:27 +0200
parents bec49af30c2f b87d8337c695
children 6788b08ca420
comparison
equal deleted inserted replaced
110996:e65b79c36e50 110997:b8fde5ef9e14
164 int found = 0; 164 int found = 0;
165 ...; 165 ...;
166 166
167 while (1) 167 while (1)
168 { 168 {
169 /* Get one byte from the source. If the souce is exausted, jump 169 /* Get one byte from the source. If the source is exhausted, jump
170 to no_more_source:. */ 170 to no_more_source:. */
171 ONE_MORE_BYTE (c); 171 ONE_MORE_BYTE (c);
172 172
173 if (! __C_conforms_to_XXX___ (c)) 173 if (! __C_conforms_to_XXX___ (c))
174 break; 174 break;
178 /* The byte sequence is invalid for XXX. */ 178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX; 179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0; 180 return 0;
181 181
182 no_more_source: 182 no_more_source:
183 /* The source exausted successfully. */ 183 /* The source exhausted successfully. */
184 detect_info->found |= found; 184 detect_info->found |= found;
185 return 1; 185 return 1;
186 } 186 }
187 #endif 187 #endif
188 188
528 528
529 /* If set, designation sequence should be placed at beginning of line 529 /* If set, designation sequence should be placed at beginning of line
530 on output. */ 530 on output. */
531 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400 531 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
532 532
533 /* If set, do not encode unsafe charactes on output. */ 533 /* If set, do not encode unsafe characters on output. */
534 #define CODING_ISO_FLAG_SAFE 0x0800 534 #define CODING_ISO_FLAG_SAFE 0x0800
535 535
536 /* If set, extra latin codes (128..159) are accepted as a valid code 536 /* If set, extra latin codes (128..159) are accepted as a valid code
537 on input. */ 537 on input. */
538 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000 538 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
684 /* List of symbols `coding-category-xxx' ordered by priority. This 684 /* List of symbols `coding-category-xxx' ordered by priority. This
685 variable is exposed to Emacs Lisp. */ 685 variable is exposed to Emacs Lisp. */
686 static Lisp_Object Vcoding_category_list; 686 static Lisp_Object Vcoding_category_list;
687 687
688 /* Table of coding categories (Lisp symbols). This variable is for 688 /* Table of coding categories (Lisp symbols). This variable is for
689 internal use oly. */ 689 internal use only. */
690 static Lisp_Object Vcoding_category_table; 690 static Lisp_Object Vcoding_category_table;
691 691
692 /* Table of coding-categories ordered by priority. */ 692 /* Table of coding-categories ordered by priority. */
693 static enum coding_category coding_priorities[coding_category_max]; 693 static enum coding_category coding_priorities[coding_category_max];
694 694
816 produced_chars++; \ 816 produced_chars++; \
817 *dst++ = (c); \ 817 *dst++ = (c); \
818 } while (0) 818 } while (0)
819 819
820 820
821 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */ 821 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2. */
822 822
823 #define EMIT_TWO_ASCII_BYTES(c1, c2) \ 823 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
824 do { \ 824 do { \
825 produced_chars += 2; \ 825 produced_chars += 2; \
826 *dst++ = (c1), *dst++ = (c2); \ 826 *dst++ = (c1), *dst++ = (c2); \
1225 old-style emacs-mule encoding, or 0 for the other kind of 1225 old-style emacs-mule encoding, or 0 for the other kind of
1226 composition. 1226 composition.
1227 1227
1228 METHOD is one of enum composition_method. 1228 METHOD is one of enum composition_method.
1229 1229
1230 Optionnal COMPOSITION-COMPONENTS are characters and composition 1230 Optional COMPOSITION-COMPONENTS are characters and composition
1231 rules. 1231 rules.
1232 1232
1233 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID 1233 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1234 follows. 1234 follows.
1235 1235
1930 BYTES is 0xA0 plus a byte length of this composition data, 1930 BYTES is 0xA0 plus a byte length of this composition data,
1931 1931
1932 CHARS is 0xA0 plus a number of characters composed by this 1932 CHARS is 0xA0 plus a number of characters composed by this
1933 data, 1933 data,
1934 1934
1935 COMPONENTs are characters of multibye form or composition 1935 COMPONENTs are characters of multibyte form or composition
1936 rules encoded by two-byte of ASCII codes. 1936 rules encoded by two-byte of ASCII codes.
1937 1937
1938 In addition, for backward compatibility, the following formats are 1938 In addition, for backward compatibility, the following formats are
1939 also recognized as composition data on decoding. 1939 also recognized as composition data on decoding.
1940 1940
2426 { 2426 {
2427 const unsigned char *src = coding->source + coding->consumed; 2427 const unsigned char *src = coding->source + coding->consumed;
2428 const unsigned char *src_end = coding->source + coding->src_bytes; 2428 const unsigned char *src_end = coding->source + coding->src_bytes;
2429 const unsigned char *src_base; 2429 const unsigned char *src_base;
2430 int *charbuf = coding->charbuf + coding->charbuf_used; 2430 int *charbuf = coding->charbuf + coding->charbuf_used;
2431 /* We may produce two annocations (charset and composition) in one 2431 /* We may produce two annotations (charset and composition) in one
2432 loop and one more charset annocation at the end. */ 2432 loop and one more charset annotation at the end. */
2433 int *charbuf_end 2433 int *charbuf_end
2434 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3); 2434 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2435 int consumed_chars = 0, consumed_chars_base; 2435 int consumed_chars = 0, consumed_chars_base;
2436 int multibytep = coding->src_multibyte; 2436 int multibytep = coding->src_multibyte;
2437 Lisp_Object attrs, charset_list; 2437 Lisp_Object attrs, charset_list;
2503 { 2503 {
2504 int nchars, nbytes; 2504 int nchars, nbytes;
2505 /* emacs_mule_char can load a charset map from a file, which 2505 /* emacs_mule_char can load a charset map from a file, which
2506 allocates a large structure and might cause buffer text 2506 allocates a large structure and might cause buffer text
2507 to be relocated as result. Thus, we need to remember the 2507 to be relocated as result. Thus, we need to remember the
2508 original pointer to buffer text, and fixup all related 2508 original pointer to buffer text, and fix up all related
2509 pointers after the call. */ 2509 pointers after the call. */
2510 const unsigned char *orig = coding->source; 2510 const unsigned char *orig = coding->source;
2511 EMACS_INT offset; 2511 EMACS_INT offset;
2512 2512
2513 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id, 2513 c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2530 consumed_chars = consumed_chars_base + nchars; 2530 consumed_chars = consumed_chars_base + nchars;
2531 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR) 2531 if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2532 cmp_status->ncomps -= nchars; 2532 cmp_status->ncomps -= nchars;
2533 } 2533 }
2534 2534
2535 /* Now if C >= 0, we found a normally encoded characer, if C < 2535 /* Now if C >= 0, we found a normally encoded character, if C <
2536 0, we found an old-style composition component character or 2536 0, we found an old-style composition component character or
2537 rule. */ 2537 rule. */
2538 2538
2539 if (cmp_status->state == COMPOSING_NO) 2539 if (cmp_status->state == COMPOSING_NO)
2540 { 2540 {
3041 ASET (attrs, coding_attr_safe_charsets, safe_charsets); 3041 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3042 } 3042 }
3043 3043
3044 3044
3045 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 3045 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3046 Check if a text is encoded in one of ISO-2022 based codig systems. 3046 Check if a text is encoded in one of ISO-2022 based coding systems.
3047 If it is, return 1, else return 0. */ 3047 If it is, return 1, else return 0. */
3048 3048
3049 static int 3049 static int
3050 detect_coding_iso_2022 (struct coding_system *coding, 3050 detect_coding_iso_2022 (struct coding_system *coding,
3051 struct coding_detection_info *detect_info) 3051 struct coding_detection_info *detect_info)
3450 } 3450 }
3451 cmp_status->state = COMPOSING_NO; 3451 cmp_status->state = COMPOSING_NO;
3452 return new_chars; 3452 return new_chars;
3453 } 3453 }
3454 3454
3455 /* If characers are under composition, finish the composition. */ 3455 /* If characters are under composition, finish the composition. */
3456 #define MAYBE_FINISH_COMPOSITION() \ 3456 #define MAYBE_FINISH_COMPOSITION() \
3457 do { \ 3457 do { \
3458 if (cmp_status->state != COMPOSING_NO) \ 3458 if (cmp_status->state != COMPOSING_NO) \
3459 char_offset += finish_composition (charbuf, cmp_status); \ 3459 char_offset += finish_composition (charbuf, cmp_status); \
3460 } while (0) 3460 } while (0)
3556 { 3556 {
3557 const unsigned char *src = coding->source + coding->consumed; 3557 const unsigned char *src = coding->source + coding->consumed;
3558 const unsigned char *src_end = coding->source + coding->src_bytes; 3558 const unsigned char *src_end = coding->source + coding->src_bytes;
3559 const unsigned char *src_base; 3559 const unsigned char *src_base;
3560 int *charbuf = coding->charbuf + coding->charbuf_used; 3560 int *charbuf = coding->charbuf + coding->charbuf_used;
3561 /* We may produce two annocations (charset and composition) in one 3561 /* We may produce two annotations (charset and composition) in one
3562 loop and one more charset annocation at the end. */ 3562 loop and one more charset annotation at the end. */
3563 int *charbuf_end 3563 int *charbuf_end
3564 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3); 3564 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3565 int consumed_chars = 0, consumed_chars_base; 3565 int consumed_chars = 0, consumed_chars_base;
3566 int multibytep = coding->src_multibyte; 3566 int multibytep = coding->src_multibyte;
3567 /* Charsets invoked to graphic plane 0 and 1 respectively. */ 3567 /* Charsets invoked to graphic plane 0 and 1 respectively. */
3859 case '[': /* specification of direction */ 3859 case '[': /* specification of direction */
3860 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)) 3860 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3861 goto invalid_code; 3861 goto invalid_code;
3862 /* For the moment, nested direction is not supported. 3862 /* For the moment, nested direction is not supported.
3863 So, `coding->mode & CODING_MODE_DIRECTION' zero means 3863 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3864 left-to-right, and nozero means right-to-left. */ 3864 left-to-right, and nonzero means right-to-left. */
3865 ONE_MORE_BYTE (c1); 3865 ONE_MORE_BYTE (c1);
3866 switch (c1) 3866 switch (c1)
3867 { 3867 {
3868 case ']': /* end of the current direction */ 3868 case ']': /* end of the current direction */
3869 coding->mode &= ~CODING_MODE_DIRECTION; 3869 coding->mode &= ~CODING_MODE_DIRECTION;
4764 { 4764 {
4765 const unsigned char *src = coding->source + coding->consumed; 4765 const unsigned char *src = coding->source + coding->consumed;
4766 const unsigned char *src_end = coding->source + coding->src_bytes; 4766 const unsigned char *src_end = coding->source + coding->src_bytes;
4767 const unsigned char *src_base; 4767 const unsigned char *src_base;
4768 int *charbuf = coding->charbuf + coding->charbuf_used; 4768 int *charbuf = coding->charbuf + coding->charbuf_used;
4769 /* We may produce one charset annocation in one loop and one more at 4769 /* We may produce one charset annotation in one loop and one more at
4770 the end. */ 4770 the end. */
4771 int *charbuf_end 4771 int *charbuf_end
4772 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2); 4772 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4773 int consumed_chars = 0, consumed_chars_base; 4773 int consumed_chars = 0, consumed_chars_base;
4774 int multibytep = coding->src_multibyte; 4774 int multibytep = coding->src_multibyte;
4882 { 4882 {
4883 const unsigned char *src = coding->source + coding->consumed; 4883 const unsigned char *src = coding->source + coding->consumed;
4884 const unsigned char *src_end = coding->source + coding->src_bytes; 4884 const unsigned char *src_end = coding->source + coding->src_bytes;
4885 const unsigned char *src_base; 4885 const unsigned char *src_base;
4886 int *charbuf = coding->charbuf + coding->charbuf_used; 4886 int *charbuf = coding->charbuf + coding->charbuf_used;
4887 /* We may produce one charset annocation in one loop and one more at 4887 /* We may produce one charset annotation in one loop and one more at
4888 the end. */ 4888 the end. */
4889 int *charbuf_end 4889 int *charbuf_end
4890 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2); 4890 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4891 int consumed_chars = 0, consumed_chars_base; 4891 int consumed_chars = 0, consumed_chars_base;
4892 int multibytep = coding->src_multibyte; 4892 int multibytep = coding->src_multibyte;
5539 { 5539 {
5540 const unsigned char *src = coding->source + coding->consumed; 5540 const unsigned char *src = coding->source + coding->consumed;
5541 const unsigned char *src_end = coding->source + coding->src_bytes; 5541 const unsigned char *src_end = coding->source + coding->src_bytes;
5542 const unsigned char *src_base; 5542 const unsigned char *src_base;
5543 int *charbuf = coding->charbuf + coding->charbuf_used; 5543 int *charbuf = coding->charbuf + coding->charbuf_used;
5544 /* We may produce one charset annocation in one loop and one more at 5544 /* We may produce one charset annotation in one loop and one more at
5545 the end. */ 5545 the end. */
5546 int *charbuf_end 5546 int *charbuf_end
5547 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2); 5547 = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5548 int consumed_chars = 0, consumed_chars_base; 5548 int consumed_chars = 0, consumed_chars_base;
5549 int multibytep = coding->src_multibyte; 5549 int multibytep = coding->src_multibyte;
6142 symbol) `japanese-iso-8bit' by default. 6142 symbol) `japanese-iso-8bit' by default.
6143 6143
6144 o coding-category-iso-7-else 6144 o coding-category-iso-7-else
6145 6145
6146 The category for a coding system which has the same code range 6146 The category for a coding system which has the same code range
6147 as ISO2022 of 7-bit environemnt but uses locking shift or 6147 as ISO2022 of 7-bit environment but uses locking shift or
6148 single shift functions. Assigned the coding-system (Lisp 6148 single shift functions. Assigned the coding-system (Lisp
6149 symbol) `iso-2022-7bit-lock' by default. 6149 symbol) `iso-2022-7bit-lock' by default.
6150 6150
6151 o coding-category-iso-8-else 6151 o coding-category-iso-8-else
6152 6152
6153 The category for a coding system which has the same code range 6153 The category for a coding system which has the same code range
6154 as ISO2022 of 8-bit environemnt but uses locking shift or 6154 as ISO2022 of 8-bit environment but uses locking shift or
6155 single shift functions. Assigned the coding-system (Lisp 6155 single shift functions. Assigned the coding-system (Lisp
6156 symbol) `iso-2022-8bit-ss2' by default. 6156 symbol) `iso-2022-8bit-ss2' by default.
6157 6157
6158 o coding-category-big5 6158 o coding-category-big5
6159 6159
7546 7546
7547 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */ 7547 /* 1 iff Vcode_conversion_reused_workbuf is already in use. */
7548 static int reused_workbuf_in_use; 7548 static int reused_workbuf_in_use;
7549 7549
7550 7550
7551 /* Return a working buffer of code convesion. MULTIBYTE specifies the 7551 /* Return a working buffer of code conversion. MULTIBYTE specifies the
7552 multibyteness of returning buffer. */ 7552 multibyteness of returning buffer. */
7553 7553
7554 static Lisp_Object 7554 static Lisp_Object
7555 make_conversion_work_buffer (int multibyte) 7555 make_conversion_work_buffer (int multibyte)
7556 { 7556 {
8198 } 8198 }
8199 8199
8200 8200
8201 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If 8201 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
8202 HIGHEST is nonzero, return the coding system of the highest 8202 HIGHEST is nonzero, return the coding system of the highest
8203 priority among the detected coding systems. Otherwize return a 8203 priority among the detected coding systems. Otherwise return a
8204 list of detected coding systems sorted by their priorities. If 8204 list of detected coding systems sorted by their priorities. If
8205 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct 8205 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8206 multibyte form but contains only ASCII and eight-bit chars. 8206 multibyte form but contains only ASCII and eight-bit chars.
8207 Otherwise, the bytes are raw bytes. 8207 Otherwise, the bytes are raw bytes.
8208 8208
9300 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1)); 9300 struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9301 CHECK_SYMBOL (coding_system); 9301 CHECK_SYMBOL (coding_system);
9302 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding); 9302 setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9303 /* We had better not send unsafe characters to terminal. */ 9303 /* We had better not send unsafe characters to terminal. */
9304 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING; 9304 terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9305 /* Characer composition should be disabled. */ 9305 /* Character composition should be disabled. */
9306 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; 9306 terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9307 terminal_coding->src_multibyte = 1; 9307 terminal_coding->src_multibyte = 1;
9308 terminal_coding->dst_multibyte = 0; 9308 terminal_coding->dst_multibyte = 0;
9309 return Qnil; 9309 return Qnil;
9310 } 9310 }
9316 (Lisp_Object coding_system) 9316 (Lisp_Object coding_system)
9317 { 9317 {
9318 CHECK_SYMBOL (coding_system); 9318 CHECK_SYMBOL (coding_system);
9319 setup_coding_system (Fcheck_coding_system (coding_system), 9319 setup_coding_system (Fcheck_coding_system (coding_system),
9320 &safe_terminal_coding); 9320 &safe_terminal_coding);
9321 /* Characer composition should be disabled. */ 9321 /* Character composition should be disabled. */
9322 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; 9322 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9323 safe_terminal_coding.src_multibyte = 1; 9323 safe_terminal_coding.src_multibyte = 1;
9324 safe_terminal_coding.dst_multibyte = 0; 9324 safe_terminal_coding.dst_multibyte = 0;
9325 return Qnil; 9325 return Qnil;
9326 } 9326 }
9350 if (NILP (coding_system)) 9350 if (NILP (coding_system))
9351 coding_system = Qno_conversion; 9351 coding_system = Qno_conversion;
9352 else 9352 else
9353 Fcheck_coding_system (coding_system); 9353 Fcheck_coding_system (coding_system);
9354 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t)); 9354 setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9355 /* Characer composition should be disabled. */ 9355 /* Character composition should be disabled. */
9356 TERMINAL_KEYBOARD_CODING (t)->common_flags 9356 TERMINAL_KEYBOARD_CODING (t)->common_flags
9357 &= ~CODING_ANNOTATE_COMPOSITION_MASK; 9357 &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9358 return Qnil; 9358 return Qnil;
9359 } 9359 }
9360 9360
9718 If Nth element is a number NUM, N is the first byte of a 9718 If Nth element is a number NUM, N is the first byte of a
9719 charset whose ID is NUM. 9719 charset whose ID is NUM.
9720 9720
9721 If Nth element is a list of charset IDs, N is the first byte 9721 If Nth element is a list of charset IDs, N is the first byte
9722 of one of them. The list is sorted by dimensions of the 9722 of one of them. The list is sorted by dimensions of the
9723 charsets. A charset of smaller dimension comes firtst. */ 9723 charsets. A charset of smaller dimension comes first. */
9724 val = Fmake_vector (make_number (256), Qnil); 9724 val = Fmake_vector (make_number (256), Qnil);
9725 9725
9726 for (tail = charset_list; CONSP (tail); tail = XCDR (tail)) 9726 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9727 { 9727 {
9728 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail))); 9728 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));