emacs: src/coding.c comparison

comparison src/coding.c @ 88438:3a34b722dd71

(encode_coding_utf_8): Initialize produced_chars to 0. (decode_coding_utf_16): Fix converting high and low bytes to code-point. (encode_coding_utf_16): Substitute coding->default_char for non-Unicode characters. (decode_coding): Don't call record_insert here. (setup_coding_system): Initialize `surrogate' of coding->spec.utf_16 to 0. (EMIT_ONE_BYTE): Fix for multibyte case.

author	Kenichi Handa <handa@m17n.org>
date	Fri, 08 Mar 2002 00:19:39 +0000
parents	6418a272b97e
children	6b86cf30a0b9

comparison

equal deleted inserted replaced

-:ffec838ec027
+:3a34b722dd71
 /*** 0. General comments ***
 CODING SYSTEM
-Coding system is an encoding mechanism of one or more character
+Coding system is an object for a encoding mechanism that contains
-sets.  Here's a list of coding system types supported by Emacs.
+information about how to convert byte sequence to character
-When we say "decode", it means converting a text encoded by some
+sequences and vice versa.  When we say "decode", it means converting
-coding system into Emacs' internal format (emacs-utf-8), and when we
+a byte sequence of a specific coding system into a character
-say "encode", it means converting a text of emacs-utf-8 to some
+sequence that is represented by Emacs' internal coding system
-other coding system.
+`emacs-utf-8', and when we say "encode", it means converting a
+character sequence of emacs-utf-8 to a byte sequence of a specific
-Emacs represents a coding system by a Lisp symbol.  Each symbol is a
+coding system.
-key to the hash table Vcharset_hash_table.  This hash table
-associates the symbol to the corresponding detailed specifications.
+In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
+C level, a coding system is represented by a vector of attributes
-Before using a coding system for decoding and encoding, we setup a
+stored in the hash table Vcharset_hash_table.  The conversion from a
-structure of type `struct coding_system'.  This structure keeps
+coding system symbol to attributes vector is done by looking up
-various information about a specific code conversion (e.g.  the
+Vcharset_hash_table by the symbol.
-location of source and destination data).
+Coding systems are classified into the following types depending on
-Coding systems are classified into the following types by how to
+the mechanism of encoding.  Here's a brief descrition about type.
-represent a character in a byte sequence.  Here's a brief descrition
-about type.
-o Emacs' internal format (emacs-utf-8)
-The extended UTF-8 which allows eight-bit raw bytes mixed with
-character codes.  Emacs holds characters in buffers and strings by
-this format.
 o UTF-8
 o UTF-16
 `carriage-return'.
 Since text characters encoding and end-of-line encoding are
 independent, any coding system described above can take any format
 of end-of-line (except for no-conversion).
+STRUCT CODING_SYSTEM
+Before using a coding system for code conversion (i.e. decoding and
+encoding), we setup a structure of type `struct coding_system'.
+This structure keeps various information about a specific code
+conversion (e.g.  the location of source and destination data).
 */
 /* COMMON MACROS */
 } while (0)
 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
-#define EMIT_TWO_BYTES(c1, c2)			\
+#define EMIT_TWO_BYTES(c1, c2)		\
-do {						\
+do {					\
-produced_chars += 2;			\
+produced_chars += 2;		\
-if (multibytep)				\
+if (multibytep)			\
-{						\
+{					\
-	CHAR_STRING_ADVANCE ((int) (c1), dst);	\
+	int ch;				\
-	CHAR_STRING_ADVANCE ((int) (c2), dst);	\
+					\
-}						\
+	ch = (c1);			\
-else					\
+	if (ch >= 0x80)			\
-{						\
+	  ch = BYTE8_TO_CHAR (ch);	\
-	*dst++ = (c1);				\
+	CHAR_STRING_ADVANCE (ch, dst);	\
-	*dst++ = (c2);				\
+	ch = (c2);			\
-}						\
+	if (ch >= 0x80)			\
+	  ch = BYTE8_TO_CHAR (ch);	\
+	CHAR_STRING_ADVANCE (ch, dst);	\
+}					\
+else				\
+{					\
+	*dst++ = (c1);			\
+	*dst++ = (c2);			\
+}					\
 } while (0)
 #define EMIT_THREE_BYTES(c1, c2, c3)	\
 do {					\
 {
 if (coding->src_pos < 0)
 	coding->source = GAP_END_ADDR + coding->src_pos_byte;
 else
 	{
-	  if (coding->src_pos < GPT
+	  struct buffer *buf = XBUFFER (coding->src_object);
-	      && coding->src_pos + coding->src_chars >= GPT)
+	  EMACS_INT beg_byte = BUF_BEG_BYTE (buf);
-	    move_gap_both (coding->src_pos, coding->src_pos_byte);
+	  EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
-	  coding->source = BYTE_POS_ADDR (coding->src_pos_byte);
+	  unsigned char *beg_addr = BUF_BEG_ADDR (buf);
+	  coding->source = beg_addr + coding->src_pos_byte - 1;
+	  if (coding->src_pos_byte >= gpt_byte)
+	    coding->source += BUF_GAP_SIZE (buf);
 	}
 }
 else if (STRINGP (coding->src_object))
 {
 coding->source = (XSTRING (coding->src_object)->data
 int multibytep = coding->dst_multibyte;
 int *charbuf = coding->charbuf;
 int *charbuf_end = charbuf + coding->charbuf_used;
 unsigned char *dst = coding->destination + coding->produced;
 unsigned char *dst_end = coding->destination + coding->dst_bytes;
-int produced_chars;
+int produced_chars = 0;
 int c;
 if (multibytep)
 {
 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
 int c, c1, c2;
 src_base = src;
 ONE_MORE_BYTE (c1);
 ONE_MORE_BYTE (c2);
-c = (c1 << 16) | c2;
+c = (c1 << 8) | c2;
 if (bom == utf_16_with_bom)
 	{
 	  if (endian == utf_16_big_endian
 	      ? c != 0xFFFE : c != 0xFEFF)
 	    {
 	break;
 ONE_MORE_BYTE (c1);
 ONE_MORE_BYTE (c2);
 c = (endian == utf_16_big_endian
-	   ? ((c1 << 16) | c2) : ((c2 << 16) | c1));
+	   ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
 if (surrogate)
 	{
 	  if (! UTF_16_LOW_SURROGATE_P (c))
 	    {
 	      if (endian == utf_16_big_endian)
 while (charbuf < charbuf_end)
 {
 ASSURE_DESTINATION (safe_room);
 c = *charbuf++;
-if (c >= 0x110000)
+if (c >= MAX_UNICODE_CHAR)
-	c = 0xFFFF;
+	c = coding->default_char;
 if (c < 0x10000)
 	{
 	  if (big_endian)
 	    EMIT_TWO_BYTES (c >> 8, c & 0xFF);
 				    : EQ (val, Qt) ? utf_16_with_bom
 				    : utf_16_without_bom);
 val = AREF (attrs, coding_attr_utf_16_endian);
 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
 				       : utf_16_little_endian);
+CODING_UTF_16_SURROGATE (coding) = 0;
 coding->detector = detect_coding_utf_16;
 coding->decoder = decode_coding_utf_16;
 coding->encoder = encode_coding_utf_16;
 coding->common_flags
 	|= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
 	  coding->carryover_bytes = nbytes;
 	  while (nbytes-- > 0)
 	    *p++ = *src++;
 	}
 coding->consumed = coding->src_bytes;
-}
-if (BUFFERP (coding->dst_object))
-{
-record_insert (coding->dst_pos, coding->produced_char);
 }
 return coding->result;
 }

Mercurial > emacs

comparison src/coding.c @ 88438:3a34b722dd71