Mercurial > emacs
annotate lisp/international/utf-8.el @ 37678:ebec0594dece
(compile-files): Redirect output of chmod to
/dev/null.
| author | Gerd Moellmann <gerd@gnu.org> |
|---|---|
| date | Fri, 11 May 2001 10:53:56 +0000 |
| parents | b095952a8678 |
| children | 88389fa9b713 |
| rev | line source |
|---|---|
| 35542 | 1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support |
| 2 | |
| 3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. | |
| 4 ;; Licensed to the Free Software Foundation. | |
| 5 | |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
6 ;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> |
| 36243 | 7 ;; Keywords: multilingual, Unicode, UTF-8, i18n |
| 35542 | 8 |
| 9 ;; This file is part of GNU Emacs. | |
| 10 | |
| 11 ;; GNU Emacs is free software; you can redistribute it and/or modify | |
| 12 ;; it under the terms of the GNU General Public License as published by | |
| 13 ;; the Free Software Foundation; either version 2, or (at your option) | |
| 14 ;; any later version. | |
| 15 | |
| 16 ;; GNU Emacs is distributed in the hope that it will be useful, | |
| 17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 19 ;; GNU General Public License for more details. | |
| 20 | |
| 21 ;; You should have received a copy of the GNU General Public License | |
| 22 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
| 23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
| 24 ;; Boston, MA 02111-1307, USA. | |
| 25 | |
| 26 ;;; Commentary: | |
| 27 | |
| 28 ;; The coding-system `mule-utf-8' supports encoding/decoding of the | |
| 36243 | 29 ;; following character sets to and from UTF-8: |
| 35542 | 30 ;; |
| 31 ;; ascii | |
| 32 ;; eight-bit-control | |
| 33 ;; latin-iso8859-1 | |
| 34 ;; mule-unicode-0100-24ff | |
| 35 ;; mule-unicode-2500-33ff | |
| 36 ;; mule-unicode-e000-ffff | |
| 37 ;; | |
| 38 ;; Characters of other character sets cannot be encoded with | |
| 36243 | 39 ;; mule-utf-8. Note that the mule-unicode charsets currently lack |
| 40 ;; case and syntax information, so things like `downcase' will only | |
| 41 ;; work for characters from ASCII and Latin-1. | |
| 35542 | 42 ;; |
| 36243 | 43 ;; On decoding, Unicode characters that do not fit into the above |
| 44 ;; character sets are handled as `eight-bit-control' or | |
| 45 ;; `eight-bit-graphic' characters to retain the information about the | |
| 46 ;; original byte sequence. | |
| 47 | |
| 48 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: | |
| 35542 | 49 |
| 50 ;; scalar | utf-8 | |
| 51 ;; value | 1st byte | 2nd byte | 3rd byte | |
| 52 ;; --------------------+-----------+-----------+---------- | |
| 53 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | | |
| 54 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | | |
| 55 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx | |
| 56 | |
| 57 ;;; Code: | |
| 58 | |
| 59 (define-ccl-program ccl-decode-mule-utf-8 | |
| 60 ;; | |
| 61 ;; charset | bytes in utf-8 | bytes in emacs | |
| 62 ;; -----------------------+----------------+--------------- | |
| 63 ;; ascii | 1 | 1 | |
| 64 ;; -----------------------+----------------+--------------- | |
| 65 ;; eight-bit-control | 2 | 2 | |
| 66 ;; latin-iso8859-1 | 2 | 2 | |
| 67 ;; -----------------------+----------------+--------------- | |
| 68 ;; mule-unicode-0100-24ff | 2 | 4 | |
| 69 ;; (< 0800) | | | |
| 70 ;; -----------------------+----------------+--------------- | |
| 71 ;; mule-unicode-0100-24ff | 3 | 4 | |
| 72 ;; (>= 8000) | | | |
| 73 ;; mule-unicode-2500-33ff | 3 | 4 | |
| 74 ;; mule-unicode-e000-ffff | 3 | 4 | |
| 75 ;; | |
| 76 ;; Thus magnification factor is two. | |
| 77 ;; | |
| 78 `(2 | |
| 79 ((loop | |
| 80 (read r0) | |
| 81 | |
| 82 ;; 1byte encoding, i.e., ascii | |
| 83 (if (r0 < #x80) | |
| 84 (write r0) | |
| 85 | |
| 86 ;; 2byte encoding | |
| 87 (if (r0 < #xe0) | |
| 88 ((read r1) | |
| 89 (r0 &= #x1f) | |
| 90 (r0 <<= 6) | |
| 91 (r1 &= #x3f) | |
| 92 (r1 += r0) | |
| 93 ;; now r1 holds scalar value | |
| 94 | |
| 95 ;; eight-bit-control | |
| 96 (if (r1 < 160) | |
| 97 ((r0 = ,(charset-id 'eight-bit-control)) | |
| 98 (write-multibyte-character r0 r1)) | |
| 99 | |
| 100 ;; latin-iso8859-1 | |
| 101 (if (r1 < 256) | |
| 102 ((r0 = ,(charset-id 'latin-iso8859-1)) | |
| 103 (r1 -= 128) | |
| 104 (write-multibyte-character r0 r1)) | |
| 105 | |
| 106 ;; mule-unicode-0100-24ff (< 0800) | |
| 107 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
| 108 (r1 -= #x0100) | |
| 109 (r2 = (((r1 / 96) + 32) << 7)) | |
| 110 (r1 %= 96) | |
| 111 (r1 += (r2 + 32)) | |
| 112 (write-multibyte-character r0 r1))))) | |
| 113 | |
| 114 ;; 3byte encoding | |
| 115 (if (r0 < #xf0) | |
| 116 ((read r1 r2) | |
| 117 (r3 = ((r0 & #x0f) << 12)) | |
| 118 (r3 += ((r1 & #x3f) << 6)) | |
| 119 (r3 += (r2 & #x3f)) | |
| 120 ;; now r3 holds scalar value | |
| 121 | |
| 122 ;; mule-unicode-0100-24ff (>= 0800) | |
| 123 (if (r3 < #x2500) | |
| 124 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) | |
| 125 (r3 -= #x0100) | |
| 126 (r3 //= 96) | |
| 127 (r1 = (r7 + 32)) | |
| 128 (r1 += ((r3 + 32) << 7)) | |
| 129 (write-multibyte-character r0 r1)) | |
| 130 | |
| 131 ;; mule-unicode-2500-33ff | |
| 132 (if (r3 < #x3400) | |
| 133 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) | |
| 134 (r3 -= #x2500) | |
| 135 (r3 //= 96) | |
| 136 (r1 = (r7 + 32)) | |
| 137 (r1 += ((r3 + 32) << 7)) | |
| 138 (write-multibyte-character r0 r1)) | |
| 139 | |
| 140 ;; U+3400 .. U+DFFF | |
| 141 ;; keep those bytes as eight-bit-{control|graphic} | |
| 142 (if (r3 < #xe000) | |
| 36522 | 143 (;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic |
| 35542 | 144 (r3 = ,(charset-id 'eight-bit-graphic)) |
| 145 (write-multibyte-character r3 r0) | |
| 146 (if (r1 < #xa0) | |
| 147 (r3 = ,(charset-id 'eight-bit-control))) | |
| 148 (write-multibyte-character r3 r1) | |
| 149 (if (r2 < #xa0) | |
| 150 (r3 = ,(charset-id 'eight-bit-control)) | |
| 151 (r3 = ,(charset-id 'eight-bit-graphic))) | |
| 152 (write-multibyte-character r3 r2)) | |
| 153 | |
| 154 ;; mule-unicode-e000-ffff | |
| 155 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) | |
| 156 (r3 -= #xe000) | |
| 157 (r3 //= 96) | |
| 158 (r1 = (r7 + 32)) | |
| 159 (r1 += ((r3 + 32) << 7)) | |
| 160 (write-multibyte-character r0 r1)))))) | |
| 161 | |
| 162 ;; 4byte encoding | |
| 163 ;; keep those bytes as eight-bit-{control|graphic} | |
| 164 ((read r1 r2 r3) | |
| 165 ;; r0 > #xf0, thus eight-bit-graphic | |
| 166 (r4 = ,(charset-id 'eight-bit-graphic)) | |
| 167 (write-multibyte-character r4 r0) | |
| 168 (if (r1 < #xa0) | |
| 169 (r4 = ,(charset-id 'eight-bit-control))) | |
| 170 (write-multibyte-character r4 r1) | |
| 171 (if (r2 < #xa0) | |
| 172 (r4 = ,(charset-id 'eight-bit-control)) | |
| 173 (r4 = ,(charset-id 'eight-bit-graphic))) | |
| 174 (write-multibyte-character r4 r2) | |
| 175 (if (r3 < #xa0) | |
| 176 (r4 = ,(charset-id 'eight-bit-control)) | |
| 177 (r4 = ,(charset-id 'eight-bit-graphic))) | |
| 178 (write-multibyte-character r4 r3))))) | |
| 179 | |
| 180 (repeat)))) | |
| 181 | |
| 36243 | 182 "CCL program to decode UTF-8. |
| 36465 | 183 Basic decoding is done into the charsets ascii, latin-iso8859-1 and |
| 184 mule-unicode-*. Encodings of un-representable Unicode characters are | |
| 185 decoded asis into eight-bit-control and eight-bit-graphic | |
| 186 characters.") | |
| 35542 | 187 |
| 188 (define-ccl-program ccl-encode-mule-utf-8 | |
| 189 `(1 | |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
190 ((r5 = -1) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
191 (loop |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
192 (if (r5 < 0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
193 ((r1 = -1) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
194 (read-multibyte-character r0 r1)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
195 (;; We have already done read-multibyte-character. |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
196 (r0 = r5) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
197 (r1 = r6) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
198 (r5 = -1))) |
| 35542 | 199 |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
200 (if (r0 == ,(charset-id 'ascii)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
201 (write r1) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
202 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
203 (if (r0 == ,(charset-id 'latin-iso8859-1)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
204 ;; r1 scalar utf-8 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
205 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
206 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
207 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
208 ((r0 = (((r1 & #x40) >> 6) | #xc2)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
209 (r1 &= #x3f) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
210 (r1 |= #x80) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
211 (write r0 r1)) |
| 35542 | 212 |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
213 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
214 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
215 ;; #x3f80 == (0011 1111 1000 0000)b |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
216 (r1 &= #x7f) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
217 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
218 ;; now r1 holds scalar value |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
219 (if (r1 < #x0800) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
220 ;; 2byte encoding |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
221 ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
222 ;; #x07c0 == (0000 0111 1100 0000)b |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
223 (r1 &= #x3f) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
224 (r1 |= #x80) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
225 (write r0 r1)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
226 ;; 3byte encoding |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
227 ((r0 = (((r1 & #xf000) >> 12) | #xe0)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
228 (r2 = ((r1 & #x3f) | #x80)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
229 (r1 &= #x0fc0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
230 (r1 >>= 6) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
231 (r1 |= #x80) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
232 (write r0 r1 r2)))) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
233 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
234 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
235 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
236 (r1 &= #x7f) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
237 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
238 (r0 = (((r1 & #xf000) >> 12) | #xe0)) |
| 35542 | 239 (r2 = ((r1 & #x3f) | #x80)) |
| 240 (r1 &= #x0fc0) | |
| 241 (r1 >>= 6) | |
| 242 (r1 |= #x80) | |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
243 (write r0 r1 r2)) |
| 35542 | 244 |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
245 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
246 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
247 (r1 &= #x7f) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
248 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
249 (r0 = (((r1 & #xf000) >> 12) | #xe0)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
250 (r2 = ((r1 & #x3f) | #x80)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
251 (r1 &= #x0fc0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
252 (r1 >>= 6) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
253 (r1 |= #x80) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
254 (write r0 r1 r2)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
255 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
256 (if (r0 == ,(charset-id 'eight-bit-control)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
257 ;; r1 scalar utf-8 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
258 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
259 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
260 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
261 ((write #xc2) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
262 (write r1)) |
| 35542 | 263 |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
264 (if (r0 == ,(charset-id 'eight-bit-graphic)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
265 ;; r1 scalar utf-8 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
266 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
267 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
268 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
269 ((write r1) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
270 (r1 = -1) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
271 (read-multibyte-character r0 r1) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
272 (if (r0 != ,(charset-id 'eight-bit-graphic)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
273 (if (r0 != ,(charset-id 'eight-bit-control)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
274 ((r5 = r0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
275 (r6 = r1)))) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
276 (if (r5 < 0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
277 ((read-multibyte-character r0 r2) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
278 (if (r0 != ,(charset-id 'eight-bit-graphic)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
279 (if (r0 != ,(charset-id 'eight-bit-control)) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
280 ((r5 = r0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
281 (r6 = r2)))) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
282 (if (r5 < 0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
283 (write r1 r2) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
284 (if (r1 < #xa0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
285 (write r1) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
286 ((write #xc2) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
287 (write r1))))))) |
| 35542 | 288 |
|
37097
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
289 ;; Unsupported character. |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
290 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
291 ((write #xef) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
292 (write #xbf) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
293 (write #xbd))))))))) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
294 (repeat))) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
295 (if (r1 >= #xa0) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
296 (write r1) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
297 (if (r1 >= #x80) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
298 ((write #xc2) |
|
b095952a8678
(ccl-encode-mule-utf-8): Fix handling of eight-bit-control chars.
Kenichi Handa <handa@m17n.org>
parents:
36522
diff
changeset
|
299 (write r1))))) |
| 35542 | 300 |
| 36243 | 301 "CCL program to encode into UTF-8. |
| 302 Only characters from the charsets ascii, eight-bit-control, | |
| 36465 | 303 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized. |
| 304 Others are encoded as U+FFFD.") | |
| 35542 | 305 |
| 306 (make-coding-system | |
| 307 'mule-utf-8 4 ?u | |
| 308 "UTF-8 encoding for Emacs-supported Unicode characters. | |
| 36243 | 309 The supported Emacs character sets are: |
| 35542 | 310 ascii |
| 311 eight-bit-control | |
| 312 eight-bit-graphic | |
| 313 latin-iso8859-1 | |
| 314 mule-unicode-0100-24ff | |
| 315 mule-unicode-2500-33ff | |
| 316 mule-unicode-e000-ffff | |
| 317 | |
| 36243 | 318 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF |
| 319 are decoded into sequences of eight-bit-control and eight-bit-graphic | |
| 320 characters to preserve their byte sequences. Emacs characters out of | |
| 321 these ranges are encoded into U+FFFD. | |
| 322 | |
| 323 Note that, currently, characters in the mule-unicode charsets have no | |
| 324 syntax and case information. Thus, for instance, upper- and | |
| 325 lower-casing commands won't work with them." | |
| 35542 | 326 |
| 327 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) | |
| 328 '((safe-charsets | |
| 329 ascii | |
| 330 eight-bit-control | |
| 331 eight-bit-graphic | |
| 332 latin-iso8859-1 | |
| 333 mule-unicode-0100-24ff | |
| 334 mule-unicode-2500-33ff | |
| 335 mule-unicode-e000-ffff) | |
|
36371
f6bb3ed752b4
(mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents:
36243
diff
changeset
|
336 (mime-charset . utf-8) |
|
36423
aa776838b660
(mule-utf-8): Set coding-category property to coding-category-utf-8.
Kenichi Handa <handa@m17n.org>
parents:
36371
diff
changeset
|
337 (coding-category . coding-category-utf-8) |
|
36371
f6bb3ed752b4
(mule-utf-8): Set correct value for valid-codes property.
Kenichi Handa <handa@m17n.org>
parents:
36243
diff
changeset
|
338 (valid-codes (0 . 255)))) |
| 35542 | 339 |
| 340 (define-coding-system-alias 'utf-8 'mule-utf-8) |
