Skip to content
Snippets Groups Projects
Commit 6ac77d96 authored by rtoy's avatar rtoy
Browse files

Character name and properties support, from Paul Foley, slightly

modified by Raymond Toy.

Use 19f/boot-2009-03-cross-unicode-<arch> for the cross compile
script.  Use 19f/boot-2009-03-unicode-char for the bootstrap file to
initialize the unicode character structures.

bootfiles/19e/boot-2008-05-cross-unicode-common.lisp:
o Just add new slots to fd-stream here, to make it easier to bootstrap
  the utf16-extfmts code, and to select the clobber-it restart
  automatically.
o Build the initial unicode properties

bootfiles/19f/boot-2009-03-cross-unicode-ppc.lisp:
bootfiles/19f/boot-2009-03-cross-unicode-sparc.lisp:
bootfiles/19f/boot-2009-03-cross-unicode-x86.lisp:
o New scripts for cross-compiling.  Basically just calls the
  original ones in the 19e directory.

bootfiles/19f/boot-2009-03-unicode-char.lisp:
o Bootstrap file to load up the full unicode properties.

i18n/UnicodeData.txt:
o UnicodeData.txt, obtained from
  unicode.org/Public/UNIDATA/UnicodeData.txt, 2009-03-24.

code/fd-stream.lisp:
o Add new slots to fd-stream, needed by utf16-extfmts branch.

code/char.lisp:
o Define structure to hold unicode properties of each character and
  new hash table to hold the properties.
o Update CHAR-NAME and NAME-CHAR to give the character names and code,
  respectively.
o Update GRAPHIC-CHAR-P, ALPHA-CHAR-P, UPPER-CASE-P, LOWER-CASE-P,
  BOTH-CASE-P, ALPHANUMERICP, EQUAL-CHAR-CODE, CHAR-UPCASE, and
  CHAR-DOWNCASE to handle unicode characters.
o Add function REBUILD-UNICODE-DATA to update the unicode structures
  from UnicodeData.txt.
parent 951c95f7
No related branches found
Tags unicode-utf16-char-support-2009-03-25
No related merge requests found
......@@ -41,6 +41,76 @@
(t 0 32 old-vm:simple-vector-type)))
)
(handler-bind ((error #'(lambda (c)
(declare (ignore c))
(invoke-restart 'kernel::clobber-it))))
(defstruct (fd-stream
(:print-function %print-fd-stream)
(:constructor %make-fd-stream)
(:include file-stream
(misc #'fd-stream-misc-routine)))
(name nil) ; The name of this stream
(file nil) ; The file this stream is for
;;
;; The backup file namestring for the old file, for :if-exists :rename or
;; :rename-and-delete.
(original nil :type (or simple-string null))
(delete-original nil) ; for :if-exists :rename-and-delete
;;
;;; Number of bytes per element.
(element-size 1 :type index)
(element-type 'base-char) ; The type of element being transfered.
(fd -1 :type fixnum) ; The file descriptor
;;
;; Controls when the output buffer is flushed.
(buffering :full :type (member :full :line :none))
;;
;; Character position if known.
(char-pos nil :type (or index null))
;;
;; T if input is waiting on FD. :EOF if we hit EOF.
(listen nil :type (member nil t :eof))
;;
;; The input buffer.
(unread nil)
(ibuf-sap nil :type (or system-area-pointer null))
(ibuf-length nil :type (or index null))
(ibuf-head 0 :type index)
(ibuf-tail 0 :type index)
;; The output buffer.
(obuf-sap nil :type (or system-area-pointer null))
(obuf-length nil :type (or index null))
(obuf-tail 0 :type index)
;; Output flushed, but not written due to non-blocking io.
(output-later nil)
(handler nil)
;;
;; Timeout specified for this stream, or NIL if none.
(timeout nil :type (or index null))
;;
;; Pathname of the file this stream is opened to (returned by PATHNAME.)
(pathname nil :type (or pathname null))
;;
;; External formats
;; @@ I want to use :default here, but keyword pkg isn't set up yet at boot
;; so initialize to NIL and fix it in SET-ROUTINES
#+unicode
(external-format nil :type (or null keyword cons))
#+unicode
(oc-state nil)
#+unicode
(co-state nil)
#+unicode
(last-char-read-size 0 :type index)))
(defun %print-fd-stream (fd-stream stream depth)
(declare (ignore depth) (stream stream))
(format stream "#<Stream for ~A>"
(fd-stream-name fd-stream)))
;; Dump a character of a string to a fasl file in the byte correct
;; order.
(defun dump-string-char (code file)
......@@ -333,3 +403,41 @@
(dump-unsigned-32 f-vers res))
res))
(in-package "LISP")
(defvar *unicode-data* (make-hash-table :test 'equal :size 26674))
(defvar *assigned-codepoints-bitmap* (make-array 65536 :element-type 'bit))
(when (< (hash-table-count *unicode-data*) 20000)
(dolist (range '((#x0000 . #x001F) (#x007F . #x009F) (#x3400 . #x4DB5)
(#x4E00 . #x9FBB) (#xAC00 . #xD7A3) (#xE000 . #xF8FF)
(#xDB80 . #xDBFF)))
(loop for i from (car range) to (cdr range) do
(setf (aref *assigned-codepoints-bitmap* i) 1)))
(with-open-file (s "target:i18n/UnicodeData.txt")
(format t "~&;; Loading UnicodeData.txt~%")
(flet ((cat (x) (dpb (position (char x 0) "CLMNPSZ") (byte 3 4)
(position (char x 1) "cdefiklmnopstu")))
(num (x) (if (string= x "") nil x))
(chr (x) (if (string= x "") nil
(let ((n (parse-integer x :radix 16)))
(and (< n char-code-limit) (code-char n))))))
(loop for line = (read-line s nil) while line do
(let* ((split (loop for i = 0 then (1+ j)
as j = (position #\; line :start i)
collect (subseq line i j) while j))
(code (parse-integer (first split) :radix 16)))
(unless (or (>= code char-code-limit)
(char= (char (second split) 0) #\<))
(setf (aref *assigned-codepoints-bitmap* code) 1)
(let ((x (vector (code-char code)
(nth 1 split)
(cat (nth 2 split))
(num (nth 6 split))
(num (nth 7 split))
(num (nth 8 split))
(chr (nth 12 split))
(chr (nth 13 split))
(chr (nth 14 split)))))
(setf (gethash (code-char code) *unicode-data*) x)
(setf (gethash (second split) *unicode-data*) x))))))))
(format t "*unicode-data* size = ~D~%" (hash-table-count *unicode-data*))
(load "target:bootfiles/19e/boot-2008-05-cross-unicode-ppc")
(load "target:bootfiles/19e/boot-2008-05-cross-unicode-sparc")
(load "target:bootfiles/19e/boot-2008-05-cross-unicode-x86")
(in-package "LISP")
;; Rebuild the *unicode-data* with the correct stuff.
(rebuild-unicode-data)
......@@ -5,7 +5,7 @@
;;; Carnegie Mellon University, and has been placed in the public domain.
;;;
(ext:file-comment
"$Header: /Volumes/share2/src/cmucl/cvs2git/cvsroot/src/code/char.lisp,v 1.15.18.4 2008/09/03 16:34:30 rtoy Exp $")
"$Header: /Volumes/share2/src/cmucl/cvs2git/cvsroot/src/code/char.lisp,v 1.15.18.5 2009/03/25 19:32:53 rtoy Exp $")
;;;
;;; **********************************************************************
;;;
......@@ -98,6 +98,32 @@
(#x20 ("Space" "SP" "SPC"))
(#x7f ("Rubout" "Delete" "DEL")))))
#+unicode
(defstruct (unicode (:type vector))
character name category num1 num2 num3 upper lower title)
;; Note: *unicode-data* is initializes with itself. So how does it
;; get initialized to begin with? A bootstrap files needs to be run
;; and rebuild-unicode-data will fill *unicode-data* with appropriate
;; values. Likewise for *assigned-codepoints-bitmap*.
#+unicode
(defvar *unicode-data* #.*unicode-data*)
#+unicode
(defun unicode-data (thing)
(gethash thing *unicode-data*))
#+unicode
(defvar *assigned-codepoints-bitmap* #.*assigned-codepoints-bitmap*)
(defun codepoint-assigned-p (codepoint)
#-unicode (declare (ignore codepoint))
#-unicode t
#+unicode
(= 1 (aref (the (simple-bit-vector #.char-code-limit)
*assigned-codepoints-bitmap*)
codepoint)))
;;;; Accessor functions:
......@@ -150,11 +176,12 @@
name
#-unicode nil
#+unicode
;; FIXME:
;; return the Unicode name of the character,
;; Return the Unicode name of the character,
;; or U+xxxx if it doesn't have a name
(format nil "U+~4,'0X" (char-code char)))))
(let ((data (unicode-data char)))
(if data
(nstring-capitalize (substitute #\_ #\Space (unicode-name data)))
(format nil "U+~4,'0X" (char-code char)))))))
(defun name-char (name)
"Given an argument acceptable to string, name-char returns a character
......@@ -164,9 +191,12 @@
(or (cdr (assoc (string name) char-name-alist :test #'string-equal))
#-unicode nil
#+unicode
;; FIXME:
;; See if it's a valid Unicode character name
nil)))
(let ((data (unicode-data
(nsubstitute #\Space #\_ (string-upcase name)))))
(if data
(unicode-character data)
nil)))))
......@@ -194,35 +224,52 @@
returns ()."
(declare (character char))
(and (typep char 'base-char)
#-(and unicode (not unicode-bootstrap))
(< 31
(char-code (the base-char char))
127)))
127)
#+(and unicode (not unicode-bootstrap))
(let ((data (unicode-data char)))
(and data (> (unicode-category data) 16)))))
(defun alpha-char-p (char)
"The argument must be a character object. Alpha-char-p returns T if the
argument is an alphabetic character, A-Z or a-z; otherwise ()."
(declare (character char))
#-(and unicode (not unicode-bootstrap))
(let ((m (char-code char)))
(or (< 64 m 91) (< 96 m 123))))
(or (< 64 m 91) (< 96 m 123)))
#+(and unicode (not unicode-bootstrap))
(let ((data (unicode-data char)))
(and data (= (ldb (byte 3 4) (unicode-category data)) 1))))
(defun upper-case-p (char)
"The argument must be a character object; upper-case-p returns T if the
argument is an upper-case character, () otherwise."
(declare (character char))
#-(and unicode (not unicode-bootstrap))
(< 64
(char-code char)
91))
91)
#+(and unicode (not unicode-bootstrap))
(let ((data (unicode-data char)))
(and data (= (unicode-category data) #x1d))))
(defun lower-case-p (char)
"The argument must be a character object; lower-case-p returns T if the
argument is a lower-case character, () otherwise."
(declare (character char))
#-(and unicode (not unicode-bootstrap))
(< 96
(char-code char)
123))
123)
#+(and unicode (not unicode-bootstrap))
(let ((data (unicode-data char)))
(and data (= (unicode-category data) #x16))))
(defun both-case-p (char)
......@@ -230,8 +277,14 @@
argument is an alphabetic character and if the character exists in
both upper and lower case. For ASCII, this is the same as Alpha-char-p."
(declare (character char))
#-(and unicode (not unicode-bootstrap))
(let ((m (char-code char)))
(or (< 64 m 91) (< 96 m 123))))
(or (< 64 m 91) (< 96 m 123)))
#+(and unicode (not unicode-bootstrap))
(let ((data (unicode-data char)))
(and data (or (= (unicode-category data) #x16)
(= (unicode-category data) #x1d)))))
(defun digit-char-p (char &optional (radix 10.))
......@@ -258,8 +311,14 @@
"Given a character-object argument, alphanumericp returns T if the
argument is either numeric or alphabetic."
(declare (character char))
#-(and unicode (not unicode-bootstrap))
(let ((m (char-code char)))
(or (< 47 m 58) (< 64 m 91) (< 96 m 123))))
(or (< 47 m 58) (< 64 m 91) (< 96 m 123)))
#+(and unicode (not unicode-bootstrap))
(or (< 47 (char-code char) 58)
(let ((data (unicode-data char)))
(and data (or (= (unicode-category data) #x16)
(= (unicode-category data) #x1D))))))
(defun char= (character &rest more-characters)
......@@ -324,10 +383,16 @@
;;; Equal-Char-Code is used by the following functions as a version of char-int
;;; which loses case info.
#-(and unicode (not unicode-bootstrap))
(defmacro equal-char-code (character)
`(let ((ch (char-code ,character)))
(if (< 96 ch 123) (- ch 32) ch)))
#+(and unicode (not unicode-bootstrap))
(defmacro equal-char-code (character)
`(let* ((char ,character)
(data (unicode-data char)))
(char-code (or (and data (unicode-upper data)) char))))
(defun char-equal (character &rest more-characters)
......@@ -405,16 +470,29 @@
(defun char-upcase (char)
"Returns CHAR converted to upper-case if that is possible."
(declare (character char))
#-(and unicode (not unicode-bootstrap))
(if (lower-case-p char)
(code-char (- (char-code char) 32))
char))
char)
#+(and unicode (not unicode-bootstrap))
(let ((data (unicode-data char)))
(if data
(or (unicode-upper data) char)
char)))
(defun char-downcase (char)
"Returns CHAR converted to lower-case if that is possible."
(declare (character char))
#-(and unicode (not unicode-bootstrap))
(if (upper-case-p char)
(code-char (+ (char-code char) 32))
char))
char)
#+(and unicode (not unicode-bootstrap))
(let ((data (unicode-data char)))
(if data
(or (unicode-lower data) char)
char)))
(defun digit-char (weight &optional (radix 10))
"All arguments must be integers. Returns a character object that
......@@ -424,3 +502,40 @@
(and (typep weight 'fixnum)
(>= weight 0) (< weight radix) (< weight 36)
(code-char (if (< weight 10) (+ 48 weight) (+ 55 weight)))))
;; Rebuild *unicode-data* and *assigned-codepoints-bitmap*
#+unicode
(defun rebuild-unicode-data ()
(when (< (hash-table-count *unicode-data*) 20000)
(dolist (range '((#x0000 . #x001F) (#x007F . #x009F) (#x3400 . #x4DB5)
(#x4E00 . #x9FBB) (#xAC00 . #xD7A3) (#xE000 . #xF8FF)
(#xDB80 . #xDBFF)))
(loop for i from (car range) to (cdr range) do
(setf (aref *assigned-codepoints-bitmap* i) 1)))
(with-open-file (s "target:i18n/UnicodeData.txt")
(flet ((cat (x) (dpb (position (char x 0) "CLMNPSZ") (byte 3 4)
(position (char x 1) "cdefiklmnopstu")))
(num (x) (if (string= x "") nil x))
(chr (x) (if (string= x "") nil
(let ((n (parse-integer x :radix 16)))
(and (< n char-code-limit) (code-char n))))))
(loop for line = (read-line s nil) while line do
(let* ((split (loop for i = 0 then (1+ j)
as j = (position #\; line :start i)
collect (subseq line i j) while j))
(code (parse-integer (first split) :radix 16)))
(unless (or (>= code char-code-limit)
(char= (char (second split) 0) #\<))
(setf (aref *assigned-codepoints-bitmap* code) 1)
(let ((x (vector (code-char code)
(nth 1 split)
(cat (nth 2 split))
(num (nth 6 split))
(num (nth 7 split))
(num (nth 8 split))
(chr (nth 12 split))
(chr (nth 13 split))
(chr (nth 14 split)))))
(setf (gethash (code-char code) *unicode-data*) x)
(setf (gethash (second split) *unicode-data*) x)))))))))
......@@ -5,7 +5,7 @@
;;; Carnegie Mellon University, and has been placed in the public domain.
;;;
(ext:file-comment
"$Header: /Volumes/share2/src/cmucl/cvs2git/cvsroot/src/code/fd-stream.lisp,v 1.85.4.1 2008/05/14 16:12:04 rtoy Exp $")
"$Header: /Volumes/share2/src/cmucl/cvs2git/cvsroot/src/code/fd-stream.lisp,v 1.85.4.2 2009/03/25 19:32:53 rtoy Exp $")
;;;
;;; **********************************************************************
;;;
......@@ -193,7 +193,20 @@
(timeout nil :type (or index null))
;;
;; Pathname of the file this stream is opened to (returned by PATHNAME.)
(pathname nil :type (or pathname null)))
(pathname nil :type (or pathname null))
;;
;; External format support
;;
;; @@ I want to use :default here, but keyword pkg isn't set up yet at boot
;; so initialize to NIL and fix it in SET-ROUTINES
#+unicode
(external-format nil :type (or null keyword cons))
#+unicode
(oc-state nil)
#+unicode
(co-state nil)
#+unicode
(last-char-read-size 0 :type index))
(defun %print-fd-stream (fd-stream stream depth)
(declare (ignore depth) (stream stream))
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment