;;; emacsdream.scm ;; Copyright (C) 2009, 2010, 2011 Thien-Thi Nguyen ;; ;; This file is part of ttn-do, released under the terms of the ;; GNU General Public License as published by the Free Software ;; Foundation; either version 3, or (at your option) any later ;; version. There is NO WARRANTY. See file COPYING for details. ;;; Code: (define-module (ttn-do zzz emacsdream) #:export (make-unsigned-byte-stream utf8-reader valid-ucs? valid-ucs2?) #:use-module ((ice-9 optargs-kw) #:select (lambda* define* let-optional* let-keywords*)) #:use-module ((ice-9 streams) #:select (port->stream make-stream stream-car stream-cdr stream-null? stream-map vector->stream list->stream)) #:use-module ((ttn-do zzz personally) #:select (accumulator))) ;; Is it an "unsigned byte stream"? (define ubs? (make-object-property)) (define (recognizable-as-ubs x) (set! (ubs? x) #t) x) ;; Return a stream delivering unsigned bytes from @var{source}. ;; @var{source} can be a port; a string; a list, vector or uniform ;; vector of unsigned bytes (integers in the range [0,255]), or a ;; pre-existing unsigned byte stream object. For a list or vector, ;; the elements may also be Scheme characters, which are automatically ;; converted via @code{char->integer}. This conversion also occurs ;; automatically if @var{source} is a string or a port. For a uniform ;; vector (which must have prototype @code{#\nul}), elements are taken ;; modulo 256 i.e., @code{#y(-1 2 -42)} yields the three integers 255, ;; 2 and 214. ;; ;; Normally, @var{source} elements are delivered without validation. ;; If @code{#:check @var{symbol}} is specified, however, if an element ;; is not an integer in the range [0,255], throw @var{symbol} with the ;; element as arg. ;; (define* (make-unsigned-byte-stream source #:key (check #f)) (define (ubs<-char-stream s) (stream-map char->integer s)) (define (from-port port) (ubs<-char-stream (port->stream port read-char))) (let ((s (cond ((port? source) (from-port source)) ((string? source) (call-with-input-string source from-port)) ((pair? source) ((if (char? (car source)) ubs<-char-stream identity) (list->stream source))) ((vector? source) ((if (or (zero? (vector-length source)) (not (char? (vector-ref source 0)))) identity ubs<-char-stream) (vector->stream source))) ((and (uniform-vector? source) (eq? #\nul (array-prototype source))) (let ((len (uniform-vector-length source))) (make-stream (lambda (i) (or (= len i) (cons (modulo (uniform-vector-ref source i) 256) (1+ i)))) 0))) ((ubs? source) source) (else (error "bad source:" source))))) (recognizable-as-ubs (if check (stream-map (lambda (ub) (or (and (integer? ub) (<= 0 ub 255)) (throw check ub)) ub) s) s)))) ;; Return a procedure that internally parses the UTF8-encoded input ;; @var{from}, which may be a string, a port, a list of characters, a ;; list of unsigned bytes (integers in the range [0,255]), or the ;; ``extracted ubs'' from a previous call to @code{utf8-reader} (see ;; below). ;; ;; Optional arg INIT-POS is a pair @code{(@var{byte} . @var{char})}, ;; defaulting to (0 . 0), that specifies the stream's initial position. ;; ;; If the returned procedure @var{p} is called with no arguments, ;; it reads the next UTF8 encoded character (one or more bytes), ;; updates the stream position, updates the reused-storage pair: ;; @example ;; (BYTES-CONSUMED . UVAL) ;; @end example ;; and returns it. Both @var{byte-count} and @var{uval} are ;; integers. If there is a problem, @var{p} throws @code{invalid-utf8} ;; with the list of problematic bytes. Note that decoding does not ;; perform UCS-specific checks (see @code{valid-ucs?} et al for that). ;; On the other hand, if the stream is empty, @var{p} returns @code{#f}. ;; ;; Internally, for speed, storage for stream state is allocated once ;; and subsequently reused. To access this state, @var{p} can also be ;; called with @var{command} @var{args}@dots{}, where @var{command} is ;; one of: ;; ;; @table @code ;; @item #:raw-bytes ;; This returns the vector (length six) used to temporarily store the ;; bytes taken from the stream. Some elements may be unspecified, ;; and some elements may represent bytes from previous calls. On ;; @code{invalid-utf8} error, the proper subset is also thrown. ;; ;; @item #:posbox ;; This returns the pair @code{(@var{byte} . @var{char})}, where ;; both @var{byte} and @var{char} are counters of bytes and characters ;; parsed, respectively. ;; ;; @item #:rvbox ;; This returns the pair @code{(@var{bytes-consumed} . @var{uval})}, ;; the same one (updated and) returned by calling @var{p} with no ;; arguments. ;; ;; @item #:ubs ;; This returns the unsigned-byte stream object constructed from ;; @var{from}, marked in a way that it is recognizable if passed to ;; another call to @var{utf8-reader}. ;; @end table ;; ;; Aside from directly manipulating the state (which is not ;; recommended practice, by the way), two additional commands ;; provide common abstractions. ;; ;; @table @code ;; @item #:skip! @var{n} [@var{acc?}] ;; This advances the stream by @var{n} bytes or until the stream is ;; empty, whichever happens first. The value is of the form: ;; @example ;; ((BEFORE . DIFF) [BYTE...]) ;; @end example ;; where @var{before} is the byte-position before skipping, @var{diff} ;; is how many bytes were skipped (less than or equal to @var{n}), and ;; the rest the list of bytes skipped, if @var{acc?} is specified and ;; non-@code{#f}. ;; ;; @item #:sync! ;; This advances the stream (if necessary) until the ``current byte'' ;; looks like the beginning of a valid UTF8-encoded character, or until ;; the stream is empty, whichever happens first, and returns a form ;; similar to that of the @code{#:skip!} command. ;; @end table ;; ;;-args: (- 1 0) ;; (define (utf8-reader from . init-pos) (let ((posbox (if (null? init-pos) (cons 0 0) (cons (caar init-pos) (cdar init-pos)))) (rvbox (cons #f #f)) (raw-bytes (make-vector 6)) (ubs (make-unsigned-byte-stream from #:check 'invalid-utf8))) (define (advance!) (set-car! posbox (1+ (car posbox))) (set! ubs (stream-cdr ubs))) (define (ignored bef acc) (acons bef (- (car posbox) bef) (acc))) (define (skip! n . acc?) (let ((acc? (and (not (null? acc?)) (car acc?))) (acc (accumulator)) (bef (car posbox))) (let loop ((n n)) (cond ((or (zero? n) (stream-null? ubs)) (ignored bef acc)) (else (and acc? (acc (stream-car ubs))) (advance!) (loop (1- n))))))) (define (sync!) (let ((acc (accumulator)) (bef (car posbox))) (let loop () (cond ((or (stream-null? ubs) (not (= 2 (bit-extract (stream-car ubs) 6 8)))) (ignored bef acc)) (else (acc (stream-car ubs)) (advance!) (loop)))))) (define (next-ubyte!) (and (not (stream-null? ubs)) (let ((ub (stream-car ubs))) (advance!) ub))) (define (bad through) (apply throw 'invalid-utf8 (list-head (vector->list raw-bytes) through))) (define (got a d) (set-cdr! posbox (1+ (cdr posbox))) (set-car! rvbox a) (set-cdr! rvbox d) rvbox) (define (grok! one) (vector-set! raw-bytes 0 one) (and (or (memq one '(#xC0 #xC1)) (<= #xF5 one #xFF)) (bad 1)) (if (> 128 one) ;; ASCII: Done! (got 1 one) ;; Process more bytes, taking ‘one’ as the control byte ;; (which has some data bits in the lower part as well). (let ((need #f) ; how many valid in ‘more’ (uval 0)) (define (xlo n stop) (bit-extract n 0 stop)) (define (xlo6 n) (xlo n 6)) ;; Determine which additional bytes are to be read. ;; Valid results are in the range [2,6]. (or (logbit? 6 one) (bad 1)) (set! need (do ((i 5 (1- i))) ((if (zero? i) (bad 1) (not (logbit? i one))) (- 7 i)))) ;; Start decoded value with data bits from control byte. (set! uval (xlo one (- 7 need))) ;; Read, validate and extract data bits from additional bytes. (do ((i 1 (1+ i))) ((= i need)) (let ((b (next-ubyte!))) (vector-set! raw-bytes i b) (or (and b (= #b10 (bit-extract b 6 8))) (bad i)) (set! uval (logior (ash uval 6) (xlo6 b))))) ;; Done! (got need uval)))) (lambda command (if (null? command) (and=> (next-ubyte!) grok!) (case (car command) ((#:raw-bytes) raw-bytes) ((#:posbox) posbox) ((#:rvbox) rvbox) ((#:ubs) (recognizable-as-ubs ubs)) ((#:skip!) (apply skip! (cdr command))) ((#:sync!) (sync!)) (else (error "bad command:" command))))))) ;; Return @code{#t} iff @var{uval} is a UCS value that is neither a ;; UTF-16 surrogate (in the range [#xD800,#xDFFF]) nor a non-character ;; (one of #xFFFE, #xFFFF). ;; (define (valid-ucs? uval) (not (or ;; UTF-16 surrogates: [#xD800,#xDFFF] (<= #xD8 (ash uval -8) #xDF) ;; Non characters: #xFFFE, #xFFFF (memq uval '(#xFFFE #xFFFF))))) ;; Return @code{#t} iff @var{uval} is a UCS value that fits in two bytes. ;; This uses @code{valid-ucs?}. ;; (define (valid-ucs2? uval) (and (valid-ucs? uval) (> 65536 uval))) ;;; emacsdream.scm ends here