;;; utf8proc->s7 tests

(load "libutf8proc.scm")

(when (defined? '*libutf8proc*)
  
  (with-let *libutf8proc*

    ;; --------------------------------
    ;; these are from the libutf8proc test directory
    
    (define (print-property c)
      (format *stderr* "  category = ~S~% charwidth = ~D~%~A~%"
	      (utf8proc_category_string c)
	      (utf8proc_charwidth c)
	      (utf8proc_get_property c)))
    
    (do ((c 1 (+ c 1)))
	((= c #x110000))
      (let ((l (utf8proc_tolower c))
	    (u (utf8proc_toupper c)))
	(unless (or (= l c)
		    (utf8proc_codepoint_valid l))
	  (format *stderr* "~X: invalid tolower~%" c))
	(unless (or (= u c)
		    (utf8proc_codepoint_valid u))
	  (format *stderr* "~X: invalid toupper~%" c))
	))
    
    (do ((c 0 (+ c 1)))
	((or (= c #xd800)
	     (and (not (utf8proc_codepoint_valid c))
		  (not (format *stderr* "~X: codepoint invalid~%" c))))))
    
    (do ((c #xd800 (+ c 1)))
	((or (= c #xe000)
	     (and (utf8proc_codepoint_valid c)
		  (not (format *stderr* "~X: codepoint valid?~%" c))))))
    
    (do ((c #xe000 (+ c 1)))
	((or (= c #x110000)
	     (and (not (utf8proc_codepoint_valid c))
		  (not (format *stderr* "~X: codepoint invalid~%" c))))))
    
    (do ((c #x110000 (+ c 1)))
	((or (= c #x110010)
	     (and (utf8proc_codepoint_valid c)
		  (not (format *stderr* "~X: codepoint valid?~%" c))))))
    
    ;; (print-property #xbb)
    
    (do ((c 1 (+ c 1)))
	((= c #x110000))
      (let ((cat ((utf8proc_get_property c) 'category))
	    (w (utf8proc_charwidth c)))
	(if (and (or (= cat UTF8PROC_CATEGORY_MN) (= cat UTF8PROC_CATEGORY_ME))
		 (positive? w))
	    (format *stderr* "nonzero width ~D for combining char ~X~%" w c))
	(if (and (zero? w)
		 (or (and (>= cat UTF8PROC_CATEGORY_LU) (<= cat UTF8PROC_CATEGORY_LO))
		     (and (>= cat UTF8PROC_CATEGORY_ND) (<= cat UTF8PROC_CATEGORY_SC))
		     (and (>= cat UTF8PROC_CATEGORY_SO) (<= cat UTF8PROC_CATEGORY_ZS))))
	    (format *stderr* "zero width for symbol-like char ~X~%" c))))
    ;; --------------------------------

    (define s '("élan ‘quote’")) ; example from Norman Gray
    (display s)       ; ("élan â\x80;\x98;quoteâ\x80;\x99;") -- this is due to write's slashify_table choices: now displays ("élan ‘quote’")
    (newline)
    (display (car s)) ; élan ‘quote’
    (newline)
    
    (define b (string->byte-vector (car s)))
    (format #t "~{~X ~}" b) ;c3 a9 6c 61 6e 20 e2 80 98 71 75 6f 74 65 e2 80 99 
    (newline)
    
    
    (define p (utf8proc_map (car s) UTF8PROC_NULLTERM)) ; is this doing anything useful (besides error checking)?
    (display (car p))       ; élan ‘quote’
    (newline)
    
    (define p1 (utf8proc_map "(\"élan ‘quote’\")" UTF8PROC_NULLTERM))
    (display (car p1))      ; ("élan ‘quote’")
    (newline)
    
    (define b1 (string->byte-vector (car p1)))
    (format #t "~{~X ~}" b1) ;28 22 c3 a9 6c 61 6e 20 e2 80 98 71 75 6f 74 65 e2 80 99 22 29
    (newline)
    
    (define s1 (with-output-to-string (lambda () (display s))))
    (display s1) (newline)  ; ("élan ‘quote’")
    (define p2 (utf8proc_map s1 UTF8PROC_NULLTERM))
    (if (integer? (cdr p2))
	(display (utf8proc_errmsg (cdr p2))) ; "Invalid UTF-8 string" or "unknown error" -- what is the problem here?
	(display (car p2)))
    (newline)
    
    (let ((len (cdr p1))
	  (p1c (copy (car p1))))
      (do ((n (utf8proc_iterate p1c len) (utf8proc_iterate p1c len)))
	  ((<= (car n) 0)) ; (cdr n) is the codepoint as an integer
	(display (substring p1c 0 (car n))) (display #\space) ; ( " é l a n   ‘ q u o t e ’ " )
	(set! p1c (substring p1c (car n)))
	(set! len (- len (car n))))
      (newline))
    
    (let ((e1 (utf8proc_encode_char #x00E9))) ; unicode code-point to utf-8 -> (cons utf-8-string length-thereof)
      (format #t "#x~{~X~}" (string->byte-vector (car e1))) ; #xc3a9
      (newline)
      (display (car e1))      ; é
      (newline))

    (let ((e1 (utf8proc_encode_char #x018b)))
      (format #t "#x~{~X~}" (string->byte-vector (car e1)))
      (newline)
      (display (car e1))      ; latin cap D with top bar
      (newline))

    (let ((e1 (utf8proc_encode_char #x0238)))
      (format #t "#x~{~X~}" (string->byte-vector (car e1)))
      (newline)
      (display (car e1))      ; latin small db digraph
      (newline))

    (let ((e1 (utf8proc_encode_char #x1e00)))
      (format #t "#x~{~X~}" (string->byte-vector (car e1)))
      (newline)
      (display (car e1))      ; latin cap A ring below
      (newline))

    (display (string->symbol "élan ‘quote’"))
    (newline)
    (display (symbol->string (symbol "élan ‘quote’")))
    (newline)
    
  ))