diff --git a/src/Makefile.inc b/src/Makefile.inc index 6f8cf3e1..cf208f4e 100644 --- a/src/Makefile.inc +++ b/src/Makefile.inc @@ -1,7 +1,7 @@ # Unix configuration Makefile for Moscow ML -*- mode: makefile -*- # Where to install stuff -PREFIX=/usr/local +PREFIX=/home/igrant/local # BINDIR contains true executable files, such as scripts # LIBDIR contains bytecode files (such as mosmlcmp and library units), and .dll/.so for dynlibs. @@ -37,7 +37,7 @@ BASELIBS=-lm # This works with most systems, including MacOS X with XCode installed: -CC=gcc +CC=egcc # CC=gcc -mmacosx-version-min=10.7 # for building OS X package # CC=/usr/sepp/bin/gcc # Solaris at KVL @@ -68,8 +68,8 @@ UNAME_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') ifeq ($(UNAME_S),Linux) CPP=cpp -P -traditional -Dunix -Umsdos -Wno-invalid-pp-token STRIP=strip -S - LD=gcc -rdynamic -Wl,-rpath,$(LIBDIR) - DYNLD=gcc -shared + LD=$(CC) -rdynamic -Wl,-rpath,$(LIBDIR) + DYNLD=$(CC) -shared endif ifeq ($(UNAME_S),Darwin) # For MacOS X, use the same as Linux except DYNDL CPP=cpp -P -traditional -Dunix -Umsdos -Wno-invalid-pp-token @@ -92,8 +92,8 @@ ifeq ($(UNAME_S),OpenBSD) ADDRUNLIBS= CPP=cpp -P -traditional -Dunix -Umsdos -Wno-invalid-pp-token STRIP=strip -S - LD=gcc -rdynamic -Wl,-rpath,$(LIBDIR) - DYNLD=gcc -shared + LD=$(CC) -rdynamic -Wl,-rpath,$(LIBDIR) + DYNLD=$(CC) -shared endif ifeq ($(UNAME_S),Custom) # Your configuration here diff --git a/src/compiler/Config.mlp b/src/compiler/Config.mlp index 28aee35a..7b4ccde6 100644 --- a/src/compiler/Config.mlp +++ b/src/compiler/Config.mlp @@ -53,15 +53,15 @@ val pervasiveOpenedUnits = ["General"]; val fulllib = ["Option", "List", "ListPair", "Strbase", "Char", "String", "StringCvt", "TextIO", "BasicIO", "Vector", - "Array", "VectorSlice", "ArraySlice", "Misc", "Substring", - "Bool", "Int", "Real", "Math", + "Array", "VectorSlice", "ArraySlice", "Misc", + "Substring", "Bool", "Int", "Real", "Math", "Word", "Word8", "Word8Vector", "Word8Array", "Word8VectorSlice", "Word8ArraySlice", "Byte", "BinIO", "CharVector", "CharArray", "CharVectorSlice", "CharArraySlice", "Time", "Timer", "Date", "Path", "FileSys", "Process", "OS", - "Mosml", "PP", "CommandLine"] + "Mosml", "UTF8", "PP", "CommandLine"] val preloadedUnitSets = [ ("default", ["Option", "List", "Strbase", "Char", "String", diff --git a/src/compiler/Lexer.lex b/src/compiler/Lexer.lex index 972cf741..031933c2 100644 --- a/src/compiler/Lexer.lex +++ b/src/compiler/Lexer.lex @@ -3,7 +3,9 @@ open Fnlib Memory Config Mixture Const Parser; (* For Quote/Antiquote --- object language embedding. *) -val quotation = ref false +val quotation = ref false + +val utf8 = ref false datatype lexingMode = NORMALlm @@ -119,9 +121,17 @@ fun store_string_char c = incr string_index end +fun store_string_chars [] = () + | store_string_chars (c::cs) = (store_string_char c; store_string_chars cs) + +fun store_string s = store_string_chars (String.explode s) + +fun extracta slc = CharArraySlice.vector(CharArraySlice.slice slc) +fun extractv slc = CharVectorSlice.vector(CharVectorSlice.slice slc) + fun get_stored_string() = - let open CharArraySlice - val s = vector(slice(!string_buff, 0, SOME (!string_index))) + let open CharArray + val s = extracta(!string_buff, 0, SOME (!string_index)) in string_buff := initial_string_buffer; s @@ -207,6 +217,28 @@ fun scanString scan lexbuf = setLexStartPos lexbuf (!savedLexemeStart - getLexAbsPos lexbuf) ) +fun hexval c = + if #"0" <= c andalso c <= #"9" then Char.ord c - 48 + else (Char.ord c - 55) mod 32; + +fun UTF8StringOfUCSEscapeSequence lexbuf i = + let + val s = getLexeme lexbuf + val sl = String.size s + fun skipPrefix n = + let val c = String.sub (s,n) + in if not (c = #"u" orelse c = #"U" orelse c = #"+") then n else skipPrefix (n+1) + end + fun hexCharsToWord n = + let fun iter acc n = + if n < sl + then iter (acc * 0x10 + (hexval (String.sub(s,n)))) (n + 1) + else acc + in Word.fromInt (iter 0 n) + end + in store_string (UTF8.UCStoUTF8String (hexCharsToWord (skipPrefix 1))) + end; + } rule Token = parse @@ -277,7 +309,7 @@ and TokenN = parse { scanString String lexbuf; let val s = get_stored_string() in if size s <> 1 then - lexError "ill-formed character constant" lexbuf + lexError "ill-formed (possibly multi-byte encoded) character constant" lexbuf else (); CHAR (CharVector.sub(s, 0)) end } @@ -375,15 +407,19 @@ and String = parse store_string_char(Char.chr code); String lexbuf end } - | "\\u" [`0`-`9``a`-`f``A`-`F`] [`0`-`9``a`-`f``A`-`F`] - [`0`-`9``a`-`f``A`-`F`] [`0`-`9``a`-`f``A`-`F`] - { let val code = charCodeOfHexadecimal lexbuf 1 in - if code >= 256 then - skipString "character code is too large" SkipString lexbuf - else (); - store_string_char(Char.chr code); - String lexbuf - end } + | `\\` [`u``U`] + [`0`-`9``a`-`f``A`-`F`] [`0`-`9``a`-`f``A`-`F`] + [`0`-`9``a`-`f``A`-`F`] [`0`-`9``a`-`f``A`-`F`] + { UTF8StringOfUCSEscapeSequence lexbuf 1 + handle UTF8.BadUTF8 s => skipString s SkipString lexbuf; + String lexbuf } + | `\\` [`u``U`] `+` + [`0`-`9``a`-`f``A`-`F`]? [`0`-`9``a`-`f``A`-`F`]? + [`0`-`9``a`-`f``A`-`F`] [`0`-`9``a`-`f``A`-`F`] + [`0`-`9``a`-`f``A`-`F`] [`0`-`9``a`-`f``A`-`F`] + { UTF8StringOfUCSEscapeSequence lexbuf 1 + handle UTF8.BadUTF8 s => skipString s SkipString lexbuf; + String lexbuf } | `\\` { skipString "ill-formed escape sequence" SkipString lexbuf } | (eof | `\^Z`) @@ -392,9 +428,24 @@ and String = parse { skipString "newline not permitted in string" SkipString lexbuf } | [`\^A`-`\^Z` `\127` `\255`] { skipString "invalid character in string" SkipString lexbuf } - | _ - { (store_string_char(getLexemeChar lexbuf 0); - String lexbuf) } + | "" { UTF8Char lexbuf; + String lexbuf } + +and UTF8Char = parse + [`\^@`-`\127`] { store_string_char(getLexemeChar lexbuf 0) } + | ( [`\194`-`\223`] [`\128`-`\191`] + | `\224` [`\160`-`\191`] [`\128`-`\191`] + | [`\225`-`\236`] [`\128`-`\191`] [`\128`-`\191`] + | `\237` [`\128`-`\159`] [`\128`-`\191`] + | [`\238``\239`] [`\128`-`\191`] [`\128`-`\191`] + | `\240` [`\144`-`\191`] [`\128`-`\191`] [`\128`-`\191`] + | [`\241`-`\243`] [`\128`-`\191`] [`\128`-`\191`] [`\128`-`\191`] + | `\244` [`\128`-`\143`] [`\128`-`\191`] [`\128`-`\191`] + ) { store_string (getLexeme lexbuf) } + | _ { if !utf8 + then lexError "ill-formed UTF8 character code" lexbuf + else store_string (getLexeme lexbuf) + } and SkipString = parse `"` diff --git a/src/compiler/Lexer.sig b/src/compiler/Lexer.sig index 6e580092..20f3a0da 100644 --- a/src/compiler/Lexer.sig +++ b/src/compiler/Lexer.sig @@ -1,3 +1,4 @@ val quotation : bool ref; +val utf8 : bool ref; val resetLexerState : unit -> unit; val Token : Lexing.lexbuf -> Parser.token; diff --git a/src/compiler/Makefile b/src/compiler/Makefile index de9a4d41..4381aee5 100644 --- a/src/compiler/Makefile +++ b/src/compiler/Makefile @@ -47,7 +47,7 @@ T_LIBOBJS= \ StringCvt.uo Word.uo Word8.uo Word8Vector.uo CharVector.uo \ Word8Array.uo CharArray.uo Obj.uo Nonstdio.uo \ Substring.uo Path.uo Time.uo OS.uo FileSys.uo \ - Lexing.uo Parsing.uo PP.uo + Lexing.uo Parsing.uo UTF8.uo PP.uo T_OBJS= \ Predef.uo Prim_c.uo Symtable.uo Patch.uo Tr_const.uo \ diff --git a/src/compiler/Smltop.sml b/src/compiler/Smltop.sml index 5179d213..9f586537 100644 --- a/src/compiler/Smltop.sml +++ b/src/compiler/Smltop.sml @@ -267,6 +267,8 @@ val smltop_con_basis = ("loadPath",{ qualid={qual="Meta", id=["loadPath"]}, info=VARname REGULARo}), ("quotation", { qualid={qual="Meta", id=["quotation"]}, info=VARname REGULARo}), + ("utf8", + { qualid={qual="Meta", id=["utf8"]}, info=VARname REGULARo}), ("valuepoly", { qualid={qual="Meta", id=["valuepoly"]}, info=VARname REGULARo}), ("printVal", { qualid={qual="Meta", id=["printVal"]},info=VARname OVL1TXXo}), @@ -300,6 +302,7 @@ val smltop_VE = ("quietdec", trivial_scheme(type_ref type_bool)), ("loadPath", trivial_scheme(type_ref (type_list type_string))), ("quotation", trivial_scheme(type_ref type_bool)), + ("utf8", trivial_scheme(type_ref type_bool)), ("valuepoly", trivial_scheme(type_ref type_bool)), ("printVal", sc_bogus), ("printDepth", trivial_scheme(type_ref type_int)), @@ -337,6 +340,7 @@ fun resetSMLTopDynEnv() = ("quietdec", repr Exec_phr.quietdec), ("loadPath", repr Mixture.load_path), ("quotation", repr Lexer.quotation), + ("utf8", repr Lexer.utf8), ("valuepoly", repr Mixture.value_polymorphism), ("printVal", repr evalPrint), ("printDepth", repr printDepth), diff --git a/src/dynlibs/Makefile b/src/dynlibs/Makefile index 603d4124..e353d7e1 100644 --- a/src/dynlibs/Makefile +++ b/src/dynlibs/Makefile @@ -1,53 +1,54 @@ +MAKE=gmake all: - cd interface; make - cd intinf; make - cd mgd; make - cd mgdbm; make - cd mmysql; make - cd mpq; make - cd mregex; make - cd msocket; make - cd munix; make + cd interface; $(MAKE) + cd intinf; $(MAKE) + cd mgd; $(MAKE) + cd mgdbm; $(MAKE) + cd mmysql; $(MAKE) + cd mpq; $(MAKE) + cd mregex; $(MAKE) + cd msocket; $(MAKE) + cd munix; $(MAKE) install: - cd intinf; make install - cd mgd; make install - cd mgdbm; make install - cd mmysql; make install - cd mpq; make install - cd mregex; make install - cd msocket; make install - cd munix; make install + cd intinf; $(MAKE) install + cd mgd; $(MAKE) install + cd mgdbm; $(MAKE) install + cd mmysql; $(MAKE) install + cd mpq; $(MAKE) install + cd mregex; $(MAKE) install + cd msocket; $(MAKE) install + cd munix; $(MAKE) install uninstall: - cd intinf; make uninstall - cd mgd; make uninstall - cd mgdbm; make uninstall - cd mmysql; make uninstall - cd mpq; make uninstall - cd mregex; make uninstall - cd msocket; make uninstall - cd munix; make uninstall + cd intinf; $(MAKE) uninstall + cd mgd; $(MAKE) uninstall + cd mgdbm; $(MAKE) uninstall + cd mmysql; $(MAKE) uninstall + cd mpq; $(MAKE) uninstall + cd mregex; $(MAKE) uninstall + cd msocket; $(MAKE) uninstall + cd munix; $(MAKE) uninstall test: - cd interface; make test - cd intinf; make test - cd mgd; make test - cd mgdbm; make test - cd mmysql; make test - cd mpq; make test - cd mregex; make test - cd munix; make test + cd interface; $(MAKE) test + cd intinf; $(MAKE) test + cd mgd; $(MAKE) test + cd mgdbm; $(MAKE) test + cd mmysql; $(MAKE) test + cd mpq; $(MAKE) test + cd mregex; $(MAKE) test + cd munix; $(MAKE) test clean: - cd crypt; make clean - cd interface; make clean - cd intinf; make clean - cd mgd; make clean - cd mgdbm; make clean - cd mmysql; make clean - cd mpq; make clean - cd mregex; make clean - cd msocket; make clean - cd munix; make clean + cd crypt; $(MAKE) clean + cd interface; $(MAKE) clean + cd intinf; $(MAKE) clean + cd mgd; $(MAKE) clean + cd mgdbm; $(MAKE) clean + cd mmysql; $(MAKE) clean + cd mpq; $(MAKE) clean + cd mregex; $(MAKE) clean + cd msocket; $(MAKE) clean + cd munix; $(MAKE) clean diff --git a/src/dynlibs/intinf/Makefile b/src/dynlibs/intinf/Makefile index d8beaf02..4c943172 100644 --- a/src/dynlibs/intinf/Makefile +++ b/src/dynlibs/intinf/Makefile @@ -33,7 +33,7 @@ intinf.o: intinf.c $(CC) $(CFLAGS) -c -o intinf.o intinf.c libmgmp.so: intinf.o - $(DYNLD) -o libmgmp.so intinf.o -L$(GMPLIBDIR) -lgmp -lc + $(DYNLD) -o libmgmp.so intinf.o -L$(GMPLIBDIR) -lgmp test: echo $(CURDIR) diff --git a/src/dynlibs/munix/Makefile b/src/dynlibs/munix/Makefile index f4eadad3..17a14e84 100644 --- a/src/dynlibs/munix/Makefile +++ b/src/dynlibs/munix/Makefile @@ -15,7 +15,7 @@ munix.o: munix.c $(CC) $(CFLAGS) -c -o munix.o munix.c libmunix.so: munix.o - $(DYNLD) -o libmunix.so munix.o -lc + $(DYNLD) -o libmunix.so munix.o install: $(INSTALL_DATA) libmunix.so $(DESTDIR)$(LIBDIR) diff --git a/src/mosmlcmp b/src/mosmlcmp index 79d093df..b9043df9 100644 Binary files a/src/mosmlcmp and b/src/mosmlcmp differ diff --git a/src/mosmllex b/src/mosmllex index 763125e7..df62a52f 100644 Binary files a/src/mosmllex and b/src/mosmllex differ diff --git a/src/mosmllib/Makefile b/src/mosmllib/Makefile index f058deef..56d6cade 100644 --- a/src/mosmllib/Makefile +++ b/src/mosmllib/Makefile @@ -26,7 +26,7 @@ all: Array.uo Array2.uo ArraySlice.uo Arraysort.uo \ TextIO.uo Time.uo Timer.uo \ Unix.uo Vector.uo VectorSlice.uo \ Weak.uo Word.uo Word8.uo Word8Array.uo Word8ArraySlice.uo \ - Word8Vector.uo Word8VectorSlice.uo + Word8Vector.uo Word8VectorSlice.uo UTF8.uo # Make with the current compiler current: @@ -80,6 +80,8 @@ PackRealLittle.uo: PackRealLittle.ui Word8Array.ui Word8ArraySlice.ui \ TextIO.ui: StringCvt.ui Char.ui Msp.uo: Msp.ui String.ui StringCvt.ui List.ui Option.ui Vector.ui TextIO.ui \ Int.ui Mosmlcgi.ui Char.ui +UTF8.uo: UTF8.ui String.ui Word.ui CharVector.ui StringCvt.ui Word8.ui \ + Int.ui Word8Vector.ui Char.ui AppleScript.uo: AppleScript.ui Regex.uo: Regex.ui Word.ui Dynlib.ui List.ui Vector.ui Substring.ui Time.uo: Time.ui Real.ui StringCvt.ui Char.ui @@ -159,12 +161,13 @@ Array2.ui: Vector.ui ArraySlice.ui: Vector.ui Array.ui VectorSlice.ui Int.uo: Int.ui String.ui StringCvt.ui Char.ui Signal.ui: Word.ui +UTF8.ui: Word.ui String.ui Char.ui Buffer.uo: Buffer.ui String.ui Substring.ui PackRealBig.ui: Word8Array.ui Word8Vector.ui Dynlib.uo: Dynlib.ui Dynarray.uo: Dynarray.ui Array.ui Word8Vector.ui: Word8.ui -PP.uo: PP.ui String.ui List.ui Vector.ui Array.ui TextIO.ui +PP.uo: PP.ui String.ui UTF8.ui List.ui Vector.ui Array.ui TextIO.ui Word8ArraySlice.ui: Word8Array.ui Word8.ui Word8Vector.ui \ Word8VectorSlice.ui Parsing.uo: Parsing.ui Lexing.ui Vector.ui Obj.uo diff --git a/src/mosmllib/PP.sig b/src/mosmllib/PP.sig index d8f0f0ce..7e05152d 100644 --- a/src/mosmllib/PP.sig +++ b/src/mosmllib/PP.sig @@ -8,6 +8,8 @@ datatype break_style = CONSISTENT | INCONSISTENT +val utf8 : bool ref + val mk_ppstream : ppconsumer -> ppstream val dest_ppstream : ppstream -> ppconsumer val add_break : ppstream -> int * int -> unit diff --git a/src/mosmllib/PP.sml b/src/mosmllib/PP.sml index ec0aef4d..5df8fe55 100644 --- a/src/mosmllib/PP.sml +++ b/src/mosmllib/PP.sml @@ -19,6 +19,8 @@ exception QUEUE_FULL exception QUEUE_EMPTY exception REQUESTED_QUEUE_SIZE_TOO_SMALL +val utf8 = ref true + local fun ++ i n = (i + 1) mod n fun -- i n = (i - 1) mod n @@ -557,7 +559,9 @@ fun add_string (pps : ppstream) s = end else () - val slen = String.size s + val slen = if !utf8 + then UTF8.size_ s + else String.size s val S_token = S{String = s, Length = slen} in if (delim_stack_is_empty the_delim_stack) diff --git a/src/mosmllib/UTF8.sig b/src/mosmllib/UTF8.sig new file mode 100644 index 00000000..184bcbb9 --- /dev/null +++ b/src/mosmllib/UTF8.sig @@ -0,0 +1,19 @@ +signature UTF8 = sig + +exception BadUTF8 of string + +type ('a, 'b) reader = 'b -> ('a * 'b) option + +val scanUTF8 : (char, 'a) reader -> (string, 'a) reader +val scanUCS : (char, 'a) reader -> (word, 'a) reader +val UCStoUTF8String : word -> string +val UCSfromUTF8String : string -> word option +val size : string -> int +val size_ : string -> int +val padLeft : char -> int -> string -> string +val padRight : char -> int -> string -> string +val UTF8fromUTF8string : string -> string option + +val scanUTF8Transition : (char, 'a) reader -> (string, 'a) reader + +end diff --git a/src/mosmllib/UTF8.sml b/src/mosmllib/UTF8.sml new file mode 100644 index 00000000..f1fef71c --- /dev/null +++ b/src/mosmllib/UTF8.sml @@ -0,0 +1,236 @@ +structure UTF8 :> UTF8 = struct + +type ('a, 'b) reader = 'b -> ('a * 'b) option; + +exception BadUTF8 of string; + +(* A lookup table: The n'th entry gives the number of octets that + follow in a well-formed UTF8 encoding beginning with the octet n + + 0x7F, or 4 if the value is a continuation octet, or 0 if the value + is not a valid octet of a UTF representation. These conditions are + from Table 9.3 of the Third edition of the UCS standard ISO/IEC + 10646:2009 *) + +fun countf n' = + let val n = n' + 0x80 + val r = if n > 0xF4 then 0 + else if n > 0xEF then 3 + else if n > 0xDF then 2 + else if n > 0xC1 then 1 + else if n > 0xBF then 0 + else 4 + in Word8.fromInt r + end; + +(* A lookup table: The n'th entry gives the 4 high bits of each of the + lower and upper bounds of a valid UTF8 continuation octet (i.e. the + 2nd octet of a multi-octet UTF8 representation) beginning with the + octet n + 0x7F. These values are taken from Table 9.3 of the Third + edition of the UCS standard ISO/IEC 10646:2009 *) + +fun validf n' = + let val n = n' + 0x80 + val r = + if n = 0xE0 then 0xAB + else if n = 0xED then 0x89 + else if n = 0xF0 then 0x9F + else if n = 0xF4 then 0x88 + else 0x8B + in Word8.fromInt r + end; + +val count = Word8Vector.tabulate (0x80,countf); +val valid = Word8Vector.tabulate (0x80,validf); + +fun size s = + let val len = String.size s + fun errmsg n cn = ("Invalid UTF8 sequence at octet "^(Int.toString n)^ + " (code 0x"^(Int.fmt StringCvt.HEX cn)^")") + fun iter sz n = + if n < len + then let val cn = Char.ord (CharVector.sub (s,n)) + in if cn < 0x80 + then iter (sz+1) (n+1) + else case Word8.toInt (Word8Vector.sub (count, cn - 0x80)) + of 1 => iter (sz+1) (n+2) + | 2 => iter (sz+1) (n+3) + | 3 => iter (sz+1) (n+4) + | _ => raise BadUTF8 (errmsg n cn) + end + else sz + in iter 0 0 + end + +fun size_ s = size s handle BadUTF8 _ => String.size s + +local + fun pad c n s = + let val len = size s + in CharVector.tabulate (Int.max(n - len, 0), fn _ => c) + end +in + fun padLeft c n s = (pad c n s)^s + fun padRight c n s = s^(pad c n s) +end + +type state = {stateno : int, seqno : int, fcno : int, chars : string, ucsno : Word.word}; + +(* A state machine for decoding UTF8 octet sequences. *) + +datatype transition = Process of Char.char -> transition + | Value of string * Word.word + | Error of string; + +fun transition state = + let fun continue (state : state) c n = + let val ucsno' = Word.orb(Word.<<(#ucsno state,0w6), Word.andb(Word.fromInt n,0wx3F)) + val chars' = (#chars state)^(String.str c) + in if #seqno state = 1 + then Value (chars', ucsno') + else Process (transition {stateno = 3, seqno = (#seqno state) - 1, + fcno = n, chars = chars', ucsno = ucsno'}) + end + fun newstate c = + let val n = Char.ord c + in case #stateno state + of 1 => if n < 0x80 + then Value (String.str c, Word.fromInt n) + else let val sl = Word8.toInt (Word8Vector.sub (count, n - 0x80)) + in if sl < 1 orelse sl > 3 + then Error "Invalid UTF8 initial octet" + else Process (transition + {stateno = 2, seqno = sl, fcno = n, + chars = String.str c, + ucsno = Word.andb(Word.fromInt n, + Word.<<(0w1,0w6-Word.fromInt sl)-0w1)}) + end + | 2 => let val r = (Word8Vector.sub (valid, #fcno state - 0x80)) + val l = Word8.toInt (Word8.andb(r, 0wxF0)) + val h = Word8.toInt (Word8.orb(Word8.<<(Word8.andb(r, 0wx0F), 0w4),0wx0F)) + in if n < l orelse n > h + then Error "Invalid UTF8 continuation octet" + else continue state c n + end + | 3 => if n < 0x80 orelse n > 0xBF + then Error "Invalid UTF8 trailing octet" + else continue state c n + | _ => raise Fail "Internal error: UTF8.transition: invalid state" + end + in newstate + end; + +fun scanUTF8UCS rvf (getc : (char, 'a) reader) = + let val start = Process (transition {stateno = 1, seqno = 0, fcno = 0, chars = "", ucsno = 0wx000000}); + fun recur st (css : 'a) = + (case st + of Process f => + (case getc css + of NONE => NONE + | SOME(c,css) => recur (f c) css) + | Value rv => SOME (rvf rv,css) + | Error s => raise (BadUTF8 s)) + in fn css => recur start css + end; + +val scanUCS = scanUTF8UCS (fn (_,w) => w); + +val scanUTF8Transition = scanUTF8UCS (fn (s,_) => s); + +val UCSfromUTF8String = + StringCvt.scanString scanUCS; + +fun UCStoUTF8String cp = + let + fun storeString cp = + let + fun store_bits acc n cp = + let val mask = Word8.<<(0w1,n)-0w1 + val topbits = Word8.~>>(0wx80,0w6-n) + val c = topbits + Word8.andb(Word8.fromLargeWord cp,mask) + in ((String.str (Char.chr (Word8.toInt c)))^acc, (Word.>>(cp,n))) + end + fun continue n (acc,cp) = + if n < 0w6 + then store_bits acc n cp + else continue (n - 0w6) (store_bits acc 0w6 cp) + in + if cp > 0wx10FFFF then raise BadUTF8 "Invalid UCS scalar value (too large)" + else if cp >= 0wxD800 andalso cp <= 0wxDFFF then + raise BadUTF8 "Invalid UCS value (surrogate code point range)" + else if cp > 0wx00FFFF then continue 0w21 ("",cp) + else if cp > 0wx0007FF then continue 0w16 ("",cp) + else if cp > 0wx00007F then continue 0w11 ("",cp) + else (String.str(Char.chr (Word.toInt cp)),0w0) + end + val (cs,_) = storeString cp + in + cs + end; + +(* A UTF8 character scanner suitable for plugging into a String.string + or TextIO.instream scanning function such as scanString or + scanStream in the StringCvt and TextIO structures of the Standard + ML basis library. Alternatively, this function can be applied to + Substring.getc to produce a UTF8 reader for values of type + Substring.substring. *) + +fun scanUTF8 (getc : (char, 'a) reader) = + (fn (cs) => + let val c = getc cs + in + case c of + NONE => NONE + | SOME (c1,cs) => + let val n = Char.ord c1 + in if n < 0x80 then SOME(String.str c1,cs) + else let open Word8Vector + val sl = Word8.toInt (sub (count,n - 0x80)) + val r = sub (valid, n - 0x80) + in if sl < 1 orelse sl > 3 then raise BadUTF8 "Invalid UTF8 continuation octet." + else let val c = getc cs + in case c + of NONE => NONE + | SOME (c2,cs) => + let val chrs = (String.str c1)^(String.str c2) + val n = Char.ord c2 + val l = Word8.toInt (Word8.andb(r, 0wxF0)) + val h = Word8.toInt (Word8.orb(Word8.<<(Word8.andb(r, 0wx0F), + 0w4),0wx0F)) + in if n < l orelse n > h then + raise BadUTF8 "Invalid UTF8 continuation octet." + else if sl = 1 then SOME(chrs, cs) + else let val c = getc cs + in case c + of NONE => NONE + | SOME (c3,cs) => + let val chrs = chrs^(String.str c3) + val n = Char.ord c3 + in if n < 0x80 orelse n > 0xBF + then raise BadUTF8 + "Invalid UTF8 continuation octet." + else if sl = 2 then SOME(chrs, cs) + else let val c = getc cs + in case c + of NONE => NONE + | SOME (c4,cs) => + let val chrs = + chrs^(String.str c4) + val n = Char.ord c4 + in if n < 0x80 + orelse n > 0xBF + then raise BadUTF8 + "Invalid UTF8 continuation octet." + else SOME(chrs,cs) + end + end + end + end + end + end + end + end + end); + +val UTF8fromUTF8string = StringCvt.scanString scanUTF8; + +end (* struct *) diff --git a/src/mosmllib/test/Makefile b/src/mosmllib/test/Makefile index cf6ec4b1..a706ea1a 100644 --- a/src/mosmllib/test/Makefile +++ b/src/mosmllib/test/Makefile @@ -51,4 +51,4 @@ clean: rm -f testlink testcycl testbadl rm -f testrun.ui testrun.uo rm -f noinput.pipe - cd callback; make clean + cd callback; $(MAKE) clean diff --git a/src/mosmllnk b/src/mosmllnk index b2d9065a..189d1479 100644 Binary files a/src/mosmllnk and b/src/mosmllnk differ diff --git a/src/runtime/Makefile b/src/runtime/Makefile index 483de050..58bd5457 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -115,8 +115,8 @@ jumptbl.h : instruct.h depend : prims.c opnames.h jumptbl.h mv Makefile Makefile.bak (sed -n -e '1,/^### DO NOT DELETE THIS LINE/p' Makefile.bak; \ - gcc -MM -I.. $(OPTS) *.c; \ - gcc -MM -I.. $(OPTS) -DDEBUG *.c | sed -e 's/\.o/.d.o/') > Makefile + $(CC) -MM -I.. $(OPTS) *.c; \ + $(CC) -MM -I.. $(OPTS) -DDEBUG *.c | sed -e 's/\.o/.d.o/') > Makefile rm Makefile.bak ### EVERYTHING THAT GOES BEYOND THIS COMMENT WILL BE ERASED WITHOUT WARNING diff --git a/src/test/Makefile b/src/test/Makefile index ab59ce8f..dfb35e66 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -2,6 +2,7 @@ # This works with bash MOSML=mosml +MAKE=gmake all: rm -f result @@ -24,7 +25,7 @@ cleaned: current: rm -f result - make all MOSML=../camlrunm\ ../compiler/mosmltop\ -stdlib\ ../mosmllib + $(MAKE) all MOSML=../camlrunm\ ../compiler/mosmltop\ -stdlib\ ../mosmllib clean: rm -f result