(* $Header: /net/thrasher/thrasher3/users/black/hol/Disser/RCS/c2holLex.sml,v 1.6 1996/12/06 21:33:36 black Exp $ *) (* *created "Mon Nov 18 14:51:31 1996" *by "Paul E. Black" *) (* *modified "Fri Nov 22 19:40:11 1996" *by "Paul E. Black" *) (*--------------------------------------------------------------------------- Lexical analyser functions for C to HOL90 AST ---------------------------------------------------------------------------*) (**** code adapted from Chapter 9 of ML for the Working Programmer, 2nd edition by Lawrence C. Paulson, Computer Laboratory, University of Cambridge. (Cambridge University Press, 1996) ****) (* the type of tokens *) datatype tokenType = String of string | CharConst of string | Id of string | Integer of string | Key of string | ErrUnk of string; (* Input to LexicalFUN *) signature KEYWORD = sig val alphastart: char list (* chars which begin identifiers or keywords *) and alphabet: char list (* chars making up identifiers or keywords *) and whitespace: char list (* blanks, tabs, etc. *) and specials: char list (* chars which begin operators *) and keywords: string list and operators: string list end; structure Keyword = struct val alphastart = explode "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" and alphabet = explode "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" and keywords = [ "break", "case", "char", "cont", "default", "do", "else", "enum", "for", "if", "int", "return", "struct", "switch", "typedef", "void", "while"] and whitespace = explode " \t\n" and specials = explode "(){}[],.;&|+-*/=~^!<>#" and operators = [ ",", ".", ";", "&", "->", "&&", "||", "|", "~", "^", "=", "==", "!=", "<=", ">", ">=", "<", "!", "--", "++", "{", "}", "(", ")", "[", "]", "+", "-", "*", "/", "<<", ">>", "#"] end; (* the lexical scan function *) local (* get the next character from the text to parse *) fun getChar [] = NONE | getChar (next::rest) = SOME (next,rest); (* scan the longest operator which is in the list of operators *) fun symbolic (strsofar, toksofar, text) = case getChar text of NONE => (toksofar, text) | SOME (c, rest) => let val newstr = strsofar ^ (str c) in if mem newstr Keyword.operators then symbolic (newstr, SOME newstr, rest) else (* adding the next character is (no longer) an op *) (toksofar, text) end; (* test case: symbolic ("", NONE, explode "!=a"); (SOME "!=",[#"a"]) symbolic ("", NONE, explode "+++a"); (SOME "++",[#"+",#"a"]) *) local (* distinguish keywords from identifiers *) fun alphaKey a = if mem a Keyword.keywords then Key(a) else Id(a) in (* scan the longest text which is an identifier *) fun alphanumeric (strsofar, text) = case getChar text of NONE => (alphaKey strsofar, text) | SOME (c, rest) => if mem c Keyword.alphabet then alphanumeric (strsofar ^ (str c), rest) else (alphaKey strsofar, text) end; (* test cases: alphanumeric ("", explode "3downlo6ad_now !=a"); (Id "3downlo6ad_now",[#" ",#"!",#"=",#"a"]) alphanumeric ("", explode "~do!=a"); (Id "",[#"~",#"d",#"o",#"!",#"=",#"a"]) alphanumeric ("", explode "for(i=3"); (Key "for",[#"(",#"i",#"=",#"3"]) alphanumeric ("", explode "forteen(i=3"); (Id "forteen",[#"(",#"i",#"=",#"3"]) *) (* scan the remainder of a string constant *) (* SKIMP: does not handle embedded newlines, etc. *) fun stringConst (strsofar, text) = case getChar text of NONE => (* premature end of text *) (ErrUnk ("premature end of string: " ^ strsofar), text) | SOME (c, rest) => if c = #"\\" then (* "escaped" character *) case getChar rest of NONE => (* premature end of text *) (ErrUnk("premature end of escape in string: " ^ strsofar), text) | SOME (cc, rr) => stringConst (strsofar ^ "\\" ^ (str cc), rr) else if c = #"\"" then (* end of string *) (String strsofar, rest) else (* some other character *) stringConst (strsofar ^ (str c), rest); (* test cases: stringConst ("", explode "3do\\nw\\\"nlo\"a"); (String "3do\\nw\\\"nlo",[#"a"]) stringConst ("", explode "~do!=a"); (ErrUnk "premature end of string: ~do!=a",[]) stringConst ("", explode "for\\"); (ErrUnk "premature end of escape in string: for",[]) stringConst ("", explode "\"fox"); (String "",[#"f",#"o",#"x"]) *) (* scan the remainder of a character constant *) fun charConst (strsofar, text) = case getChar text of NONE => (* premature end of text *) (ErrUnk ("premature end of char constant: " ^ strsofar), text) | SOME (c, rest) => if c = #"\\" then (* "escaped" character *) case getChar rest of NONE => (* premature end of text *) (ErrUnk("premature end of escape in char: " ^ strsofar), rest) | SOME (cc, rr) => charConst (strsofar ^ "\\" ^ (str cc), rr) else if c = #"'" then (* end of character constant *) (CharConst strsofar, rest) else (* some other character *) charConst (strsofar ^ (str c), rest); (* test cases: charConst ("", explode " 'a"); (CharConst " ",[#"a"]) charConst ("", explode "~do!=a"); (ErrUnk "premature end of char constant: ~do!=a",[]) charConst ("", explode "for\\"); (ErrUnk "premature end of escape in char: for",[]) charConst ("", explode "\\''b"); (CharConst "\\'",[#"b"]) *) val ord0 = ord #"0"; local val ord9 = ord #"9" in fun isDigit c = let val ordc = ord c in ord0 <= ordc andalso ordc <= ord9 end end; (* scan a number. right now, just an integer *) fun numberConst (strsofar, text) = case getChar text of NONE => (* end of text. Don't bother checking if strsofar is empty, since something else is sure to get a syntax error *) (Integer strsofar, text) | SOME (c, rest) => if isDigit c then numberConst (strsofar ^ (str c), rest) else if strsofar = "" then (ErrUnk "number expected", text) else (Integer strsofar, text); (* test cases: numberConst ("", explode " a"); (ErrUnk "number expected",[#" ",#"a"]) numberConst ("", explode "350"); (Integer "350", []) numberConst ("", explode "2-37"); (Integer "2", [#"-",#"3",#"7"]) *) (* scan a list of characters into a list of tokens *) fun scanning (toks, ss) = case getChar ss of NONE => rev toks (*end of substring*) | SOME (c,rest) => if mem c Keyword.alphastart then (* identifier or keyword *) let val (tok, ss2) = alphanumeric (str c, rest) in scanning (tok::toks, ss2) end else if mem c Keyword.specials then (* operator *) let val (opttok, ss2) = symbolic ("", NONE, ss) in case opttok of NONE => scanning (ErrUnk("bad op: " ^ str c)::toks, rest) | SOME tok => scanning (Key tok ::toks, ss2) end else if mem c Keyword.whitespace then scanning (toks, rest) else if isDigit c then (* number *) let val (tok, ss2) = numberConst ("", ss) in scanning (tok::toks, ss2) end else if c = #"\"" then (* string constant *) let val (tok, ss2) = stringConst ("", rest) in scanning (tok::toks, ss2) end else if c = #"'" then (* character constant *) let val (tok, ss2) = charConst ("", rest) in scanning (tok::toks, ss2) end else (* unknown, pass it along *) scanning (ErrUnk (str c) :: toks, rest); in (* convert a string into a list of tokens *) fun scan a = scanning([], explode a); end; (* end of $Source: /net/thrasher/thrasher3/users/black/hol/Disser/RCS/c2holLex.sml,v $ *)