(* -*- caml -*- *)
(* $Id: lexer.mll,v 1.1 2004/12/07 17:30:41 sjost Exp $ *)

{
 open Parser
 open Support


type buildfun = info -> Parser.token
let (symbolTable : (string,buildfun) Hashtbl.t) = Hashtbl.create 42

let keywords = (* All keywords are lower-case only *)
     [	("unit",	fun fi -> Parser.UNIT fi);	
	("diamond",	fun fi -> Parser.DIAMOND fi);	
	("bool",	fun fi -> Parser.BOOL fi);
	("int",		fun fi -> Parser.INTEGER fi);
	("float",	fun fi -> Parser.FLOAT fi);	(* new *)
(*	("list",        fun fi -> Parser.LIST fi);      (* NOT BUILT-IN in CAMELOT! *) *)
	("array",	fun fi -> Parser.ARRAY fi);	(* new *)
	("char",	fun fi -> Parser.CHAR fi);	(* new *)
	("string",	fun fi -> Parser.STRING fi);	(* new *)

	("type",	fun fi -> Parser.TYPE fi);	(* new *)
	("val",       	fun fi -> Parser.VAL fi);	(* new *)
	("richval",    	fun fi -> Parser.ANNVAL fi);	(* new *)
	("of",       	fun fi -> Parser.OF fi);	(* new *)

	("true",	fun fi -> Parser.TRUEVAL fi);	
	("false",	fun fi -> Parser.FALSEVAL fi);

	("not",		fun fi -> Parser.NOT fi);	(* new *)
	("fst",		fun fi -> Parser.FST fi);	(* new *)
	("snd",		fun fi -> Parser.SND fi);	(* new *)
	("and",		fun fi -> Parser.LAND fi);	(* Not in CAMELOT. Also as Symbol "&" defined *)
	("or",		fun fi -> Parser.LOR fi);	(* new *)
	("andalso",	fun fi -> Parser.ANDALSO fi);	(* new *)
	("orelse",	fun fi -> Parser.ORELSE fi);	(* new *)
        ("mod",	        fun fi -> Parser.MOD fi);	(* Newly introduced to CAMELOT *)

	("begin",	fun fi -> Parser.BEGIN fi);     (* kwxm: new Camelot syntax *)
        ("end",		fun fi -> Parser.END fi);       (* kwxm: new Camelot syntax *)
	("if",		fun fi -> Parser.IF fi);
	("then", 	fun fi -> Parser.THEN fi);
	("else",     	fun fi -> Parser.ELSE fi);
	("match",    	fun fi -> Parser.MATCH fi);
	("match\'",    	fun fi -> Parser.MATCHPR fi);
      	("endmatch",   	fun fi -> Parser.MATCHEND fi);
      	("with",   	fun fi -> Parser.WITH fi);
       	("let",		fun fi -> Parser.LET fi);
       	("rec",		fun fi -> Parser.REC fi);	(* new *)
	("in",		fun fi -> Parser.IN fi);
	("where",       fun fi -> Parser.WHERE fi);     (* new *)
       	("fun",		fun fi -> Parser.FUN fi);	(* newer *)
]

(* These are all KEYWORDS in Camelot, but I think this is unnatural. We parse them like any other identifier
   and assume they are automatically included in the signature...	
	("int_of_float",	fun fi -> Parser.INT_OF_FLOAT fi);	
	("float_of_int",	fun fi -> Parser.FLOAT_OF_INT fi);	
	("char_of_int",		fun fi -> Parser.CHAR_OF_INT fi);	
	("int_of_char",		fun fi -> Parser.INT_OF_CHAR fi);	
	("float_of_string",	fun fi -> Parser.FLOAT_OF_STRING fi);	
	("string_of_float",	fun fi -> Parser.STRING_OF_FLOAT fi);	
	("int_of_string",	fun fi -> Parser.INT_OF_STRING fi);	
	("string_of_int",	fun fi -> Parser.STRING_OF_INT fi);
	("print_int",		fun fi -> Parser.PRINT_INT fi);		
	("print_float",		fun fi -> Parser.PRINT_FLOAT fi);	
	("print_char",		fun fi -> Parser.PRINT_CHAR fi);	
	("print_string",	fun fi -> Parser.PRINT_STRING fi);	
	("print_newline",	fun fi -> Parser.PRINT_NEWLINE fi);	
	("array_head",		fun fi -> Parser.ARRAY_HEAD fi);	
*)


let _ = 
   List.iter (fun (kwd,(tokfun:buildfun)) -> Hashtbl.add symbolTable kwd tokfun) keywords

let keyword_or_identifier fi str =
  try (Hashtbl.find symbolTable (String.lowercase str)) fi  
	with Not_found -> IDENTIFIER {i=fi; v=str}

let keyword_or_constructor fi str =
  try (Hashtbl.find symbolTable (String.lowercase str)) fi  
	with Not_found -> CONSTRUCTOR {i=fi; v=str} 

let lineno = ref 1
and depth = ref 0
and start = ref 0 
and filename = ref ""
and startLex = ref unknown

let setFilename s = filename := s

let newline lexbuf = incr lineno; start := (Lexing.lexeme_start lexbuf)

let info lexbuf =
  create (!filename) (!lineno) ((Lexing.lexeme_start lexbuf - !start)-1) (* -1 Fr emacs-kompatibilitt: der zhlt von 0 an *)

let text = Lexing.lexeme
	
let extractLineno yytext offset =
  int_of_string (String.sub yytext offset (String.length yytext - offset))
} 
 
let comment_start    = "(*"
let comment_end      = "*)"
let comment_line     = "//"
let alphaupper       = ['A'-'Z']
let alphalower       = ['a'-'z']
let extensional      = ['?' '#' '.' '_' '$' '~' '' '' '' '']
let num		     = ['0'-'9' ]
let alpha	     = alphaupper | alphalower
let alphalowerext    = alphalower | extensional 
let alphaupperext    = alphaupper | extensional 
let alphaext         = alphaupper | alphalower | num | extensional | '\'' (* A leading prime marks a typevariable, therefore alphalowerext/alphaupperext shall not contain a prime *)
let alphanum         = alpha | num 
let alphanumunder    = alphanum | ['_']
let alphanumunderdot = alphanumunder | ['.']
let keysymchar       = ['+' '-' '*' '/' '(' ')' '=' '>' '<' ';' ':' ',']
let whitespace	     = [' ' '\009' '\012'] (* Note that newlines are not accounted for! *)

rule token = parse 
  eof				{ EOF (info lexbuf) }
| '\n' whitespace* ("and" | "AND" | "And") { newline lexbuf; AND (info lexbuf)} (* dies ist ein let-and *)
| whitespace+			{ token lexbuf }
| '\n'                          { newline lexbuf; token lexbuf } 
| comment_end			{ errAt (info lexbuf) "Unmatched end of comment" }
| comment_start num+ comment_end (* Intended Diamond-Sizes may be hidden given in comments like "(*3*)" *)
                                { let inf = info lexbuf in
				  let fstr = text lexbuf in
				 (* let _ = print_string ("\n###"^fstr^"@") in *)
				  let len = String.length fstr in
                                  let str = (String.sub fstr 2 (len-4)) in
				 (* let _ = print_string ("@"^str^"@") in *)
				  let num = try int_of_string str 
				  with Failure s -> errAt inf "Diamond-Comment unrecognizeable."
				  in
				 (* let _ = print_string ("@"^(string_of_int num)^"###") in *)
				  DIASIZE {i=inf; v=num} }
| comment_start (*[^ '0' - '9']*)   { depth:=1; startLex := info lexbuf; 
				  comment lexbuf; token lexbuf }
| comment_line	                { commentline lexbuf; token lexbuf }
| '\'' [^ '\'' '\n' ]	'\'' 	{ CHARVAL {i=info lexbuf; v=(String.get (text lexbuf) 1)} }
| '"'  [^ '"' '\n' ]*   '"'     { let fstr = (text lexbuf) in (* Is always escaped - unescaping is not really possible here! *)
                                  let len  = String.length fstr in
                                  let str  = (String.sub fstr 1 (len-2)) in
				  STRINGVAL {i=info lexbuf; v=str} }
| '@'                           { AT (info lexbuf) }
| '!'                           { BANG (info lexbuf) }
| '$'                           { DOLLAR (info lexbuf) }
| '#'                           { SHARP (info lexbuf) }
| ''                           { POUND (info lexbuf) }
| ''                           { PARAGRAPH (info lexbuf) }
(* | '_'				{ USCORE (info lexbuf) } *)
(* Commented out by kwxm:  concflicted with keyword_or_identifier *)
| '='				{ EQUALS (info lexbuf) }
| '<'				{ LESS (info lexbuf) }
| '>'				{ GREATER (info lexbuf) }
| "<>"                          { DIAMOND (info lexbuf) }
| "<="				{ LTEQ (info lexbuf) }
| ">="				{ GTEQ (info lexbuf)}
| "->"				{ ARROW (info lexbuf)}
| "(|"	  	                { LBANA    (info lexbuf)} 
| "|)"	  	                { RBANA    (info lexbuf)} 
| "()"	  	                { UNITVAL    (info lexbuf)} 
| '(' whitespace+ ')'		{ EMPTYPAREN (info lexbuf)} (* the possible whitespace simplifies the parser *)
| '('				{ LPAREN (info lexbuf)}
| ')'				{ RPAREN (info lexbuf)}
| '{'				{ LBRACE (info lexbuf)}
| '}'				{ RBRACE (info lexbuf)}
| '['				{ LBRAK (info lexbuf)}
| ']'				{ RBRAK (info lexbuf)}
| '.'				{ DOT (info lexbuf)}
| ','				{ COMMA (info lexbuf)}
| ':'				{ COLON (info lexbuf)}
| ';'				{ SEMICOLON (info lexbuf)}
| '&'				{ LAND (info lexbuf)} (* also as keyword "and" defined *)
| "&&"				{ LAND (info lexbuf)} (* also as keyword "and" defined *)
| "||"				{ LOR  (info lexbuf)} (* also as keyword "or" defined *)
| "|endmatch"                   { MATCHEND (info lexbuf)}  (* helps emacs to align the endmatch keyword, which is unknown in caml-mode - we still keep the keyword "endmatch" which leads to the same token *)
| "| endmatch"                  { MATCHEND (info lexbuf)}  (* helps emacs to align the endmatch keyword, which is unknown in caml-mode - we still keep the keyword "endmatch" which leads to the same token *)
| '|'				{ BAR (info lexbuf)}
| '^'				{ CARET (info lexbuf)}
| '+'				{ PLUS (info lexbuf)}
| '-'				{ MINUS (info lexbuf)}
| '*'				{ TIMES (info lexbuf)}
| '/'				{ DIVIDE (info lexbuf)}
| "+."				{ FPLUS (info lexbuf)}
| "-."				{ FMINUS (info lexbuf)}
| "*."				{ FTIMES (info lexbuf)}
| "/."				{ FDIVIDE (info lexbuf)}
| '\'' alphaext*                { TYPEVAR {i=info lexbuf; v= text lexbuf} }
| alphalowerext alphaext* 	{ keyword_or_identifier (info lexbuf) (text lexbuf) }
| alphaupperext alphaext* 	{ keyword_or_constructor (info lexbuf) (text lexbuf) }
| num+				{ INTVAL {i=info lexbuf; v=int_of_string(text lexbuf)} }
| num+ ('.' num+)? (('e'|'E') '-'? num+ )?	
                                { FLOATVAL {i=info lexbuf; 
					    v=try 
					        float_of_string(text lexbuf)
                                              with 
				                Failure("float_of_string") -> (errAt (info lexbuf) ("Unidentifiable float value:"^(text lexbuf)))
				} }
|  _                            { errAt (info lexbuf) "Illegal character" }


and comment = parse
| comment_end	{ depth:=!depth-1;
                  if !depth>0 then comment lexbuf }
| comment_start	{ depth:=!depth+1; comment lexbuf }
| '\n'		{ newline lexbuf; comment lexbuf }  (* ??? Fehlte im Original! Warum? *)
| eof           { errAt (info lexbuf) "Lexing Error: File ends with a runaway comment. Probably a '*)' missing." }
| _             { comment lexbuf }

and commentline = parse
| '\n'		{ newline lexbuf }			
| _		{ commentline lexbuf}







