{-
    Kaya - My favourite toy language.
    Copyright (C) 2004, 2005 Edwin Brady

    This file is distributed under the terms of the GNU General
    Public Licence. See COPYING for licence.
-}

module Lexer where

import Char
import Language
import Debug.Trace

data ParseOpts = ParseOpts {
		    libdirs :: [FilePath]
			   }

type LineNumber = Int
type P a = String -> String -> [FilePath] -> Name -> LineNumber -> Result a

getLibdirs :: P [FilePath]
getLibdirs = \s fn ld mod l -> Success ld

getLineNo :: P LineNumber
getLineNo = \s fn ld mod l -> Success l

getFileName :: P String
getFileName = \s fn ld mod l -> Success fn

getContent :: P String
getContent = \s fn ld mod l -> Success s

getModuleName :: P Name
getModuleName = \s fn ld mod l -> Success mod

thenP :: P a -> (a -> P b) -> P b
m `thenP` k = \s fn ld mod l ->
   case m s fn ld mod l of 
       Success a -> k a s fn ld mod l
       Failure e f ln -> Failure e f ln

returnP :: a -> P a
returnP a = \s fn ld mod l -> Success a

failP :: String -> P a
failP err = \s fn ld mod l -> Failure err fn l

catchP :: P a -> (String -> P a) -> P a
catchP m k = \s fn ld mod l ->
   case m s fn ld mod l of
      Success a -> Success a
      Failure e f ln -> k e s fn ld mod l

data Token = TokenInt Int
	   | TokenReal Double
           | TokenName Name
           | TokenString String
	   | TokenBool Bool
	   | TokenChar Char
	   | TokenMetaVar Int
           | TokenEq
           | TokenPlus
           | TokenMinus
           | TokenTimes
           | TokenDiv
           | TokenPower
           | TokenMod
	   | TokenAnd
	   | TokenOr
	   | TokenAndBool
	   | TokenOrBool
	   | TokenXOR
	   | TokenShLeft
	   | TokenShRight
	   | TokenInc
	   | TokenDec
	   | TokenIncBy
	   | TokenDecBy
	   | TokenMultBy
	   | TokenDivBy
           | TokenIntEq
           | TokenIntNE
           | TokenOB
           | TokenCB
           | TokenOCB
           | TokenCCB
           | TokenOSB
           | TokenCSB
           | TokenLT
           | TokenGT
           | TokenLE
           | TokenGE
	   | TokenNot
           | TokenAssign
	   | TokenColon
	   | TokenAt
	   | TokenAtBracket
	   | TokenBacktick
	   | TokenSemiColon
	   | TokenComma
	   | TokenHash
	   | TokenDot
	   | TokenDots
	   | TokenColons
--	   | TokenSize
	   | TokenData
	   | TokenAbstract
	   | TokenType
	   | TokenVar
	   | TokenIntType
	   | TokenCharType
	   | TokenBoolType
	   | TokenRealType
	   | TokenStringType
	   | TokenFileType
	   | TokenPointerType
	   | TokenVoidType
	   | TokenReturn
	   | TokenWhile
	   | TokenDo
	   | TokenRepeat
	   | TokenFor
	   | TokenBreak
	   | TokenPass
	   | TokenTry
	   | TokenCatch
	   | TokenFinally
	   | TokenThrow
	   | TokenException
	   | TokenTo
	   | TokenIn
	   | TokenCase
	   | TokenOf
	   | TokenIf
	   | TokenElse
	   | TokenEnd
	   | TokenPrint
--	   | TokenInputStr
--	   | TokenInputNum
	   | TokenArrow
	   | TokenInclude
	   | TokenImport
	   | TokenModule
	   | TokenProgram
	   | TokenWebapp
	   | TokenShebang
	   | TokenExtern
	   | TokenExtData
	   | TokenImported
	   | TokenDocstring
	   | TokenLink
	   | TokenVM
	   | TokenGlobal
	   | TokenPublic
	   | TokenPrivate
	   | TokenDefault
	   | TokenLambda
	   | TokenDatacon
	   | TokenDatatype
	   | TokenCInclude
	   | TokenForeign
	   | TokenApp
	   | TokenFNid
	   | TokenEOF
   deriving (Show, Eq)

lexer :: (Token -> P a) -> P a
lexer cont [] = cont TokenEOF []
lexer cont ('\n':cs) = \fn ld mod line -> lexer cont cs fn ld mod (line+1)
lexer cont (c:cs) 
      | isSpace c = \fn ld mod line -> lexer cont cs fn ld mod line
      | isAlpha c = lexVar cont (c:cs)
      | isDigit c = lexNum cont (c:cs)
      | c == '_' = lexVar cont (c:cs)
lexer cont ('$':c:cs) | isDigit c = lexMetaVar cont (c:cs)
lexer cont ('/':'*':cs) = lexerEatComment 0 cont cs
lexer cont ('/':'/':cs) = lexerEatToNewline cont cs
lexer cont ('%':cs) = lexSpecial cont cs
lexer cont ('"':cs) = lexString cont cs
lexer cont ('\'':cs) = lexChar cont cs
lexer cont ('+':'+':cs) = cont TokenInc cs
lexer cont ('-':'-':cs) = cont TokenDec cs
lexer cont ('+':'=':cs) = cont TokenIncBy cs
lexer cont ('-':'=':cs) = cont TokenDecBy cs
lexer cont ('*':'=':cs) = cont TokenMultBy cs
lexer cont ('/':'=':cs) = cont TokenDivBy cs
lexer cont ('+':cs) = cont TokenPlus cs
lexer cont ('-':'>':cs) = cont TokenArrow cs
lexer cont ('-':cs) = cont TokenMinus cs
--lexer cont (':':'=':cs) = cont TokenAssign cs
lexer cont ('=':'=':cs) = cont TokenIntEq cs
lexer cont ('!':'=':cs) = cont TokenIntNE cs
lexer cont ('!':cs) = cont TokenNot cs
lexer cont ('=':cs) = cont TokenEq cs
lexer cont ('*':'*':cs) = cont TokenPower cs
lexer cont ('*':cs) = cont TokenTimes cs
lexer cont ('@':'(':cs) = cont TokenAtBracket cs
lexer cont ('@':cs) = cont TokenAt cs
lexer cont ('/':cs) = cont TokenDiv cs
lexer cont ('&':'&':cs) = cont TokenAndBool cs
lexer cont ('|':'|':cs) = cont TokenOrBool cs
lexer cont ('&':cs) = cont TokenAnd cs
lexer cont ('|':cs) = cont TokenOr cs
lexer cont ('^':cs) = cont TokenXOR cs
lexer cont ('(':cs) = cont TokenOB cs
lexer cont (')':cs) = cont TokenCB cs
lexer cont ('{':cs) = cont TokenOCB cs
lexer cont ('}':cs) = cont TokenCCB cs
lexer cont ('[':cs) = cont TokenOSB cs
lexer cont (']':cs) = cont TokenCSB cs
lexer cont ('<':'<':cs) = cont TokenShLeft cs
lexer cont ('>':'>':cs) = cont TokenShRight cs
lexer cont ('<':'=':cs) = cont TokenLE cs
lexer cont ('>':'=':cs) = cont TokenGE cs
lexer cont ('<':cs) = cont TokenLT cs
lexer cont ('>':cs) = cont TokenGT cs
lexer cont (':':':':cs) = cont TokenColons cs
--lexer cont (':':cs) = cont TokenColon cs
lexer cont (';':cs) = cont TokenSemiColon cs
lexer cont (',':cs) = cont TokenComma cs
--lexer cont ('#':cs) = cont TokenHash cs
lexer cont ('.':'.':cs) = cont TokenDots cs
lexer cont ('.':cs) = cont TokenDot cs
lexer cont ('`':cs) = cont TokenBacktick cs
lexer cont ('#':'!':cs) = cont TokenShebang (stripToNL cs)
   where stripToNL ('\n':cs) = cs
	 stripToNL (x:cs) = stripToNL cs
lexer cont (c:cs) = lexError c cs

lexError c s l = failP (show l ++ ": Unrecognised token '" ++ [c] ++ "'\n") s l

lexerEatComment nls cont ('*':'/':cs) 
    = \fn ld mod line -> lexer cont cs fn ld mod (line+nls)
lexerEatComment nls cont ('\n':cs) = lexerEatComment (nls+1) cont cs
lexerEatComment nls cont (c:cs) = lexerEatComment nls cont cs
lexerEatToNewline cont ('\n':cs) 
   = \fn ld mod line -> lexer cont cs fn ld mod (line+1)
lexerEatToNewline cont (c:cs) = lexerEatToNewline cont cs

lexNum cont cs = cont tok rest
  where (num,rest,isreal) = readNum cs
	tok | isreal = TokenReal (read num)
	    | otherwise = TokenInt (read num)

lexMetaVar cont cs = cont (TokenMetaVar (read num)) rest
      where (num,rest) = span isDigit cs

readNum :: String -> (String,String,Bool)
readNum x = rn' False "" x
  where rn' dot acc [] = (acc,[],dot)
	rn' False acc ('.':xs) | head xs /= '.' = rn' True (acc++".") xs
	rn' dot acc (x:xs) | isDigit x = rn' dot (acc++[x]) xs
	rn' dot acc ('e':'+':xs) = rn' True (acc++"e+") xs
	rn' dot acc ('e':'-':xs) = rn' True (acc++"e-") xs
	rn' dot acc ('e':xs) = rn' True (acc++"e") xs
	rn' dot acc xs = (acc,xs,dot)

lexString cont cs = 
   \fn ld mod line ->
   case getstr cs of
      (str,rest,nls) -> cont (TokenString str) rest fn ld mod (nls+line)

lexChar cont cs = 
   \fn ld mod line ->
   case getchar cs of
      Just (str,rest) -> cont (TokenChar str) rest fn ld mod line
      Nothing -> failP (fn++":"++show line++":Unterminated character constant")
		       cs fn ld mod line

isAllowed c = isAlpha c || isDigit c || c `elem` "_\'?#"

lexVar cont cs =
   case span isAllowed cs of
-- Keywords
      ("data",rest) -> cont TokenData rest
      ("abstract",rest) -> cont TokenAbstract rest
      ("type",rest) -> cont TokenType rest
      ("var",rest) -> cont TokenVar rest
      ("Int",rest) -> cont TokenIntType rest
      ("Char",rest)  -> cont TokenCharType rest
      ("Bool",rest)  -> cont TokenBoolType rest
      ("Float",rest)  -> cont TokenRealType rest
      ("String",rest) -> cont TokenStringType rest
      ("File",rest) -> cont TokenFileType rest
      ("Ptr",rest) -> cont TokenPointerType rest
      ("Void",rest)  -> cont TokenVoidType rest
      ("return",rest)  -> cont TokenReturn rest
      ("foreign",rest) -> cont TokenForeign rest
      ("while",rest)  -> cont TokenWhile rest
      ("do",rest)  -> cont TokenDo rest
      ("repeat",rest)  -> cont TokenRepeat rest
      ("for",rest) -> cont TokenFor rest
      ("break",rest) -> cont TokenBreak rest
      ("pass",rest) -> cont TokenPass rest
      ("try",rest) -> cont TokenTry rest
      ("throw",rest) -> cont TokenThrow rest
      ("catch",rest) -> cont TokenCatch rest
      ("finally",rest) -> cont TokenFinally rest
      ("Exception",rest) -> cont TokenException rest
--      ("to",rest) -> cont TokenTo rest
      ("in",rest) -> cont TokenIn rest
      ("case",rest) -> cont TokenCase rest
      ("of",rest) -> cont TokenOf rest
--      ("end",rest) -> cont TokenEnd rest
      ("print",rest)  -> cont TokenPrint rest
--      ("readInt",rest)  -> cont TokenInputNum rest
--      ("readStr",rest)  -> cont TokenInputStr rest
      ("if",rest) -> cont TokenIf rest
      ("else",rest) -> cont TokenElse rest
      ("true",rest) -> cont (TokenBool True) rest
      ("default",rest) -> cont (TokenDefault) rest
      ("false",rest) -> cont (TokenBool False) rest
--      ("size",rest) -> cont TokenSize rest
--      ("CINCLUDE",rest) -> cont TokenCInclude rest
      ("include",rest) -> cont TokenInclude rest
      ("import",rest) -> cont TokenImport rest
      ("program",rest) -> cont TokenProgram rest
      ("module",rest) -> cont TokenModule rest
      ("webapp",rest) -> cont TokenWebapp rest
--      ("fnid",rest) -> cont TokenFNid rest
      ("globals",rest) -> cont TokenGlobal rest
      ("lambda",rest) -> cont TokenLambda rest
      ("public",rest) -> cont TokenPublic rest
      ("private",rest) -> cont TokenPrivate rest
      (var,rest)   -> cont (mkname var) rest

lexSpecial cont cs =
    case span isAllowed cs of
      ("extern",rest) -> cont TokenExtern rest
      ("data",rest) -> cont TokenExtData rest
      ("datacon",rest) -> cont TokenDatacon rest
      ("datatype",rest) -> cont TokenDatatype rest
      ("doc",rest) -> cont TokenDocstring rest
      ("imported",rest) -> cont TokenImported rest
      ("include",rest) -> cont TokenCInclude rest
      ("link",rest) -> cont TokenLink rest
      ("VM",rest) -> cont TokenVM rest
      (thing,rest) -> cont TokenMod (thing++rest)

mkname :: String -> Token
mkname c = TokenName (UN c)

getstr :: String -> (String,String,Int)
getstr cs = let (str,rest,nls) = getstr' "" cs 0 in
		(reverse str,rest,nls)
getstr' acc ('\"':xs) = \nl -> (acc,xs,nl)
getstr' acc ('\\':'n':xs) = getstr' ('\n':acc) xs -- Newline
getstr' acc ('\\':'r':xs) = getstr' ('\r':acc) xs -- CR
getstr' acc ('\\':'t':xs) = getstr' ('\t':acc) xs -- Tab
getstr' acc ('\\':'b':xs) = getstr' ('\b':acc) xs -- Backspace
getstr' acc ('\\':'a':xs) = getstr' ('\a':acc) xs -- Alert
getstr' acc ('\\':'f':xs) = getstr' ('\f':acc) xs -- Formfeed
getstr' acc ('\\':'0':xs) = getstr' ('\0':acc) xs -- null
getstr' acc ('\\':x:xs) = getstr' (x:acc) xs -- Literal
getstr' acc ('\n':xs) = \nl ->getstr' ('\n':acc) xs (nl+1) -- Count the newline
getstr' acc (x:xs) = getstr' (x:acc) xs

getchar :: String -> Maybe (Char,String)
getchar ('\\':'n':'\'':xs) = Just ('\n',xs) -- Newline
getchar ('\\':'r':'\'':xs) = Just ('\r',xs) -- CR
getchar ('\\':'t':'\'':xs) = Just ('\t',xs) -- Tab
getchar ('\\':'b':'\'':xs) = Just ('\b',xs) -- Backspace
getchar ('\\':'a':'\'':xs) = Just ('\a',xs) -- Alert
getchar ('\\':'f':'\'':xs) = Just ('\f',xs) -- Formfeed
getchar ('\\':'0':'\'':xs) = Just ('\0',xs) -- null
getchar ('\\':x:'\'':xs) = Just (x,xs) -- Literal
getchar (x:'\'':xs) = Just (x,xs)
getchar _ = Nothing
