(* A simple lexer, which distinguishes integers, floats and single character
delimiters. Quoted strings are also distinguished, and allow escaped quotes.
Any other non-whitespace-including string is returned as an [Ident].
*)
open Utility

type token =
  | Int of int
  | Float of float
  | Ident of string
  | String of string

let string_of_token = function
  | Int i -> "Int " ^ string_of_int i
  | Float f -> "Float " ^ string_of_float f
  | String s -> "String " ^ s
  | Ident s -> "Ident " ^ s

let string_of_tokens ts =
  fold_left (fun a b -> a ^ "\n " ^ b) "" (map string_of_token ts)

let lex_item s =
  if s = "" then failwith "lex_item: blank string" else
  try
    match hd (explode s) with
    | '\"' when String.length s >= 2 ->
        String (implode (rev (tl (rev (tl (explode s))))))
    | letter when
        (letter >= 'a' && letter <= 'z') ||
        (letter >= 'A' && letter <= 'Z') ->
          Ident s
    | _ ->
        let f = float_of_string s in
          if float (toint f) = f then Int (toint f) else Float f
  with
    e -> dpr ("z" ^ s); Ident s

let is_delimiter = function
  | '(' | ')' | '<' | '>' | '[' | ']' | '{' | '}' | '%' | '/' -> true
  | _ -> false

let is_whitespace = function
  | '\000' | '\009' | '\010' | '\012' | ' ' | '\013' -> true
  | _ -> false

let is_whitespace_or_delimiter c =
  is_whitespace c || is_delimiter c

(* Return the list of characters between and including the current position and
before the next character satisfying a given predicate, leaving the position at
the character following the last one returned. Can raise [EndOfInput]. If [eoi]
is true, end of input is considered a delimiter, and the characters up to it are
returned if it is reached. *)
let getuntil eoi f i =
  let rec getuntil_inner r eoi f i =
    match i.Pdfio.input_byte () with
    | x when x = Pdfio.no_more -> if eoi then rev r else (dpr "C"; raise End_of_file)
    | x ->
        let chr = char_of_int x in
          if f chr
            then (Pdfio.rewind i; rev r)
            else getuntil_inner (chr::r) eoi f i
  in
    getuntil_inner [] eoi f i

(* The same, but don't return anything. (Remove eoi Aug 08) *)
let rec ignoreuntil f i =
  match i.Pdfio.input_byte () with
  | x when x = Pdfio.no_more -> ()
  | x -> if f (char_of_int x) then Pdfio.rewind i else ignoreuntil f i

(* Position on the next non-whitespace character. *)
let dropwhite i =
  ignoreuntil (notpred is_whitespace) i

(* Get a quoted string, including the quotes. Any quotes inside must be escaped. *)
let rec get_string_inner prev i =
  match i.Pdfio.input_byte () with
  | x when x = Pdfio.no_more -> raise End_of_file
  | x when x = int_of_char '\"' -> rev ('\"'::prev)
  | x when x = int_of_char '\\' ->
      begin match i.Pdfio.input_byte () with
      | x when x = Pdfio.no_more-> raise End_of_file
      | x when x = int_of_char '\"' -> get_string_inner ('\"'::prev) i
      | x -> get_string_inner (char_of_int x::'\\'::prev) i
      end
  | x -> get_string_inner (char_of_int x::prev) i

let get_string i =
  Pdfio.nudge i;
  implode ('\"'::get_string_inner [] i)

(* Repeatedly take a whitespace-or-delimiter-delimited section from the input, and scan it *)
let get_section i =
  match Pdfio.peek_char i with
  | None -> None
  | Some _ ->
      dropwhite i;
      match Pdfio.peek_char i with
      | Some '\"' -> Some (get_string i)
      | None -> None
      | Some x ->
          if is_delimiter x
            then (Pdfio.nudge i; Some (implode [x]))
            else Some (implode (getuntil true is_whitespace_or_delimiter i))

let rec lex_inner prev i =
  try
    match get_section i with
    | None -> rev prev
    | Some section -> lex_inner (lex_item section::prev) i
  with
    End_of_file -> dpr "F"; rev prev

let lex = lex_inner []

let lex_string s =
  lex (Pdfio.input_of_bytestream (bytestream_of_string s))

