/* This file is part of Malaga, a system for Natural Language Analysis.
 * Copyright (C) 1995-1999 Bjoern Beutel
 *
 * Bjoern Beutel
 * Universitaet Erlangen-Nuernberg
 * Abteilung fuer Computerlinguistik
 * Bismarckstrasse 12
 * D-91054 Erlangen
 * e-mail: malaga@linguistik.uni-erlangen.de 
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */

/* description ==============================================================*/

/* This module contains data structures and functions related to the generation
 * of the allomorph lexicon. */

/* includes =================================================================*/

#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
#include <time.h>
#include <setjmp.h>
#include "basic.h"
#include "pools.h"
#include "values.h"
#include "tries.h"
#include "rule_type.h"
#include "rules.h"
#include "scanner.h"
#include "files.h"
#include "malaga_files.h"
#include "symbols.h"
#include "input.h"
#include "commands.h"

#ifdef HANGUL
#include "hangul.h"
#endif

#undef GLOBAL
#define GLOBAL

#include "lex_compiler.h"

/* types ====================================================================*/

typedef struct /* an entry in the lexicon buffer */
{
  int_t surface;
  int_t cat;
} entry_t;

typedef struct CONSTANT_NODE_T /* a node of the constant table */
{
  struct CONSTANT_NODE_T *left_son; /* sons with a name that before <name> */
  struct CONSTANT_NODE_T *right_son; /* sons with a name that after <name> */
  string_t name; /* name of the node in <string_pool> */
  value_t value;
} constant_node_t;

/* variables ================================================================*/

LOCAL constant_node_t *constant_tree = NULL; /* root node of constant tree */

LOCAL string_t current_surf; /* currently generated surface in filter rules */

LOCAL struct /* buffer for the compiled lexicon entries */
{
  pool_t entry_pool;
  pool_t value_pool;
  pool_t string_pool;
  
  entry_t *entries;
  int_t entries_size;

  cell_t *values;
  int_t values_size;

  char *strings;
  int_t strings_size;

  int_t lex_entry_count;
  int_t intermediate_count;
  int_t allomorph_count;
} lex_buffer;

/* forward declarations =====================================================*/

FORWARD void local_parse_value (void);

/* functions for execution of allomorph rules ===============================*/

LOCAL void lex_add_allo (value_t surf, value_t cat)
/* Add an allomorph, consisting of <surf> and <cat>,
 * to the allomorph lexicon. */
{
  entry_t entry;
  string_t surf_string;

  surf_string = value_to_string (surf);
  if (*surf_string == EOS)
    error ("allomorph surface is empty");

  copy_string_to_pool (lex_buffer.string_pool, surf_string, &entry.surface);
  copy_value_to_pool (lex_buffer.value_pool, cat, &entry.cat);
  copy_to_pool (lex_buffer.entry_pool, &entry, 1, NULL);
  
  lex_buffer.allomorph_count++;
}

/*---------------------------------------------------------------------------*/

LOCAL void execute_allo_rule (value_t lex_entry)
/* Execute the allo_rule on <lex_entry>. */
{
  /* Set callback routines. */
  add_allo = lex_add_allo;
  
  top = 0;
  push_value (lex_entry);
  execute_rule (allo_rule_sys, allo_rule_sys->allo_rule);
}

/* support functions for parsing ============================================*/

LOCAL void free_constants (constant_node_t **node_ptr)
/* Free the constant table with root <*node_ptr>. */
{
  if (*node_ptr != NULL) 
  {
    free_constants (&(*node_ptr)->left_son);
    free_constants (&(*node_ptr)->right_son);
    free_mem (&(*node_ptr)->name);
    free_mem (&(*node_ptr)->value);
    free_mem (node_ptr);
  }
}

/*---------------------------------------------------------------------------*/

LOCAL constant_node_t *find_constant_node (string_t name, bool_t new_constant)
/* Find and return a constant with given name.
 * If <new_constant> == TRUE, create a new constant
 * (error if constant already exists).
 * else return an old constant (error if constant does not exist). */
{
  constant_node_t **node_ptr; /* link to the current node (link may be NULL) */

  node_ptr = &constant_tree;
  while (TRUE) 
  {
    int_t comp_result;
    constant_node_t *node;
    
    if (*node_ptr == NULL) /* The node doesn't exist yet. */
    {
      if (! new_constant)
	error ("constant \"@%s\" is not defined", name);
      else 
      {
	/* Allocate and initialise node. */
	node = new_mem (sizeof (constant_node_t));
	node->left_son = NULL;
	node->right_son = NULL;
	node->name = new_string (name, NULL);
	/* Link node into tree. */
	*node_ptr = node;
	return node;
      }
    }
    
    /* Node is not NULL. */
    node = *node_ptr;
    comp_result = strcmp_no_case (name, node->name);
    if (comp_result < 0) 
      node_ptr = &node->left_son;
    else if (comp_result > 0)
      node_ptr = &node->right_son;
    else
    {
      /* The node already exists. */
      if (new_constant)
	error ("constant \"@%s\" is already defined", name);
      else
	return node;
    }
  }
}

/* parse functions ==========================================================*/

LOCAL void parse_symbol (void)
/* Parse a symbol and push it on the value stack. */
{
  test_token (TOK_IDENT);
  push_symbol_value (find_symbol (token_name));
  read_next_token ();
}

/*---------------------------------------------------------------------------*/

LOCAL void parse_simple_value (void)
/* Parse a value and leave it on the value stack. */
{
  int_t n; /* number of values in list or record */
  
  switch (next_token) 
  {
  case '<': /* Parse a list. */
    read_next_token ();
    n = 0;
    if (next_token != '>') 
    {
      local_parse_value ();
      n++;
      while (next_token == ',') 
      {
	read_next_token ();
	local_parse_value ();
	n++;
      }
    }
    parse_token ('>');
    build_list (n);
    break;
    
  case '[': /* Parse a record. */
    read_next_token ();
    n = 0;
    if (next_token != ']') 
    {
      local_parse_value ();
      parse_token (':');
      local_parse_value ();
      n++;
      while (next_token == ',') 
      {
	read_next_token ();
	local_parse_value ();
	parse_token (':');
	local_parse_value ();
	n++;
      }
    }
    parse_token (']');
    build_record (n);
    break;
    
  case TOK_IDENT: /* Parse a symbol. */
    parse_symbol ();
    break;
    
  case TOK_STRING: /* Parse a string. */
#ifdef HANGUL
    encode_hangul (&token_string);
#endif
    push_string_value (token_string, NULL);
    read_next_token ();
    break;

  case TOK_NUMBER: /* Parse a number value. */
    push_number_value (token_number);
    read_next_token ();
    break;
    
  case TOK_CONSTANT: /* Parse a constant. */
    push_value ((find_constant_node (token_name, FALSE))->value);
    read_next_token ();
    break;

  case '(':
    read_next_token ();
    local_parse_value ();
    parse_token (')');
    break;

  default:
    error ("value expected, not %s", token_as_text (next_token));
  }
}

/*---------------------------------------------------------------------------*/

LOCAL void parse_dotted_value (void)
/* Parse a value and a sequence of following ".<ident>" or ".<number>". */
{
  parse_simple_value ();
  while (next_token == '.')
  {
    read_next_token ();
    parse_simple_value ();
    dot_operation ();
  }
}

/*---------------------------------------------------------------------------*/

LOCAL void parse_term_value (void)
/* Parse a value that may contain the "*" and the "/" operator. */
{
  parse_dotted_value ();
  
  while (next_token == '*' || next_token == '/') 
  {
    int_t operator_token = next_token;
    
    read_next_token ();
    parse_dotted_value ();
    if (operator_token == '*')
      asterisk_operation ();
    else
      slash_operation ();
  }
}

/*---------------------------------------------------------------------------*/

LOCAL void local_parse_value (void)
/* Parse any value. This function is recursive; 
 * to get a value from outside, use "parse_value". */
{
  if (next_token == '-')
  {
    read_next_token ();
    parse_term_value ();
    unary_minus_operation ();
  }
  else
    parse_term_value ();

  while (next_token == '+' || next_token == '-') 
  {
    int_t operator_token = next_token;
      
    read_next_token ();
    parse_term_value ();
    if (operator_token == '-')
      minus_operation ();
    else
      plus_operation ();
  }
}

/*---------------------------------------------------------------------------*/

LOCAL void parse_value (value_t *value)
/* Parse a value and return it in *<value>.
 * Use "free_mem" to free <value> after use. */
{
  top = 0;
  local_parse_value ();
  DB_ASSERT (top == 1);
  *value = new_value (value_stack[--top]);
}

/*---------------------------------------------------------------------------*/

LOCAL void parse_lex_value (void)
/* Parse a value and compile it. */
{
  value_t value;
  int_t line_number = current_line_number (); /* where lexical entry starts */
  string_t file_name = current_file_name ();
  
  parse_value (&value);
  parse_token (';');
  
  /* Error messages will show <lex_entry_line_number>. */
  lex_entry_line_number = line_number;
  lex_entry_file_name = file_name;
  
  lex_buffer.lex_entry_count++;
  execute_allo_rule (value);

  if (! rule_successful)
    fprintf (stderr, "warning: file \"%s\", line %ld: no allomorphs generated",
	    name_in_path (file_name), line_number);

  free_mem (&value);
  
  /* No longer show <lex_entry_line_number> in error messages. */
  lex_entry_line_number = -1;
  lex_entry_file_name = NULL;
}

/*---------------------------------------------------------------------------*/

LOCAL void parse_lex_values (void)
/* Read all values in the current file and run the allomorph rules on them. */
{
  while (next_token != EOF) 
  {
    if (next_token == TOK_DEFINE)
    {
      constant_node_t *constant;

      read_next_token ();
      test_token (TOK_CONSTANT);
      constant = find_constant_node (token_name, TRUE);
      read_next_token ();
      parse_token (TOK_ASSIGN);
      parse_value (&constant->value);
      parse_token (';');
    }
    else if (next_token == TOK_INCLUDE) 
    {
      string_t file_name;
      
      read_next_token ();
      test_token (TOK_STRING);
      file_name = absolute_path (token_string, current_file_name ());
      begin_include (file_name);
      parse_lex_values ();
      end_include ();
      parse_token (';');
      free_mem (&file_name);
    } 
    else
    {
      check_user_break ();
      parse_lex_value ();
    }
  }
}

/* functions for construction of run-time lexicon ===========================*/

#define MARK_LAST_ENTRY(var) \
  ((var) = - ((var) + 1))
/* macro to mark last entry of a list in <cat_lists> */

/*---------------------------------------------------------------------------*/

LOCAL void init_lex_buffer (void)
/* Initialise the lexicon buffer. */
{
  lex_buffer.entry_pool = new_pool (sizeof (entry_t));
  lex_buffer.value_pool = new_pool (sizeof (cell_t));
  lex_buffer.string_pool = new_pool (sizeof (char));
  lex_buffer.entries = NULL;
  lex_buffer.values = NULL;
  lex_buffer.strings = NULL;
  lex_buffer.lex_entry_count = lex_buffer.allomorph_count = 0;
  lex_buffer.intermediate_count = -1;
}

/*---------------------------------------------------------------------------*/

LOCAL void free_lex_buffer (void)
/* Free all memory used by lexicon buffer. */
{
  free_pool (&lex_buffer.entry_pool);
  free_pool (&lex_buffer.value_pool);
  free_pool (&lex_buffer.string_pool);
  free_mem (&lex_buffer.entries);
  free_mem (&lex_buffer.values);
  free_mem (&lex_buffer.strings);
}

/*---------------------------------------------------------------------------*/

LOCAL void copy_lex_buffer (void)
/* Copy all pool items in "lex_buffer" to the appropriate table entries
 * and clear the pools. */
{
  lex_buffer.entries_size = pool_items (lex_buffer.entry_pool);
  lex_buffer.entries = pool_to_vector (lex_buffer.entry_pool);
  clear_pool (lex_buffer.entry_pool);

  lex_buffer.values_size = pool_items (lex_buffer.value_pool);
  lex_buffer.values = pool_to_vector (lex_buffer.value_pool);
  clear_pool (lex_buffer.value_pool);

  lex_buffer.strings_size = pool_items (lex_buffer.string_pool);
  lex_buffer.strings = pool_to_vector (lex_buffer.string_pool);
  clear_pool (lex_buffer.string_pool);
}

/*---------------------------------------------------------------------------*/

LOCAL void clear_lex_buffer (void)
/* Clear the content of the lexicon buffer. */
{
  clear_pool (lex_buffer.entry_pool);
  clear_pool (lex_buffer.value_pool);
  clear_pool (lex_buffer.string_pool);

  free_mem (&lex_buffer.entries);
  free_mem (&lex_buffer.values);
  free_mem (&lex_buffer.strings);
  lex_buffer.lex_entry_count = lex_buffer.allomorph_count = 0;
  lex_buffer.intermediate_count = -1;
}

/*---------------------------------------------------------------------------*/

GLOBAL void print_lex_buffer (FILE *stream, string_t allo_format)
/* Print all lexicon entries in the buffer to <stream> using allomorph format
 * <allo_format>. */
{
  int_t i;
  
  if (*allo_format != EOS)
  {
    for (i = 0; i < lex_buffer.entries_size; i++)
    {
      string_t line_number, surface, buffer, value_string;

      line_number = int_to_string (i+1);
      surface = new_string (lex_buffer.strings + lex_buffer.entries[i].surface,
			    NULL);
#ifdef HANGUL
      decode_hangul (&surface);
#endif
      value_string = value_to_readable (lex_buffer.values + 
					lex_buffer.entries[i].cat, 
					FALSE);
      buffer = replace_arguments (allo_format, "scn", 
				  surface, value_string, line_number);
      fprintf (stream, "%s\n", buffer);
      free_mem (&surface);
      free_mem (&buffer);
      free_mem (&value_string);
      free_mem (&line_number);

      if (ferror (stream))
	error ("can't write results: %s", strerror (errno));
    }
  }
}

/*---------------------------------------------------------------------------*/

GLOBAL void write_lex_buffer (string_t file_name)
/* Write lexicon buffer to file <file_name>. */
{
  lexicon_header_t header;
  FILE *stream;
  pool_t trie_pool;
  int_t trie_root;
  trie_entry_t *trie_entries; /* trie entries used to build the trie */
  int_t i, n;
  int_t *cat_lists;

  /* Merge entries with same surface. */
  trie_entries = new_vector (sizeof (trie_entry_t), lex_buffer.entries_size);
  cat_lists = new_vector (sizeof (int_t), lex_buffer.entries_size);
  n = 0;
  for (i = 0; i < lex_buffer.entries_size; i++)
  {
    cat_lists[i] = lex_buffer.entries[i].cat;
    
    /* Look if the <i>-th entry is the same as the <n>-th. */
    if (i == 0 || 0 !=
	strcmp_no_case (lex_buffer.strings + lex_buffer.entries[i].surface, 
			lex_buffer.strings + lex_buffer.entries[i-1].surface))
    {
      /* Mark the end of the cat_list of the last merged entry. */
      if (i > 0)
	MARK_LAST_ENTRY (cat_lists[i-1]);
      
      trie_entries[n].key = lex_buffer.strings + lex_buffer.entries[i].surface;
      trie_entries[n].content = i;
      n++;
    }
  }
  MARK_LAST_ENTRY (cat_lists[i-1]);

  new_trie (n, trie_entries, &trie_pool, &trie_root);
  free_mem (&trie_entries);

  stream = open_stream (file_name, "wb");
  
  /* Initialise the header. */
  set_header (&header.common_header, LEXICON_FILE, LEXICON_CODE_VERSION);
  header.trie_size = pool_items (trie_pool);
  header.trie_root = trie_root;
  header.cat_lists_size = lex_buffer.entries_size;
  header.values_size = lex_buffer.values_size;

  write_vector (&header, sizeof (lexicon_header_t), 1, stream, file_name);
  write_pool (trie_pool, stream, file_name);
  write_vector (cat_lists, sizeof (int_t), lex_buffer.entries_size, 
		stream, file_name);
  write_vector (lex_buffer.values, sizeof (cell_t), lex_buffer.values_size, 
		stream, file_name);
  close_stream (&stream, file_name);

  free_pool (&trie_pool);
  free_mem (&cat_lists);
}

/*---------------------------------------------------------------------------*/

LOCAL int compare_surface (const void *entry1, const void *entry2)
/* Compare the surfaces of the lexicon entries <entry1> and <entry2>.
 * If they are identical, return the original order of the entries. */
{
  const entry_t *lex_entry1 = (entry_t *) entry1;
  const entry_t *lex_entry2 = (entry_t *) entry2;
  bool_t result = strcmp_no_case (lex_buffer.strings + lex_entry1->surface, 
				  lex_buffer.strings + lex_entry2->surface);

  if (result != 0)
    return result;
  else if (lex_entry1->surface < lex_entry2->surface)
    return -1;
  else if (lex_entry1->surface > lex_entry2->surface)
    return 1;
  else
    return 0;
}

/*---------------------------------------------------------------------------*/

LOCAL void lex_add_end_state (value_t cat)
/* Add a filtered allomorph, with surface <current_surf> and category <cat>,
 * to the allomorph lexicon. */
{
  entry_t entry;

  copy_string_to_pool (lex_buffer.string_pool, current_surf, 
		       &entry.surface);
  copy_value_to_pool (lex_buffer.value_pool, cat, &entry.cat);
  copy_to_pool (lex_buffer.entry_pool, &entry, 1, NULL);
  
  lex_buffer.allomorph_count++;
}

/*---------------------------------------------------------------------------*/

LOCAL void execute_output_filter (void)
{
  int_t i, j, k;

  copy_lex_buffer ();

  /* Sort <lex_buffer.entries> by surface. */
  qsort (lex_buffer.entries, lex_buffer.entries_size, 
	 sizeof (entry_t), compare_surface);

  /* If there's no allomorph filter rule, we're finished. */
  if (allo_rule_sys->output_filter == -1)
    return;

  add_end_state = lex_add_end_state;
  lex_buffer.intermediate_count = lex_buffer.allomorph_count;
  lex_buffer.allomorph_count = 0;

  for (i = 0; i < lex_buffer.entries_size; i += j)
  {
    current_surf = lex_buffer.strings + lex_buffer.entries[i].surface;

    /* Count the number of entries that share the same surface. */
    j = 1;
    while (i + j < lex_buffer.entries_size
	   && strcmp_no_case (lex_buffer.strings 
			      + lex_buffer.entries[i + j].surface,
			      current_surf) == 0)
      j++;

    /* Create a list containing all allomorphs with <surface>. */
    top = 0;
    for (k = 0; k < j; k++)
      push_value (lex_buffer.values + lex_buffer.entries[i + k].cat);
    build_list (j);
    execute_rule (allo_rule_sys, allo_rule_sys->output_filter);
    if (! rule_successful)
      error ("no allomorphs generated for \"%s\"", current_surf);
  }
  
  copy_lex_buffer ();
}

/* interface functions for the lexicon compiler =============================*/

GLOBAL void print_lex_statistics (FILE *stream)
/* Print statistics about lexicon buffer into <stream>. */
{
  if (lex_buffer.lex_entry_count == 0)
    fprintf (stream, "no lexicon entries read\n");
  else 
  {
    fprintf (stream, "entries read:            %ld\n", 
	     lex_buffer.lex_entry_count);
    if (lex_buffer.intermediate_count != -1)
      fprintf (stream, "intermediates generated: %ld\n",
	       lex_buffer.intermediate_count);
    fprintf (stream, "allomorphs generated:    %ld\n", 
	     lex_buffer.allomorph_count);
    fprintf (stream, "allomorphs per entry:    %.4G\n",
	     ((double) lex_buffer.allomorph_count 
	      / (double) lex_buffer.lex_entry_count));
  }
}

/*---------------------------------------------------------------------------*/

GLOBAL void generate_allos_for_file (string_t source_name)
/* Parse a lexicon file <source_name> and generate allomorphs.
 * Write allomorphs into lexicon buffer. */
{
  clear_lex_buffer ();
  free_constants (&constant_tree);
  begin_include (source_name);
  parse_lex_values ();
  end_include ();
  execute_output_filter ();
}

/*---------------------------------------------------------------------------*/

GLOBAL void generate_allos_for_line (string_t lexicon_name, int_t line)
/* Read line <line> in lexicon file <lexicon_name> and generate allomorphs.
 * Write allomorphs into lexicon buffer. */
{
  clear_lex_buffer ();
  begin_include (lexicon_name);
  while (next_token != EOF && current_line_number () < line)
  {
    check_user_break ();
    
    if (next_token == TOK_INCLUDE) 
    {
      read_next_token ();
      test_token (TOK_STRING);
      parse_token (';');
    }
    else if (next_token == TOK_DEFINE)
    {
      value_t value;

      read_next_token ();
      parse_token (TOK_CONSTANT);
      parse_token (TOK_ASSIGN);
      parse_value (&value);
      free_mem (&value);
      parse_token (';');
    }
    else
    {
      value_t value;
      
      parse_value (&value);
      free_mem (&value);
      parse_token (';');
    }
  }
  
  if (next_token == EOF)
    error ("no lexicon entry at or behind line %ld", line);

  parse_lex_value ();
  end_include ();
  execute_output_filter ();
}

/*---------------------------------------------------------------------------*/

GLOBAL void generate_allos_for_string (string_t category_string)
/* Generate allomorphs for "category_string".
 * Write allomorphs into lexicon buffer. */
{
  value_t category;
  
  clear_lex_buffer ();
  set_scanner_input (category_string);
  parse_value (&category);
  if (next_token == ';')
    read_next_token ();
  test_token (EOF);
  execute_allo_rule (category);
  if (! rule_successful)
    printf ("warning: no allomorphs generated");
  free_mem (&category);
  execute_output_filter ();
}

/*---------------------------------------------------------------------------*/

GLOBAL void init_lex_compiler (string_t allo_rule_file)
/* Initialise the lex_compiler module.
 * Use allomorph rules from "allo_rule_file". */
{
  allo_rule_sys = read_rule_sys (allo_rule_file);
  init_lex_buffer ();
}

/*---------------------------------------------------------------------------*/

GLOBAL void terminate_lex_compiler (void)
/* Terminate the lex_compiler module. */
{
  free_rule_sys (&allo_rule_sys);
  free_lex_buffer ();
  free_constants (&constant_tree);
}

/* end of file ==============================================================*/
