/*  $Id: FrenchVerbDictionary.cpp,v 1.22 2009/03/10 02:59:36 sarrazip Exp $
    FrenchVerbDictionary.cpp - Dictionary of verbs and conjugation templates

    verbiste - French conjugation system
    Copyright (C) 2003-2005 Pierre Sarrazin <http://sarrazip.com/>

    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License
    as published by the Free Software Foundation; either version 2
    of the License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
    02111-1307, USA.
*/

#include "FrenchVerbDictionary.h"

#include <iostream>
#include <errno.h>
#include <string.h>

using namespace std;
using namespace verbiste;


class AutoDoc
{
public:
    AutoDoc(xmlDocPtr d) : doc(d) {}
    ~AutoDoc() { if (doc != NULL) xmlFreeDoc(doc); }
    xmlDocPtr get() const { return doc; }
    bool operator ! () const { return doc == NULL; }
private:
    xmlDocPtr doc;
};


class AutoString
{
public:
    AutoString(xmlChar *s) : str(s) {}
    ~AutoString() { if (str != NULL) xmlFree(str); }
    xmlChar *get() const { return str; }
    bool operator ! () const { return str == NULL; }
    size_t length() const { return str == NULL ? 0 : strlen((char *) str); }
private:
    xmlChar *str;
};


inline
const xmlChar *
XMLCHAR(const char *s)
{
    return (const xmlChar *) s;
}


inline
int
equal(const xmlChar *a, const char *b)
{
    return xmlStrcmp(a, XMLCHAR(b)) == 0;
}


inline
int
different(const xmlChar *a, const char *b)
{
    return !equal(a, b);
}


inline
xmlChar *
getProp(xmlNodePtr node, const char *propName)
{
    return xmlGetProp(node, XMLCHAR(propName));
}


inline
xmlChar *
getString(xmlDocPtr doc, xmlNodePtr node)
{
    return xmlNodeListGetString(doc, node, 1);
}


inline
string
operator + (const AutoString &a, const string &b)
{
    return (char *) a.get() + b;
}


inline
string
operator + (const string &a, const AutoString &b)
{
    return a + (char *) b.get();
}


inline
Mode
convertModeName(const xmlChar *modeName)
{
    return FrenchVerbDictionary::convertModeName((char *) modeName);
}


inline
Tense
convertTenseName(const xmlChar *tenseName)
{
    return FrenchVerbDictionary::convertTenseName((char *) tenseName);
}


FrenchVerbDictionary::FrenchVerbDictionary() throw (logic_error)
  : conjugSys(),
    knownVerbs(),
    inflectionTable(),
    verbTrie(*this)
{
    const char *libdatadir = NULL;
    #ifndef NDEBUG
    libdatadir = getenv("LIBDATADIR");
    #endif
    if (libdatadir == NULL)
	libdatadir = LIBDATADIR;

    string conjFN  = libdatadir + string("/") + "conjugation-fr.xml";
    string verbsFN = libdatadir + string("/") + "verbs-fr.xml";

    init(conjFN, verbsFN);
}


FrenchVerbDictionary::FrenchVerbDictionary(
				const string &conjugationFilename,
				const string &verbsFilename)
					throw (logic_error)
  : conjugSys(),
    knownVerbs(),
    inflectionTable(),
    verbTrie(*this)
{
    init(conjugationFilename, verbsFilename);
}


void
FrenchVerbDictionary::init(const string &conjugationFilename,
			    const string &verbsFilename)
					throw (logic_error)
{
    toUTF8 = iconv_open("UTF-8", "ISO-8859-1");
    if (toUTF8 == (iconv_t) -1)
	throw logic_error("conversion from ISO-8859-1 to UTF-8 not supported");
    toLatin1 = iconv_open("ISO-8859-1", "UTF-8");
    if (toLatin1 == (iconv_t) -1)
	throw logic_error("conversion from UTF-8 to ISO-8859-1 not supported");

    {
	for (int i = 0; i < 0xC0; i++)
	    latin1TolowerTable[i] = char(tolower(char(i)));
	for (int i = 0xC0; i < 0xE0; i++)
	    latin1TolowerTable[i] = char(i + 0x20);
	for (int i = 0xE0; i < 0x100; i++)
	    latin1TolowerTable[i] = char(i);
    }

    loadConjugationDatabase(conjugationFilename.c_str());
    loadVerbDatabase(verbsFilename.c_str());
}


void
FrenchVerbDictionary::loadConjugationDatabase(
				const char *conjugationFilename)
					throw (logic_error)
{
    if (conjugationFilename == NULL)
	throw invalid_argument("conjugationFilename");

    AutoDoc conjDoc = xmlParseFile(conjugationFilename);
    if (!conjDoc)
	throw logic_error("could not parse " + string(conjugationFilename));

    readConjugation(conjDoc.get());
}


void
FrenchVerbDictionary::loadVerbDatabase(
				const char *verbsFilename)
					throw (logic_error)
{
    if (verbsFilename == NULL)
	throw invalid_argument("verbsFilename");

    AutoDoc verbsDoc = xmlParseFile(verbsFilename);
    if (!verbsDoc)
	throw logic_error("could not parse " + string(verbsFilename));

    readVerbs(verbsDoc.get());
}


void
FrenchVerbDictionary::readConjugation(xmlDocPtr doc) throw(logic_error)
{
    xmlNodePtr rootNodePtr = xmlDocGetRootElement(doc);

    if (rootNodePtr == NULL)
	throw logic_error("empty conjugation document");

    if (different(rootNodePtr->name, "conjugation-fr"))
	throw logic_error("wrong top node in conjugation document");

    for (xmlNodePtr templ = rootNodePtr->xmlChildrenNode;
			templ != NULL;
			templ = templ->next)
    {
	if (different(templ->name, "template"))  // ignore junk between tags
	    continue;

	string tname = getLatin1XmlProp(templ, "name");
	if (tname.empty())
	    throw logic_error("missing template name attribute");

	// The template name is the root and the termination,
	// with a colon in between.  For example, "pla:cer".

	if (tname.find(':') == string::npos)
	    throw logic_error("missing colon in template name");

	// The use of the [] operator creates an empty conjugation
	// template spec, to which we keep a reference:

	TemplateSpec &theTemplateSpec = conjugSys[tname];

	// Same idea:

	TemplateInflectionTable &ti = inflectionTable[tname];

	// For each mode (e.g., infinitive, indicative, conditional, etc):
	for (xmlNodePtr mode = templ->xmlChildrenNode;
			    mode != NULL;
			    mode = mode->next)
	{
	    if (equal(mode->name, "text"))  // any text in this node is ignored
		continue;

	    Mode theMode = ::convertModeName(mode->name);
	    ModeSpec &theModeSpec = theTemplateSpec[theMode];

	    // For each tense in the mode:
	    for (xmlNodePtr tense = mode->xmlChildrenNode;
			    tense != NULL;
		    	    tense = tense->next)
	    {
		if (equal(tense->name, "text"))
		    continue;

		Tense theTense = ::convertTenseName(tense->name);
		TenseSpec &theTenseSpec = theModeSpec[theTense];

		// For each person in the tense:
		int personCounter = 0;
		for (xmlNodePtr person = tense->xmlChildrenNode;
				person != NULL;
				person = person->next)
		{
		    if (different(person->name, "p"))
			continue;

		    personCounter++;

		    theTenseSpec.push_back(PersonSpec());
		    PersonSpec &thePersonSpec = theTenseSpec.back();

		    // For each inflection for this person:
		    // (Note that most persons of most verbs have only
		    // on inflection.)
		    for (xmlNodePtr inf = person->xmlChildrenNode;
					inf != NULL;
					inf = inf->next)
		    {
			string variant = getLatin1XmlNodeText(
						    doc, inf->xmlChildrenNode);
			thePersonSpec.push_back(variant);

			ModeTensePersonNumber mtpn(
				(char *) mode->name,
				(char *) tense->name,
				personCounter);
			ti[variant].push_back(mtpn);
		    }
		}
	    }
	}
    }
}


string
FrenchVerbDictionary::getLatin1XmlNodeText(xmlDocPtr doc, xmlNodePtr node)
								throw(int)
{
    xmlChar *s = getString(doc, node);
    if (s == NULL)
	return string();
    return utf8ToLatin1((char *) s);
}


string
FrenchVerbDictionary::getLatin1XmlProp(xmlNodePtr node, const char *propName)
								throw(int)
{
    xmlChar *s = getProp(node, propName);
    if (s == NULL)
	return string();
    return utf8ToLatin1((char *) s);
}


void
FrenchVerbDictionary::readVerbs(xmlDocPtr doc) throw(logic_error)
{
    xmlNodePtr rootNodePtr = xmlDocGetRootElement(doc);

    if (rootNodePtr == NULL)
	throw logic_error("empty verbs document");

    if (different(rootNodePtr->name, "verbs-fr"))
	throw logic_error("wrong top node in verbs document");

    for (xmlNodePtr v = rootNodePtr->xmlChildrenNode; v != NULL; v = v->next)
    {
	if (equal(v->name, "text") || equal(v->name, "comment"))
	    continue;

	xmlNodePtr i = v->xmlChildrenNode;
	if (i == NULL || i->xmlChildrenNode == NULL)
	    throw logic_error("missing <i> node");

	string infinitive =
			getLatin1XmlNodeText(doc, i->xmlChildrenNode);
	if (infinitive.empty())
	    throw logic_error("empty <i> node");
	size_t lenInfinitive = infinitive.length();

	if (i->next == NULL)
	    throw logic_error("unexpected end after <i> node");

	xmlNodePtr t = i->next->next;
	if (t == NULL)
	    throw logic_error("missing <t> node");

	string tname = getLatin1XmlNodeText(doc, t->xmlChildrenNode);
	if (tname.empty())
	    throw logic_error("empty <t> node");
	string::size_type posColon = tname.find(':');
	if (posColon == string::npos)
	    throw logic_error("missing colon in <t> node");
	if (conjugSys.find(tname) == conjugSys.end())
	    throw logic_error("unknown template name: " + tname);

	knownVerbs[infinitive] = tname;

	// <aspirate-h>: If this verb starts with an aspirate h, remember it:
	if (t->next != NULL && t->next->next != NULL)
	    aspirateHVerbs.insert(infinitive);

	// Insert the verb in the trie.
	// A list of template names is associated to each verb in this trie.

	size_t lenTermination = tname.length() - posColon - 1;
	assert(lenTermination > 0);
	assert(lenInfinitive >= lenTermination);

	string verbRadical(infinitive, 0, lenInfinitive - lenTermination);

	vector<string> **templateListPtr =
				verbTrie.getUserDataPointer(verbRadical);
	assert(templateListPtr != NULL);
	if (*templateListPtr == NULL)
	{
	    //cerr << "new verbRadical: '" << verbRadical << "'\n";
	    *templateListPtr = new vector<string>();
	}
	(*templateListPtr)->push_back(tname);
    }
}


FrenchVerbDictionary::~FrenchVerbDictionary()
{
    iconv_close(toLatin1);
    iconv_close(toUTF8);
}


const TemplateSpec *
FrenchVerbDictionary::getTemplate(const string &templateName) const
{
    ConjugationSystem::const_iterator it = conjugSys.find(templateName);
    if (it == conjugSys.end())
	return NULL;
    return &it->second;
}


ConjugationSystem::const_iterator
FrenchVerbDictionary::beginConjugSys() const
{
    return conjugSys.begin();
}


ConjugationSystem::const_iterator
FrenchVerbDictionary::endConjugSys() const
{
    return conjugSys.end();
}


const char *
FrenchVerbDictionary::getVerbTemplate(const char *infinitive) const
{
    if (infinitive == NULL)
	return NULL;
    VerbTable::const_iterator it = knownVerbs.find(infinitive);
    if (it == knownVerbs.end())
	return NULL;
    return it->second.c_str();
}


const char *
FrenchVerbDictionary::getVerbTemplate(const string &infinitive) const
{
    return getVerbTemplate(infinitive.c_str());
}


VerbTable::const_iterator
FrenchVerbDictionary::beginKnownVerbs() const
{
    return knownVerbs.begin();
}


VerbTable::const_iterator
FrenchVerbDictionary::endKnownVerbs() const
{
    return knownVerbs.end();
}


const std::vector<ModeTensePersonNumber> *
FrenchVerbDictionary::getMTPNForInflection(
				const std::string &templateName,
				const std::string &inflection) const
{
    InflectionTable::const_iterator i = inflectionTable.find(templateName);
    if (i == inflectionTable.end())
	return NULL;
    const TemplateInflectionTable &ti = i->second;
    TemplateInflectionTable::const_iterator j = ti.find(inflection);
    if (j == ti.end())
	return NULL;
    return &j->second;
}


/*static*/
Mode
FrenchVerbDictionary::convertModeName(const char *modeName)
{
    Mode mode = INVALID_MODE;
    if (modeName == NULL)
	;
    else if (strcmp(modeName, "infinitive") == 0)
	mode = INFINITIVE_MODE;
    else if (strcmp(modeName, "indicative") == 0)
	mode = INDICATIVE_MODE;
    else if (strcmp(modeName, "conditional") == 0)
	mode = CONDITIONAL_MODE;
    else if (strcmp(modeName, "subjunctive") == 0)
	mode = SUBJUNCTIVE_MODE;
    else if (strcmp(modeName, "imperative") == 0)
	mode = IMPERATIVE_MODE;
    else if (strcmp(modeName, "participle") == 0)
	mode = PARTICIPLE_MODE;
    return mode;
}


/*static*/
Tense
FrenchVerbDictionary::convertTenseName(const char *tenseName)
{
    Tense tense = INVALID_TENSE;
    if (tenseName == NULL)
	;
    else if (strcmp(tenseName, "infinitive-present") == 0)
	tense = PRESENT_TENSE;
    else if (strcmp(tenseName, "present") == 0)
	tense = PRESENT_TENSE;
    else if (strcmp(tenseName, "imperfect") == 0)
	tense = IMPERFECT_TENSE;
    else if (strcmp(tenseName, "future") == 0)
	tense = FUTURE_TENSE;
    else if (strcmp(tenseName, "simple-past") == 0)
	tense = PAST_TENSE;
    else if (strcmp(tenseName, "imperative-present") == 0)
	tense = PRESENT_TENSE;
    else if (strcmp(tenseName, "present-participle") == 0)
	tense = PRESENT_TENSE;
    else if (strcmp(tenseName, "past-participle") == 0)
	tense = PAST_TENSE;
    else if (strcmp(tenseName, "past") == 0)
	tense = PAST_TENSE;
    return tense;
}


void
FrenchVerbDictionary::deconjugate(const string &conjugatedVerb,
				std::vector<InflectionDesc> &results)
{
    verbTrie.setDestination(&results);
    (void) verbTrie.get(conjugatedVerb);
    verbTrie.setDestination(NULL);
}


/*virtual*/
void
FrenchVerbDictionary::VerbTrie::onFoundPrefixWithUserData(
			const string &conjugatedVerb,
			string::size_type index,
			const vector<std::string> *templateList) const throw()
{
    assert(templateList != NULL);
    if (results == NULL)
	return;

    string radical(conjugatedVerb, 0, index);
    string term(conjugatedVerb, index);

    /*
	'templateList' contains the names of conjugated templates that might
	apply to the conjugated verb.  We check each of them to see if there
	is one that accepts the given termination 'term'.
    */
    for (vector<string>::const_iterator i = templateList->begin();
						i != templateList->end(); i++)
    {
	const string &tname = *i;
	const TemplateInflectionTable &ti =
				fvd.inflectionTable.find(tname)->second;
	TemplateInflectionTable::const_iterator j = ti.find(term);
	if (j == ti.end())
	    continue;  // template 'tname' does not accept termination 'term'

	// template 'tname' accepts 'term', so we produce some results.

	string templateTerm(tname, tname.find(':') + 1);
	    // termination of the infinitive form

	const vector<ModeTensePersonNumber> &v = j->second;
	    // list of mode-tense-person combinations that can correspond
	    // to the conjugated verb's termination

	for (vector<ModeTensePersonNumber>::const_iterator k = v.begin();
						    k != v.end(); k++)
	{
	    const ModeTensePersonNumber &mtpn = *k;

	    string infinitive = radical + templateTerm;
		// the infinitive of the conjugated verb is formed from its
		// radical part and from the termination of the template name

	    results->push_back(InflectionDesc(infinitive, tname, mtpn));
		// the InflectionDesc object is an analysis of the
		// conjugated verb
	}
    }
}


/*static*/
const char *
FrenchVerbDictionary::getModeName(Mode m)
{
    if (int(m) < int(INFINITIVE_MODE) || int(m) > int(PARTICIPLE_MODE))
	return NULL;

    static const char *names[] =
    {
	"infinitive", "indicative", "conditional",
	"subjunctive", "imperative", "participle"
    };

    return names[int(m) - 1];
}


/*static*/
const char *
FrenchVerbDictionary::getTenseName(Tense t)
{
    if (int(t) < int(PRESENT_TENSE) || int(t) > int(FUTURE_TENSE))
	return NULL;

    static const char *names[] =
    {
	"present", "past", "imperfect", "future"
    };

    return names[int(t) - 1];
}


string
FrenchVerbDictionary::tolowerLatin1(const string &latin1String) const
{
    string result;
    for (string::size_type len = latin1String.length(), i = 0; i < len; i++)
	result += latin1TolowerTable[(unsigned char) latin1String[i]];
    return result;
}


string
FrenchVerbDictionary::latin1ToUTF8(const string &latin1String) const throw(int)
{
    size_t len = latin1String.length();
    size_t inbytesleft = len + 1;
    size_t outbytesleft = len * 2 + 1;
    char *inbuf = strcpy(new char[inbytesleft], latin1String.c_str());
    char *outbuf = new char[outbytesleft];

    ICONV_CONST char *in = inbuf;
	/*  ICONV_CONST is defined by iconv.m4 to 'const' if the 2nd
	    argument of iconv() requires const, as is the case under
	    FreeBSD 6.1 with GCC 3.4.4 [2006-07-10].
	*/
    char *out = outbuf;
    if (iconv(toUTF8, &in, &inbytesleft, &out, &outbytesleft) == (size_t) -1)
    {
	int e = errno;
	delete [] inbuf;
	delete [] outbuf;
	throw e;
    }

    string result = outbuf;
    delete [] inbuf;
    delete [] outbuf;
    return result;
}


string
FrenchVerbDictionary::utf8ToLatin1(const string &utf8String) const throw(int)
{
    size_t len = utf8String.length();
    size_t inbytesleft = len + 1;
    size_t outbytesleft = len + 1;
    char *inbuf = strcpy(new char[inbytesleft], utf8String.c_str());
    char *outbuf = new char[outbytesleft];

    ICONV_CONST char *in = inbuf;
    char *out = outbuf;
    if (iconv(toLatin1, &in, &inbytesleft, &out, &outbytesleft) == (size_t) -1)
    {
	int e = errno;
	delete [] inbuf;
	delete [] outbuf;
	throw e;
    }

    string result = outbuf;
    delete [] inbuf;
    delete [] outbuf;
    return result;
}


void
FrenchVerbDictionary::utf8ToLatin1(vector<InflectionDesc> &vec) const throw(int)
{
    for (vector<InflectionDesc>::iterator it = vec.begin();
					it != vec.end(); it++)
	(*it).infinitive = utf8ToLatin1((*it).infinitive);

    // The 'templateName' field is in ASCII and does not need to be converted.
}


void
FrenchVerbDictionary::utf8ToLatin1(
			vector<vector<string> > &vec) const throw(int)
{
    for (vector<vector<string> >::iterator i = vec.begin();
					i != vec.end(); i++)
	for (vector<string>::iterator j = (*i).begin();
					j != (*i).end(); j++)
	    *j = utf8ToLatin1(*j);
}



/*static*/
string
FrenchVerbDictionary::getRadical(
			const string &infinitive,
			const string &templateName) throw(logic_error)
{
    string::size_type posColon = templateName.find(':');
    if (posColon == string::npos)
	throw logic_error("no colon found in template name");

    string::size_type lenSuffix = templateName.length() - posColon - 1;
    string::size_type lenInfPrefix = infinitive.length() - lenSuffix;
    return string(infinitive, 0, lenInfPrefix);
}


/*static*/
void
FrenchVerbDictionary::generateTense(const string &radical,
				const TemplateSpec &templ,
				Mode mode,
				Tense tense,
				vector< vector<string> > &dest,
				bool includePronouns,
				bool aspirateH) throw()
{
    const ModeSpec &modeSpec = templ.find(mode)->second;
    const TenseSpec &tenseSpec = modeSpec.find(tense)->second;

    if (mode != INDICATIVE_MODE
	    && mode != CONDITIONAL_MODE
	    && mode != SUBJUNCTIVE_MODE)
	includePronouns = false;

    for (TenseSpec::const_iterator p = tenseSpec.begin();
				    p != tenseSpec.end(); p++)
    {
	dest.push_back(vector<string>());
	for (PersonSpec::const_iterator i = p->begin(); i != p->end(); i++)
	{
	    string pronoun;
	    string v = radical + *i;

	    if (includePronouns)
	    {
		size_t noPers = p - tenseSpec.begin();
		switch (noPers)
		{
		    case 0:
		    {
			bool elideJe = false;
			if (!aspirateH)
			{
			    char init = (v.empty() ? '\0' : v[0]);
			    bool isVowelOrH = (strchr(
				    "aeiouyhAEIOUYH",
								init) != NULL);
			    if (isVowelOrH)
				elideJe = true;
			}
			pronoun = (elideJe ? "j'" : "je ");
			break;
		    }
		    case 1: pronoun = "tu "; break;
		    case 2: pronoun = "il "; break;
		    case 3: pronoun = "nous "; break;
		    case 4: pronoun = "vous "; break;
		    case 5: pronoun = "ils "; break;
		}

		if (mode == SUBJUNCTIVE_MODE)
		{
		    if (noPers == 2 || noPers == 5)
			pronoun = "qu'" + pronoun;
		    else
			pronoun = "que " + pronoun;
		}
	    }

	    dest.back().push_back(pronoun + v);
	}
    }
}


bool FrenchVerbDictionary::isVerbStartingWithAspirateH(
				const std::string &infinitive) const throw()
{
    return aspirateHVerbs.find(infinitive) != aspirateHVerbs.end();
}
