/*
   Name: $RCSfile: parser.c,v $
   Author: Alan Moran
   $Date: 2005/11/13 20:57:21 $
   $Revision: 1.17 $
   $Id: parser.c,v 1.17 2005/11/13 20:57:21 a_j_moran Exp $

   Legal Notice:

   This program is free software; you can redistribute it and/or
   modify it under the terms of the license contained in the
   COPYING file that comes with this distribution.

 */

/**
   @file

   @brief XML parser to normalize documents prior to XSLT transformation.

   For the most part the tools upon which rapple builds perform adequately when
   processing web assets.  In certain known situations, however, they fail to
   parse a document in a fashion that is suitable for rapple.  The parser
   modules addresses these deficiencies by providing an entry point in the
   overall processing for specific custom parsing.

*/

#include <expat.h>
#include <ctype.h>
#include "globals.h"
#include "parser.h"
#include "regex.h"

long        doc_size;
rpl_str_t   p_doc;
rpl_str_t   a_href_buf;
int         depth, txt_flag;

#ifdef IGNORE
static const rpl_entity_map entities[] = {
	{ "nbsp",   "#160" },
	{ "iexcl",  "#161" },
	{ "cent",   "#162" },
	{ "pound",  "#163" },
	{ "curren", "#164" },
	{ "yen",    "#165" },
	{ "brvbar", "#166" },
	{ "sect",   "#167" },
	{ "uml",    "#168" },
	{ "copy",   "#169" },
	{ "ordf",   "#170" },
	{ "laquo",  "#171" },
	{ "not",    "#172" },
	{ "shy",    "#173" },
	{ "reg",    "#174" },
	{ "macr",   "#175" },
	{ "deg",    "#176" },
	{ "plusmn", "#177" },
	{ "sup2",   "#178" },
	{ "sup3",   "#179" },
	{ "acute",  "#180" },
	{ "micro",  "#181" },
	{ "para",   "#182" },
	{ "middot", "#183" },
	{ "cedil",  "#184" },
	{ "sup1",   "#185" },
	{ "ordm",   "#186" },
	{ "raquo",  "#187" },
	{ "frac14", "#188" },
	{ "frac12", "#189" },
	{ "frac34", "#190" },
	{ "iquest", "#191" },
	{ "Agrave", "#192" },
	{ "Aacute", "#193" },
	{ "Acirc",  "#194" },
	{ "Atilde", "#195" },
	{ "Auml",   "#196" },
	{ "Aring",  "#197" },
	{ "AElig",  "#198" },
	{ "Ccedil", "#199" },
	{ "Egrave", "#200" },
	{ "Eacute", "#201" },
	{ "Ecirc",  "#202" },
	{ "Euml",   "#203" },
	{ "Igrave", "#204" },
	{ "Iacute", "#205" },
	{ "Icirc",  "#206" },
	{ "Iuml",   "#207" },
	{ "ETH",    "#208" },
	{ "Ntilde", "#209" },
	{ "Ograve", "#210" },
	{ "Oacute", "#211" },
	{ "Ocirc",  "#212" },
	{ "Otilde", "#213" },
	{ "Ouml",   "#214" },
	{ "times",  "#215" },
	{ "Oslash", "#216" },
	{ "Ugrave", "#217" },
	{ "Uacute", "#218" },
	{ "Ucirc",  "#219" },
	{ "Uuml",   "#220" },
	{ "Yacute", "#221" },
	{ "THORN",  "#222" },
	{ "szlig",  "#223" },
	{ "agrave", "#224" },
	{ "aacute", "#225" },
	{ "acirc",  "#226" },
	{ "atilde", "#227" },
	{ "auml",   "#228" },
	{ "aring",  "#229" },
	{ "aelig",  "#230" },
	{ "ccedil", "#231" },
	{ "egrave", "#232" },
	{ "eacute", "#233" },
	{ "ecirc",  "#234" },
	{ "euml",   "#235" },
	{ "igrave", "#236" },
	{ "iacute", "#237" },
	{ "icirc",  "#238" },
	{ "iuml",   "#239" },
	{ "eth",    "#240" },
	{ "ntilde", "#241" },
	{ "ograve", "#242" },
	{ "oacute", "#243" },
	{ "ocirc",  "#244" },
	{ "otilde", "#245" },
	{ "ouml",   "#246" },
	{ "divide", "#247" },
	{ "oslash", "#248" },
	{ "ugrave", "#249" },
	{ "uacute", "#250" },
	{ "ucirc",  "#251" },
	{ "uuml",   "#252" },
	{ "yacute", "#253" },
	{ "thorn",  "#254" },
	{ "yuml",   "#255" },
	{ "quot",    "#34" },
	{ "amp",     "#38#38;" },
	{ "lt",      "#38#60;" },
	{ "gt",      "#62" },
	{ "apos	",   "#39" },
	{ "OElig",   "#338" },
	{ "oelig",   "#339" },
	{ "Scaron",  "#352" },
	{ "scaron",  "#353" },
	{ "Yuml",    "#376" },
	{ "circ",    "#710" },
	{ "tilde",   "#732" },
	{ "ensp",    "#8194" },
	{ "emsp",    "#8195" },
	{ "thinsp",  "#8201" },
	{ "zwnj",    "#8204" },
	{ "zwj",     "#8205" },
	{ "lrm",     "#8206" },
	{ "rlm",     "#8207" },
	{ "ndash",   "#8211" },
	{ "mdash",   "#8212" },
	{ "lsquo",   "#8216" },
	{ "rsquo",   "#8217" },
	{ "sbquo",   "#8218" },
	{ "ldquo",   "#8220" },
	{ "rdquo",   "#8221" },
	{ "bdquo",   "#8222" },
	{ "dagger",  "#8224" },
	{ "Dagger",  "#8225" },
	{ "permil",  "#8240" },
	{ "lsaquo",  "#8249" },
	{ "rsaquo",  "#8250" },
	{ "euro",    "#8364" },
	{ "fnof",     "#402" },
	{ "Alpha",    "#913" },
	{ "Beta",     "#914" },
	{ "Gamma",    "#915" },
	{ "Delta",    "#916" },
	{ "Epsilon",  "#917" },
	{ "Zeta",     "#918" },
	{ "Eta",      "#919" },
	{ "Theta",    "#920" },
	{ "Iota",     "#921" },
	{ "Kappa",    "#922" },
	{ "Lambda",   "#923" },
	{ "Mu",       "#924" },
	{ "Nu",       "#925" },
	{ "Xi",       "#926" },
	{ "Omicron",  "#927" },
	{ "Pi",       "#928" },
	{ "Rho",      "#929" },
	{ "Sigma",    "#931" },
	{ "Tau",      "#932" },
	{ "Upsilon",  "#933" },
	{ "Phi",      "#934" },
	{ "Chi",      "#935" },
	{ "Psi",      "#936" },
	{ "Omega",    "#937" },
	{ "alpha",    "#945" },
	{ "beta",     "#946" },
	{ "gamma",    "#947" },
	{ "delta",    "#948" },
	{ "epsilon",  "#949" },
	{ "zeta",     "#950" },
	{ "eta",      "#951" },
	{ "theta",    "#952" },
	{ "iota",     "#953" },
	{ "kappa",    "#954" },
	{ "lambda",   "#955" },
	{ "mu",       "#956" },
	{ "nu",       "#957" },
	{ "xi",       "#958" },
	{ "omicron",  "#959" },
	{ "pi",       "#960" },
	{ "rho",      "#961" },
	{ "sigmaf",   "#962" },
	{ "sigma",    "#963" },
	{ "tau",      "#964" },
	{ "upsilon",  "#965" },
	{ "phi",      "#966" },
	{ "chi",      "#967" },
	{ "psi",      "#968" },
	{ "omega",    "#969" },
	{ "thetasym", "#977" },
	{ "upsih",    "#978" },
	{ "piv",      "#982" },
	{ "bull",     "#8226" },
	{ "hellip",   "#8230" },
	{ "prime",    "#8242" },
	{ "Prime",    "#8243" },
	{ "oline",    "#8254" },
	{ "frasl",    "#8260" },
	{ "weierp",   "#8472" },
	{ "image",    "#8465" },
	{ "real",     "#8476" },
	{ "trade",    "#8482" },
	{ "alefsym",  "#8501" },
	{ "larr",     "#8592" },
	{ "uarr",     "#8593" },
	{ "rarr",     "#8594" },
	{ "darr",     "#8595" },
	{ "harr",     "#8596" },
	{ "crarr",    "#8629" },
	{ "lArr",     "#8656" },
	{ "uArr",     "#8657" },
	{ "rArr",     "#8658" },
	{ "dArr",     "#8659" },
	{ "hArr",     "#8660" },
	{ "forall",   "#8704" },
	{ "part",     "#8706" },
	{ "exist",    "#8707" },
	{ "empty",    "#8709" },
	{ "nabla",    "#8711" },
	{ "isin",     "#8712" },
	{ "notin",    "#8713" },
	{ "ni",       "#8715" },
	{ "prod",     "#8719" },
	{ "sum",      "#8721" },
	{ "minus",    "#8722" },
	{ "lowast",   "#8727" },
	{ "radic",    "#8730" },
	{ "prop",     "#8733" },
	{ "infin",    "#8734" },
	{ "ang",      "#8736" },
	{ "and",      "#8743" },
	{ "or",       "#8744" },
	{ "cap",      "#8745" },
	{ "cup",      "#8746" },
	{ "int",      "#8747" },
	{ "there4",   "#8756" },
	{ "sim",      "#8764" },
	{ "cong",     "#8773" },
	{ "asymp",    "#8776" },
	{ "ne",       "#8800" },
	{ "equiv",    "#8801" },
	{ "le",       "#8804" },
	{ "ge",       "#8805" },
	{ "sub",      "#8834" },
	{ "sup",      "#8835" },
	{ "nsub",     "#8836" },
	{ "sube",     "#8838" },
	{ "supe",     "#8839" },
	{ "oplus",    "#8853" },
	{ "otimes",   "#8855" },
	{ "perp",     "#8869" },
	{ "sdot",     "#8901" },
	{ "lceil",    "#8968" },
	{ "rceil",    "#8969" },
	{ "lfloor",   "#8970" },
	{ "rfloor",   "#8971" },
	{ "lang",     "#9001" },
	{ "rang",     "#9002" },
	{ "loz",      "#9674" },
	{ "spades",   "#9824" },
	{ "clubs",    "#9827" },
	{ "hearts",   "#9829" },
	{ "diams",    "#9830" }
};
static int entities_sz = sizeof(entities) / sizeof(entities[0]);
#endif

/**
   Escapes ampersands by replacing instances of & with &amp;

   @param src

   @return the contents of the string src with characters matching entity keys replaced with
   entity literals.
 */
static void
rpl_parse_escape_amperands(rpl_str_t *input) 
{
    rpl_str_t src, sp, cp, dest;
    int amp_count = 0;
    size_t s_length, l_length, i, j=0;
	static rpl_c_str_t amp = "&amp;";

    assert(*input != NULL);

	/* calculate the length of the return string */
	src = *input;
    s_length = strlen(src);
	sp = src;
	while((cp = strchr(sp,'&')) != NULL)
	{
		amp_count++;
		sp = cp + 1;
	}

	/* now search and replace ampersands if necessary */
	if(amp_count > 0)
	{
    	dest = (rpl_str_t)rpl_me_malloc(s_length + strlen(amp) * amp_count + 1);
		for(i=0; i<s_length; i++) {
			/* is the current character an entity that requires replacement ? */
			if(src[i] == '&')
			{
				/* match! replace entity with literal form */
				l_length = strlen(amp);
				snprintf(dest+j, l_length + 1, "%s", amp);
				j += l_length;
			} else {
				/* no match here, move along please */
				dest[j++] = src[i];
			}
		}
		dest[j] = '\0';
		*input = dest;
		rpl_me_free(src);
	} 
}

/**
   Search and replace &amp; with &.  Note the regular expresion replace is not
   being used here because it resolves quoted ampersands (&amp;amp;) to & which
   is not what is required here.

   @param ctnt pointer to string to be parsed.
 */
static void 
rpl_parse_literal_ampersand(rpl_str_t *ctnt)
{
	rpl_str_t src, sp, np, ap;
	char *rp;

	assert(*ctnt != NULL);
	
	src = *ctnt;
	np = (rpl_str_t)rpl_me_malloc(strlen(src));

	/* search and replace any &amp; found....*/
	sp = src;
	strcpy(np, "");
	while((ap = strstr(sp,"&amp;")) != NULL)
	{
		*ap='\0';
		strcat(np,sp);
		/* TODO: or one that is used to escape an entity e.g., where &euro; is supposed to appear in output */
		/* ...but not one that represents a literal "&" (can be split over newline?) */
		rp = ap + 5;
		while(isalnum(*rp) || *rp == '\r' || *rp == '\n')
			rp++;
		if(*rp == ';')
		{
			strcat(np, "&");
		} else {
			strcat(np, "&amp;");
		}
		sp=ap + 5;
	}
	strcat(np,sp);

	sp = *ctnt;
	*ctnt = np;
	rpl_me_free(sp);
}


/**
   Resolves named entities into numeric entities.  There is still some work to be done testing
   this function (it is not used now but has been retained as it will likely be of use later)

   @param ctnt pointer to string containing named entities.
 */
#ifdef IGNORE
static void
rpl_parse_resolve_entities(rpl_str_t *ctnt)
{
	static rpl_regex_t *rep = NULL;
	rpl_str_t sp, cp, np, lp, rp, rpl_ctnt, msg;
	int i=0;

	assert(*ctnt != NULL);

	if(rep == NULL)
		rep = rpl_regex_create("((?U)(&([a-zA-z][a-zA-Z0-9]+);))",RPL_REGEX_FLAG_MULTILINE);
	/* TODO: perform proper dimensioning - see parse_escape_amperands and make counting static utility function */
	rpl_ctnt = (rpl_str_t)rpl_me_malloc(strlen(*ctnt) + 100);
	strcpy(rpl_ctnt, "");
	/* search for named entities in the input */
	sp = *ctnt;
	while(rpl_regex_match(rep, sp, 0) > 0)
	{	
		/* there exists a named entity within sp */
		if((cp = rpl_regex_capture(rep,3)) != NULL)
		{
			/* [lp,rp] range delimits the name cp to look up */
			lp = strstr(sp, cp);
			rp = strchr(lp,';');
			*lp = '\0';
			*rp = '\0';
			/* if a named entity match occurs then resolve */
			for(i=0; i<entities_sz; i++)
			{
				if(strcmp(cp, entities[i].name) == 0)
				{
					/* if the name matches then extract the numeric */
					np = (rpl_str_t)rpl_me_malloc(strlen(entities[i].numeric) + 1);
					strcpy(np, entities[i].numeric);
					break;
				}
			}
			if(i < entities_sz)
			{
				/* having confirmed the matched and extracted numeric */
				strcat(rpl_ctnt, sp);
				strcat(rpl_ctnt, np);
			} else {
				/* otherwise we know that we have an unknown named entity */
				msg = rpl_message_get("PARSER_UNKNOWN_ENTITY", cp, RPL_EOM);
				rpl_log_warn(msg);
				rpl_me_free(msg);
				strcat(rpl_ctnt, " &");
				strcat(rpl_ctnt, cp);
			}
			strcat(rpl_ctnt, ";");
			sp = rp + 1;
			rpl_me_free(cp);
		}
	}
	strcat(rpl_ctnt, sp);
	sp = *ctnt;
	*ctnt = rpl_ctnt;

	/* free resources */
	rpl_me_free(sp);
	rpl_regex_destroy(rep);
}
#endif

/**
   Handles start of element event. Passed to XML_SetElementHandler.

   @param data
   @param el
   @param attr
 */
static void
start(void *data, const char *el, const char **attr) {
    int         i, xhtml_ns_c=0;
    rpl_c_str_t  sp;
    rpl_str_t    atr, msg;

    /* locate the position of the first character that does not preceed a namespace separator (:) */
    if((sp = strchr(el,':')) != NULL) {
        sp++;
    } else {
        sp = el;
    }

    strcat(p_doc, "<");
    strcat(p_doc, sp);

    for(i=0; attr[i]; i += 2) {
        if(strncmp(RPL_XMLNS, attr[i], strlen(RPL_XMLNS)) != 0) {
            atr = (rpl_str_t)rpl_me_malloc(strlen(attr[i]) + strlen(attr[i+1]) + 4);
            sprintf(atr, " %s='%s' ", attr[i], attr[i+1]);
            strcat(p_doc, atr);
            rpl_me_free(atr);

            /* note the href value in <a> elements */
            if((strcmp(el, "a") == 0) && (strcmp(attr[i],"href") == 0)) {
                a_href_buf = (rpl_str_t)rpl_me_malloc(strlen(attr[i+1]) + 1);
                sprintf(a_href_buf, "%s", attr[i+1]);
            }
        }
        if(strstr(attr[i+1],RPL_XHTML_NS) != NULL)
            xhtml_ns_c++;
		/* presence of < or > not permitted in attribute values - this usually happens
		   when scripting languages e.g., PHP, use constructs such as <?= to dynamically
		   set attribute values. */
		if((strchr(attr[i+1], '<') != NULL) || (strchr(attr[i+1],'>') != NULL))
		{
			msg = rpl_message_get("PARSER_ILLEGAL_ATTR_CHAR", RPL_EOM);
			rpl_log_warn(msg);
			rpl_me_free(msg);
		}
    }

    /* insert the XHTML namespace if not already present */
    if((strcmp(el, "html") == 0) && (xhtml_ns_c > 0))
        strcat(p_doc, RPL_XHTML_NS);

    /* if the tag is empty then close it now */
    if((strcmp(el,"br") != 0) && (strcmp(el,"hr") != 0) && (strcmp(el,"img") != 0) && (strcmp(el,"link") != 0)) {
        strcat(p_doc, ">");
    } else {
        strcat(p_doc, " />");
    }

    /* assume that the element contains no text */
    txt_flag = 0;

    depth++;
}

/**
   Handles presence of character text event. Passed to XML_SetCharacterDataHandler.

   @param data
   @param txt
   @param txtlen
 */
static void
characters(void *data, const char *txt, int txtlen) {
    rpl_str_t p_txt;

    if((txtlen > 0) && (txt != NULL) && (strlen(txt) > 0)) {
        p_txt = (rpl_str_t)rpl_me_malloc(txtlen + 2);
        snprintf(p_txt, txtlen + 1, "%s", txt);
        strcat(p_doc, p_txt);
        rpl_me_free(p_txt);
    }

    /* confirm that text was encountered */
    txt_flag = 1;
}

/**
   Handles end of element event. Passed to XML_SetElementHandler.

   @param data
   @param el
 */
static void
end(void *data, const char *el) {
    rpl_c_str_t sp;

    /* there is nothing to do if the tag is empty */
    if((strcmp(el,"br") != 0) && (strcmp(el,"hr") != 0) && (strcmp(el,"img") != 0) && (strcmp(el,"link") != 0)) {
        /* if there is no text in an <a> element then default to the href value */
        if(strcmp(el,"a") == 0) {
            if(a_href_buf && (txt_flag == 0)) {
                strcat(p_doc, a_href_buf);
                rpl_me_free(a_href_buf);
            }
        }
        /* locate the position of the first character that does not preceed a namespace separator (:) */
        if((sp = strchr(el,':')) != NULL) {
            sp++;
        } else {
            sp = el;
        }
        strcat(p_doc, "</");
        strcat(p_doc, sp);
        strcat(p_doc, ">");
    }
    depth--;
}

/**
   Performs normalization parsing.
   @param xml_doc

   @return Parsed doc with entity keys replaced with entity literals.
 */
rpl_str_t
rpl_parse(rpl_str_t xml_doc_in) {

    XML_Parser parser;
	rpl_str_t line_no, msg, xml_doc;

	assert(xml_doc_in != NULL);

	xml_doc = strdup((rpl_c_str_t)xml_doc_in);

    if((parser = XML_ParserCreate(NULL)) == NULL)
        rpl_log_fatal(rpl_message_get("OUT_OF_MEMORY", "normalizing XML parser", RPL_EOM));

    XML_SetElementHandler(parser, start, end);
    XML_SetCharacterDataHandler(parser, characters);

    p_doc = (rpl_str_t)rpl_me_malloc(2 * strlen(xml_doc) + 1);

	/* entity complicate the parsing somewhat since expat automatically resolves
	   predefined entities (&amp; &lt; etc.) and sometimes these need to be 
	   interpreted literally (e.g., when wishing to display an ampersand in a
	   document.  The easiest solution is to "escape" all ampersands prior to
	   parsing, let expat resolve what it finds and then quote the leftover
	   ampersands (this passes the document faithfully onto the next parsing phase) */
	rpl_parse_escape_amperands(&xml_doc);
    if (XML_Parse(parser, xml_doc, strlen(xml_doc), 1) == XML_STATUS_ERROR)
	{
		line_no = (rpl_str_t)rpl_me_malloc(12);
		sprintf(line_no, "(line %d)", XML_GetCurrentLineNumber(parser));
		msg = rpl_str_concat("XML Parse Error: ", XML_ErrorString(XML_GetErrorCode(parser)), 
									" ", line_no, RPL_STR_EOC);
		rpl_log_error(msg);
		rpl_me_free(line_no);
		rpl_me_free(msg);
		return RPL_STR_NUL;
	}

	/* rpl_parse_resolve_entities(&p_doc); */
	rpl_parse_literal_ampersand(&p_doc);
	return p_doc;
}

