/*
   Name: $RCSfile: mod_link.c,v $
   Author: Alan Moran
   $Date: 2005/11/26 21:37:37 $
   $Revision: 1.4 $
   $Id: mod_link.c,v 1.4 2005/11/26 21:37:37 a_j_moran Exp $

   Legal Notice:

   This program is free software; you can redistribute it and/or
   modify it under the terms of the license contained in the
   COPYING file that comes with this distribution.

 */

/**
   @file

   @brief Functions to support the xslt module.

 */

#include <expat.h>
#include "globals.h"
/* these conditionals must appear after the inclusion of the globals header */
#if SUPPORT_XSLT_PROC == 1
#include <sablot.h>
#elif SUPPORT_XSLT_PROC == 2
#include <libxml/xmlmemory.h>
#include <libxml/debugXML.h>
#include <libxml/HTMLtree.h>
#include <libxml/xmlIO.h>
/* #include <libxml/DOCBparser.h> */
#include <libxml/xinclude.h>
#include <libxml/catalog.h>
#include <libxslt/xslt.h>
#include <libxslt/xsltInternals.h>
#include <libxslt/transform.h>
#include <libxslt/xsltutils.h>
#endif
#include "mod_link.h"

static rpl_str_t rpl_link_base_dir;
int process_flag = 0;
FILE *xml_rpt_fp;
XML_Parser parser;

/**
   Remove trailing link material not associated with the link resource
   (e.g., link fragments or query strings).  Caller is responsible for
   freeing the returned string.

   @param link the link to be normalized.
   @return the normalized link.
 */
static rpl_str_t
rpl_mod_link_normalize(rpl_c_str_t link)
{
	rpl_str_t sp, cp;

	assert(link != NULL);

	sp = strdup(link);
	/* remove trailing link fragments (e.g., index.html#intro) */
	if((cp = strchr(sp, '#')) != NULL)
		*cp = '\0';
	/* remove query strings (e.g., index.html?name=Joe) */
	if((cp = strchr(sp, '?')) != NULL)
		*cp = '\0';

	return sp;
}

/**
   Resolves link fragment and checks local file existence if appropriate.

   @param link fragment pointing to (possible) local resource.
   @return -1 if link pointers to non-existent local resource, otherwise 0.
 */
static int
rpl_mod_link_resolve(rpl_c_str_t link, rpl_str_t rdp)
{
	rpl_str_t domain, nlink;
	rpl_str_t resource = NULL;
	int rc = 0;

	assert(link != NULL); 

	if(rdp == NULL)
		rdp = "";
	nlink = rpl_mod_link_normalize(link);
	domain = rpl_cfg_get_trf_tpl_domain();

	/* check if the link refers to a local resource */
  	if(strncmp(link, domain, strlen(domain)) == 0)
	{
		/* matches the domain so must be an internal fully qualified name */
		resource = rpl_me_malloc(strlen(rpl_link_base_dir) + strlen(nlink) - strlen(domain) + 3);
		sprintf(resource, "%s/%s", rpl_link_base_dir, nlink + strlen(domain));
	} else if((strncmp(nlink, "http://", 7) != 0) && (strncmp(nlink, "https://", 8) != 0) 
				&& (strncmp(nlink, "mailto:", 7) != 0)) {
		/* not an external reference so must be an internal relative link */
		resource = rpl_me_malloc(strlen(rpl_link_base_dir) + strlen(rdp) + strlen(nlink) + 1);
		sprintf(resource, "%s/%s/%s", rpl_link_base_dir, rdp, nlink);
	} 

	/* check presence of the resource */
	if(resource != NULL)
	{
		rc = rpl_fs_file_exists(resource);
		rpl_me_free(resource);
	}
	rpl_me_free(nlink);

	return rc;
}

/**
   Prints error reports to the XML link check report file.

   @param element name of element that contains broken link.
   @param attribute name of attribute that contains broken link.
   @param link name of link that is broken.
 */
static void
rpl_mod_link_report_error(rpl_c_str_t element, rpl_str_t attribute, rpl_c_str_t link)
{
	assert((element != NULL) && (attribute != NULL) && (link != NULL));

	fprintf(xml_rpt_fp, RPL_MOD_LINK_ERROR, 
							element, 
							attribute, 
							XML_GetCurrentLineNumber(parser), 
							XML_GetCurrentColumnNumber(parser), 
							link);
}

/**
   Scan the input document for elements that have attribtues pointing to 
   resources and check that the resources exist if they are local.

   @param data user defined data (in this case the relative path of the parsed file)
   @param el the name of the element being handled.
   @param attr the key/value pairs for the attributes of the element.
 */
static void 
rpl_mod_link_start(void *data, const char *el, const char **attr)
{
	int i=0;
	rpl_str_t rdp = (rpl_str_t)data;

	if((strcmp(el, "a") == 0) || (strcmp(el, "A") == 0) || (strcmp(el, "link") == 0) || (strcmp(el, "LINK") == 0))
	{
		/* process elements with "href" attributes */
		for(i=0; attr[i]; i+= 2)
		{
			if((strcmp(attr[i],"href")==0) || (strcmp(attr[i],"HREF")==0))
			{
				if(rpl_mod_link_resolve(attr[i+1], rdp) < 0)
					rpl_mod_link_report_error(el, "href", attr[i+1]);
			}
		}
	} else if((strcmp(el, "img") == 0) || (strcmp(el, "IMG") == 0)) {
		/* process elements with "src" attributes */
		for(i=0; attr[i]; i+= 2)
		{
			if((strcmp(attr[i],"src")==0) || (strcmp(attr[i],"SRC")==0))
			{
				if(rpl_mod_link_resolve(attr[i+1], rdp) < 0)
					rpl_mod_link_report_error(el, "src", attr[i+1]);
			}
		}
	}
}

/**
   Handles closing of elements (but current does not do anything.)

   @param data user defined data (in this case the relative path of the parsed file)
   @param el the name of the element being handled.
 */
static void
rpl_mod_link_end(void *data, const char *el) {}

/**
   Configure the link module.

   @param fns pointer to module interface to be configured.
 */
void
rpl_mod_link_configure(rpl_mod_fns *fns)
{
	fns->init = rpl_mod_link_init;
	fns->process = rpl_mod_link_process;
	fns->cleanup = rpl_mod_link_cleanup;
	rpl_link_base_dir = rpl_wk_get_webdir();
	fns->basedir = rpl_link_base_dir;
}

/**
   Open the XML report file and set the process flag (i.e., check all the appropriate
   configuration settings are correct.)
 */
rpl_wk_status 
rpl_mod_link_init()
{
	rpl_wk_status status = RPL_WK_OK;
	rpl_str_t msg;

	if((rpl_cfg_get_linkchecker_xslt() != NULL) 
			&& (rpl_cfg_get_linkchecker_xml_report() != NULL) 
				&& (rpl_cfg_get_linkchecker_html_report() != NULL))
	{
		process_flag = 1;
		if((xml_rpt_fp = fopen((rpl_c_str_t)rpl_cfg_get_linkchecker_xml_report(), "wb")) == NULL)
			return RPL_WK_ERR;
		fprintf(xml_rpt_fp,"%s", RPL_MOD_LINK_HDR);
	} else {
		process_flag = 0;
		msg = rpl_message_get("LINK_CHECKER_MISSING_ENTRIES", RPL_EOM);
		rpl_log_warn(msg);
		rpl_me_free(msg);
	}

	return status; 
}

/**
   Performs xslt operations on web asset.

   @param filename name of file to be transformed relative to website base directory.
   @param st_buf stat of file.
 */
rpl_wk_status 
rpl_mod_link_process(rpl_c_str_t filename, struct stat statbuf)
{
	rpl_str_t msg, rdp, fp, key, line_no;
	rpl_wk_status status = RPL_WK_OK;
	rpl_reg_item item;
	rpl_str_t file, ctnt;

	assert(filename != NULL);

	if(process_flag) 
	{

		/* extract key information */
		if(rpl_fs_resolve_paths(filename, rpl_link_base_dir, &rdp, &fp))
			return RPL_WK_ERR;
		key = rpl_reg_create_key(rdp, fp);

		/* only parse the files */
		if(S_ISREG(statbuf.st_mode))
		{

			/* retrieve asset from registry */
			item = rpl_reg_search(key);
			if(item == &RPL_REG_WA_NULL)
			{
				rpl_log_error(rpl_message_get("REG_ASSET_NOT_FOUND", key, RPL_EOM));
				return RPL_WK_ERR;
			}

			if(rpl_wa_is_transformable(*item))
			{

				fprintf(xml_rpt_fp,"%s%s%s", RPL_MOD_LINK_FILE_ENTRY_1, filename, RPL_MOD_LINK_FILE_ENTRY_2);
				msg = rpl_str_concat(rpl_message_get("WK_PROCESSING", RPL_EOM), "link ", filename, RPL_STR_EOC);
				rpl_log_info(msg);
				rpl_me_free(msg);

				/* retrieve asset from registry */
				item = rpl_reg_search(key);
				if(item == &RPL_REG_WA_NULL)
				{
					msg = rpl_message_get("REG_ASSET_NOT_FOUND", key, RPL_EOM);
					rpl_log_error(msg);
					rpl_me_free(msg);
					return RPL_WK_ERR;
				}

				file = rpl_me_malloc(strlen(rpl_link_base_dir) + strlen(rdp) + strlen(fp) + 3);
				sprintf(file, "%s/%s/%s", rpl_link_base_dir, rdp, fp);
				ctnt = rpl_fs_f2str(file);

				/* new parser created for each document (is there a better way ?) */
				if((parser = XML_ParserCreate(NULL)) == NULL)
					rpl_log_fatal(rpl_message_get("OUT_OF_MEMORY", "link checker XML parser", RPL_EOM));
				XML_SetElementHandler(parser, rpl_mod_link_start, rpl_mod_link_end);
				XML_SetUserData(parser, (void *)rdp);

				if (XML_Parse(parser, ctnt, strlen(ctnt), 1) == XML_STATUS_ERROR)
				{
					line_no = (rpl_str_t)rpl_me_malloc(12);
					sprintf(line_no, "(line %d)", XML_GetCurrentLineNumber(parser));
					msg = rpl_str_concat("XML Parse Error: ", XML_ErrorString(XML_GetErrorCode(parser)), 
							" ", line_no, RPL_STR_EOC);
					rpl_log_error(msg);
					rpl_me_free(line_no);
					rpl_me_free(msg);
					status = RPL_WK_WARN;
				}

				XML_ParserFree(parser);

				fprintf(xml_rpt_fp, "%s", RPL_MOD_LINK_FILE_ENTRY_3);

				rpl_me_free(file);
				rpl_me_free(ctnt);

			}
		}

		/* free resources */
		if(strlen(rdp) > 0)
			rpl_me_free(rdp);
		if(strlen(fp) > 0)
			rpl_me_free(fp);

	}

	return status; 
}

/**
   Release resources (and transform XML report into HTML) held during processing. 
 */
rpl_wk_status 
rpl_mod_link_cleanup()
{
	rpl_wk_status status = RPL_WK_OK;
	rpl_str_t xslt = NULL, input;

#if SUPPORT_XSLT_PROC == 1
	/* define Sablot variables */
	SablotSituation situation;
	SablotHandle proc;
	SDOM_Document xml, xsl;
	int sablotronFlags;
	rpl_str_t output;
#elif SUPPORT_XSLT_PROC == 2
	static xsltStylesheetPtr cur;
	rpl_c_str_t catalog_loc;
	xmlDocPtr doc, res = NULL;
	rpl_str_t msg;
	FILE *fp;
#endif

	if(process_flag) 
	{
		/* acquire XSLT file and input */
		fprintf(xml_rpt_fp,"%s", RPL_MOD_LINK_TRL);
		fclose(xml_rpt_fp);
		input = rpl_fs_f2str(rpl_cfg_get_linkchecker_xml_report());

#if SUPPORT_XSLT_PROC == 1
		xslt = rpl_fs_f2str(rpl_cfg_get_linkchecker_xslt());
		/* prepare input and stylehseet for processing */
		SablotCreateSituation(&situation);
		SablotParseStylesheetBuffer(situation, xslt, &xsl);
		SablotParseBuffer(situation, input, &xml);
		SablotCreateProcessorForSituation(situation,&proc);
		SablotAddArgTree(situation, proc, RPL_MOD_LINK_SABLOT_XSLT, xsl);
		SablotAddArgTree(situation, proc, RPL_MOD_LINK_SABLOT_DATA, xml);

		sablotronFlags = SablotGetOptions (situation);
		sablotronFlags |= SAB_PARSE_PUBLIC_ENTITIES;
		SablotSetOptions (situation, sablotronFlags);

		/* perform transformation */
		SablotRunProcessorGen(situation, proc, RPL_MOD_LINK_SABLOT_ARG_XSLT, RPL_MOD_LINK_SABLOT_ARG_DATA, 
									RPL_MOD_LINK_SABLOT_ARG_OUT);
		SablotGetResultArg(proc, RPL_MOD_LINK_SABLOT_ARG_OUT, &output);
		rpl_fs_str2f(output, rpl_cfg_get_linkchecker_html_report());

		/* free resources */
		SablotDestroyDocument(situation, xml);
		SablotDestroyDocument(situation, xsl);
		SablotDestroyProcessor(proc);
		SablotDestroySituation(situation);
		rpl_me_free(output);

#elif SUPPORT_XSLT_PROC == 2

		/* attempt to identify the catalog location */
		catalog_loc = (rpl_cfg_get_trf_catalog()) ?  rpl_cfg_get_trf_catalog() : getenv("XML_CATALOG_FILES");
		if(catalog_loc == NULL)
		{
			msg = rpl_message_get("XML_CATALOG_DEFAULT", RPL_EOM);
			rpl_log_warn(msg);
		} else {
			xmlLoadCatalogs(catalog_loc);
		}

		/* set up libxslt internals */
		xmlSubstituteEntitiesDefault(1);
		xmlLoadExtDtdDefaultValue = 1;

		/* perform xsltlib operations */
		if((cur = xsltParseStylesheetFile((const xmlChar *) rpl_cfg_get_linkchecker_xslt())) == NULL)
		{
			msg = rpl_message_get("XSLT_STYLESHEET_PARSE_FAILED", xslt, RPL_EOM);
			rpl_log_fatal(msg);
		}

		if((doc = xmlParseMemory(input, strlen(input))) == NULL)
		{
			msg = rpl_message_get("XSLT_INPUT_PARSE_FAILED", input, RPL_EOM);
			rpl_log_error(msg);

			return -1;
		}
		res = xsltApplyStylesheet(cur, doc, NULL);
		/* xsltSaveResultToString((xmlChar **)&output, &out_size, res, cur); */
		if((fp = fopen(rpl_cfg_get_linkchecker_html_report(), "wb")) == NULL)
			return RPL_WK_ERR;
		xsltSaveResultToFile(fp, res, cur);

		/*
		   xmlFreeDoc(res);
		   xmlFreeDoc(doc);
		 */
		xsltFreeStylesheet(cur);
		xsltCleanupGlobals();
		xmlCleanupParser();
#endif

		/* release string resources 
		   rpl_me_free(rpl_link_base_dir);
		 */
	}

	return status;
}

