/* Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#include "udm_config.h"

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <string.h>
#include <errno.h>
#include <ctype.h>
#include <signal.h>
#include <assert.h>

#ifdef WIN32
#include <time.h>
#endif

#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif

#ifdef HAVE_LIBUTIL_H
#include <libutil.h>
#endif

#ifdef HAVE_GETOPT_H
#include <getopt.h>
#endif

#include "udm_common.h"
#include "udm_utils.h"
#include "udm_log.h"
#include "udm_conf.h"
#include "udm_indexer.h"
#include "udm_robots.h"
#include "udm_db.h"
#include "udm_url.h"
#include "udm_parser.h"
#include "udm_proto.h"
#include "udm_hrefs.h"
#include "udm_mutex.h"
#include "udm_crc32.h"
#include "udm_hash.h"
#include "udm_xmalloc.h"
#include "udm_http.h"
#include "udm_host.h"
#include "udm_server.h"
#include "udm_alias.h"
#include "udm_word.h"
#include "udm_crossword.h"
#include "udm_parsehtml.h"
#include "udm_parsexml.h"
#include "udm_spell.h"
#include "udm_execget.h"
#include "udm_agent.h"
#include "udm_match.h"
#include "udm_doc.h"
#include "udm_result.h"
#include "udm_parsedate.h"
#include "udm_unicode.h"
#include "udm_contentencoding.h"
#include "udm_vars.h"
#include "udm_guesser.h"
#include "udm_textlist.h"
#include "udm_id3.h"
#include "udm_stopwords.h"
#include "udm_wild.h"
#ifdef HAVE_ZLIB
#include "udm_store.h"
#endif

/* This should be last include */
#ifdef DMALLOC
#include "dmalloc.h"
#endif

#define UDM_THREADINFO(A,s,m)	if(A->Conf->ThreadInfo)A->Conf->ThreadInfo(A,s,m)

/***************************************************************************/

#define MAXHSIZE	1023*4	/* TUNE */

__C_LINK int __UDMCALL UdmStoreHrefs(UDM_AGENT * Indexer) {
	size_t		i, res;
	UDM_DOCUMENT	Doc;

	UdmDocInit(&Doc);

	for (i = Indexer->Hrefs.dhrefs; i < Indexer->Hrefs.nhrefs; i++) {
		UDM_HREF *H = &Indexer->Hrefs.Href[i];
		if(!(H->stored)){
			if( 1 /*strlen(H->url) <= UDM_URLSIZE */ ) {  /* FIXME: replace this by config parameter chacking */
				UdmVarListReplaceInt(&Doc.Sections, "Referrer-ID", H->referrer);
				UdmVarListReplaceUnsigned(&Doc.Sections,"Hops", H->hops);
				UdmVarListReplaceStr(&Doc.Sections,"URL",H->url?H->url:"");
				UdmVarListReplaceInt(&Doc.Sections, "URL_ID", UdmStrHash32(H->url ? H->url : ""));
				UdmVarListReplaceInt(&Doc.Sections,"Site_id", H->site_id);
				UdmVarListReplaceInt(&Doc.Sections,"Server_id", H->server_id);
				if(UDM_OK != (res = UdmURLAction(Indexer, &Doc, UDM_URL_ACTION_ADD))){
					return(res);
				}
			}
			H->stored=1;
		}
	}
	UdmDocFree(&Doc);
	
	/* Remember last stored URL num */
	/* Note that it will became 0   */
	/* after next sort in AddUrl    */
	Indexer->Hrefs.dhrefs = Indexer->Hrefs.nhrefs;
	
	/* We should not free URL list with onw database */
	/* to avoid double indexing of the same document */
	/* So, do it if compiled with SQL only           */
	
	/* FIXME: this is incorrect with both SQL and built-in compiled */
	if(Indexer->Hrefs.nhrefs > MAXHSIZE)
		UdmHrefListFree(&Indexer->Hrefs);
	return UDM_OK;
}

static void RelLink(UDM_URL *curURL, UDM_URL *newURL, char **str){
	const char	*schema = newURL->schema ? newURL->schema : curURL->schema;
	const char	*hostinfo = newURL->hostinfo ? newURL->hostinfo : curURL->hostinfo;
	const char	*path = (newURL->path && newURL->path[0]) ? newURL->path : curURL->path;
	const char	*fname = ((newURL->filename && newURL->filename[0]) || (newURL->path && newURL->path[0])) ? 
	  newURL->filename : curURL->filename;
	char		*pathfile = (char*)malloc(strlen(UDM_NULL2EMPTY(path)) + strlen(UDM_NULL2EMPTY(fname)) + 5);
		
	if (pathfile == NULL) return;
	sprintf(pathfile, "/%s%s",  UDM_NULL2EMPTY(path), UDM_NULL2EMPTY(fname));
	
	UdmURLNormalizePath(pathfile);
	

	if (!strcasecmp(UDM_NULL2EMPTY(schema), "mailto") || !strcasecmp(UDM_NULL2EMPTY(schema), "javascript")) {
	        *str = (char*)malloc(strlen(UDM_NULL2EMPTY(schema)) + strlen(UDM_NULL2EMPTY(newURL->specific)) + 4);
		sprintf(*str, "%s:%s", UDM_NULL2EMPTY(schema), UDM_NULL2EMPTY(newURL->specific));
	} else if(/*!strcasecmp(UDM_NULL2EMPTY(schema), "file") ||*/ !strcasecmp(UDM_NULL2EMPTY(schema), "htdb")) {
	        *str = (char*)malloc(strlen(UDM_NULL2EMPTY(schema)) + strlen(pathfile) + 4);
		sprintf(*str, "%s:%s", UDM_NULL2EMPTY(schema), pathfile);
	}else{
	        *str = (char*)malloc(strlen(UDM_NULL2EMPTY(schema)) + strlen(pathfile) + strlen(UDM_NULL2EMPTY(hostinfo)) + 8);
		sprintf(*str, "%s://%s%s", UDM_NULL2EMPTY(schema), UDM_NULL2EMPTY(hostinfo), pathfile);
	}
	
	if(!strncmp(*str, "ftp://", 6) && (strstr(*str, ";type=")))
		*(strstr(*str, ";type")) = '\0';
	UDM_FREE(pathfile);
}

static int UdmDocBaseHref(UDM_AGENT *Indexer,UDM_DOCUMENT *Doc){
	const char	*basehref=UdmVarListFindStr(&Doc->Sections,"base.href",NULL);
	/* <BASE HREF="xxx"> stuff            */
	/* Check that URL is properly formed  */
	/* baseURL is just temporary variable */
	/* If parsing  fails we'll use old    */
	/* base href, passed via CurURL       */
	/* Note that we will not check BASE     */
	/* if delete_no_server is unset         */
	/* This is  actually dirty hack. We     */
	/* must check that hostname is the same */
	
	if(basehref /*&& (Doc->Spider.follow==UDM_FOLLOW_WORLD)*/){
		UDM_URL		*baseURL = UdmURLInit(NULL);
		int		parse_res;
		
		if(!(parse_res=UdmURLParse(baseURL, basehref))) {
			UdmURLParse(&Doc->CurURL,basehref);
			UdmLog(Indexer,UDM_LOG_ERROR,"BASE HREF '%s'",basehref);
		}else{
			switch(parse_res){
			case UDM_URL_LONG:
				UdmLog(Indexer,UDM_LOG_ERROR,"BASE HREF too long: '%s'",basehref);
				break;
			case UDM_URL_BAD:
			default:
				UdmLog(Indexer,UDM_LOG_ERROR,"Error in BASE HREF URL: '%s'",basehref);
			}
		}
		UdmURLFree(baseURL);
	}
	return UDM_OK;
}

static int UdmFilterFind(UDM_MATCHLIST *L,const char *newhref,char *reason){
	UDM_MATCH_PART	P[10];
	UDM_MATCH	*M;
	int		res=UDM_METHOD_GET;
	
	if((M=UdmMatchListFind(L,newhref,10,P))){
		sprintf(reason,"%s %s %s '%s'",M->arg,UdmMatchTypeStr(M->match_type),M->case_sense?"Sensitive":"InSensitive",M->pattern);
		res=UdmMethod(M->arg);
	}else{
		sprintf(reason,"Allow by default");
	}
	return res;
}

int UdmConvertHref(UDM_AGENT *Indexer, UDM_URL *CurURL, UDM_HREF *Href){
	int		parse_res, cascade;
	UDM_URL		*newURL = UdmURLInit(NULL);
	char		*newhref = NULL;
	UDM_MATCH	*Alias;
	char		*alias;
	size_t		aliassize, nparts = 10;
	UDM_MATCH_PART	Parts[10];
	UDM_SERVER	*Srv;
	char		reason[1024]="";
	
	if((parse_res=UdmURLParse(newURL, Href->url))){
		switch(parse_res){
			case UDM_URL_LONG:
				UdmLog(Indexer,UDM_LOG_DEBUG,"URL too long: '%s'",Href->url);
				break;
			case UDM_URL_BAD:
			default:
				UdmLog(Indexer,UDM_LOG_DEBUG,"Error in URL: '%s'",Href->url);
		}
	}
	
	RelLink(CurURL, newURL, &newhref);
	aliassize = 128 + 10 * strlen(newhref);
	alias = (char*)malloc(aliassize);
	if (alias == NULL) {
	  UdmLog(Indexer, UDM_LOG_ERROR, "No memory. %s line %d", __FILE__, __LINE__);
	  goto ret;
	}
	
	UdmLog(Indexer,UDM_LOG_DEBUG,"Link '%s' %s",Href->url,newhref);
	for(cascade = 0; ((Alias=UdmMatchListFind(&Indexer->Conf->ReverseAliases,newhref,nparts,Parts))) && (cascade < 1024); cascade++) {
		UdmMatchApply(alias,aliassize,newhref,Alias->arg,Alias,nparts,Parts);
		if(alias[0]){
		  UdmLog(Indexer,UDM_LOG_DEBUG,"ReverseAlias%d: '%s'", cascade, alias);
		  strcpy(newhref,alias);
		} else break;
	}
	
		
	UdmURLParse(newURL, newhref);
	
	Href->site_id = 0;
	UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
	if(!(Srv = UdmServerFind(Indexer->Conf, &Indexer->Conf->Servers, newhref, NULL))) {
		UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);
		UdmLog(Indexer,UDM_LOG_DEBUG,"no Server, skip it",newhref);
		Href->method=UDM_METHOD_DISALLOW;
		goto ret;
	}
	UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);

	if (!strcasecmp(UDM_NULL2EMPTY(newURL->schema), "mailto") || !strcasecmp(UDM_NULL2EMPTY(newURL->schema), "javascript")) {
		UdmLog(Indexer,UDM_LOG_DEBUG,"'%s' schema, skip it", newURL->schema, newhref);
		Href->method=UDM_METHOD_DISALLOW;
		goto ret;
	}
	
	if (Href->hops > Srv->MaxHops) {
		UdmLog(Indexer, UDM_LOG_DEBUG, "too many hops (%d), skip it", Href->hops);
		Href->method = UDM_METHOD_DISALLOW;
		goto ret;
	}

	UdmLog(Indexer, UDM_LOG_DEBUG, " Server applied: site_id: %d URL: %s", Srv->site_id, Srv->Match.pattern);
	
	/* Check Allow/Disallow/CheckOnly stuff */
	Href->method=UdmFilterFind(&Indexer->Conf->Filters,newhref,reason);
	if(Href->method==UDM_METHOD_DISALLOW){
		UdmLog(Indexer,UDM_LOG_DEBUG,"%s, skip it",reason);
		goto ret;
	}else{
		UdmLog(Indexer,UDM_LOG_DEBUG,"%s",reason);
	}
	
	/* FIXME: add MaxHops, Robots */
	UDM_FREE(Href->url);
	Href->url = (char*)strdup(newhref);
	Href->server_id = Srv->site_id;
ret:	
	UDM_FREE(newhref);
	UDM_FREE(alias);
	UdmURLFree(newURL);
	return UDM_OK;
}

static int UdmDocConvertHrefs(UDM_AGENT *Indexer, UDM_DOCUMENT *Doc){
	size_t		i;
	int		hops=UdmVarListFindInt(&Doc->Sections,"Hops",0);
	urlid_t		url_id = (urlid_t)UdmVarListFindInt(&Doc->Sections, "ID", 0);
	uint4           maxhops = UdmVarListFindUnsigned(&Doc->Sections, "MaxHops", 255);

	for(i=0;i<Doc->Hrefs.nhrefs;i++){
		UDM_HREF	*Href=&Doc->Hrefs.Href[i];
		Href->hops=hops+1;
		UdmConvertHref(Indexer,&Doc->CurURL,Href);
		Href->referrer=url_id;
		if (maxhops > Href->hops) {
		  Href->stored = 0;
		} else {
		  Href->stored = 1;
		  Href->method = UDM_METHOD_DISALLOW;
		}
	}
	return UDM_OK;
}

int UdmDocStoreHrefs(UDM_AGENT *Indexer, UDM_DOCUMENT *Doc){
	size_t i;
	
	if(Doc->method==UDM_METHOD_HEAD)
		return UDM_OK;
	
	UdmDocBaseHref(Indexer,Doc);
	UdmDocConvertHrefs(Indexer,Doc);
	for(i=0;i<Doc->Hrefs.nhrefs;i++){
		UDM_HREF	*Href=&Doc->Hrefs.Href[i];
		if(Href->method!=UDM_METHOD_DISALLOW) {
		  UdmHrefListAdd(&Indexer->Hrefs, Href);
		}
	}
	return UDM_OK;
}

/*********************** 'UrlFile' stuff (for -f option) *******************/

__C_LINK int __UDMCALL UdmURLFile(UDM_AGENT *Indexer, const char *fname,int action){
	FILE *url_file;
	char str[1024]="";
	char str1[1024]="";
	int result,res;
	UDM_URL *myurl = UdmURLInit(NULL);
	UDM_HREF Href;
	
	/* Read lines and clear/insert/check URLs                     */
	/* We've already tested in main.c to make sure it can be read */
	/* FIXME !!! Checking should be done here surely              */
	
	if(!strcmp(fname,"-"))
		url_file=stdin;
	else
		url_file=fopen(fname,"r");
	
	while(fgets(str1,sizeof(str1),url_file)){
		char *end;
		if(!str1[0])continue;
		end=str1+strlen(str1)-1;
		while((end>=str1)&&(*end=='\r'||*end=='\n')){
			*end=0;if(end>str1)end--;
		}
		if(!str1[0])continue;
		if(str1[0]=='#')continue;

		if(*end=='\\'){
			*end=0;strcat(str,str1);
			continue;
		}
		strcat(str,str1);
		strcpy(str1,"");

		switch(action){
		case UDM_URL_FILE_REINDEX:
			UdmVarListReplaceStr(&Indexer->Conf->Vars, "ul", str);
			result = UdmURLAction(Indexer, NULL, UDM_URL_ACTION_EXPIRE);
			if(result!=UDM_OK) { UdmURLFree(myurl); return(result); }
			UdmVarListDel(&Indexer->Conf->Vars, "ul");
			break;
		case UDM_URL_FILE_CLEAR:
			UdmVarListReplaceStr(&Indexer->Conf->Vars, "ul", str);
			result=UdmClearDatabase(Indexer);
			if(result!=UDM_OK) { UdmURLFree(myurl); return(UDM_ERROR); }
			UdmVarListDel(&Indexer->Conf->Vars, "ul");
			break;
		case UDM_URL_FILE_INSERT:
			UdmHrefInit(&Href);
			Href.url=str;
			Href.method=UDM_METHOD_GET;
			UdmHrefListAdd(&Indexer->Hrefs, &Href);
			break;
		case UDM_URL_FILE_PARSE:
			res=UdmURLParse(myurl, str);
			if((res != UDM_OK) && (myurl->schema == NULL))
				res=UDM_URL_BAD;
			if(res){
				switch(res){
				case UDM_URL_LONG:
					UdmLog(Indexer,UDM_LOG_ERROR,"URL too long: '%s'",str);
					break;
				case UDM_URL_BAD:
				default:
					UdmLog(Indexer,UDM_LOG_ERROR,"Error in URL: '%s'",str);
				}
				UdmURLFree(myurl);
				return(UDM_ERROR);
			}
			break;
		}
		str[0]=0;
	}
	if(url_file!=stdin)
		fclose(url_file);
	UdmURLFree(myurl);
	return(UDM_OK);
}




/*******************************************************************/


static int UdmDocAlias(UDM_AGENT *Indexer,UDM_DOCUMENT *Doc){
	UDM_MATCH	*Alias;
	UDM_MATCH_PART	Parts[10];
	size_t		alstrlen, nparts=10;
	const char	*alias_prog=UdmVarListFindStr(&Indexer->Conf->Vars,"AliasProg",NULL);
	char		*alstr;
	int		result=UDM_OK;
	const char	*url=UdmVarListFindStr(&Doc->Sections,"URL","");

	alstrlen = 256 + strlen(url);
	if ((alstr = (char*)malloc(alstrlen)) == NULL) return UDM_ERROR;
	alstr[0] = '\0';
	if(alias_prog){
		result = UdmAliasProg(Indexer, alias_prog, url, alstr, alstrlen - 1);
		UdmLog(Indexer,UDM_LOG_EXTRA,"AliasProg result: '%s'",alstr);
		if(result!=UDM_OK || alstr[0]) { UDM_FREE(alstr); return result; }
	}
	
	/* Find alias when aliastr is empty, i.e.     */
	/* when there is no alias in "Server" command */
	/* and no AliasProg                           */
	if((Alias=UdmMatchListFind(&Indexer->Conf->Aliases,url,nparts,Parts))){
		UdmMatchApply(alstr, alstrlen - 1, url, Alias->arg, Alias, nparts, Parts);
	}
	if(alstr[0]){
		UdmVarListReplaceStr(&Doc->Sections,"Alias",alstr);
	}
	UDM_FREE(alstr);
	return UDM_OK;
}




static int UdmDocLookupConn(UDM_AGENT *Indexer,UDM_DOCUMENT *Doc){
	const char *proxy;
	int u;
	
	if((proxy=UdmVarListFindStr(&Doc->RequestHeaders,"Proxy",NULL))){
		char *port;

		UdmLog(Indexer, UDM_LOG_DEBUG, "Using Proxy: %s", proxy);
		Doc->connp.hostname = (char*)strdup(proxy);
		if((port=strchr(Doc->connp.hostname,':'))){
			*port++='\0';
			Doc->connp.port=atoi(port);
		}else{
			Doc->connp.port=3128;
		}
	}else{
		if (Doc->CurURL.hostname){
			Doc->connp.hostname = (char*)strdup(Doc->CurURL.hostname);
			Doc->connp.port=Doc->CurURL.port?Doc->CurURL.port:Doc->CurURL.default_port;
		}
	}

	UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
	u = UdmHostLookup(&Indexer->Conf->Hosts, &Doc->connp);
	UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);

	if(Doc->CurURL.hostname != NULL && *Doc->CurURL.hostname != '\0' && u) {
		UdmLog(Indexer,UDM_LOG_WARN,"Can't resolve host '%s'",Doc->connp.hostname);
		Doc->method = UDM_METHOD_VISITLATER;
		UdmVarListReplaceInt(&Doc->Sections, "Status", UDM_HTTP_STATUS_SERVICE_UNAVAILABLE);
	}
	return UDM_OK;
}



static int UdmDocCheck(UDM_AGENT *Indexer,UDM_SERVER *CurSrv,UDM_DOCUMENT *Doc){
	char		reason[1024]="";
	int		nerrors=-1;
	int		hops=UdmVarListFindInt(&Doc->Sections,"Hops",0);
	const char	*method=UdmVarListFindStr(&CurSrv->Vars,"Method","Allow");
	
	switch(CurSrv->Match.match_type){
		case UDM_MATCH_WILD:
			UdmLog(Indexer,UDM_LOG_DEBUG, "Realm %s wild '%s'", method, CurSrv->Match.pattern);
			break;
		case UDM_MATCH_REGEX:
			UdmLog(Indexer,UDM_LOG_DEBUG, "Realm %s regex '%s'", method, CurSrv->Match.pattern);
			break;
		case UDM_MATCH_SUBNET:
			UdmLog(Indexer,UDM_LOG_DEBUG, "Subnet %s '%s'", method, CurSrv->Match.pattern);
			break;
		case UDM_MATCH_BEGIN:
		default:
			UdmLog(Indexer,UDM_LOG_DEBUG, "Server %s '%s'", method, CurSrv->Match.pattern);
			break;
	}
	
	if((Doc->method=UdmMethod(method)) == UDM_METHOD_GET){
		/* Check Allow/Disallow/CheckOnly stuff */
		Doc->method=UdmFilterFind(&Indexer->Conf->Filters,UdmVarListFindStr(&Doc->Sections,"URL",""),reason);
		UdmLog(Indexer,UDM_LOG_DEBUG,"%s",reason);
	}
	
	if(Doc->method==UDM_METHOD_DISALLOW)return UDM_OK;
	
	/* Check that hops is less than MaxHops */
	if(hops>Doc->Spider.maxhops){
		UdmLog(Indexer,UDM_LOG_WARN,"Too many hops (%d)",hops);
		Doc->method=UDM_METHOD_DISALLOW;
		return UDM_OK;
	}
	
	/* Check for too many errors on this server */
	UdmDocLookupConn(Indexer, Doc);
	nerrors = (Doc->connp.Host != NULL) ?  Doc->connp.Host->net_errors : 0;
	
	if((nerrors>=Doc->Spider.max_net_errors)&&(Doc->Spider.max_net_errors)){
		size_t	next_index_time=time(NULL)+Doc->Spider.net_error_delay_time;
		char	buf[64];
		
		UdmLog(Indexer,UDM_LOG_WARN,"Too many network errors (%d) for this server",nerrors);
		UdmVarListReplaceInt(&Doc->Sections,"Status",UDM_HTTP_STATUS_SERVICE_UNAVAILABLE);
		UdmTime_t2HttpStr((int)next_index_time, buf);
		UdmVarListReplaceStr(&Doc->Sections,"Next-Index-Time",buf);
		Doc->method=UDM_METHOD_VISITLATER;
		return UDM_OK;
	}
	
	return UDM_OK;
}


static void UdmAppendTarget(UDM_AGENT *Indexer, const char *url, const char *lang, const int hops) {
  UDM_DOCUMENT *Doc, *Save;
  size_t i;

  UDM_GETLOCK(Indexer, UDM_LOCK_THREAD);
  UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
  if (Indexer->Conf->Targets.num_rows > 0) {
    for (i = Indexer->Conf->Targets.num_rows - 1; i > 0; i--) {
      Doc = &Indexer->Conf->Targets.Doc[i];
      if (strcasecmp(UdmVarListFindStr(&Doc->Sections, "URL", ""), url) == 0 
	  && strcmp(UdmVarListFindStr(&Doc->RequestHeaders, "Accept-Language", ""), lang) == 0) {
	UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);
	UDM_RELEASELOCK(Indexer, UDM_LOCK_THREAD);
	return;
      }
    }
  }
  if ((Indexer->Conf->Targets.Doc = 
       realloc(Save = Indexer->Conf->Targets.Doc, (Indexer->Conf->Targets.num_rows + 1) * sizeof(UDM_DOCUMENT))) == NULL) {
    Indexer->Conf->Targets.Doc = Save;
    UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);
    UDM_RELEASELOCK(Indexer, UDM_LOCK_THREAD);
    return;
  }
  Doc = &Indexer->Conf->Targets.Doc[Indexer->Conf->Targets.num_rows++];
  UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);
  UdmDocInit(Doc);
  UdmVarListAddStr(&Doc->Sections, "URL", url);
  UdmVarListAddInt(&Doc->Sections, "Hops", hops);
  UdmVarListReplaceInt(&Doc->Sections, "URL_ID", UdmStrHash32(url));
  UdmURLAction(Indexer, Doc, UDM_URL_ACTION_ADD);
  if (*lang != '\0') UdmVarListAddStr(&Doc->RequestHeaders, "Accept-Language", lang);
  UDM_RELEASELOCK(Indexer, UDM_LOCK_THREAD);
  return;
}


static int UdmDocProcessResponseHeaders(UDM_AGENT *Indexer,UDM_DOCUMENT *Doc){  /* This function must have exclusive acces to Conf */
	UDM_VAR		*var;
	UDM_MATCH_PART	P[10];
	size_t		sec;
	const char      *content_type = UdmVarListFindStr(&Doc->Sections, "Content-Type", NULL);
	const char      *vary = UdmVarListFindStr(&Doc->Sections, "Vary", NULL);

	if (vary != NULL) {
	  if (strcasestr(vary, "accept-language") != NULL) {
	    UDM_URL *newURL = UdmURLInit(NULL);
	    char *url; const char *ourl;
	    const char *VaryLang = UdmVarListFindStr(&Doc->Sections, "VaryLang", "en"), *CL;
	    const int       hops = UdmVarListFindInt(&Doc->Sections, "Hops", 0);
	    char *tok, *lt;
	    size_t urlen;

	    UdmVarListReplaceStr(&Doc->Sections, "Status", "300");
	    UdmURLParse(newURL, ourl = UdmVarListFindStr(&Doc->Sections, "URL", ""));
	    if (strcmp(UDM_NULL2EMPTY(newURL->filename), "robots.txt") != 0) {
	      CL = UdmVarListFindStr(&Doc->Sections, "Content-Location", UDM_NULL2EMPTY(newURL->filename));
	      urlen = 128 + strlen(UDM_NULL2EMPTY(newURL->hostinfo)) + strlen(UDM_NULL2EMPTY(newURL->path)) + strlen(CL);
	      if ((url = (char*)malloc(urlen)) != NULL) {
		snprintf(url, urlen, "%s://%s%s%s", UDM_NULL2EMPTY(newURL->schema), UDM_NULL2EMPTY(newURL->hostinfo), 
			 UDM_NULL2EMPTY(newURL->path), CL );
		UdmAppendTarget(Indexer,  url, "", hops);
		tok = udm_strtok_r((char*)VaryLang, " ,\t", &lt);
		while (tok != NULL) {
		  UdmAppendTarget(Indexer, ourl, tok, hops );
		  tok = udm_strtok_r(NULL, " ,\t", &lt);
		}
		UDM_FREE(url);
	      }
	    }
	    UdmURLFree(newURL);
	  }
	}

	if (content_type != NULL) {
	  char *p;
	  if ((p = strstr(content_type, "charset=")) != NULL) {
	    const char *cs = UdmCharsetCanonicalName(p + 8);
	    *p = '\0';
	    UdmRTrim((char*)content_type, "; ");
	    UdmVarListReplaceStr(&Doc->Sections, "Server-Charset", cs ? cs : (p + 8));
	  }
	}
	
	UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
	if((strcasecmp(UdmVarListFindStr(&Indexer->Conf->Vars,"UseRemoteContentType","yes"),"yes") != 0) || (content_type == NULL) ) {
	   	UDM_MATCH	*M;
	   	const char	*fn = (Doc->CurURL.filename && Doc->CurURL.filename[0]) ? Doc->CurURL.filename : "index.html";
		
		if((M=UdmMatchListFind(&Indexer->Conf->MimeTypes,fn,10,P)))
			UdmVarListReplaceStr(&Doc->Sections,"Content-Type",M->arg);
	}
	if ((var=UdmVarListFind(&Doc->Sections,"Server"))){
		if(!strcasecmp("yes",UdmVarListFindStr(&Indexer->Conf->Vars,"ForceIISCharset1251","no"))){
			if (!UdmWildCaseCmp(var->val,"*Microsoft*")||!UdmWildCaseCmp(var->val,"*IIS*")){
				const char *cs;
				if((cs=UdmCharsetCanonicalName("windows-1251")))
					UdmVarListReplaceStr(&Doc->Sections, "Server-Charset", cs);
			}
		}
	}
	UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);
	if(!UdmVarListFind(&Doc->Sections,"Content-Type")) {
		UdmVarListAddStr(&Doc->Sections,"Content-Type","application/octet-stream");
	}

	for(sec=0;sec<Doc->Sections.nvars;sec++){
		var=&Doc->Sections.Var[sec];
		
		if(!strcasecmp(var->name,"Location")){
			UDM_URL *newURL = UdmURLInit(NULL);
			switch(UdmURLParse(newURL, var->val)) {
				case UDM_URL_OK:
					if (UDM_NULL2EMPTY(newURL->schema) != NULL) {
						UDM_HREF Href;
						UdmHrefInit(&Href);
						Href.url=var->val;
						Href.hops=UdmVarListFindInt(&Doc->Sections,"Hops",0)+1;
						Href.referrer=UdmVarListFindInt(&Doc->Sections,"Referrer-ID",0);
						Href.method=UDM_METHOD_GET;
						Href.site_id = UdmVarListFindInt(&Doc->Sections, "Site_id", 0);
						Href.server_id = UdmVarListFindInt(&Doc->Sections,"Server_id", 0);
						UdmHrefListAdd(&Doc->Hrefs,&Href);
					}
					break;
				case UDM_URL_LONG:
					UdmLog(Indexer,UDM_LOG_ERROR,"Redirect URL too long: '%s'",var->val);
					break;
				case UDM_URL_BAD:
				default:
					UdmLog(Indexer,UDM_LOG_ERROR,"Error in redirect URL: '%s'",var->val);
			}
			UdmURLFree(newURL);
		}
	}
	return UDM_OK;
}




static int UdmDocParseContent(UDM_AGENT * Indexer, UDM_DOCUMENT * Doc){
	
#ifdef USE_PARSER
	UDM_PARSER	*Parser;
#endif
	const char	*real_content_type=NULL;
	const char	*url=UdmVarListFindStr(&Doc->Sections,"URL","");
	const char	*ct=UdmVarListFindStr(&Doc->Sections,"Content-Type","");
	const char	*ce=UdmVarListFindStr(&Doc->Sections,"Content-Encoding","");
	int		result=UDM_OK;
	
	if(!strcmp(UDM_NULL2EMPTY(Doc->CurURL.filename), "robots.txt")) return UDM_OK;
	
#ifdef HAVE_ZLIB
	if(!strcasecmp(ce,"gzip") || !strcasecmp(ce,"x-gzip")){
		UDM_THREADINFO(Indexer,"UnGzip",url);
		UdmUnGzip(Doc);
		UdmVarListReplaceInt(&Doc->Sections, "Content-Length", Doc->Buf.buf - Doc->Buf.content + (int)Doc->Buf.size);
	}else
	if(!strcasecmp(ce,"deflate")){
		UDM_THREADINFO(Indexer,"Inflate",url);
		UdmInflate(Doc);
		UdmVarListReplaceInt(&Doc->Sections, "Content-Length", Doc->Buf.buf - Doc->Buf.content + (int)Doc->Buf.size);
	}else
	if(!strcasecmp(ce,"compress") || !strcasecmp(ce,"x-compress")){
		UDM_THREADINFO(Indexer,"Uncompress",url);
		UdmUncompress(Doc);
		UdmVarListReplaceInt(&Doc->Sections, "Content-Length", Doc->Buf.buf - Doc->Buf.content + (int)Doc->Buf.size);
	}else
#endif	
	if(!strcasecmp(ce,"identity") || !strcasecmp(ce,"")){
		/* Nothing to do*/
	}else{
		UdmLog(Indexer,UDM_LOG_ERROR,"Unsupported Content-Encoding");
		UdmVarListReplaceInt(&Doc->Sections,"Status",UDM_HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE);
	}
	
#ifdef USE_PARSER
	/* Let's try to start external parser for this Content-Type */
	
	if((Parser=UdmParserFind(&Indexer->Conf->Parsers,ct))){
		UdmLog(Indexer,UDM_LOG_DEBUG,"Found external parser '%s' -> '%s'",
			Parser->from_mime?Parser->from_mime:"NULL",
			Parser->to_mime?Parser->to_mime:"NULL");
	}
	if(Parser && UdmParserExec(Indexer,Parser,Doc)){
		char *to_charset;
		real_content_type=Parser->to_mime?Parser->to_mime:"unknown";
		UdmLog(Indexer,UDM_LOG_DEBUG,"Parser-Content-Type: %s",real_content_type);
		if((to_charset=strstr(real_content_type,"charset="))){
		        const char *cs = UdmCharsetCanonicalName(UdmTrim(to_charset + 8, " \t"));
			UdmVarListReplaceStr(&Doc->Sections, "Server-Charset", cs);
			UdmLog(Indexer,UDM_LOG_DEBUG, "to_charset='%s'", cs);
		}
#ifdef DEBUG_PARSER
		fprintf(stderr,"content='%s'\n",Doc->content);
#endif
	}
#endif
	
	if(!real_content_type)real_content_type=ct;
	UdmVarListAddStr(&Doc->Sections,"Parser-Content-Type",real_content_type);
	
#ifdef HAVE_ZLIB
	if ( strncmp(real_content_type, "text/", 5) == 0) {
	  UdmStoreDoc(Indexer, Doc);
	}
#endif
	
	/* CRC32 without headers */
	{
	  size_t crclen=Doc->Buf.size - (Doc->Buf.content-Doc->Buf.buf);
	  UdmVarListReplaceInt(&Doc->Sections, "crc32", (int)UdmCRC32(Doc->Buf.content,crclen));
	}

	if(Doc->method!=UDM_METHOD_HEAD){
		if(!strncasecmp(real_content_type,"text/plain",10)){
			UdmParseText(Indexer,Doc);
		}else
		if(!strncasecmp(real_content_type,"text/tab-separated-values",25)){
			UdmParseText(Indexer,Doc);
		}else
		if(!strncasecmp(real_content_type,"text/css",8)){
			UdmParseText(Indexer,Doc);
		}else	
		if(!strncasecmp(real_content_type,"text/html",9)){
			UdmHTMLParse(Indexer,Doc);
		}else
#ifdef USE_XML
		if(!strncasecmp(real_content_type, "text/xml",8)) {
			UdmXMLParse(Indexer, Doc);
		}else
#endif
		if(!strncasecmp(real_content_type, "audio/mpeg",10)) {
			UdmMP3Parse(Indexer,Doc);
		}else{
			/* Unknown Content-Type  */
			UdmLog(Indexer,UDM_LOG_ERROR,"Unsupported Content-Type '%s'",real_content_type);
			UdmVarListReplaceInt(&Doc->Sections,"Status",UDM_HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE);
		}
	}
	return result;
}


static int UdmDocAddDocExtraHeaders(UDM_DOCUMENT *Doc){
	/* Host Name for virtual hosts */
	if(Doc->CurURL.hostname != NULL) {
		char		arg[128]="";
		if(Doc->CurURL.port){
			sprintf(arg,"%s:%d",Doc->CurURL.hostname,Doc->CurURL.port);
			UdmVarListReplaceStr(&Doc->RequestHeaders,"Host",arg);
		}else{
			UdmVarListReplaceStr(&Doc->RequestHeaders,"Host",Doc->CurURL.hostname);
		}
	}
	return UDM_OK;
}

static int UdmDocAddConfExtraHeaders(UDM_ENV *Conf,UDM_DOCUMENT *Doc){
	char		arg[128]="";
	const char	*lc;
	size_t		i;
	
	/* If LocalCharset specified, add Accept-Charset header */
	if((lc=UdmVarListFindStr(&Conf->Vars,"LocalCharset",NULL))){
		snprintf(arg,sizeof(arg)-1,"%s;q=1.0, *;q=0.9, utf-8;q=0.8",lc);
		arg[sizeof(arg)-1]='\0';
		UdmVarListAddStr(&Doc->RequestHeaders,"Accept-Charset",arg);
	}
	
	for (i=0;i<Conf->Vars.nvars;i++){
		UDM_VAR *v=&Conf->Vars.Var[i];
		if(!strncmp(v->name,"Request.",8))
			UdmVarListInsStr(&Doc->RequestHeaders,v->name+8,v->val);
	}
	
#ifdef HAVE_ZLIB
	UdmVarListInsStr(&Doc->RequestHeaders,"Accept-Encoding","gzip,deflate,compress");
#endif
	return UDM_OK;
}

static int UdmDocAddServExtraHeaders(UDM_SERVER *Server,UDM_DOCUMENT *Doc){
	char	arg[128]="";
	size_t	i;
	
	for( i=0 ; i<Server->Vars.nvars ; i++){
		UDM_VAR *Hdr=&Server->Vars.Var[i];
		
		if(!strcasecmp(Hdr->name,"AuthBasic")){
			/* HTTP and FTP specific stuff */
			if((!strcasecmp(UDM_NULL2EMPTY(Doc->CurURL.schema), "http")) ||
				(!strcasecmp(UDM_NULL2EMPTY(Doc->CurURL.schema), "https")) ||
				(!strcasecmp(UDM_NULL2EMPTY(Doc->CurURL.schema), "ftp")) ||
				(!strcasecmp(UDM_NULL2EMPTY(Doc->CurURL.schema), "https"))) {
				
				snprintf(arg,sizeof(arg)-1, "Basic %s", Hdr->val);
				arg[sizeof(arg)-1]='\0';
				UdmVarListReplaceStr(&Doc->RequestHeaders,"Authorization",arg);
			}
			
			if(!strcasecmp(UDM_NULL2EMPTY(Doc->CurURL.schema), "nntp") || 
			   !strcasecmp(UDM_NULL2EMPTY(Doc->CurURL.schema), "news")) {
				/* Auth if required                      */
				/* NNTPGet will parse this header        */
				/* We'll pass authinfo still in base64   */
				/* form to avoid plain user name in core */
				/* file on crashes if any                */
				
				if(Hdr->val && Hdr->val[0]){
					UdmVarListReplaceStr(&Doc->RequestHeaders,"Authorization",Hdr->val);
				}
			}
		}else
		if(!strcasecmp(Hdr->name,"ProxyAuthBasic")){
			if(Hdr->val && Hdr->val[0]){
				snprintf(arg,sizeof(arg)-1,"Basic %s", Hdr->val);
				arg[sizeof(arg)-1]='\0';
				UdmVarListReplaceStr(&Doc->RequestHeaders,"Proxy-Authorization",arg);
			}
		}else
		if(!strcasecmp(Hdr->name, "Proxy")){
			if(Hdr->val && Hdr->val[0]){
				UdmVarListReplaceStr(&Doc->RequestHeaders, Hdr->name, Hdr->val);
			}
		}else{
			if(!strncmp(Hdr->name,"Request.",8))
				UdmVarListReplaceStr(&Doc->RequestHeaders,Hdr->name+8,Hdr->val);
		}
		
	}
	return UDM_OK;
}


static int UdmNextTarget(UDM_AGENT * Indexer,UDM_DOCUMENT *Result){
	int	result=UDM_NOTARGET;
	int     u;
	
	UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
	u = (Indexer->Conf->url_number <= 0);
	UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);

	if (u) {
		return UDM_NOTARGET;
	}
	
	/* Load targets into memory cache */
	UDM_GETLOCK(Indexer, UDM_LOCK_THREAD);
	UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
	u = (Indexer->Conf->Targets.cur_row >= Indexer->Conf->Targets.num_rows);
	UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);
	if (u) {
		result = UdmTargets(Indexer);
		if(result!=UDM_OK) {
		  UDM_RELEASELOCK(Indexer, UDM_LOCK_THREAD);
		  return result;
		}
	}
	
	/* Choose next target */
	UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
	if(Indexer->Conf->Targets.num_rows && (Indexer->Conf->Targets.cur_row < Indexer->Conf->Targets.num_rows ) ) {
		UDM_DOCUMENT *Doc=&Indexer->Conf->Targets.Doc[Indexer->Conf->Targets.cur_row];
		
		UdmVarListReplaceLst(&Result->Sections,&Doc->Sections,NULL,"*");
		UdmVarListReplaceLst(&Result->Sections,&Indexer->Conf->Sections,NULL,"*");
		UdmVarListReplaceLst(&Result->RequestHeaders, &Doc->RequestHeaders, NULL, "*");
		
		Indexer->Conf->Targets.cur_row++;
		Indexer->Conf->url_number--;
		UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);
		UDM_RELEASELOCK(Indexer, UDM_LOCK_THREAD);
		return UDM_OK;
	}
	
	UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);
	UDM_RELEASELOCK(Indexer, UDM_LOCK_THREAD);
	return UDM_NOTARGET;
}

static int UdmVarListLog(UDM_AGENT *A,UDM_VARLIST *V,int l,const char *pre){
	size_t h;
	if (UdmNeedLog(l)) {
	    for(h=0; h < V->nvars; h++) {
		UDM_VAR *v=&V->Var[h];
		UdmLog(A,l,"%s.%s: %s",pre,v->name,v->val?v->val:"<NULL>");
	    }
	}
	return UDM_OK;
}

static int UdmVarList2Spider(UDM_SPIDERPARAM *S,UDM_VARLIST *V){
	S->period		=UdmVarListFindInt(V, "Period",		UDM_DEFAULT_REINDEX_TIME);
	S->maxhops		=UdmVarListFindInt(V, "MaxHops",	UDM_DEFAULT_MAX_HOPS);
	S->follow		=UdmVarListFindInt(V, "Follow",		UDM_FOLLOW_PATH);
	S->max_net_errors	=UdmVarListFindInt(V, "MaxNetErrors",	UDM_MAXNETERRORS);
	S->net_error_delay_time	=UdmVarListFindInt(V, "NetErrorDelayTime",UDM_DEFAULT_NET_ERROR_DELAY_TIME);
	S->read_timeout		=UdmVarListFindInt(V, "ReadTimeOut",	UDM_READ_TIMEOUT);
	S->doc_timeout		=UdmVarListFindInt(V, "DocTimeOut",	UDM_DOC_TIMEOUT);
	S->index		=UdmVarListFindInt(V, "Index",		1);
	S->use_robots		=UdmVarListFindInt(V, "Robots",		1);
	S->use_clones		=UdmVarListFindInt(V, "DetectClones",	1);
	return UDM_OK;
}

__C_LINK int __UDMCALL UdmIndexNextURL(UDM_AGENT *Indexer){
	int		result=UDM_OK;
	UDM_DOCUMENT	Doc;
	const char	*url, *alias;
	char		*origurl = NULL, *aliasurl = NULL;
	
	UdmDocInit(&Doc);
	
	UDM_THREADINFO(Indexer,"Selecting","");

	if(UDM_OK==(result=UdmStoreHrefs(Indexer))) {
	        if (Indexer->action != UDM_OK) return Indexer->action;
		result = UdmNextTarget(Indexer, &Doc);
	}
	
	if(result==UDM_NOTARGET){
		UdmDocFree(&Doc);	/* To free Doc.connp->connp */
		return result;
	}
	
	if(result!=UDM_OK){
		UdmDocFree(&Doc);
		return result;
	}
	
	url=UdmVarListFindStr(&Doc.Sections,"URL","");
	UdmVarListReplaceInt(&Doc.Sections,"crc32old",UdmVarListFindInt(&Doc.Sections,"crc32",0));
	UdmLog(Indexer,UDM_LOG_INFO,"URL: %s",url);
	
#ifdef HAVE_SETPROCTITLE
	/* To see the URL being indexed in "ps" output on FreeBSD */
	/* Do it if single thread version */
	if(!(Indexer->handle)) setproctitle("%s",url);
#endif
	
	
	/* Collect information from Conf */
/*	UDM_GETLOCK(Indexer,UDM_LOCK_CONF);*/
	
	if(!Doc.Buf.buf){
		/* Alloc buffer for document */
	        UDM_GETLOCK(Indexer,UDM_LOCK_CONF);
		Doc.Buf.maxsize=(size_t)UdmVarListFindInt(&Indexer->Conf->Vars,"MaxDocSize",UDM_MAXDOCSIZE);
		UDM_RELEASELOCK(Indexer,UDM_LOCK_CONF);
		if ((Doc.Buf.buf=(char*)malloc(Doc.Buf.maxsize + 1)) == NULL) {
		  UdmLog(Indexer, UDM_LOG_ERROR, "Out of memory.");
		  return UDM_ERROR;
		}
		Doc.Buf.buf[0]='\0';
	}
	
	/* Check that URL has valid syntax */
	if(UdmURLParse(&Doc.CurURL,url)){
		UdmLog(Indexer,UDM_LOG_WARN,"Invalid URL: %s",url);
		Doc.method = UDM_METHOD_DISALLOW;
	}else
	if ((Doc.CurURL.filename != NULL) && (!strcmp(Doc.CurURL.filename, "robots.txt"))) {
		Doc.method = UDM_METHOD_DISALLOW;
	}else{
		UDM_SERVER	*Server;
		char		*alstr = NULL;
		
		/* Find correspondent Server */
		UDM_GETLOCK(Indexer,UDM_LOCK_CONF);
		Server = UdmServerFind(Indexer->Conf, &Indexer->Conf->Servers, url, &alstr);
		UDM_RELEASELOCK(Indexer,UDM_LOCK_CONF);
		if ( !Server ) {
			UdmLog(Indexer,UDM_LOG_WARN,"No 'Server' command for url");
			Doc.method = UDM_METHOD_DISALLOW;
		}else{
		        UDM_GETLOCK(Indexer,UDM_LOCK_CONF);
			Doc.lcs=Indexer->Conf->lcs;
			UdmVarList2Spider(&Doc.Spider,&Server->Vars);
			
			UdmDocAddConfExtraHeaders(Indexer->Conf,&Doc);
			UdmDocAddServExtraHeaders(Server,&Doc);
			UDM_RELEASELOCK(Indexer,UDM_LOCK_CONF);

			UdmVarListReplaceLst(&Doc.Sections, &Server->Vars,NULL,"*");
			UdmVarListReplaceInt(&Doc.Sections, "Site_id", UdmServerGetSiteId(Indexer, Server, &Doc.CurURL));
			UdmVarListReplaceInt(&Doc.Sections, "Server_id", Server->site_id);
			UdmVarListReplaceInt(&Doc.Sections, "MaxHops", Doc.Spider.maxhops);
			
			if(alstr != NULL) {
				/* Server Primary alias found */
				UdmVarListReplaceStr(&Doc.Sections,"Alias",alstr);
			}else{
				/* Apply non-primary alias */
				result=UdmDocAlias(Indexer,&Doc);
			}

			if((alias = UdmVarListFindStr(&Doc.Sections, "Alias", NULL))) {
			  const char *u = UdmVarListFindStr(&Doc.Sections, "URL", NULL);
			  origurl = (char*)strdup(u);
			  aliasurl = (char*)strdup(alias);
			  UdmLog(Indexer,UDM_LOG_EXTRA,"Alias: '%s'", alias);
			  UdmVarListReplaceStr(&Doc.Sections, "URL", alias);
			  UdmVarListReplaceInt(&Doc.Sections, "URL_ID", UdmStrHash32(alias));
			  UdmURLParse(&Doc.CurURL, alias);
			}

			/* Check hops, network errors, filters */
			result=UdmDocCheck(Indexer,Server,&Doc);
			
		}
		UDM_FREE(alstr);
	}
/*	UDM_RELEASELOCK(Indexer,UDM_LOCK_CONF);*/
	
	
	if(result!=UDM_OK){
		UdmDocFree(&Doc);
		return result;
	}
	
	if(Doc.method!=UDM_METHOD_DISALLOW && Doc.method!=UDM_METHOD_VISITLATER){
		UdmDocAddDocExtraHeaders(&Doc);
		if(!strncmp(UDM_NULL2EMPTY(Doc.CurURL.schema), "http", 4)) {
			if(!Doc.Spider.use_robots){
				UdmLog(Indexer,UDM_LOG_WARN, "robots.txt support is disallowed for '%s'", 
				       UDM_NULL2EMPTY(Doc.CurURL.hostinfo));
				UDM_GETLOCK(Indexer,UDM_LOCK_CONF);
				result = UdmRobotParse(Indexer, NULL, UDM_NULL2EMPTY(Doc.CurURL.hostinfo));
				UDM_RELEASELOCK(Indexer,UDM_LOCK_CONF);
			}else{
				UDM_ROBOT_RULE	*rule;
				int		take_robots;

				UDM_GETLOCK(Indexer, UDM_LOCK_THREAD);
				UDM_GETLOCK(Indexer,UDM_LOCK_CONF);
				take_robots = !UdmRobotFind(&Indexer->Conf->Robots, UDM_NULL2EMPTY(Doc.CurURL.hostinfo));
				UDM_RELEASELOCK(Indexer,UDM_LOCK_CONF);
				
				if(take_robots){
				        UDM_SERVER	*Server;
					char		*rurl;
					UDM_DOCUMENT	rDoc;
					size_t          rurlen;
					int             status;
					
					UdmDocInit(&rDoc);
					rDoc.Buf.maxsize=Doc.Buf.maxsize;
					rDoc.Spider=Doc.Spider;
					rDoc.Buf.buf=(char*)malloc(Doc.Buf.maxsize + 1);
					rDoc.Buf.buf[0]='\0';

					rurlen = 32 + strlen(UDM_NULL2EMPTY(Doc.CurURL.schema)) + 
					  strlen(UDM_NULL2EMPTY(Doc.CurURL.hostinfo));
					rurl = (char*)malloc(rurlen);
					snprintf(rurl, rurlen, "%s://%s/robots.txt", UDM_NULL2EMPTY(Doc.CurURL.schema),
						 UDM_NULL2EMPTY(Doc.CurURL.hostinfo));
					UdmVarListAddStr(&rDoc.Sections,"URL",rurl);
					UdmVarListReplaceInt(&rDoc.Sections, "URL_ID", UdmStrHash32(rurl));
					UdmURLParse(&rDoc.CurURL,rurl);
					UdmLog(Indexer,UDM_LOG_INFO,"ROBOTS: %s",rurl);

					UDM_GETLOCK(Indexer,UDM_LOCK_CONF);
					Server = UdmServerFind(Indexer->Conf, &Indexer->Conf->Servers, rurl, NULL);
					UdmDocAddDocExtraHeaders(&rDoc);
					UdmDocAddConfExtraHeaders(Indexer->Conf,&rDoc);
					UdmRobotAddEmpty(&Indexer->Conf->Robots, UDM_NULL2EMPTY(Doc.CurURL.hostinfo));
					UDM_RELEASELOCK(Indexer,UDM_LOCK_CONF);

					if (Server != NULL) {
					  UdmVarListReplaceLst(&rDoc.Sections, &Server->Vars, NULL, "*");
					  UdmDocAddServExtraHeaders(Server, &rDoc);
					}
					UdmVarListLog(Indexer,&rDoc.RequestHeaders,UDM_LOG_DEBUG,"Request");
					
					UDM_THREADINFO(Indexer,"Getting",rurl);
					UdmDocLookupConn(Indexer,&rDoc);
					result=UdmGetURL(Indexer,&rDoc);
					UdmParseHTTPResponse(Indexer,&rDoc);
					UdmDocProcessResponseHeaders(Indexer,&rDoc);
					UdmVarListLog(Indexer,&rDoc.Sections,UDM_LOG_DEBUG,"Response");
					UDM_FREE(rurl);

					if ((status = UdmVarListFindInt(&rDoc.Sections, "Status", 0)) == 200 ) {
					
					        UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
						result = UdmRobotParse(Indexer, rDoc.Buf.content, UDM_NULL2EMPTY(rDoc.CurURL.hostinfo));
						UDM_RELEASELOCK(Indexer,UDM_LOCK_CONF);
					} else if (status >= 500) {
						Doc.method = UDM_METHOD_VISITLATER;
					}

					UdmDocFree(&rDoc);
				}
				
				UDM_RELEASELOCK(Indexer, UDM_LOCK_THREAD);

				/* Check whether URL is disallowed by robots.txt */
				UDM_GETLOCK(Indexer,UDM_LOCK_CONF);
				rule = UdmRobotRuleFind(&Indexer->Conf->Robots, &Doc.CurURL);
				UDM_RELEASELOCK(Indexer,UDM_LOCK_CONF);
				if(rule) {
					UdmLog(Indexer,UDM_LOG_WARN,"robots.txt: '%s %s'",(rule->cmd==UDM_METHOD_DISALLOW)?"Disallow":"Allow",rule->path);
					if(rule->cmd==UDM_METHOD_DISALLOW)
						Doc.method=rule->cmd;
				}
			}
		}
		if(origurl != NULL){
			UdmVarListReplaceStr(&Doc.Sections,"URL",origurl);
			UdmVarListReplaceInt(&Doc.Sections, "URL_ID", UdmStrHash32(origurl));
			UdmURLParse(&Doc.CurURL,origurl);
		}
	}
	
	
	if(result!=UDM_OK){
	        UDM_FREE(origurl); UDM_FREE(aliasurl);
		UdmDocFree(&Doc);
		return result;
	}

	if(Doc.method!=UDM_METHOD_DISALLOW && Doc.method!=UDM_METHOD_VISITLATER){
		int	start,state,status=0;
		int	mp3type=UDM_MP3_UNKNOWN;
		
		if(!(Indexer->flags&UDM_FLAG_REINDEX)){
			const char *l=UdmVarListFindStr(&Doc.Sections,"Last-Modified",NULL);
			if (l) UdmVarListReplaceStr(&Doc.RequestHeaders, "If-Modified-Since", l);
		}
		
		UDM_THREADINFO(Indexer,"Getting",url);
		
		start=(Doc.method==UDM_METHOD_CHECKMP3 || Doc.method==UDM_METHOD_CHECKMP3ONLY)?1:0;
		
		for(state=start;state>=0;state--){
			const char	*hdr=NULL;
			
			if(state==1)hdr="bytes=0-256";
			if(mp3type==UDM_MP3_TAG)hdr="bytes=-128";
			
			UdmVarListReplaceInt(&Doc.Sections, "Status", UDM_HTTP_STATUS_UNKNOWN);
			
			if(aliasurl != NULL) {
			  UdmVarListReplaceStr(&Doc.Sections,"URL",alias);
			  UdmVarListReplaceInt(&Doc.Sections, "URL_ID", UdmStrHash32(alias));
			  UdmURLParse(&Doc.CurURL,alias);
			}
				
			UdmVarListLog(Indexer,&Doc.RequestHeaders,UDM_LOG_DEBUG,"Request");
				
			if(hdr) {
			        UdmVarListAddStr(&Doc.RequestHeaders, "Range", hdr);
				UdmLog(Indexer, UDM_LOG_INFO, "Range: [%s]", hdr);
			}
				
			result = UdmGetURL(Indexer, &Doc);
				
			if(hdr) {
			        UdmVarListDel(&Doc.RequestHeaders, "Range");
			}
				
			if(origurl != NULL) {
			  UdmVarListReplaceStr(&Doc.Sections,"URL",origurl);
			  UdmVarListReplaceInt(&Doc.Sections, "URL_ID", UdmStrHash32(origurl));
			  UdmURLParse(&Doc.CurURL,origurl);
			}
			
			if(result!=UDM_OK){
			        UDM_FREE(origurl); UDM_FREE(aliasurl);
				UdmDocFree(&Doc);
				return result;
			}
			
			UdmParseHTTPResponse(Indexer,&Doc);
			UdmDocProcessResponseHeaders(Indexer,&Doc);
			UdmVarListLog(Indexer,&Doc.Sections,UDM_LOG_DEBUG,"Response");
			
			status=UdmVarListFindInt(&Doc.Sections,"Status",0);
			
			UdmLog(Indexer, UDM_LOG_EXTRA, "Status: %d %s", status, UdmHTTPErrMsg(status));

			if(status!=206 && status!=200)
				break;
			
			if(state==1){	/* Needs guessing */
				if(UDM_MP3_UNKNOWN != (mp3type=UdmMP3Type(&Doc))){
					UdmVarListReplaceStr(&Doc.Sections,"Content-Type","audio/mpeg");
					if(Doc.method == UDM_METHOD_CHECKMP3ONLY && mp3type != UDM_MP3_TAG) break;
				}
				if(Doc.method == UDM_METHOD_CHECKMP3ONLY) break;
			}
		}
		
		/* Add URL from Location: header */
		/* This is to give a chance for  */
		/* a concurent thread to take it */
		result=UdmDocStoreHrefs(Indexer,&Doc);
		if(result!=UDM_OK){
		        UDM_FREE(origurl); UDM_FREE(aliasurl);
			UdmDocFree(&Doc);
			return result;
		}
		
		/* Increment indexer's download statistics */
		Indexer->nbytes+=Doc.Buf.size;
		Indexer->ndocs++;
		
		if((!Doc.Buf.content) && (status<500)) {
			UdmLog(Indexer,UDM_LOG_ERROR,"Illegal HTTP headers in response");
			status=UDM_HTTP_STATUS_SERVICE_UNAVAILABLE;
			UdmVarListReplaceInt(&Doc.Sections,"Status",status);
		}
		
		if(status==UDM_HTTP_STATUS_OK || status==UDM_HTTP_STATUS_PARTIAL_OK){
		   	size_t		wordnum;
			const char *lang = NULL;
		   	
			UDM_THREADINFO(Indexer,"Parsing",url);
			
			result = UdmDocParseContent(Indexer, &Doc);
			if(result!=UDM_OK){
			        UDM_FREE(origurl); UDM_FREE(aliasurl);
				UdmDocFree(&Doc);
				return result;
			}
			
			UDM_GETLOCK(Indexer,UDM_LOCK_CONF);
			if(Indexer->Conf->LangMaps.nmaps && Doc.method != UDM_METHOD_DISALLOW){
				register size_t t;
				int flag = !strcasecmp(UdmVarListFindStr(&Indexer->Conf->Vars, "LangMapUpdate", "no"), "yes");

				bzero((void*)Indexer->LangMap, sizeof(UDM_LANGMAP));
				for (t = 0; t <= UDM_LM_HASHMASK; t++) Indexer->LangMap->memb[t].index = t;
				for(t=0;t<Doc.TextList.nitems;t++){
					UDM_TEXTITEM *Item=&Doc.TextList.Item[t];
					UdmBuildLangMap(Indexer->LangMap, Item->str, strlen(Item->str), flag);
				}
			}

			UDM_RELEASELOCK(Indexer,UDM_LOCK_CONF);
			
			UdmGuessCharSet(Indexer, &Doc, &Indexer->Conf->LangMaps, Indexer->LangMap);
				
			UdmLog(Indexer, UDM_LOG_EXTRA, "Guesser: Lang: %s, Charset: %s",
			       lang = UdmVarListFindStr(&Doc.Sections,"Content-Language",""),
			       UdmVarListFindStr(&Doc.Sections,"Charset",""));
			
			UdmParseURLText(Indexer,&Doc);
			UdmParseHeaders(Indexer,&Doc);
			if (Doc.method!=UDM_METHOD_HREFONLY) {
				UdmPrepareWords(Indexer,&Doc);
			}
			
			/* Remove StopWords */
			UDM_GETLOCK(Indexer,UDM_LOCK_CONF);
			for(wordnum=0;wordnum<Doc.Words.nwords;wordnum++){
				const char	*w=Doc.Words.Word[wordnum].word;
				size_t		wlen=strlen(w);
				
				if(wlen>Indexer->Conf->WordParam.max_word_len ||
				   wlen<Indexer->Conf->WordParam.min_word_len ||
				   UdmStopListFind(&Indexer->Conf->StopWords, w, lang ) != NULL)
				{
					Doc.Words.Word[wordnum].coord=0;
				}	
			}
			for(wordnum=0;wordnum<Doc.CrossWords.ncrosswords;wordnum++){
				const char	*w=Doc.CrossWords.CrossWord[wordnum].word;
				size_t		wlen=strlen(w);
				
				if(wlen>Indexer->Conf->WordParam.max_word_len ||
				   wlen<Indexer->Conf->WordParam.min_word_len ||
				   UdmStopListFind(&Indexer->Conf->StopWords,w, lang) != NULL)
				{
					Doc.CrossWords.CrossWord[wordnum].weight=0;
				}	
			}
			UDM_RELEASELOCK(Indexer,UDM_LOCK_CONF);
			if(UDM_OK != (result = UdmURLAction(Indexer, &Doc, UDM_URL_ACTION_LINKS_DELETE))) {
			  return result;
			}
		}
	}
	
	if(UDM_OK==(result=UdmDocStoreHrefs(Indexer,&Doc))){
		if(UDM_OK!=(result=UdmStoreHrefs(Indexer)))
			result=UDM_ERROR;
	}
	
	if(result!=UDM_OK){
	        UDM_FREE(origurl); UDM_FREE(aliasurl);
		UdmDocFree(&Doc);
		return result;
	}
	
	/* Free unnecessary information */
	UdmHrefListFree(&Doc.Hrefs);
	UdmVarListFree(&Doc.RequestHeaders);
	UdmTextListFree(&Doc.TextList);
	UDM_FREE(Doc.Buf.buf);
	Doc.Buf.maxsize=0;
	
	result = UdmURLAction(Indexer, &Doc, UDM_URL_ACTION_FLUSH);
	
	UDM_FREE(origurl); UDM_FREE(aliasurl);
	UDM_GETLOCK(Indexer, UDM_LOCK_THREAD);
	if (result == UDM_OK) result = Indexer->action;
	UDM_RELEASELOCK(Indexer, UDM_LOCK_THREAD);
	return result;
}
