/* Distributed Checksum Clearinghouse
 *
 * compute fuzzy body checksum #1
 *
 * Copyright (c) 2005 by Rhyolite Software, LLC
 *
 * This agreement is not applicable to any entity which sells anti-spam
 * solutions to others or provides an anti-spam solution as part of a
 * security solution sold to other entities, or to a private network
 * which employs the DCC or uses data provided by operation of the DCC
 * but does not provide corresponding data to other users.
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * Parties not eligible to receive a license under this agreement can
 * obtain a commercial license to use DCC and permission to use
 * U.S. Patent 6,330,590 by contacting Commtouch at http://www.commtouch.com/
 * or by email to nospam@commtouch.com.
 *
 * A commercial license would be for Distributed Checksum and Reputation
 * Clearinghouse software.  That software includes additional features.  This
 * free license for Distributed ChecksumClearinghouse Software does not in any
 * way grant permision to use Distributed Checksum and Reputation Clearinghouse
 * software
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND RHYOLITE SOFTWARE, LLC DISCLAIMS ALL
 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL RHYOLITE SOFTWARE, LLC
 * BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
 * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
 * SOFTWARE.
 *
 * Rhyolite Software DCC 1.3.42-1.47 $Revision$
 */

#include "dcc_ck.h"

#define FZ1  cks->fuz1

#define MAX_FUZ1_LEN	(4*1024)


void
dcc_ck_fuz1_init(DCC_GOT_CKS *cks)
{
	cks->sums[DCC_CK_FUZ1].type = DCC_CK_FUZ1;
	cks->sums[DCC_CK_FUZ1].tgts = DCC_TGTS_INVALID;
	cks->sums[DCC_CK_FUZ1].rpt2srvr = 0;
	FZ1.total = 0;			/* bytes summed */
	FZ1.eol = FZ1.cp = FZ1.buf;
	FZ1.url.st = DCC_URL_ST_IDLE;

	MD5Init(&FZ1.md5);
}



static inline u_char			/* 0=keep the line, 1=discard it */
dear_sucker(const char *cp, u_int llen)
{
#define CK_WORD(w) (llen >= sizeof(w) && !strncmp(cp, w, sizeof(w)-1))

	if (CK_WORD("dear"))
		return 1;
	if (CK_WORD("hello"))
		return 1;
	if (CK_WORD("greeting"))
		return 1;
	if (CK_WORD("date"))
		return 1;

	return 0;
#undef CKWORD
}



static inline u_char
add_sum(DCC_GOT_CKS *cks, int len)
{
	int i;

	if (!len)
		return 1;

	/* ignore the end of very long spam, since
	 * it is likely to make the checksum differ */
	i = MAX_FUZ1_LEN - (FZ1.total + len);
	if (i < 0)
		len += i;
	MD5Update(&FZ1.md5, FZ1.buf, len);
	return (FZ1.total += len) < MAX_FUZ1_LEN;
}



void
dcc_ck_fuz1(DCC_GOT_CKS *cks, const char *bp, u_int bp_len)
{
	char *cp;
	DNSBL_WORK* dnsbl;
	int i, len, c;

	if (cks->sums[DCC_CK_FUZ1].type != DCC_CK_FUZ1)
		return;

	if (FZ1.total >= MAX_FUZ1_LEN)
		return;

	cp = FZ1.cp;

	for (;;) {
		if (bp_len == 0) {
			/* Sum the buffer if it ends with a line.  Note that
			 * every message always ends with an artificial "\n". */
			if (FZ1.eol == cp) {
				add_sum(cks, cp - FZ1.buf);
				FZ1.eol = cp = FZ1.buf;
			}
			FZ1.cp = cp;
			return;
		}
		--bp_len;
		c = *bp++;

		i = dcc_ck_url(&FZ1.url, c, &cp);
		c = i>>DCC_CK_URL_SHIFT;
		switch ((DCC_CK_URL)(i & DCC_CK_URL_MASK)) {
		case DCC_CK_URL_CHAR:
			break;
		case DCC_CK_URL_CK_LEN:
			/* Make room before starting a URL
			 * if we are too close to the end of
			 * the buffer for a maximum size URL */
			if (cp >= &FZ1.buf[sizeof(FZ1.buf)-DCC_URL_MAX]) {
				if (!FZ1.eol
				    || FZ1.eol < cp-DCC_FUZ1_MAX_LINE) {
					if (!add_sum(cks, cp - FZ1.buf))
					    return;
					FZ1.eol = 0;
					cp = FZ1.buf;
				} else {
					len = FZ1.eol - FZ1.buf;
					if (!add_sum(cks, len))
					    return;
					memmove(FZ1.buf, FZ1.eol, cp - FZ1.eol);
					FZ1.eol = FZ1.buf;
					cp -= len;
				}
			}
			if ((dnsbl = cks->dnsbl) != 0
			    && dnsbl->hit == DNSBL_HIT_NONE)
				dnsbl->tgt_dom_len = 0;
			continue;
		case DCC_CK_URL_HOST:
		case DCC_CK_URL_DOT:
			if ((dnsbl = cks->dnsbl) != 0
			    && dnsbl->hit == DNSBL_HIT_NONE
			    && dnsbl->tgt_dom_len<ISZ(dnsbl->tgt_dom))
				dnsbl->tgt_dom[dnsbl->tgt_dom_len++] = c;
			break;
		case DCC_CK_URL_HOST_END:
			dcc_dnsbl_url(cks->dnsbl);
			break;
		case DCC_CK_URL_HOST_RESET:
			if ((dnsbl = cks->dnsbl) != 0
			    && dnsbl->hit == DNSBL_HIT_NONE)
				dnsbl->tgt_dom_len = 0;
			break;
		case DCC_CK_URL_SKIP:
			continue;
		}

		/* collect only ASCII letters */
		if (c >= 'a' && c <= 'z') {
			/* Collect more of a new line */
			*cp = c;
			if (++cp < &FZ1.buf[sizeof(FZ1.buf)])
				continue;

			/* We are at the end of the buffer,
			 * so add it to the checksum */
			if (!add_sum(cks, cp - FZ1.buf))
				return;
			cp = FZ1.buf;
			FZ1.eol = 0;
			continue;
		}

		if (c == '\n') {
			/* Ignore short lines starting with some strings */
			if (FZ1.eol
			    && (len = cp - FZ1.eol) > 0
			    && len <= DCC_FUZ1_MAX_LINE
			    && dear_sucker(FZ1.eol, len)) {
				cp = FZ1.eol;
				continue;
			}

			/* Add the line to the checksum if we do not
			 * have room in the buffer for another line */
			if (cp >= &FZ1.buf[sizeof(FZ1.buf) - (DCC_FUZ1_MAX_LINE
							+ DCC_HTTPS_LEN)]) {
				if (!add_sum(cks, cp - FZ1.buf))
					return;
				cp = FZ1.buf;
			}
			FZ1.eol = cp;
		}
	}
}



void
dcc_ck_fuz1_fin(DCC_GOT_CKS *cks)
{
	if (cks->sums[DCC_CK_FUZ1].type != DCC_CK_FUZ1)
		return;

	/* we cannot compute a checksum on an empty or nearly empty message */
	if (FZ1.total < 30) {
		cks->sums[DCC_CK_FUZ1].type = DCC_CK_INVALID;
		return;
	}

	MD5Final(cks->sums[DCC_CK_FUZ1].sum, &FZ1.md5);
	cks->sums[DCC_CK_FUZ1].rpt2srvr = 1;
}
