/* Distributed Checksum Clearinghouse server database functions
 *
 * Copyright (c) 2004 by Rhyolite Software
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND RHYOLITE SOFTWARE DISCLAIMS ALL
 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL RHYOLITE SOFTWARE
 * BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
 * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
 * SOFTWARE.
 *
 * Rhyolite Software DCC 1.2.66-1.93 $Revision$
 */

#include "srvr_defs.h"
#include <sys/resource.h>
#ifdef HAVE_HW_PHYSMEM
#include <sys/sysctl.h>
#endif

#define PSTATIC static
/* #define PSTATIC */			/* for profiling */

static const u_char hash_magic[sizeof(HASH_ENTRY)] = DB_HASH_MAGIC;

DB_STATS db_stats;

DB_STATES db_sts;

int db_fd = -1;
DCC_PATH db_nm;
int db_hash_fd = -1;
DCC_PATH db_hash_nm;
struct timeval db_locked;		/* 1=database not locked */
u_int db_flags;				/* same as magic.s.flags */

struct timeval db_time;

/* Common UNIX buffer cache flushing mechanisms are too quick for
 * this database, causing far too much disk traffic.  However, it
 * is necessary to push the database toward the disk so that when dccd
 * shuts down, the system need not be confronted with GBytes to write.
 * So choose a delay that tries to push the database to the disk in less
 * than an hour but no more than 1 MByte/sec for a 4 GByte database.
 * 128 buffers covering 4 GBytes implies 30 MByte/buffer.  If 20% if
 * each buffer is dirty, then we don't want to flush more than about
 * 1 buffer every 5 seconds. */
#define MSYNC_TIME 10
static time_t msync_time;

int db_debug;

u_char grey_on;
static u_char db_no_mmap;
static u_char db_dirty;
static u_char db_extended;
static u_char db_rdonly;
DB_PG_NUM db_end_pg_num = -1;		/* have rebuilt hash to this page */
u_char db_failed;			/* something bad happened */
static u_char db_invalidate;		/* do not write to the files */


int db_buf_total;			/* total # of db buffers */
DB_PTR db_max_rss;			/* maximum db resident set size */
/* use DB_PTR instead of off_t because off_t is often only 32-bits */

#define DB_HASH_TOTAL DB_BUF_MAX
static DB_BUF *db_buf_hash[DB_HASH_TOTAL];
static DB_BUF db_bufs[DB_HASH_TOTAL];	/* control mmap()'ed blocks */
static DB_BUF *buf_oldest, *buf_newest;

#define B2PATH(b) ((b)->type == DB_BUF_TYPE_DB			\
		   ? DCC_NM2PATH(db_nm)				\
		   : DCC_NM2PATH(db_hash_nm))

#define DB_BUF_HASH(pg_num) (&db_buf_hash[(pg_num) % DIM(db_buf_hash)])

static const DB_VERSION_BUF version_buf = DB_VERSION_STR;

u_int db_page_size;			/* size of 1 mmap()'ed buffer */

static off_t hash_fsize;
DB_HADDR db_hash_len;			/* # of hash table entries */
DB_HADDR db_hash_used;			/* # of hash table entries in use */
u_int db_hash_page_len;			/* # of HASH_ENTRY's per buffer */
DB_HADDR db_max_hash_entries = 0;	/* after db_buf_init()*/
static off_t db_fsize;			/* size of database file */
DB_PTR db_csize;			/* size of database contents in bytes */
static DB_PTR db_csize_stored_hash;	/* DB size stored in hash file */
static DB_PTR db_csize_stored_db;	/*  "  "     "    in database file */
DB_SN db_sn, db_sn_stored;		/* creation or expiration serial # */
DB_EX_TS db_ex_ts;			/* cleaned to these dates */
DB_EX_SECS db_ex_secs;			/*	with these durations */
static DB_EX_SECS db_ex_secs_stored;
DB_NOKEEP_CKS db_nokeep_cks;		/* ignore some checksums */
static DB_NOKEEP_CKS db_nokeep_cks_stored;
DB_FLOD_THOLDS db_flod_tholds;
static DB_FLOD_THOLDS db_flod_tholds_stored;
u_int db_page_max;			/* only padding after this in DB buf */
char db_window_size[32];		/* size of mmap() window */

static const u_char dcc_ck_fuzziness[DCC_DIM_CKS] = {
	0,				/* DCC_CK_INVALID */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_IP */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_ENV_FROM */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_FROM */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_SUB */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_MESSAGE_ID */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_RECEIVED */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_BODY */
	DCC_CK_FUZ_LVL1,		/* DCC_CK_FUZ1 */
	DCC_CK_FUZ_LVL2,		/* DCC_CK_FUZ2 */
	DCC_CK_FUZ_LVL2,		/* DCC_CK_FUZ3 */
	DCC_CK_FUZ_LVL2,		/* DCC_CK_GREY_TRIPLE */
	DCC_CK_FUZ_LVL2,		/* DCC_CK_SRVR_ID */
	DCC_CK_FUZ_LVL2			/* DCC_CK_ENV_TO */
};
static const u_char grey_ck_fuzziness[DCC_DIM_CKS] = {
	0,				/* DCC_CK_INVALID */
	DCC_CK_FUZ_LVL2,		/* DCC_CK_IP */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_ENV_FROM */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_FROM */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_SUB */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_MESSAGE_ID */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_RECEIVED */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_BODY */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_FUZ1 */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_FUZ2 */
	DCC_CK_FUZ_LVL_NO,		/* DCC_CK_GREY_MSG */
	DCC_CK_FUZ_LVL1,		/* DCC_CK_GREY_TRIPLE */
	DCC_CK_FUZ_LVL1,		/* DCC_CK_SRVR_ID */
	DCC_CK_FUZ_LVL1			/* DCC_CK_ENV_TO */
};
const u_char *db_ck_fuzziness = dcc_ck_fuzziness;


PSTATIC u_char buf_msync(DCC_EMSG, DB_BUF *);
PSTATIC u_char buf_munmap(DCC_EMSG, DB_BUF *);
PSTATIC u_char buf_mmap(DCC_EMSG, DB_BUF *, DB_PG_NUM);
PSTATIC DB_BUF *find_buf(DCC_EMSG, DB_BUF_TYPE, DB_PG_NUM);
PSTATIC u_char map_hash(DCC_EMSG, DB_HADDR, DB_STATE *);
PSTATIC u_char map_db(DCC_EMSG, DB_PTR, u_int, DB_STATE *);
PSTATIC void rel_db_states(u_char);
PSTATIC u_char db_flush_len(DCC_EMSG);
PSTATIC u_char db_make_clean(DCC_EMSG, u_char);


static u_int
gcd(u_int n, u_int m)
{
	u_int r;

	if (n > m) {
		r = m; m = n; n = r;
	}
	for (;;) {
		r = m % n;
		if (r == 0)
			return n;
		m = n;
		n = r;
	}
}



DB_NOKEEP_CKS
def_nokeep_cks(void)
{
	DCC_CK_TYPES type;
	DB_NOKEEP_CKS nokeep = 0;

	DB_SET_NOKEEP(nokeep, DCC_CK_INVALID);
	DB_SET_NOKEEP(nokeep, DCC_CK_FLOD_PATH);
	for (type = 0; type < DCC_NUM_CKS; ++type) {
		if ((!grey_on && DB_DEF_NOKEEP(type))
		    || (grey_on && DB_GREY_NOKEEP(type)))
			DB_SET_NOKEEP(nokeep, type);
	}

	return nokeep;
}



/* At least in BSD/OS, mmap() cannot extend a file */
u_char
db_extend(DCC_EMSG emsg, int fd, const char *nm,
	  DB_PTR new_size, DB_PTR old_size)
{
	static u_char zeros[4096];
	int len, i;

	if (new_size > DB_PTR_MAX) {
		dcc_pemsg(EX_SOFTWARE, emsg,
			  "invalid new size %#llx for %s",
			  new_size, DCC_NM2PATH(nm));
		return 0;
	}
	if (new_size <= old_size) {
		dcc_pemsg(EX_SOFTWARE, emsg,
			  "new_size %#llx <= old_size %#llx in db_extend(%s)",
			  new_size, old_size, DCC_NM2PATH(nm));
		return 0;
	}

	/* Use write() because FreeBSD documentation cautions against mmap() on
	 * files with holes. */
	if (old_size != (DB_PTR)lseek(fd, old_size, SEEK_SET)) {
		dcc_pemsg(EX_IOERR, emsg, "lseek(%s,%#llx): %s",
			  DCC_NM2PATH(nm), old_size, ERROR_STR());
		return 0;
	}

	for (;;) {
		len = new_size - old_size;
		if (len > ISZ(zeros))
			len = sizeof(zeros);
		else if (len <= 0)
			return 1;
		old_size += len;
		i = write(fd, &zeros, len);
		if (i != len) {
			dcc_pemsg(EX_IOERR, emsg, "write(%s,%d)=%d: %s",
				  DCC_NM2PATH(nm), i, len, ERROR_STR());
			return 0;
		}
	}
}



/* release all unneeded buffers */
u_char					/* 0=problem 1=finished */
db_unload(DCC_EMSG emsg)
{
	DB_BUF *b;
	u_char result = 1;

	for (b = buf_oldest; b != 0; b = b->newer) {
		if (b->type == DB_BUF_TYPE_FREE)
			continue;
		if (b->lock_cnt != 0)
			continue;
		if (!buf_munmap(emsg, b)) {
			emsg = 0;
			result = 0;
		}
	}

	return result;
}



/* msync() all important buffers and forget some oldest buffer
 *	This does not seem to have any effects on many systems */
u_char
db_sync_some(DCC_EMSG emsg)
{
	DB_BUF *b;
	u_char result;

	result = 1;
	for (b = buf_oldest; b != 0; b = b->newer) {
		if (b->type == DB_BUF_TYPE_FREE
		    || b->lock_cnt != 0
		    || !(b->flags & DB_BUF_FG_DIRTY))
			continue;

		if ((b->flags & DB_BUF_FG_MSYNC)
		    || DB_IS_TIME(msync_time, MSYNC_TIME)) {
			if (!buf_msync(emsg, b)) {
				result = 0;
				emsg = 0;
			}
		}
	}
	return result;
}



static void
db_rel_state(DB_STATE *st)
{
	DB_BUF *b;

	if ((b = st->b) != 0) {
		st->b = 0;
		st->d.p = 0;
		st->s.rptr = DB_PTR_BAD;
		if (--b->lock_cnt == 0) {
			if (!DB_IS_LOCKED())
				buf_munmap(0, b);
		} else if (b->lock_cnt < 0) {
			dcc_logbad(EX_SOFTWARE,
				   "negative database buffer lock");
		}
	}
}



PSTATIC void
rel_db_states(u_char not_hash_magic)
{
	DB_STATE *st;

	for (st = &db_sts.rcd; st < &db_sts.hash_ctl; ++st)
		db_rel_state(st);

	/* release the buffer with the dirty flag only if allowed */
	if (!not_hash_magic)
		db_rel_state(st);
}



/* shut down the database, including flushing and releasing all
 * mmap()'ed buffers */
u_char
db_close(DCC_EMSG emsg,
	 int mode)			/* -1=invalidate cache, 0=flush, 1=ok */
{
	u_char result = 1;

	if (mode < 0) {
		db_invalidate = 1;
		mode = 0;
	}

	/* flush the data and then release and flush the dirty flags */
	if (!db_make_clean(emsg, mode)) {
		emsg = 0;
		result = 0;
	}
	rel_db_states(0);
	if (!db_unload(emsg)) {
		emsg = 0;
		result = 0;
	}

	/* close the hash table first because the server is often
	 * waiting for the lock on the main file held by dbclean */
	if (db_hash_fd >= 0) {
		if (0 > close(db_hash_fd)) {
			dcc_pemsg(EX_IOERR, emsg, "close(%s): %s",
				  DCC_NM2PATH(db_hash_nm), ERROR_STR());
			emsg = 0;	/* print next error message directly */
			result = 0;
		}
		db_hash_fd = -1;
	}
	if (db_fd >= 0) {
		if (0 > close(db_fd)) {
			dcc_pemsg(EX_IOERR, emsg, "close(%s): %s",
				  DCC_NM2PATH(db_nm), ERROR_STR());
			emsg = 0;	/* print next error message directly */
			result = 0;
		}
		db_fd = -1;
	}

	db_invalidate = 0;
	db_failed = 0;
	db_dirty = 0;
	db_extended = 0;
	db_locked.tv_sec = 0;		/* clear DB_IS_LOCKED() */
	return result;
}



/* This locking does only multiple-readers/single-writer */
int					/* -1=failed, 0=was not locked, 1=was */
db_lock(DCC_EMSG emsg)
{
	struct stat sb;

	if (DB_IS_LOCKED())
		return 1;

	if (!dcc_exlock_fd(emsg, db_fd, DCC_LOCK_ALL_FILE, "", db_nm))
		return -1;
	if (0 > fstat(db_fd, &sb)) {
		dcc_pemsg(EX_IOERR, emsg, "stat(%s): %s",
			  DCC_NM2PATH(db_nm), ERROR_STR());
		return -1;
	}
	if (db_fsize != sb.st_size) {
		if (db_fsize > sb.st_size || !db_rdonly) {
			dcc_pemsg(EX_IOERR, emsg,
				  "%s changed from "OFF_HPAT" to "OFF_HPAT,
				  DCC_NM2PATH(db_nm), db_fsize, sb.st_size);
			return -1;
		}
		db_fsize = sb.st_size;
	}

	gettimeofday(&db_locked, 0);
	return 0;
}



PSTATIC u_char				/* 0=failed, 1=clean, 2=too much work */
db_make_clean(DCC_EMSG emsg, u_char ok)
{
	u_char result;

	if (!db_dirty)
		return 1;

	result = 1;

	/* send any changes to the disk,
	 * but keep the database-dirty flags in RAM */
	if (!db_failed
	    && ok
	    && db_hash_fd >= 0
	    && !map_hash(emsg, DB_HADDR_SIZES, &db_sts.hash_ctl)) {
		emsg = 0;
		result = 0;
	}

	rel_db_states(1);
	result = db_unload(emsg);
	if (!result) {
		emsg = 0;
		result = 0;
	}

	if (db_extended) {
		/* Send the meta-data to disk so that other processes
		 * such as dbclean can find the new length of the file
		 * on Solaris. */
		if (0 > fsync(db_fd)) {
			dcc_pemsg(EX_IOERR, emsg, "fsync(%s): %s",
				  DCC_NM2PATH(db_nm), ERROR_STR());
			emsg = 0;
			result = 0;
		}
		db_extended = 0;
	}

	/* Clean the dirty flag in the hash table.
	 * With luck, this will reach the disk after everything else */
	if (!db_failed
	    && ok
	    && db_hash_fd >= 0
	    && (DB_HADDR_EX(db_sts.hash_ctl.d.h
			    ->HASH_STORE_USED) != db_hash_used)) {
		DB_HADDR_CP(db_sts.hash_ctl.d.h->HASH_STORE_USED, db_hash_used);
		db_sts.hash_ctl.b->flags |= (DB_BUF_FG_MSYNC | DB_BUF_FG_DIRTY);
		if (!buf_msync(emsg, db_sts.hash_ctl.b))
			result = 0;
	}

	db_dirty = 0;
	return result;
}



/* mark the hash file and so the database dirty */
static u_char
db_make_dirty(DCC_EMSG emsg)
{
	if (db_dirty)
		return 1;

	if (!DB_IS_LOCKED()) {
		dcc_pemsg(EX_SOFTWARE, emsg, "dirtying unlocked database");
		return 0;
	}

	if (db_rdonly)
		dcc_logbad(EX_SOFTWARE, "dirtying read-only database");

	if (!map_hash(emsg, DB_HADDR_SIZES, &db_sts.hash_ctl))
		return 0;
	DB_HADDR_CP(db_sts.hash_ctl.d.h->HASH_STORE_USED, 0);
	db_sts.hash_ctl.b->flags |= DB_BUF_FG_MSYNC;
	if (!buf_msync(emsg, db_sts.hash_ctl.b))
		return 0;

	db_dirty = 1;
	return 1;
}



/* (start to) unlock the database */
u_char					/* 0=failed, 1=at least started */
db_unlock(DCC_EMSG emsg)
{
	int result;

	if (!DB_IS_LOCKED())
		return 1;

	/* when we unlock frequently, we cannot use the write() hack */
	db_no_mmap = 0;

	result = db_make_clean(emsg, 1);
	if (!result)
		emsg = 0;		/* print next error message directly */
	if (!dcc_unlock_fd(emsg, db_fd, DCC_LOCK_ALL_FILE, "", db_nm))
		result = 0;
	db_locked.tv_sec = 0;
	return result;
}



static void
get_db_max_rss(void)
{
#define MIN_DB_MBYTE 32
#define MAX_DB_MBYTE 3072		/* 3 GByte on 32 bit machines is fair */
#if DCC_MAX_DB_MBYTE < MIN_DB_MBYTE || DCC_MAX_DB_MBYTE > MAX_DB_MBYTE
#undef DCC_MAX_DB_MBYTE
#define DCC_MAX_DB_MBYTE MAX_DB_MBYTE
#endif
#if DCC_DB_MBYTE >= MIN_DB_MBYTE && DCC_DB_MBYTE <= DCC_MAX_DB_MBYTE
#define DEF_DB_MBYTE DCC_DB_MBYTE
#else
#define DEF_DB_MBYTE 64
#endif
	DB_PTR physmem = 0;

#ifdef HAVE_PHYSMEM_TOTAL
	/* maybe someday physmem_total() will be widely available */
	physmem = physmem_total();
#else
#ifdef HAVE__SC_PHYS_PAGES
	long pages, pagesize;

	if ((pages = sysconf(_SC_PHYS_PAGES)) == -1) {
		dcc_error_msg("sysconf(_SC_PHYS_PAGES): %s",
			      ERROR_STR());
	} else if ((pagesize = sysconf(_SC_PAGESIZE)) == -1) {
		dcc_error_msg("sysconf(_SC_PAGESIZE): %s",
			      ERROR_STR());
	} else {
		physmem = (DB_PTR)pages * (DB_PTR)pagesize;
	}
#else
#ifdef HAVE_HW_PHYSMEM
	int mib[2] = {CTL_HW, HW_PHYSMEM};
	int hw_physmem;
	size_t hw_physmem_len = sizeof(hw_physmem);

	if (0 <= sysctl(mib, 2, &hw_physmem, &hw_physmem_len, 0, 0)) {
		physmem = hw_physmem;
	} else {
		dcc_error_msg("sysctl(HW_PHYSMEM): %s", ERROR_STR());
	}
#endif
#endif /* HAVE__SC_PHYS_PAGES */
#endif /* HAVE_PHYSMEM_TOTAL */

	/* Try to use physical memory less 512 MByte or half if there
	 * is less than 1 GByte.
	 * If we got a reasonable memory size from the kernel, use it
	 * use a default if not */
	if (physmem > 1024*1024*1024)
		db_max_rss = physmem - 512*1024*1024;
	else
		db_max_rss = physmem/2;
	if (db_max_rss/(1024*1024) < DEF_DB_MBYTE) {
		db_max_rss = DEF_DB_MBYTE;
		db_max_rss *= 1024*1024;
	} else if (db_max_rss/(1024*1024) > DCC_MAX_DB_MBYTE) {
		db_max_rss = DCC_MAX_DB_MBYTE;
		db_max_rss *= 1024*1024;
	}
#undef MIN_DB_MBYTE
#undef MAX_DB_MBYTE
#undef DEF_DB_MBYTE
}



/* Pick a buffer size that will hold an integral number of DB hash
 * table entries and is a multiple of system's page size.
 * The entire hash table should reside in memory
 * if the system has enough memory. */
int
db_get_page_size(u_int old_page_size,	/* 0 or required page size */
		 u_int tgt_page_size)	/* 0 or target page size */
{
	u_int min_page_size, max_page_size;

	/* Ask the operating system only once so we don't get differing
	 * answers and so compute a varying page size.
	 * Somesystems can't keep their stories straight. */
	if (db_max_rss == 0)
		get_db_max_rss();

	/* Compute the least common multiple of system the page size
	 * and the DB hash table entry size.  This will give us the
	 * smallest page size that we can use. */
	min_page_size = getpagesize();
	min_page_size *= (sizeof(HASH_ENTRY)
			  / gcd(sizeof(HASH_ENTRY), min_page_size));

	/* The DB buffer or page size must also be a multiple of the
	 * the end-of-page padding used in the main database file. */
	if (sizeof(DB_RCD) % DB_RCD_PAD != 0)
		dcc_logbad(EX_SOFTWARE,
			   "DB padding size %d"
			   " is not a divisor of DB entry size %d",
			   DB_RCD_PAD, ISZ(DB_RCD));
	min_page_size *= (DB_RCD_PAD / gcd(DB_RCD_PAD, min_page_size));

	/* Use the old buffer size if possible so we are not confused
	 * by padding at the ends of the old pages.
	 * Fail if it is impossible.  This should cause dbclean to
	 * rebuild the database. */
	if (old_page_size != 0) {
		if ((old_page_size % min_page_size) != 0)
			return 0;
		/* adjust the number of buffers to fit our window size */
		db_buf_total = (db_max_rss + old_page_size-1) / old_page_size;
		if (db_buf_total > DB_BUF_MAX)
			db_buf_total = DB_BUF_MAX;
		if (db_buf_total < (int)DB_BUF_MIN)
			db_buf_total = DB_BUF_MAX;
		return old_page_size;
	}

	db_buf_total = DB_BUF_MAX;
	max_page_size = db_max_rss / db_buf_total;
	max_page_size -= max_page_size % min_page_size;

	/* If we have a target page size, try to use it.
	 * A target page size is big enough to hold 25% of the entire
	 * greylist database */
	if (tgt_page_size != 0 && tgt_page_size < max_page_size) {
		tgt_page_size -= tgt_page_size % min_page_size;
		if (tgt_page_size < min_page_size)
			tgt_page_size = min_page_size;
		return tgt_page_size;
	} else if (max_page_size > min_page_size) {
		return max_page_size;
	} else {
		return min_page_size;
	}
}



/* (re)create the buffer pool
 * The buffers are small blocks that point to the real mmap()'ed memory.
 */
u_char
db_buf_init(u_int old_page_size)	/* 0 or required page size */
{
	DB_BUF *b, *bprev, *bnext;
	int i;


	db_page_size = db_get_page_size(old_page_size, 0);
	if (!db_page_size)
		return 0;

	db_page_max = db_page_size - DB_RCD_PAD;
	db_hash_page_len = db_page_size/sizeof(HASH_ENTRY);

	db_max_hash_entries = (MAX_HASH_ENTRIES
			       - MAX_HASH_ENTRIES % db_hash_page_len);

	for (b = db_bufs, i = DB_BUF_MAX; --i != 0; ++b) {
		if (b->buf.v)
			free(b->buf.v);
	}
	memset(db_bufs, 0, sizeof(db_bufs));

	b = db_bufs;
	buf_oldest = b;
	bprev = 0;
	for (i = db_buf_total; --i != 0; b = bnext) {
		bnext = b+1;
		b->older = bprev;
		b->newer = bnext;
		bprev = b;
	}
	if (b->buf.v)
		free(b->buf.v);
	memset(b, 0, sizeof(*b));
	b->older = bprev;
	buf_newest = b;

	memset(db_buf_hash, 0, sizeof(db_buf_hash));

	return 1;
}



static void
clear_hash_entry(HASH_ENTRY *hash, DB_HADDR rcd_num)
{
	DB_HADDR rcd_p;

	if (rcd_num == DB_HADDR_MAGIC) {
		memcpy(hash, &hash_magic, sizeof(hash_magic));
		return;
	}

	memset(hash, 0, sizeof(*hash));

	if (rcd_num == DB_HADDR_SIZES) {
		DB_HADDR_CP(hash->HASH_STORE_LEN, db_hash_len);
		DB_HADDR_CP(hash->HASH_STORE_USED, DB_HADDR_MIN);
		return;
	}

	if (rcd_num == DB_HADDR_MIN) {
		rcd_p = DB_HADDR_FREE;
	} else if (rcd_num == DB_HADDR_FREE) {
		rcd_p = db_hash_len - 1;
	} else {
		rcd_p = rcd_num - 1;
	}
	DB_HADDR_CP(hash->bak, rcd_p);

	if (rcd_num == DB_HADDR_FREE) {
		rcd_p = DB_HADDR_MIN;
	} else {
		rcd_p = rcd_num+1;
		if (rcd_p >= db_hash_len)
			rcd_p = DB_HADDR_FREE;
	}
	DB_HADDR_CP(hash->fwd, rcd_p);
}



#ifdef MAP_ANON
/* Clear new hash file by linking all of its entries into
 * the free list using the DB buffer/page mechanism */
static u_char
clear_hash(DCC_EMSG emsg)
{
	DB_HADDR haddr;
	DB_BUF *b;
	HASH_ENTRY *hash, *hash_lim;
	void *p;
	int i;

	db_dirty = 1;
	haddr = DB_HADDR_NULL;
	hash = hash_lim = 0;
	for (haddr = 0; haddr < db_hash_len; ++haddr, ++hash) {
		if (hash >= hash_lim) {
			b = find_buf(emsg, DB_BUF_TYPE_HASH,
				     haddr / db_hash_page_len);
			if (!b)
				return 0;
			hash = b->buf.h;
			if (!hash) {
				p = mmap(0, db_page_size,
					 PROT_READ | PROT_WRITE,
					 MAP_ANON | MAP_PRIVATE, -1, 0);
				if (p == MAP_FAILED) {
					dcc_pemsg(EX_IOERR, emsg,
						  "mmap(anon, %d): %s",
						  db_page_size, ERROR_STR());
					return 0;
				}
				b->buf.v = p;
				hash = p;
				b->flags |= (DB_BUF_FG_NO_MMAP
					     | DB_BUF_FG_ANON);
			}
			hash_lim = hash;
			if (db_hash_len - haddr > db_hash_page_len)
				hash_lim += db_hash_page_len;
			else
				hash_lim += db_hash_len - haddr;
		}
		clear_hash_entry(hash, haddr);
	}

	/* The hash table might not be an even number of pages,
	 * but the file must be.  We know the excess is less than
	 * one hash table entry. */
	i = hash_fsize - db_hash_len*sizeof(*hash);
	if (i > 0)
		memset(hash, 0, i);

	return 1;
}

#else /* !defined(MAP_ANON) */

/* Clear new hash file by linking all of its entries into
 * the free list and using write(). */
static u_char
write_hash_buf(DCC_EMSG emsg, void *buf, void *ptr)
{
	int i, len;

	len = (char *)ptr - (char *)buf;
	if (!len)
		return 1;
	i = write(db_hash_fd, buf, len);
	if (i != len) {
		dcc_pemsg(EX_IOERR, emsg, "write(%s,%d)=%d: %s",
			  DCC_NM2PATH(db_hash_nm), len, i, ERROR_STR());
		return 0;
	}
	return 1;
}



static u_char
clear_hash(DCC_EMSG emsg)
{
	DB_HADDR haddr;
	HASH_ENTRY *hash, hash_buf[512];
	int i;

	haddr = DB_HADDR_NULL;
	hash = hash_buf;
	for (haddr = 0; haddr < db_hash_len; ++haddr, ++hash) {
		if (hash >= LAST(hash_buf)) {
			if (!write_hash_buf(emsg, hash_buf, hash))
				return 0;
			hash = hash_buf;
		}

		clear_hash_entry(hash, haddr);
	}
	if (!write_hash_buf(emsg, hash_buf, hash))
		return 0;

	/* The hash table might not be an even number of pages,
	 * but the file must be.  We know the excess is less than
	 * one hash table entry. */
	i = hash_fsize - db_hash_len*sizeof(*hash);
	if (i > 0) {
		memset(hash_buf, 0, sizeof(hash_buf));
		if (!write_hash_buf(emsg, hash_buf, &hash_buf[i]))
			return 0;
	}

	return 1;
}
#endif


static u_char
make_new_hash(DCC_EMSG emsg, DB_HADDR new_hash_len)
{
	struct stat sb;
	u_int n;

	if (getuid() == 0) {
		/* if we are running as root,
		 * don't change the owner of the database */
		if (0 > fstat(db_fd, &sb)) {
			dcc_pemsg(EX_IOERR, emsg, "fstat(%s): %s",
				  DCC_NM2PATH(db_nm), ERROR_STR());
			return 0;
		}
		if (0 > fchown(db_hash_fd, sb.st_uid, sb.st_gid)) {
			dcc_pemsg(EX_IOERR, emsg, "fchown(%s,%d,%d): %s",
				  DCC_NM2PATH(db_hash_nm),
				  (int)sb.st_uid, (int)sb.st_gid,
				  ERROR_STR());
			return 0;
		}
	}

	if (new_hash_len > db_max_hash_entries)
		new_hash_len = db_max_hash_entries;

	/* increase the requested hash table size to a multiple
	 * of the page size */
	hash_fsize = new_hash_len * sizeof(HASH_ENTRY);
	n = hash_fsize % db_page_size;
	if (n != 0) {
		hash_fsize += db_page_size - n;
		new_hash_len = hash_fsize/sizeof(HASH_ENTRY);
	}

	if (new_hash_len < MIN_HASH_ENTRIES) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "database size %d is too small", new_hash_len);
		return 0;
	}
	if (new_hash_len > MAX_HASH_ENTRIES) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "database size %d is too large", new_hash_len);
		return 0;
	}

	/* create the empty hash table file */
	rel_db_states(0);
	if (!db_unload(emsg))
		return 0;
	if (0 > ftruncate(db_hash_fd, 0)) {
		dcc_pemsg(EX_IOERR, emsg, "truncate(%s,%#llx): %s",
			  DCC_NM2PATH(db_hash_nm), db_csize,
			  ERROR_STR());
		return 0;
	}

	db_hash_len = new_hash_len;
	db_hash_used = DB_HADDR_MIN;
	return clear_hash(emsg);
}



static u_char
check_old_hash(DCC_EMSG emsg)
{
	DB_HADDR fwd, bak, stored_hash_len;
	struct stat sb;

	/* check the size of the existing hash file */
	if (0 > fstat(db_hash_fd, &sb)) {
		dcc_pemsg(EX_IOERR, emsg, "stat(%s): %s",
			  DCC_NM2PATH(db_hash_nm), ERROR_STR());
		return 0;
	}
	hash_fsize = sb.st_size;
	if ((hash_fsize % sizeof(HASH_ENTRY)) != 0) {
		dcc_pemsg(EX_DATAERR, emsg, "%s has size "OFF_DPAT","
			  " not a multiple of %d",
			  DCC_NM2PATH(db_hash_nm), hash_fsize,
			  ISZ(HASH_ENTRY));
		return 0;
	}

	db_hash_len = hash_fsize/sizeof(HASH_ENTRY);
	if (db_hash_len < MIN_HASH_ENTRIES) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s has too few records, "OFF_DPAT" bytes",
			  DCC_NM2PATH(db_hash_nm), hash_fsize);
		return 0;
	}

	/* check the magic number */
	if (!map_hash(emsg, DB_HADDR_MAGIC, &db_sts.hash_ctl))
		return 0;
	if (memcmp(db_sts.hash_ctl.d.h, &hash_magic, sizeof(HASH_ENTRY))) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s has the wrong magic \"%.*s\"",
			  DCC_NM2PATH(db_hash_nm),
			  ISZ(HASH_ENTRY), db_sts.hash_ctl.d.c);
		return 0;
	}

	if (!map_hash(emsg, DB_HADDR_FREE, &db_sts.hash_ctl))
		return 0;
	fwd = DB_HADDR_EX(db_sts.hash_ctl.d.h->fwd);
	if (DB_HADDR_INVALID(fwd)
	    && (fwd != DB_HADDR_FREE
		|| db_hash_len > db_hash_used)) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s has a broken free list head of %#x",
			  DCC_NM2PATH(db_hash_nm), fwd);
		return 0;
	}
	bak = DB_HADDR_EX(db_sts.hash_ctl.d.h->bak);
	if (DB_HADDR_INVALID(bak)
	    && (bak != DB_HADDR_FREE
		|| db_hash_len > db_hash_used)) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s has a broken free list tail of %#x",
			  DCC_NM2PATH(db_hash_nm), bak);
		return 0;
	}

	/* fetch number of hash table entries used in existing file */
	if (!map_hash(emsg, DB_HADDR_SIZES, &db_sts.hash)) {
		return 0;
	}
	stored_hash_len = DB_HADDR_EX(db_sts.hash.d.h->HASH_STORE_LEN);
	if (db_hash_len != stored_hash_len) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s has %d entries but claims %d",
			  DCC_NM2PATH(db_hash_nm),
			  db_hash_len, stored_hash_len);
		return 0;
	}
	db_hash_used = DB_HADDR_EX(db_sts.hash.d.h->HASH_STORE_USED);
	if (db_hash_used < DB_HADDR_MIN) {
		if (db_hash_used == 0)
			dcc_pemsg(EX_DATAERR, emsg,
				  "%s was not closed cleanly",
				  DCC_NM2PATH(db_hash_nm));
		else
			dcc_pemsg(EX_DATAERR, emsg,
				  "%s contains an impossible %d entries",
				  DCC_NM2PATH(db_hash_nm),
				  HASH_LEN_EXT(db_hash_used));
		return 0;
	}
	if (db_hash_used >= db_hash_len) {
		if (db_hash_used > db_hash_len)
			dcc_pemsg(EX_DATAERR, emsg,
				  "%s contains only %d entries but %d used",
				  DCC_NM2PATH(db_hash_nm),
				  HASH_LEN_EXT(db_hash_len),
				  HASH_LEN_EXT(db_hash_used));
		else
			dcc_pemsg(EX_DATAERR, emsg,
				  "%s is filled with %d entries",
				  DCC_NM2PATH(db_hash_nm),
				  HASH_LEN_EXT(db_hash_len));
		return 0;
	}
	db_csize_stored_hash = DB_HPTR_EX(db_sts.hash.d.h->HASH_STORE_DB_CSIZE);
	if (db_csize_stored_hash != db_csize
	    && hash_fsize != 0) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s contains %lld bytes"
			  " instead of the %lld that %s claims",
			  db_nm, db_csize,
			  db_csize_stored_hash, DCC_NM2PATH(db_hash_nm));
		return 0;
	}

	return 1;
}



/* open the files and generally get ready to work */
u_char					/* 0=failed, 1=ok */
db_open(DCC_EMSG emsg,
	const char *new_db_nm,
	DB_HADDR new_hash_len,		/* 0 or # of entries */
	u_char mode)			/* DB_OPEN_* */
{
	u_int cur_page_size;
	int hash_flags, db_open_flags;
	struct stat db_sb;
	DB_PTR window;

	db_close(0, 1);
	db_failed = 1;

	memset(&db_stats, 0, sizeof(db_stats));

	if (!new_db_nm && db_nm[0] == '\0')
		new_db_nm = grey_on ? DB_GREY_NAME : DB_DCC_NAME;
	if (new_db_nm) {
		if (strlen(new_db_nm) >= (sizeof(DCC_PATH)
						- sizeof(DB_HASH_SUFFIX))) {
			dcc_pemsg(EX_DATAERR, emsg,
				  "invalid DB nm \"%s\"", new_db_nm);
			return 0;
		}
		strcpy(db_nm, new_db_nm);
		strcpy(db_hash_nm, new_db_nm);
		strcat(db_hash_nm, DB_HASH_SUFFIX);
	}

	db_rdonly = (mode & DB_OPEN_RDONLY) != 0;
	db_no_mmap = !db_rdonly && (mode & DB_OPEN_NO_MMAP) != 0;

	if (mode & DB_OPEN_LOCK_EXT) {
		if (new_hash_len) {
			dcc_pemsg(EX_SOFTWARE, emsg,
				  "extending db_open(%s) without locking",
				  DCC_NM2PATH(db_nm));
			return 0;
		}
		if (!db_rdonly) {
			dcc_pemsg(EX_SOFTWARE, emsg,
				  "db_open(%s) read/write without locking",
				  DCC_NM2PATH(db_nm));
			return 0;
		}
		db_open_flags = O_RDONLY;
		hash_flags = O_RDONLY;
	} else {
		db_open_flags = O_RDWR;
		if (new_hash_len) {
			if (db_rdonly) {
				dcc_pemsg(EX_SOFTWARE, emsg,
					  "db_open(%s) creating read-only",
					  DCC_NM2PATH(db_nm));
				return 0;
			}
			hash_flags = O_RDWR | O_CREAT;
		} else {
			/* must open the file read/write to lock it */
			hash_flags = O_RDWR;
		}
	}

	db_fd = dcc_lock_open(emsg, db_nm, db_open_flags,
			      (mode & DB_OPEN_LOCK_NOWAIT)
			      ? DCC_LOCK_OPEN_NOWAIT
			      : (mode & DB_OPEN_LOCK_EXT)
			      ? DCC_LOCK_OPEN_EXT
			      : DCC_LOCK_OPEN_WAIT,
			      DCC_LOCK_ALL_FILE, 0);
	if (db_fd == -1) {
		db_close(0, -1);
		return 0;
	}
	gettimeofday(&db_locked, 0);
	if (0 > fstat(db_fd, &db_sb)) {
		dcc_pemsg(EX_IOERR, emsg, "stat(%s): %s",
			  DCC_NM2PATH(db_nm), ERROR_STR());
		db_close(0, -1);
		return 0;
	}
	db_csize = db_fsize = db_sb.st_size;
	if (db_fsize < ISZ(DB_MAGIC)) {
		dcc_pemsg(EX_IOERR, emsg,
			  "%s with %d bytes is too small to be a DCC database",
			  DCC_NM2PATH(db_nm), (int)db_fsize);
		db_close(0, -1);
		return 0;
	}

	/* check the header of the database file */
	db_buf_init(0);
	if (!map_db(emsg, 0, sizeof(DB_MAGIC), &db_sts.rcd_magic)) {
		db_close(0, -1);
		return 0;
	}
	if (memcmp(db_sts.rcd_magic.d.magic->s.version, version_buf,
		   sizeof(version_buf))) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s contains the wrong magic string \"%.*s\"",
			  DCC_NM2PATH(db_nm),
			  ISZ(db_sts.rcd_magic.d.magic->s.version),
			  db_sts.rcd_magic.d.magic->s.version);
		db_close(0, -1);
		return 0;
	}
	db_flags = db_sts.rcd_magic.d.magic->s.flags;
	if (!(db_flags & DB_MAGIC_ST_GREY) != !grey_on) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s is%s a greylist database but must%s be",
			  DCC_NM2PATH(db_nm),
			  (db_flags & DB_MAGIC_ST_GREY) ? "" : " not",
			  grey_on ? "" : " not");
		db_close(0, -1);
		return 0;
	}
	memcpy(db_sn, db_sts.rcd_magic.d.magic->s.sn, sizeof(db_sn));
	memcpy(db_sn_stored, db_sn, sizeof(db_sn_stored));
	memcpy(db_ex_ts, db_sts.rcd_magic.d.magic->s.ex_ts,
	       sizeof(db_ex_ts));
	memcpy(&db_ex_secs, &db_sts.rcd_magic.d.magic->s.ex_secs,
	       sizeof(db_ex_secs));
	memcpy(&db_ex_secs_stored, &db_sts.rcd_magic.d.magic->s.ex_secs,
	       sizeof(db_ex_secs_stored));
	db_nokeep_cks_stored = db_sts.rcd_magic.d.magic->s.nokeep_cks;
	db_nokeep_cks = db_nokeep_cks_stored;
	DB_SET_NOKEEP(db_nokeep_cks, DCC_CK_INVALID);
	DB_SET_NOKEEP(db_nokeep_cks, DCC_CK_FLOD_PATH);
	memcpy(db_flod_tholds_stored, db_sts.rcd_magic.d.magic->s.flod_tholds,
	       sizeof(db_flod_tholds_stored));
	memcpy(db_flod_tholds, db_flod_tholds_stored,
	       sizeof(db_flod_tholds));
	db_ck_fuzziness = grey_on ? grey_ck_fuzziness : dcc_ck_fuzziness;

	db_csize = db_csize_stored_db = db_sts.rcd_magic.d.magic->s.db_csize;
	if (db_csize < sizeof(DB_MAGIC)) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s says it contains %lld bytes"
			  " or fewer than the minimum of %d",
			  DCC_NM2PATH(db_nm), db_csize, ISZ(DB_MAGIC));
		/* that is a fatal error if we are not rebuilding */
		if (new_hash_len != 0) {
			db_close(0, -1);
			return 0;
		}
	}
	if (db_csize > (DB_PTR)db_fsize) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s says it contains %lld bytes"
			  " or more than the actual size of "OFF_DPAT,
			  DCC_NM2PATH(db_nm), db_csize, db_fsize);
		/* that is a fatal error if we are not rebuilding */
		if (new_hash_len != 0) {
			db_close(0, -1);
			return 0;
		}
	}

	cur_page_size = db_sts.rcd_magic.d.magic->s.page_size;
	db_rel_state(&db_sts.rcd_magic);

	/* The buffer or page size we use must be the page size used to
	 * write the files.  Try to change our size to match the file */
	if (cur_page_size != db_page_size) {
		db_invalidate = 1;
		rel_db_states(0);
		if (!db_unload(emsg)) {
			db_close(0, -1);
			return 0;
		}
		db_invalidate = 0;
		if (!db_buf_init(cur_page_size)) {
			dcc_error_msg("%s has page size %d"
				      " incompatible with %d in %s",
				      DCC_NM2PATH(db_nm),
				      cur_page_size, db_get_page_size(0, 0),
				      db_hash_nm);
			db_close(0, -1);
			return 0;
		}
	}

	db_csize_stored_hash = 0;
	db_hash_len = 0;
	db_hash_fd = open(db_hash_nm, hash_flags, 0666);
	if (db_hash_fd < 0) {
		dcc_pemsg(EX_IOERR, emsg, "open(%s): %s",
			  DCC_NM2PATH(db_hash_nm), ERROR_STR());
		db_close(0, -1);
		return 0;
	}
	if (0 > fcntl(db_hash_fd, F_SETFD, FD_CLOEXEC)) {
		dcc_pemsg(EX_IOERR, emsg, "fcntl(%s, FD_CLOEXEC): %s",
			  DCC_NM2PATH(db_hash_nm), ERROR_STR());
		db_close(0, -1);
		return 0;
	}

	if (new_hash_len != 0) {
		if (!make_new_hash(emsg, new_hash_len)) {
			db_close(0, -1);
			return 0;
		}
	} else {
		if (!check_old_hash(emsg)) {
			db_close(0, -1);
			return 0;
		}
	}

	db_end_pg_num = db_fsize / db_page_size;
	if (db_fsize % db_page_size != 0) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s has size "OFF_HPAT","
			  " not a multiple of its page size of %#x",
			  DCC_NM2PATH(db_nm), db_fsize, db_page_size);
		db_close(0, -1);
		return 0;
	}
	/* Fill the last page of the database with zeros in case
	 * the length was wrong.
	 * That is possible only if the length is wrong by less than a page. */
	if ((DB_PTR)db_fsize > db_csize + db_page_size
	    || db_csize > (DB_PTR)db_fsize) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "%s has size "OFF_HPAT" but claims %#llx",
			  DCC_NM2PATH(db_nm), db_fsize, db_csize);
		db_close(0, -1);
		return 0;
	}
	if (!db_rdonly && (DB_PTR)db_fsize > db_csize) {
		if (!map_db(emsg, db_csize, db_fsize - db_csize, &db_sts.rcd)) {
			db_close(0, -1);
			return 0;
		}
		memset(db_sts.rcd.d.r, 0, db_fsize - db_csize);
		db_sts.rcd.b->flags |= DB_BUF_FG_MSYNC;
	}

	/* write new sizes and other parameters.
	 * This should be a NOP if the file is read only,
	 * but try it just in case. */
	if (!db_flush_magic(emsg)) {
		db_close(0, -1);
		return 0;
	}

	window = db_page_size*db_buf_total;
	if (window >= (1024*1024)) {
		snprintf(db_window_size, sizeof(db_window_size),
			 "%d MByte window",
			 (int)(window / (1024*1024)));
	} else {
		snprintf(db_window_size, sizeof(db_window_size),
			 "%d KByte window",
			 (int)(window / 1024));
	}
	rel_db_states(0);
	db_failed = 0;
	return 1;
}



/* get a free buffer for a chunk of either the hash table or database files */
PSTATIC DB_BUF *
get_free_buf(DCC_EMSG emsg, DB_BUF **bh)
{
	DB_BUF *b;

	/* Look for an unlocked buffer.
	 * We know there is one because we have more buffers than
	 * can be locked simultaneously. */
	b = buf_oldest;
	for (;;) {
		if (!b) {
			dcc_pemsg(EX_SOFTWARE, emsg,
				  "broken DB buffer MRU chain");
			return 0;
		}
		if (!b->lock_cnt)
			break;
		b = b->newer;
	}

	/* Found an unlocked buffer.
	 * Unlink it from its hash chain. */
	if (b->fwd)
		b->fwd->bak = b->bak;
	if (b->bak)
		b->bak->fwd = b->fwd;
	else if (b->hash)
		*b->hash = b->fwd;
	if (b->type != DB_BUF_TYPE_FREE) {
		if (!buf_munmap(emsg, b))
			return 0;
	}

	/* put it on the new hash chain */
	b->bak = 0;
	b->hash = bh;
	b->fwd = *bh;
	*bh = b;
	if (b->fwd)
		b->fwd->bak = b;

	return b;
}



PSTATIC DB_BUF *
find_buf(DCC_EMSG emsg, DB_BUF_TYPE type, DB_PG_NUM pg_num)
{
	DB_BUF *b, **bh;

	bh = DB_BUF_HASH(pg_num);
	b = *bh;
	for (;;) {
		if (!b) {
			/* we ran off the end of the buffer hash chain,
			 * so get a free buffer */
			b = get_free_buf(emsg, bh);
			if (!b)
				return 0;
			b->type = type;
			b->pg_num = pg_num;
			break;
		}
		if (b->type == type
		    && b->pg_num == pg_num)
			break;		/* found the buffer we need */

		b = b->fwd;
	}

	/* make the buffer newest */
	if (buf_newest != b) {
		/* unlink it */
		b->newer->older = b->older;
		if (b->older)
			b->older->newer = b->newer;
		else
			buf_oldest = b->newer;
		/* insert it at the head of the MRU list */
		b->newer = 0;
		b->older = buf_newest;
		buf_newest->newer = b;
		buf_newest = b;
	}
	if (!db_rdonly)
		b->flags |= DB_BUF_FG_DIRTY;
	return b;
}



PSTATIC DB_BUF *
find_st_buf(DCC_EMSG emsg, DB_BUF_TYPE type, DB_STATE *st, DB_PG_NUM pg_num)
{
	DB_BUF *b;

	/* release previous buffer unless it is the right one */
	b = st->b;
	if (b) {
		if (b->pg_num == pg_num
		    && b->type == type)
			return b;	/* already have the target buffer */

		st->b = 0;
		st->d.p = 0;
		if (--b->lock_cnt == 0) {
			if (!DB_IS_LOCKED()) {
				if (!buf_munmap(emsg, b))
					return 0;
			}
		} else if (b->lock_cnt < 0) {
			dcc_logbad(EX_SOFTWARE,
				   "negative database buffer lock");
		}
	}

	/* look for the buffer */
	b = find_buf(emsg, type, pg_num);
	if (!b)
		return 0;
	if (!b->buf.v) {
		/* fill it if it did not exist */
		if (!buf_mmap(emsg, b, pg_num)) {
			b->type = DB_BUF_TYPE_FREE;
			b->pg_num = -1;
			return 0;
		}
		if (type == DB_BUF_TYPE_DB)
			++db_stats.db_mmaps;
		else
			++db_stats.hash_mmaps;
	}
	++b->lock_cnt;
	st->b = b;
	st->d.p = 0;
	return b;
}



PSTATIC u_char
buf_msync(DCC_EMSG emsg, DB_BUF *b)
{
#ifdef MADV_FREE
	static u_char madvise_ok = 1;
#endif
	off_t offset;
	char *p;
	int fd, i;

	if (!(b->flags & DB_BUF_FG_DIRTY))
		return 1;


	if (db_invalidate) {
		b->flags &= ~(DB_BUF_FG_MSYNC | DB_BUF_FG_DIRTY);
		return 1;
	}

	if (b->flags & DB_BUF_FG_NO_MMAP) {
		if (b->flags & DB_BUF_FG_ANON) {
			p = b->buf.v;
		} else {
			/* at least FreeBSD fails writing to the file that
			 * underlies a mmap() region from that region */
			static char *wbuf;
			static u_int wbuf_len;

			if (wbuf_len != db_page_size) {
				wbuf_len = db_page_size;
				if (wbuf)
					free(wbuf);
				wbuf = malloc(db_page_size);
			}
			p = wbuf;
			memcpy(p, b->buf.v, db_page_size);
		}

		fd = (b->type == DB_BUF_TYPE_DB) ? db_fd : db_hash_fd;
		offset = (off_t)b->pg_num * (off_t)db_page_size;
		if (offset != lseek(fd, offset, SEEK_SET)) {
			dcc_pemsg(EX_IOERR, emsg, "lseek(%s,"OFF_HPAT"): %s",
				  B2PATH(b), offset, ERROR_STR());
			db_failed = 1;
			return 0;
		}
		i = write(fd, p, db_page_size);
		if (i != (int)db_page_size) {
			dcc_pemsg(EX_IOERR, emsg, "write(%s)=%d: %s",
				  B2PATH(b), i, ERROR_STR());
			db_failed = 1;
			return 0;
		}
#ifdef MADV_FREE
		if (madvise_ok
		    && 0 > madvise(b->buf.v, db_page_size, MADV_FREE)) {
			if (db_debug)
				dcc_trace_msg("madvise(MADV_FREE): %s",
					      ERROR_STR());
			madvise_ok = 0;
		}
#endif

#ifndef HAVE_OLD_MSYNC
	} else {
		if ((b->flags & DB_BUF_FG_MSYNC)
		    || DB_IS_TIME(msync_time, MSYNC_TIME)) {
			msync_time = db_time.tv_sec + MSYNC_TIME;
			if (0 > MSYNC(b->buf.v, db_page_size, MS_ASYNC)) {
				dcc_pemsg(EX_IOERR, emsg,
					  "msync(db buffer %#x,%#x): %s",
					  (int)b->buf.v, db_page_size,
					  ERROR_STR());
				db_failed = 1;
				return 0;
			}
		}
#endif
	}

	b->flags &= ~(DB_BUF_FG_MSYNC | DB_BUF_FG_DIRTY);
	return 1;
}



PSTATIC u_char
buf_munmap(DCC_EMSG emsg, DB_BUF *b)
{
	u_char result;

	if (b->lock_cnt != 0)
		dcc_logbad(EX_SOFTWARE, "unmapping locked DB buffer");

	result = buf_msync(emsg, b);
	if (!result) {
		emsg = 0;
		db_failed = 1;
	}

	if (0 > munmap(b->buf.v, db_page_size)) {
		dcc_pemsg(EX_IOERR, emsg, "munmap(%s,%d): %s",
			  B2PATH(b), db_page_size, ERROR_STR());
		db_failed = 1;
		result = 0;
	}
	b->buf.v = 0;

	b->pg_num = -1;
	b->type = DB_BUF_TYPE_FREE;
	b->flags = 0;

	return result;
}



PSTATIC u_char
buf_mmap(DCC_EMSG emsg, DB_BUF *b, DB_PG_NUM pg_num)
{
#ifndef HAVE_OLD_MSYNC
#ifdef MADV_RANDOM
	static u_char madv_random_ok = 1;
#endif
#ifdef MADV_WILLNEED
	static u_char madv_willneed_ok = 1;
#endif
#endif
	int flags;
	off_t offset;
	void *p;


	if (db_no_mmap
	    && (b->type == DB_BUF_TYPE_HASH
		|| pg_num >= db_end_pg_num)) {
		/* If there is enough RAM to avoid thrashing
		 * or if this is a hash table page that we will probably
		 *	be changing
		 * or if this is the current data page whose hash table
		 *	entries are being rebuilt,
		 * then read and write entire buffers instead of letting
		 * the Solaris virtual memory system do it.
		 * Solaris will bog the system down doing nothing but
		 * flushing dirty pages mmap() */
		b->flags |= DB_BUF_FG_NO_MMAP;
		flags = MAP_PRIVATE;
	} else {
#ifdef MAP_NOSYNC
		flags = (MAP_SHARED | MAP_NOSYNC);
#else
		flags = MAP_SHARED;
#endif
	}
	offset = (off_t)pg_num * (off_t)db_page_size;
	p = mmap(0, db_page_size,
		 db_rdonly ? PROT_READ : (PROT_READ | PROT_WRITE),
		 flags,
		 (b->type == DB_BUF_TYPE_DB) ? db_fd : db_hash_fd,
		 offset);

	if (p == MAP_FAILED) {
		dcc_pemsg(EX_IOERR, emsg, "mmap(%s,%#x,"OFF_HPAT"): %s",
			  B2PATH(b), db_page_size, offset, ERROR_STR());
		return 0;
	}

#ifndef HAVE_OLD_MSYNC
	if ((b->flags & DB_BUF_FG_NO_MMAP)
	    || (DB_PTR)(db_fsize+hash_fsize) < db_max_rss)  {
#ifdef MADV_WILLNEED
		/* Tell the kernel to keep entire buffers in RAM if
		 * we have plenty */
		if (madv_willneed_ok
		    && 0 > madvise(p, db_page_size, MADV_WILLNEED)) {
			if (db_debug)
				dcc_trace_msg("madvise(MADV_WILLNEED): %s",
					      ERROR_STR());
			madv_willneed_ok = 0;
		}
#endif

	} else if ((DB_PTR)db_fsize >= db_max_rss) {
#ifdef MADV_RANDOM
		/* Tell the kernel to not read entire buffers if we are short
		 * of RAM.  Let it read-ahead and try to fill buffers if we
		 * hope to keep the whole database in RAM. */
		if (madv_random_ok
		    && 0 > madvise(p, db_page_size, MADV_RANDOM)) {
			if (db_debug)
				dcc_trace_msg("madvise(MADV_RANDOM): %s",
					      ERROR_STR());
			madv_random_ok = 0;
		}
#endif
	}
#endif /* !HAVE_OLD_MSYNC */

	b->buf.v = p;
	return 1;
}



/* mmap() a hash table entry */
PSTATIC u_char
map_hash(DCC_EMSG emsg,
	 DB_HADDR haddr,		/* this entry */
	 DB_STATE *st)			/* point this to the entry */
{
	DB_PG_NUM pg_num;
	DB_PG_OFF pg_off;
	DB_BUF *b;

	if (haddr >= db_hash_len) {
		dcc_pemsg(EX_DATAERR, emsg, "invalid hash address %#x",
			  haddr);
		return 0;
	}

	pg_num = haddr / db_hash_page_len;
	pg_off = haddr % db_hash_page_len;

	b = find_st_buf(emsg, DB_BUF_TYPE_HASH, st, pg_num);
	if (!b)
		return 0;
	st->s.haddr = haddr;
	st->d.h = &b->buf.h[pg_off];
	return 1;
}



/* unlink a hash table entry from the free list */
PSTATIC u_char
unlink_free_hash(DCC_EMSG emsg,
		 DB_STATE *hash_st,	/* remove this from the free list */
		 DB_STATE *tmp_st)
{
	DB_HADDR fwd, bak;

	if (!db_make_dirty(emsg))
		return 0;

	fwd = DB_HADDR_EX(hash_st->d.h->fwd);
	bak = DB_HADDR_EX(hash_st->d.h->bak);
	if (!HE_IS_FREE(hash_st->d.h)
	    || (DB_HADDR_INVALID(fwd) && fwd != DB_HADDR_FREE)
	    || (DB_HADDR_INVALID(bak) && bak != DB_HADDR_FREE)) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "bad hash free list entry at %#x", hash_st->s.haddr);
		return 0;
	}

	if (!map_hash(emsg, fwd, tmp_st))
		return 0;
	if (DB_HADDR_EX(tmp_st->d.h->bak) != hash_st->s.haddr) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "free %#x --> bad-free %#x", hash_st->s.haddr, fwd);
		return 0;
	}
	DB_HADDR_CP(tmp_st->d.h->bak, bak);

	if (!map_hash(emsg, bak, tmp_st))
		return 0;
	if (DB_HADDR_EX(tmp_st->d.h->fwd) != hash_st->s.haddr) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "bad free %#x <-- free %#x", bak, hash_st->s.haddr);
		return 0;
	}
	DB_HADDR_CP(tmp_st->d.h->fwd, fwd);

	DB_HADDR_CP(hash_st->d.h->fwd, DB_HADDR_NULL);
	DB_HADDR_CP(hash_st->d.h->bak, DB_HADDR_NULL);
	++db_hash_used;
	return 1;
}



/* get a free hash table entry and leave db_sts.free pointing to it */
PSTATIC u_char				/* 0=failed, 1=got it */
get_free_hash(DCC_EMSG emsg,
	      DB_HADDR result)		/* try near here */
{
	DB_HADDR pg_lim;
	int i;

	if (db_hash_len <= db_hash_used) {
		dcc_pemsg(EX_SOFTWARE, emsg, "no free hash table entry;"
			  " %d of %d used", db_hash_used, db_hash_len);
		return 0;
	}

	/* look near the target
	 * Try hard because going off the page is so expensive that it
	 * justifies plenty of time here.*/
	if (result != DB_HADDR_NULL) {
		pg_lim = (result - (result % db_hash_page_len)
			  + db_hash_page_len-1);
		for (i = 0; i < 50; ++i) {
			if (!map_hash(emsg, result, &db_sts.free))
				return 0;
			if (HE_IS_FREE(db_sts.free.d.h))
				return unlink_free_hash(emsg, &db_sts.free,
							&db_sts.tmp);
			if (++result >= pg_lim)
				result -= db_hash_page_len-1-DB_HADDR_MIN;
		}
	}

	/* then try the free list */
	if (!map_hash(emsg, DB_HADDR_FREE, &db_sts.free))
		return 0;
	result = DB_HADDR_EX(db_sts.free.d.h->fwd);
	if (DB_HADDR_INVALID(result)) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "broken hash free list head of %#x", result);
		return 0;
	}
	if (!map_hash(emsg, result, &db_sts.free))
		return 0;
	return unlink_free_hash(emsg, &db_sts.free, &db_sts.tmp);
}



/* mmap() a database entry
 *	We assume that no database entry spans buffers,
 *	and that there are enough buffers to accomodate all possible
 *	concurrent requests. */
PSTATIC u_char
map_db(DCC_EMSG emsg,
       DB_PTR rptr,			/* address of the record */
       u_int tgt_len,			/* its length */
       DB_STATE *st)			/* point this to the record */
{
	DB_PG_NUM pg_num;
	DB_PG_OFF pg_off;
	DB_BUF *b;

	if (rptr+tgt_len > (DB_PTR)db_fsize) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "invalid database address %#llx or length %d"
			  " past db_fsize "OFF_HPAT" in %s",
			  rptr, tgt_len, db_fsize, DCC_NM2PATH(db_nm));
		db_failed = 1;
		return 0;
	}

	pg_num = rptr / db_page_size;
	pg_off = rptr % db_page_size;

	/* do not go past the end of a buffer */
	if (tgt_len+pg_off > db_page_size) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "invalid database address %#llx or length %#x in %s",
			  rptr, tgt_len, DCC_NM2PATH(db_nm));
		db_failed = 1;
		return 0;
	}

	b = find_st_buf(emsg, DB_BUF_TYPE_DB, st, pg_num);
	if (!b)
		return 0;
	st->s.rptr = rptr;
	st->d.r = (DB_RCD *)&b->buf.c[pg_off];
	return 1;
}



u_char					/* 0=failed, 1=got it */
db_map_rcd(DCC_EMSG emsg,
	   DB_STATE *rcd_st,		/* point this to the record */
	   DB_PTR rptr,			/* that is here */
	   u_int *rcd_lenp)		/* put its length here */
{
	u_int rcd_len;

	if (DB_PTR_IS_BAD(rptr)) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "getting bogus record at %#llx, in %s",
			  rptr, DCC_NM2PATH(db_nm));
		return 0;
	}

	if (!map_db(emsg, rptr, DB_RCD_PAD, rcd_st))
		return 0;

	rcd_len = (DB_RCD_PAD
		   + (sizeof(rcd_st->d.r->cks[0]) * DB_NUM_CKS(rcd_st->d.r)));

	if (&rcd_st->d.c[rcd_len] > &rcd_st->b->buf.c[db_page_size]) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "invalid checksum count %d at %#llx in %s",
			  DB_NUM_CKS(rcd_st->d.r), rptr, DCC_NM2PATH(db_nm));
		return 0;
	}

	if (rcd_lenp)
		*rcd_lenp = rcd_len;
	return 1;
}



/* write the new sizes of the files into the files */
PSTATIC u_char
db_flush_len(DCC_EMSG emsg)
{
	u_char result = 1;

	if (db_hash_fd != -1
	    && db_csize_stored_hash != db_csize) {
		if (!map_hash(emsg, DB_HADDR_SIZES, &db_sts.hash_ctl)) {
			result = 0;
		} else {
			DB_HPTR_CP(db_sts.hash_ctl.d.h->HASH_STORE_DB_CSIZE,
				   db_csize);
			db_csize_stored_hash = db_csize;
		}
	}

	if (db_fd != -1
	    && db_csize_stored_db != db_csize) {
		if (!map_db(emsg, 0, sizeof(DB_MAGIC), &db_sts.rcd_magic)) {
			result = 0;
		} else {
			db_sts.rcd_magic.d.magic->s.db_csize = db_csize;
			db_csize_stored_db = db_csize;
			db_sts.rcd_magic.b->flags |= DB_BUF_FG_MSYNC;
		}
	}

	return result;
}



/* write the database parameters into the magic number headers of the files */
u_char
db_flush_magic(DCC_EMSG emsg)
{
	if (!db_flush_len(emsg))
		return 0;

	if (db_fd == -1)
		return 1;

	if (db_nokeep_cks != db_nokeep_cks_stored
	    || memcmp(&db_ex_secs, &db_ex_secs_stored,
		      sizeof(db_ex_secs))
	    || memcmp(db_flod_tholds, db_flod_tholds_stored,
		      sizeof(db_flod_tholds))
	    || memcmp(db_sn, db_sn_stored, sizeof(db_sn))) {

		if (!map_db(emsg, 0, sizeof(DB_MAGIC), &db_sts.rcd_magic))
			return 0;

		db_sts.rcd_magic.d.magic->s.page_size = db_page_size;

		memcpy(&db_sts.rcd_magic.d.magic->s.ex_secs,&db_ex_secs,
		       sizeof(db_sts.rcd_magic.d.magic->s.ex_secs));
		memcpy(&db_ex_secs_stored, &db_ex_secs,
		       sizeof(db_ex_secs_stored));

		db_sts.rcd_magic.d.magic->s.nokeep_cks = db_nokeep_cks;
		db_nokeep_cks_stored = db_nokeep_cks;

		memcpy(db_sts.rcd_magic.d.magic->s.flod_tholds,
		       db_flod_tholds,
		       sizeof(db_sts.rcd_magic.d.magic->s.flod_tholds));
		memcpy(db_flod_tholds_stored,
		       db_flod_tholds,
		       sizeof(db_flod_tholds_stored));

		memcpy(db_sts.rcd_magic.d.magic->s.sn, db_sn,
		       sizeof(db_sts.rcd_magic.d.magic->s.sn));
		memcpy(db_sn_stored, db_sn,
		       sizeof(db_sn_stored));

		db_sts.rcd_magic.b->flags |= DB_BUF_FG_MSYNC;
	}

	return 1;
}



/* find a checksum type known to be in a record */
DB_RCD_CK *				/* 0=it's not there */
db_map_rcd_ck(DCC_EMSG emsg,
	      DB_STATE *rcd_st,		/* point this to the record */
	      DB_PTR rptr,		/* that is here */
	      DCC_CK_TYPES type)	/* find this type of checksum */
{
	DB_RCD_CK *rcd_ck;
	int i;

	if (!db_map_rcd(emsg, rcd_st, rptr, 0))
		return 0;

	rcd_ck = rcd_st->d.r->cks;
	i = DB_NUM_CKS(rcd_st->d.r);
	if (i > DCC_NUM_CKS) {
		dcc_pemsg(EX_DATAERR, emsg,
			  "impossible %d checksums in %#llx in %s",
			  i, rptr, DCC_NM2PATH(db_nm));
		return 0;
	}

	for (; i != 0; --i, ++rcd_ck) {
		if (DB_CK_TYPE(rcd_ck) == type)
			return rcd_ck;
	}

	dcc_pemsg(EX_DATAERR, emsg, "missing \"%s\" checksum in %#llx in %s",
		  dcc_type2str_err(type, 0, 1),
		  rptr, DCC_NM2PATH(db_nm));
	return 0;
}



DB_HADDR
db_hash(DCC_CK_TYPES type, const DCC_SUM sum)
{
	u_long accum;
	DB_HADDR haddr;

	accum = type;
	accum += (sum[0]<<24)+(sum[1]<<16)+(sum[2]<<8)+sum[3];
	accum += (sum[4]<<24)+(sum[5]<<16)+(sum[6]<<8)+sum[7];
	accum += (sum[8]<<24)+(sum[9]<<16)+(sum[10]<<8)+sum[11];
	accum += (sum[12]<<24)+(sum[13]<<16)+(sum[14]<<8)+sum[15];
	haddr = mhash(accum, db_hash_len);
	if (haddr < DB_HADDR_MIN)
		haddr = DB_HADDR_MIN;
	return haddr;
}



/* look for a checksum in the hash table
 *	return with not-found, the home slot, or the last entry on
 *	the collision chain */
DB_FOUND
db_lookup(DCC_EMSG emsg, DCC_CK_TYPES type, const DCC_SUM sum,
	  DB_HADDR lo,			/* postpone if out of this window */
	  DB_HADDR hi,
	  DB_STATE *hash_st,		/* hash block for record or related */
	  DB_STATE *rcd_st,		/* put the record or garbage here */
	  DB_RCD_CK **prcd_ck)		/* point to cksum if found */
{
	DB_HADDR haddr, haddr1;
	DB_PTR db_ptr;
	DB_RCD_CK *found_ck;
	int failsafe;

	haddr = db_hash(type, sum);
	if (haddr < lo || haddr > hi) {
		if (lo == 0 && hi == MAX_HASH_ENTRIES) {
			dcc_pemsg(EX_DATAERR, emsg,
				  "out of range hash address");
			return DB_FOUND_SYSERR;
		}
		return DB_FOUND_LATER;
	}

	if (prcd_ck)
	    *prcd_ck = 0;

	if (!map_hash(emsg, haddr, hash_st))
		return DB_FOUND_SYSERR;

	if (HE_IS_FREE(hash_st->d.h))
		return DB_FOUND_EMPTY;

	if (!DB_HADDR_C_NULL(hash_st->d.h->bak))
		return DB_FOUND_INTRUDER;

	/* We know that the current hash table entry is in its home slot.
	 * It might be for the key or checksum we are looking for
	 * or it might be for some other checksum with the same hash value. */
	for (failsafe = db_hash_len; failsafe >=0; --failsafe) {
		if (HE_CMP(hash_st->d.h, type, sum)) {
			/* This hash table entry could be for our target
			 * checksum.  Read the corresponding record so we
			 * decide whether we have a hash collision or we
			 * have found a record containing our target checksum.
			 *
			 * find right type of checksum in the record */
			db_ptr = DB_HPTR_EX(hash_st->d.h->rcd);
			found_ck = db_map_rcd_ck(emsg, rcd_st, db_ptr, type);
			if (!found_ck)
				return DB_FOUND_SYSERR;
			if (!memcmp(sum, found_ck->sum,
				    sizeof(DCC_SUM))) {
				if (prcd_ck)
					*prcd_ck = found_ck;
				return DB_FOUND_IT;
			}
		}

		/* This DB record was a hash collision, or for a checksum
		 * other than our target.
		 * Fail if this is the end of the hash chain */
		haddr1 = DB_HADDR_EX(hash_st->d.h->fwd);
		if (haddr1 == DB_HADDR_NULL)
			return DB_FOUND_CHAIN;

		if (DB_HADDR_INVALID(haddr1)) {
			dcc_pemsg(EX_DATAERR, emsg,
				  "broken hash chain fwd-link %#x at %#x in %s",
				   haddr1, haddr, DCC_NM2PATH(db_hash_nm));
			return DB_FOUND_SYSERR;
		}

		if (!map_hash(emsg, haddr1, hash_st))
			return DB_FOUND_SYSERR;

		if (DB_HADDR_EX(hash_st->d.h->bak) != haddr) {
			dcc_pemsg(EX_DATAERR, emsg,
				  "broken hash chain back-link"
				  " %#x<--%#x instead of %#x<--%#x in %s",
				  DB_HADDR_EX(hash_st->d.h->bak), haddr1,
				  haddr, haddr1, DCC_NM2PATH(db_hash_nm));
			return DB_FOUND_SYSERR;
		}
		haddr = haddr1;
	}
	dcc_pemsg(EX_DATAERR, emsg, "infinite hash chain at %#x in %s",
		  haddr, DCC_NM2PATH(db_hash_nm));
	return DB_FOUND_SYSERR;
}



/* combine checksums */
DCC_TGTS
db_sum_ck(DCC_TGTS prev,		/* previous sum */
	  DCC_TGTS new)			/* new value */
{
	DCC_TGTS res;

	/* This arithmetic must be commutative (after handling deleted
	 * values), because inter-server flooding causes records to appear in
	 * the database out of temporal order.
	 *
	 * DCC_TGTS_TOO_MANY can be thought of as a count of plus infinity.
	 * DCC_TGTS_OK is like minus infinity.
	 * DCC_TGTS_OK2 like half of minus infinity
	 * DCC_TGTS_TOO_MANY (plus infinity) added to DCC_TGTS_OK (minus
	 *	infinity) or DCC_TGTS_OK2 yields DCC_TGTS_OK or DCC_TGTS_OK2.
	 *
	 * Claims of not-spam from all clients are discarded as they arrive
	 * and before here. They can only come from the local white list
	 */

	if (new == DCC_TGTS_OK || prev == DCC_TGTS_OK) {
		res = DCC_TGTS_OK;
	} else if (new == DCC_TGTS_OK2 || prev == DCC_TGTS_OK2) {
		res = DCC_TGTS_OK2;
	} else if (new == DCC_TGTS_DEL) {
		res = prev;
	} else if (new == DCC_TGTS_TOO_MANY
		   || prev == DCC_TGTS_TOO_MANY
		   || ((res = prev+new) >= DCC_TGTS_TOO_MANY)) {
		res = DCC_TGTS_TOO_MANY;
	}
	return res;
}



/* delete reports */
static u_char				/* 1=done, 0=broken database */
del_ck(DCC_EMSG emsg,
       DCC_TGTS *res,			/* residual targets after deletion */
       const DB_RCD *new,		/* delete reports older than this one */
       DCC_CK_TYPES type,		/* delete this type of checksum */
       DB_RCD_CK *prev_ck,		/* starting with this one */
       DB_STATE *prev_st)		/* use this scratch state block */
{
	DB_PTR prev;

	*res = 0;
	for (;;) {
		/* delete reports that are older than the delete request */
		if (DCC_TS_NEWER_TS(new->ts, prev_st->d.r->ts)
		    && DB_RCD_ID(prev_st->d.r) != DCC_ID_WHITE) {
			DB_TGTS_RCD_SET(prev_st->d.r, 0);
			DB_TGTS_CK_SET(prev_ck, 0);

		} else {
			/* sum reports that are not deleted */
			*res = db_sum_ck(*res, DB_TGTS_RCD(prev_st->d.r));
		}

		prev = DB_PTR_EX(prev_ck->prev);
		if (prev == DB_PTR_NULL)
			return 1;
		prev_ck = db_map_rcd_ck(emsg, prev_st, prev, type);
		if (!prev_ck)
			return 0;
	}
}



/* Mark reports made obsolete by a spam report
 *	A new report of spam make sufficiently old reports obsolete.
 *	Sufficiently recent existing reports make a new report obsolete,
 *	or at least not worth spending bandwidth to flood. */
PSTATIC u_char				/* 1=done, 0=broken database */
db_obs_ck(DCC_EMSG emsg,
	  const DB_RCD *new,
	  DB_RCD_CK *new_ck,
	  DCC_CK_TYPES type,		/* check this type of checksum */
	  DB_RCD_CK *prev_ck,		/* starting with this one */
	  DCC_TGTS prev_ck_tgts,
	  DB_STATE *prev_st)		/* use this scratch state block */
{
	struct timeval tv;
	time_t secs;
	DCC_TS ts;
	int limit;
	DB_PTR prev;

	secs = db_ex_secs[type].all;
	if (secs > DCC_NEW_SPAM_SECS)
		secs = DCC_NEW_SPAM_SECS;
	dcc_ts2timeval(&tv, new->ts);
	dcc_timeval2ts(ts, &tv, -secs);

	limit = 100;
	for (;;) {
		/* preceding white listed entries make new entries obsolete */
		if (DB_RCD_ID(prev_st->d.r) == DCC_ID_WHITE) {
			new_ck->type_fgs |= DB_CK_FG_OBS;
			return 1;
		}

		if (DB_CK_OBS(prev_ck)) {
			/* don't look forever for recent existing report */
			if (--limit == 0)
				return 1;

		} else if (prev_ck_tgts != DCC_TGTS_TOO_MANY) {
			/* mark this predecessor obsolete if it
			 * was before the checksum became spam */
			prev_ck->type_fgs |= DB_CK_FG_OBS;

		} else if (DCC_TS_OLDER_TS(prev_st->d.r->ts, &ts)) {
			/* this older predecessor is now obsolete */
			prev_ck->type_fgs |= DB_CK_FG_OBS;
			/* we're finished, because all older preceding reports
			 * were marked obsolete when it was inserted  */
			return 1;

		} else {
			/* this predecessor is recent, so it makes
			 * our new record obsolete. */
			new_ck->type_fgs |= DB_CK_FG_OBS;
			return 1;
		}

		prev = DB_PTR_EX(prev_ck->prev);
		if (prev == DB_PTR_NULL)
			return 1;	/* it is a new report of spam */

		prev_ck = db_map_rcd_ck(emsg, &db_sts.rcd2, prev, type);
		if (!prev_ck)
			return 0;
		prev_ck_tgts = DB_TGTS_CK(prev_ck);
	}
}



/* mark extra server-ID declarations obsolete */
static u_char				/* 1=done, 0=broken database */
srvr_id_ck(DCC_EMSG emsg,
	   const DB_RCD *new,
	   DB_RCD_CK *new_ck,
	   DB_RCD_CK *prev_ck,		/* starting with this one */
	   DB_STATE *prev_st)		/* use this scratch state block */
{
	DB_PTR prev;

	for (;;) {
		if (DB_RCD_ID(prev_st->d.r) == DB_RCD_ID(new)) {
			/* keep newest server-ID declaration */
			if (DCC_TS_NEWER_TS(prev_st->d.r, new->ts))
				new_ck->type_fgs |= DB_CK_FG_OBS;
			else
				prev_ck->type_fgs |= DB_CK_FG_OBS;
			return 1;
		}

		prev = DB_PTR_EX(prev_ck->prev);
		if (prev == DB_PTR_NULL)
			return 1;

		prev_ck = db_map_rcd_ck(emsg, prev_st, prev, DCC_CK_SRVR_ID);
		if (!prev_ck)
			return 0;
	}
}



/* Install pointers in the hash table for a record and fix the accumulated
 *	counts in the record pointed to by db_sts.rcd */
u_char					/* 0=failed, 1=done */
db_link_rcd(DCC_EMSG emsg, DB_HADDR lo, DB_HADDR hi)
{
	DCC_TGTS res;
	DB_RCD *rcd;
	DB_RCD_CK *prev_ck;
	DB_RCD_CK *rcd_ck;
	DCC_CK_TYPES rcd_type;
	DCC_TGTS rcd_tgts, prev_ck_tgts;
	int ck_num;
	DB_HADDR haddr;

	if (!db_make_dirty(emsg))
		return 0;

	rcd = db_sts.rcd.d.r;
	rcd_tgts = DB_TGTS_RCD_RAW(rcd);
	rcd_ck = rcd->cks;
	ck_num = DB_NUM_CKS(rcd);
	if (ck_num > DIM(rcd->cks)) {
		dcc_pemsg(EX_SOFTWARE, emsg,
			  "bogus checksum count %#x at %#llx in %s",
			  rcd->fgs_num_cks, db_sts.rcd.s.rptr,
			  DCC_NM2PATH(db_nm));
		return 0;
	}
	for (; ck_num > 0; --ck_num, ++rcd_ck) {
		res = rcd_tgts;
		if (res == DCC_TGTS_DEL)
			res = 0;
		/* avoid dirtying a mapped page if not necessary */
		if (rcd_ck->prev != DB_PTR_CP(DB_PTR_NULL))
			rcd_ck->prev = DB_PTR_CP(DB_PTR_NULL);

		/* do not link or total some checksums unless they
		 * are whitelist entries */
		rcd_type = DB_CK_TYPE(rcd_ck);
		if (DB_TEST_NOKEEP(db_nokeep_cks, rcd_type)
		    && DB_RCD_ID(rcd) != DCC_ID_WHITE) {
			DB_TGTS_CK_SET(rcd_ck, 1);
			continue;
		}

		if (!DCC_CK_OK_DB(rcd_type)) {
			dcc_pemsg(EX_SOFTWARE, emsg,
				  "invalid checksum type %s at %#llx in %s",
				  dcc_type2str_err(rcd_type, 0, 1),
				  db_sts.rcd.s.rptr, DCC_NM2PATH(db_nm));
			return 0;
		}

		switch (db_lookup(emsg, rcd_type, rcd_ck->sum, lo, hi,
				  &db_sts.hash, &db_sts.rcd2, &prev_ck)) {
		case DB_FOUND_SYSERR:
			return 0;

		case DB_FOUND_LATER:
			continue;

		case DB_FOUND_IT:
			/* We found the checksum
			 * Update the hash table to point to the new record */
			DB_HPTR_CP(db_sts.hash.d.h->rcd, db_sts.rcd.s.rptr);
			rcd_ck->prev = DB_PTR_CP(db_sts.rcd2.s.rptr);
			if (rcd_tgts == DCC_TGTS_DEL) {
				/* delete predecessors to a delete request
				 * and compute the remaining sum */
				if (!del_ck(emsg, &res, rcd, rcd_type,
					    prev_ck, &db_sts.rcd2))
					return 0;
				/* delete requests are obsolete if the
				 * checksum is white-listed */
				if (res == DCC_TGTS_OK
				    || res == DCC_TGTS_OK2)
					rcd_ck->type_fgs |= DB_CK_FG_OBS;
			} else {
				/* Simple checksum with a predecessor
				 * This does not do the substantial extra work
				 * to notice delete requests that arrived early.
				 * That problem is handled by the incoming
				 * flooding duplicate report detection
				 * mechanism. */
				prev_ck_tgts = DB_TGTS_CK(prev_ck);
				if (DB_RCD_SUMRY(rcd))
					res = prev_ck_tgts;
				else
					res = db_sum_ck(res, prev_ck_tgts);

				if (res == DCC_TGTS_OK || res == DCC_TGTS_OK2
				    || (DB_RCD_ID(db_sts.rcd2.d.r)
					== DCC_ID_WHITE)) {
					/* obsolete white-listed checksums */
					rcd_ck->type_fgs |= DB_CK_FG_OBS;

				} else if (res == DCC_TGTS_TOO_MANY
					   && !DB_CK_OBS(rcd_ck)) {
					/* suppress unneeded reports of spam */
					if (!db_obs_ck(emsg, rcd, rcd_ck,
						       rcd_type,
						       prev_ck, prev_ck_tgts,
						       &db_sts.rcd2))
					    return 0;

				} else if (rcd_type == DCC_CK_SRVR_ID) {
					if (!srvr_id_ck(emsg, rcd, rcd_ck,
							prev_ck, &db_sts.rcd2))
					    return 0;
				}
			}
			break;

		case DB_FOUND_EMPTY:
			/* We found an empty hash table slot.
			 * Update the slot to point to our new record
			 * after removing it from the free list. */
			if (!unlink_free_hash(emsg, &db_sts.hash, &db_sts.tmp))
				return 0;
			DB_HPTR_CP(db_sts.hash.d.h->rcd, db_sts.rcd.s.rptr);
			HE_MERGE(db_sts.hash.d.h,rcd_type, rcd_ck->sum);
			break;

		case DB_FOUND_CHAIN:
			/* We found a hash collision, a chain of 1 or more
			 * records with the same hash value.
			 * Get a free slot, link it to the end of the chain,
			 * and point it to the record */
			if (!get_free_hash(emsg, db_sts.hash.s.haddr))
				return 0;
			DB_HADDR_CP(db_sts.free.d.h->bak, db_sts.hash.s.haddr);
			DB_HADDR_CP(db_sts.hash.d.h->fwd, db_sts.free.s.haddr);
			DB_HPTR_CP(db_sts.free.d.h->rcd, db_sts.rcd.s.rptr);
			HE_MERGE(db_sts.free.d.h,rcd_type, rcd_ck->sum);
			break;

		case DB_FOUND_INTRUDER:
			/* The home hash slot for our key contains an
			 * intruder.  Find a place to put it. */
			haddr = DB_HADDR_EX(db_sts.hash.d.h->fwd);
			if (haddr == DB_HADDR_NULL)
				haddr = DB_HADDR_EX(db_sts.hash.d.h->bak);
			if (!get_free_hash(emsg, haddr))
				return 0;
			/* Move the intruder */
			*db_sts.free.d.h = *db_sts.hash.d.h;
			/* re-link the neighbors of the intruder */
			haddr = DB_HADDR_EX(db_sts.free.d.h->bak);
			if (haddr == DB_HADDR_NULL) {
				dcc_pemsg(EX_DATAERR, emsg,
					  "bad hash chain reverse link at %#x"
					  " in %s",
					  haddr, DCC_NM2PATH(db_hash_nm));
				return 0;
			}
			if (!map_hash(emsg, haddr, &db_sts.tmp))
				return 0;
			DB_HADDR_CP(db_sts.tmp.d.h->fwd, db_sts.free.s.haddr);
			haddr = DB_HADDR_EX(db_sts.hash.d.h->fwd);
			if (haddr != DB_HADDR_NULL) {
				if (!map_hash(emsg, haddr, &db_sts.tmp))
					return 0;
				DB_HADDR_CP(db_sts.tmp.d.h->bak,
					    db_sts.free.s.haddr);
			}
			/* install the new entry in its home slot */
			DB_HADDR_CP(db_sts.hash.d.h->fwd, DB_HADDR_NULL);
			DB_HADDR_CP(db_sts.hash.d.h->bak, DB_HADDR_NULL);
			DB_HPTR_CP(db_sts.hash.d.h->rcd, db_sts.rcd.s.rptr);
			HE_MERGE(db_sts.hash.d.h,rcd_type, rcd_ck->sum);
			break;
		}

		/* Fix the checksum in the report.  Try not to write
		 * in the buffer unless necessary to speed up dbclean */
		if (DB_TGTS_CK(rcd_ck) != res)
			DB_TGTS_CK_SET(rcd_ck, res);
	}

	return db_flush_len(emsg);
}



/* Add a record to the database and the hash table
 *	The record must be known to be valid */
DB_PTR					/* 0=failed */
db_add_rcd(DCC_EMSG emsg, DB_RCD *new_rcd)
{
	u_int new_rcd_len, pad_len;
	DB_PTR new_db_csize, new_db_fsize, rcd_pos, new_page_num;

	if (!db_make_dirty(emsg))
		return 0;

	new_rcd_len = (sizeof(*new_rcd)
		       - sizeof(new_rcd->cks)
		       + (DB_NUM_CKS(new_rcd) * sizeof(new_rcd->cks[0])));
	rcd_pos = db_csize;
	new_db_csize = rcd_pos+new_rcd_len;
	new_page_num = new_db_csize/db_page_size;

	/* advance rcd_pos with zero filler reports to get past
	 * a page boundary */
	if (new_page_num != db_csize/db_page_size) {
		pad_len = new_page_num*db_page_size - db_csize;
		pad_len = ((pad_len + DB_RCD_PAD-1) / DB_RCD_PAD) * DB_RCD_PAD;
		rcd_pos = db_csize + pad_len;
		new_db_csize = rcd_pos + new_rcd_len;
		new_db_fsize = (new_page_num+1)*db_page_size;
		db_extended = 1;
		if (!db_extend(emsg, db_fd, db_nm, new_db_fsize, db_fsize))
			return 0;
		db_fsize = new_db_fsize;
		db_end_pg_num = new_page_num;
	}

	/* install the record */
	if (!map_db(emsg, rcd_pos, new_rcd_len, &db_sts.rcd))
		return 0;
	/* Mark its buffer to be sent to the disk to keep the database
	 * as good as possible even if we crash.  We don't need to worry
	 * about later changes to the hash links because dbclean will
	 * rebuild them if we crash */
	memcpy(db_sts.rcd.d.r, new_rcd, new_rcd_len);
	db_sts.rcd.b->flags |= DB_BUF_FG_MSYNC;
	db_csize = new_db_csize;

	/* install pointers in the hash table
	 * and update the total counts in the record */
	if (!db_link_rcd(emsg, 0, MAX_HASH_ENTRIES))
		return 0;

	++db_stats.adds;
	return rcd_pos;
}
