/* -*-Mode: C++;-*-
 * $Id: dbfs.cc 1.29 Wed, 16 May 2001 03:33:56 +0400 jmacd $
 *
 * Copyright (C) 1999, 2000, Joshua P. MacDonald <jmacd@CS.Berkeley.EDU>
 * and The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *    Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 *    Redistributions in binary form must reproduce the above
 *    copyright notice, this list of conditions and the following
 *    disclaimer in the documentation and/or other materials provided
 *    with the distribution.
 *
 *    Neither name of The University of California nor the names of
 *    its contributors may be used to endorse or promote products
 *    derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <unistd.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <errno.h>

#include "xdfs_cpp.h"
#include "edsiostdio.h"

pthread_mutexattr_t RECURSIVE_MUTEX_ATTR  = { PTHREAD_MUTEX_RECURSIVE_NP };
pthread_mutexattr_t ERRORCHECK_MUTEX_ATTR = { PTHREAD_MUTEX_ERRORCHECK_NP };

const guint   DBFS_PAGE_SIZE        = sysconf (_SC_PAGESIZE);
const guint   DBFS_FS_SHORT_THRESH  = 1024;
const guint   DBFS_LOGBSIZE         = (1 << 18);      // 256K
const guint   DBFS_LOGMAX           = 10 * (1 << 20); // 10M
const guint   DBFS_CACHESIZE        = (1 << 22);      // 4M
const guint   DBFS_TX_MAX           = (250);
const guint   DBFS_LK_MAX           = (1000);
const guint   DBFS_TRICKLE_PCT      = (25);
const guint   DBFS_ENV_OPEN_FLAGS   = (DB_INIT_LOCK |
				       DB_INIT_LOG |
				       DB_INIT_MPOOL |
				       DB_INIT_TXN |
				       DB_THREAD);

const guint   DBFS_MINOR_TABLE_SIZE     = 512; // @@ Need reuse
const guint   DBFS_MAJOR_TABLE_SIZE     = 256;
const guint   DBFS_SAREA_TABLE_SIZE     = 32;
const guint   DBFS_SHARED_DB_TABLE_SIZE = 256;
const guint   DBFS_SHARED_FD_TABLE_SIZE = 256;
const guint   DBFS_SHARED_PG_TABLE_SIZE = 512; // 512 * 8K = 4MB of page cache

const guint   DBFS_FID_BASE_BITS    = 8;
const guint   DBFS_FID_BASE_ALLOC   = (1 << DBFS_FID_BASE_BITS);
const guint   DBFS_FID_BASE_MASK    = (DBFS_FID_BASE_ALLOC - 1);

#define SECS (1000 * 1000)

const guint   DBFS_CHECKPOINT_USECS = 300 * SECS;
const guint   DBFS_TRICKLE_USECS    = 60 * SECS;
const guint   DBFS_CLEANLOG_USECS   = 600 * SECS;
const guint   DBFS_DEADLOCK_USECS   = 1 * SECS;
const guint   DBFS_INCOMPLETE_USECS = 10000;

const guint   DBFS_CONT_ID_INTERVAL = 0x10;
const guint   DBFS_CONT_ID_MASK     = (DBFS_CONT_ID_INTERVAL - 1);

const XTYPE   XTYPE_NOT_PRESENT  (FT_NotPresent);
const XTYPE   XTYPE_DIRBTREE     (FT_DirBtree);
const XTYPE   XTYPE_DIRHASH      (FT_DirHash);
const XTYPE   XTYPE_DIRSEQ       (FT_DirSeq);
const XTYPE   XTYPE_REFLINK      (FT_Reflink);
const XTYPE   XTYPE_AREALINK     (FT_Arealink);
const XTYPE   XTYPE_SHORTSEG     (FT_ShortSeg);
const XTYPE   XTYPE_LONGSEG      (FT_LongSeg);
const XTYPE   XTYPE_VIEWSEG      (FT_ViewSeg);

const XFID    XFID_NONE            (0);

const XLNK    DBFS_ZERO_REFS       (0);
const XLNK    DBFS_ONE_REF         (1);

const XSEQNO  DBFS_FID_SEQV        (1);
const XSEQNO  DBFS_SID_SEQV        (2);
const XSEQNO  DBFS_ROOT_CONT_ID    (XHTONL (XSEQNO (DBFS_CONT_ID_INTERVAL)));

const XAREA   DBFS_ROOT_AREA       (1);
const XNUM    DBFS_ROOT_DIRECTORY  (DBFS_ROOT_AREA, DBFS_ROOT_CONT_ID);

const XVID    __COMP_RECONSTRUCT_VIEW (67);

/*const*/ XSTCK   DBFS_DEF_STACK       (DBFS_DEF_STACK_VAL);
/*const*/ XSTCK   DBFS_INV_STACK;
/*const*/ XSTCK   __COMP_SRC_STCK;
/*const*/ XSTCK   __COMP_CONTROL_STCK;
/*const*/ XSTCK   __COMP_INDEX_STCK;
/*const*/ XSTCK   __COMP_STATE_STCK;
/*const*/ XSTCK   __COMP_SHAREDLNK_STCK;
/*const*/ XSTCK   __CLL_PREV_STCK;
/*const*/ XSTCK   __CLL_NEXT_STCK;

const DKEY    DBFS_NULL_DKEY       ((guint8*) 0, 0);
const MKEY    DBFS_DEF_MKEY        (DBFS_NULL_DKEY);

typedef hash_map <guint32, VIEWDEF*>  VdefHash;
typedef hash_map <guint16, STACKDEF*> SdefHash;

static VdefHash *__vdef_hash = NULL;
static SdefHash *__sdef_hash = new SdefHash;

XSEQNO XHTONL(const XSEQNO& x) { return XSEQNO (g_htonl (x.key ())); }
XSEQNO XNTOHL(const XSEQNO& x) { return XSEQNO (g_ntohl (x.key ())); }

int  MAJORC::compare_to (const MAJORC &node) const
{
    g_assert (node.sarea () == sarea ());

    XSEQNO my_seq   (XNTOHL (number ().cont_seq ()));
    XSEQNO node_seq (XNTOHL (node.number ().cont_seq ()));

    return my_seq.key () - node_seq.key ();
}

STACKDEF::STACKDEF (const XSTCK &stck_id, const CKEY &key)
{
    const char* combined = (const char*) key.data ();
    const char* firstsep = strchr (combined, ':');
    const char* lastsep  = strrchr (combined, ':');

    g_assert (firstsep && lastsep);

    _stck_id = stck_id;

    _module = string (combined, firstsep-combined);
    _name   = string (firstsep+1, lastsep-firstsep-1);
    _abbrev = string (lastsep+1);

    DEBUG_DBFS ("NEW stackdef: %s:%s:%s = %d", module (), name (), abbrev (), _stck_id.key ());

    (*__sdef_hash) [_stck_id.key ()] = this;
}

VIEWDEF::VIEWDEF (XVID view_id)
    : _view_id (view_id)
{
    if (! __vdef_hash) {
	__vdef_hash = new VdefHash;
    }

    (*__vdef_hash) [view_id.key ()] = this;
}

static void
dbfs_errcall_fcn (const char *errpfx, char *msg)
{
    DB_ERROR (msg);
}

static void
dbfs_paniccall (DbEnv *, int errval)
{
    DBFS_FATAL (errval) ("Berkeley DB panic");
}

static const char*
fmt_fid (guint val)
{
    static char buf[3];
    sprintf (buf, "%02x", val);
    return buf;
}

int
dbfs_check_ow_flags (int flags)
{
    if (! (((flags & DBFS_OVERWRITE) != 0) ^
	   ((flags & DBFS_NOOVERWRITE) != 0))) {
	DBFS_ERROR ("Must specify either DBFS_OVERWRITE or DBFS_NOOVERWRITE");
	return DBFS_INVAL;
    }

    return 0;
}

int
dbfs_viewdef_find (const XVID &id, VIEWDEF **vdefp)
{
    if (__vdef_hash) {

	VdefHash::iterator l = __vdef_hash->find (id.key ());

	if (l != __vdef_hash->end ()) {
	    (*vdefp) = (*l).second;
	    return 0;
	}
    }

    DBFS_ERROR () << "unknown view ID: " << id.key ();
    return DBFS_VIEW_UNDEF;
}

int
dbfs_stackdef_find (const XSTCK &id, STACKDEF **sdefp)
{
    if (__sdef_hash) {

	SdefHash::iterator l = __sdef_hash->find (id.key ());

	if (l != __sdef_hash->end ()) {
	    (*sdefp) = (*l).second;
	    return 0;
	}
    }

    DBFS_ERROR () << "unknown stack ID: " << id.key ();
    return DBFS_STACK_UNDEF;
}

const char*
dbfs_xtype_to_string (XTYPE type)
{
    return dbfs_type_to_string ((FileType) type.key ());
}

const char*
dbfs_type_to_string (FileType type)
{
    switch (type) {
    case FT_NotPresent: return "not present";
    case FT_Reflink:    return "ref link";
    case FT_Arealink:   return "area link";
    case FT_ShortSeg:   return "short segment";
    case FT_LongSeg:    return "long segment";
    case FT_ViewSeg:    return "view segment";
    case FT_DirSeq:     return "sequence index";
    case FT_DirHash:    return "hash index";
    case FT_DirBtree:   return "btree index";
    default:            return "unknown type";
    }
}

BASIC_DBENV::BASIC_DBENV (int dbtabsize,
			  int fdtabsize,
			  int pgtabsize)
    : _env          (NULL),
      _envopen_ret  (-1),
      _static_list  (NULL),
      _shared_dbs   (*this, "Shared DB", dbtabsize),
      _shared_fds   (*this, "Shared FD", fdtabsize),
      _shared_pgs   (*this, "Shared PG", pgtabsize)
{
}

BASIC_DBENV::~BASIC_DBENV ()
{
    close ();
}

DBFS::~DBFS ()
{
    // Make sure we call DBFS::close, not BASIC_DBENV::close
    close ();
}

int
BASIC_DBENV::basic_db_open (const char* name, int oflags, int env_open_flags)
{
    int ret;

    if (_env == NULL) {
	_env = new DbEnv (DB_CXX_NO_EXCEPTIONS);
    }

    _env->set_errcall   (dbfs_errcall_fcn);
    _env->set_paniccall (dbfs_paniccall);

    if ((ret = _envopen_ret = _env->open (name, env_open_flags, 0666))) {
	DB_ERROR (ret) ("dbenv_open: %s", name);
	return ret;
    }

    for (STATIC_DB *sdb = _static_list;
	 sdb;
	 sdb = sdb->_next) {

	if ((ret = sdb->open (*this, oflags))) {
	    PROP_ERROR (ret) ("static_db_open: ") () << sdb->_name;
	    return ret;
	}
    }

    return 0;
}

int
BASIC_DBENV::close ()
{
    int ret = 0, t_ret;

    if (_env == NULL || _envopen_ret != 0) {
	return 0;
    }

    if ((ret = checkpoint ())) {
	PROP_ERROR (ret) ("close_checkpoint");
	return ret;
    }

    for (STATIC_DB *sdb = _static_list;
	 sdb;
	 sdb = sdb->_next) {
	// static_alloc copies one reference, this closes it
	sdb->close ();
    }

    if ((t_ret = _shared_dbs.close ())) {
	PROP_ERROR (t_ret) ("shared_dbs_close");
	if (ret == 0) {
	    ret = t_ret;
	}
    }

    if ((t_ret = _shared_fds.close ())) {
	PROP_ERROR (t_ret) ("shared_fds_close");
	if (ret == 0) {
	    ret = t_ret;
	}
    }

    if ((t_ret = _shared_pgs.assert_clear ())) {
	PROP_ERROR (t_ret) ("shared_pgs_assert_clear");
	if (ret == 0) {
	    ret = t_ret;
	}
    }

    if ((ret = _env->close (0))) {
	DB_ERROR (ret) ("dbenv_close");
	return ret;
    }

    _env = NULL;

    return ret;
}

ADMIN_DB::ADMIN_DB ()
    : BASIC_DBENV (4, 0, 0),
      _pids       (*this, "pids", DB_BTREE, -1),
      _settings   (*this, "settings", DB_HASH, -1)
{
}

int
ADMIN_DB::open (const char* name, int oflags, int env_oflags)
{
    int ret;

    g_assert (_env == NULL);

    _env = new DbEnv (DB_CXX_NO_EXCEPTIONS);

    // Reduce the size of this env
    if ((ret = _env->set_cachesize (0, 1<<15, 1)) ||
	(ret = _env->set_lk_max    (50)) ||
	(ret = _env->set_tx_max    (5)) ||
	(ret = _env->set_lg_bsize  (1 << 13)) ||
	(ret = _env->set_lg_max    (1 << 15))) {
	PROP_ERROR (ret) ("admin_set_max");
	return ret;
    }

    if ((ret = basic_db_open (name, oflags, env_oflags))) {
	PROP_ERROR (ret) ("basic_db_open");
	return ret;
    }

    return 0;
}

DBFS::DBFS (int argc, char const* const* argv)
    : BASIC_DBENV   (DBFS_SHARED_DB_TABLE_SIZE,
		     DBFS_SHARED_FD_TABLE_SIZE,
		     DBFS_SHARED_PG_TABLE_SIZE),
      _area_db      (*this, "area", DB_QUEUE, sizeof (SAREA_REC)),
      _seqv_db      (*this, "seqv", DB_QUEUE, sizeof (XSEQNO)),
      _alloc_fids   (*this, "alloc", DB_QUEUE, sizeof (XFID)),
      _clean_fids   (*this, "clean", DB_QUEUE, sizeof (XFREEELT)),
      _stackcat_db  (*this, "stackcat", DB_HASH, -1),
      _fid_seqv     (*this, DBFS_FID_SEQV),
      _sid_seqv     (*this, DBFS_SID_SEQV),
      _maint_tid    (0),
      _deadlock_tid (0)
{
    g_assert (DBFS_FS_SHORT_THRESH < DBFS_PAGE_SIZE);

    for (int i = 0; i < argc; i += 1) {

	if (i > 0) {
	    _cmdline += ' ';
	}

	_cmdline += argv[i];
    }
}

int
DBFS::open (const char* fsdir, const char* logdir, int oflags)
{
    int ret;
    int env_open_flags = DBFS_ENV_OPEN_FLAGS;

    _openflags = oflags;

    if ((ret = dbfs_async_start (oflags))) {
	PROP_ERROR (ret) ("dbfs_async_start");
	return ret;
    }

    if ((ret = absolute_path (fsdir, _fsdir))) {
	PROP_ERROR (ret) ("absolute_path");
	return ret;
    }

    if ((ret = absolute_path (logdir, _logdir))) {
	PROP_ERROR (ret) ("absolute_path");
	return ret;
    }

    _filesdir = _fsdir + "/files";
    _admindir = _fsdir + "/admin";

    if (oflags & DBFS_CLEAR) {

	if (! (oflags & DBFS_CREATE)) {
	    DBFS_ERROR ("dbfs_open: DBFS_CLEAR requires DBFS_CREATE");
	    return DBFS_INVAL;
	}

	if ((ret = clear_dir (_fsdir))) {
	    PROP_ERROR (ret) ("clear_dir");
	    return ret;
	}

	if (has_logdir () && (ret = clear_dir (_logdir))) {
	    PROP_ERROR (ret) ("clear_dir");
	    return ret;
	}
    }

    if (oflags & DBFS_CREATE) {

	if (oflags & DBFS_RECOVER) {
	    DBFS_ERROR ("DBFS_RECOVER and DBFS_CREATE are incompatible");
	    return DBFS_INVAL;
	}

	if ((ret = create_dir (_fsdir))) {
	    PROP_ERROR (ret) ("create_dir");
	    return ret;
	}

	if (has_logdir () && (ret = create_dir (_logdir))) {
	    PROP_ERROR (ret) ("create_dir");
	    return ret;
	}

	if ((ret = create_dir (_filesdir)) ||
	    (ret = create_dir (_admindir))) {
	    PROP_ERROR (ret) ("create_dir");
	    return ret;
	}

	if ((ret = create_config ())) {
	    PROP_ERROR (ret) ("create_config");
	    return ret;
	}
    } else {

	if ((ret = check_dir (_fsdir))) {
	    PROP_ERROR (ret) ("check_dir");
	    return ret;
	}

	// Logdir is not checked, since it only needs to be passed at
	// creation.  Berkeley DB has rules for checking that.
    }

    if ((ret = check_recovery (oflags, env_open_flags))) {
	PROP_ERROR (ret) ("check_recovery");
	return ret;
    }

    if ((ret = basic_db_open (_fsdir.c_str (), oflags, env_open_flags))) {
	PROP_ERROR (ret) ("basic_db_open");
	return ret;
    }

    if (oflags & DBFS_CREATE) {

	if ((ret = create_root (oflags))) {
	    PROP_ERROR (ret) ("create_root");
	    return ret;
	}

    } else {

	if ((ret = catalog_read ())) {
	    PROP_ERROR (ret) ("catalog_read");
	    return ret;
	}
    }

    // Close the ADMIN_DB unless the process wants to use admin_tool()
    if (! (oflags & DBFS_ADMIN_TOOL) && (ret = finish_recovery ())) {
	PROP_ERROR (ret) ("finish_recovery");
	return ret;
    }

    maint_start    (oflags);
    deadlock_start (oflags);

    DEBUG_DBFS ("ready: ") (_fsdir);

    return 0;
}

int
DBFS::close ()
{
    int ret;

    if ((ret = close_recovery ()) ||
	(ret = finish_recovery ())) {
	DBFS_ERROR (ret) ("admin_close");
    }

    return BASIC_DBENV::close ();
}
int
DBFS::absolute_path (const char* path, string &abs)
{
    if (! path || ! path[0]) {
	abs = "";
	return 0;
    }

    if (path[0] == '/') {
	abs = path;
	return 0;
    }

    char buf[1024];

    if (! getcwd (buf, 1024)) {
	SYS_ERROR (errno) ("getcwd");
	return errno;
    }

    abs  = buf;
    abs += "/";
    abs += path;

    return 0;
}

int
DBFS::clear_dir (const string &dir)
{
    int    ret;
    string s ("rm -rf ");

    s += dir;

    if ((ret = system (s.c_str ()))) {
	SYS_ERROR (errno) ("system: ") () << s;
	return errno;
    }

    return 0;
}

int
DBFS::create_dir (const string &dir)
{
    int ret;

    if ((ret = mkdir (dir.c_str (), 0777)) && (errno != 0)) {
	ret = errno;
	SYS_ERROR (ret) () << "mkdir: " << dir;
	return ret;
    }

    return 0;
}

int
DBFS::check_dir (const string &dir)
{
    struct stat sbuf;
    int         ret;

    if ((ret = stat (dir.c_str (), & sbuf)) != 0) {
	SYS_ERROR (errno) () << "stat: " << dir;
	return errno;
    }

    if (! S_ISDIR (sbuf.st_mode)) {
	DBFS_ERROR (ENOTDIR) () << "check_dir: " << dir;
	return ENOTDIR;
    }

    return 0;
}

int
DBFS::create_config ()
{
    FileHandle *fh          = NULL;
    string      config_path = _fsdir;

    config_path += "/DB_CONFIG";

    if (! (fh = handle_write_file (config_path.c_str ()))) {
	goto cleanup;
    }

    if (! handle_printf (fh, "set_cachesize 0 %d 1\n", DBFS_CACHESIZE) ||
	! handle_printf (fh, "set_lg_bsize  %d\n",     DBFS_LOGBSIZE) ||
	! handle_printf (fh, "set_lg_max    %d\n",     DBFS_LOGMAX) ||
	! handle_printf (fh, "set_tx_max    %d\n",     DBFS_TX_MAX) ||
	! handle_printf (fh, "set_lk_max    %d\n",     DBFS_LK_MAX)
	) {
	goto cleanup;
    }

    if (has_logdir () &&
	! handle_printf (fh, "set_lg_dir    %s\n", _logdir.c_str ())) {
	goto cleanup;
    }

    if (! handle_close (fh)) {
	goto cleanup;
    }

    handle_free (fh);

    return 0;

  cleanup:

    if (fh) { handle_free (fh); }

    DBFS_ERROR ("handle errors do not propagate");
    return -1;
}

int
DBFS::catalog_read  ()
{
    int ret;
    TXN txn;
    DBCREF ccurs;

    if ((ret = txn.begin (*this, DBFS_NOROOT_SPECIAL))) {
	PROP_ERROR (ret) ("txn_begin");
	return ret;
    }

    if ((ret = _stackcat_db.cursor (txn, ccurs))) {
	PROP_ERROR (ret) ("stackcat_cursor");
	return ret;
    }

    CKEY key;
    XSTCK stck_id;
    RECDBT<XSTCK> stck_dbt (stck_id);

    while ((ret = ccurs.move_pos (key, stck_dbt, DB_NEXT, DBFS_NORMW)) == 0) {
	new STACKDEF (stck_id, key);
    }

    if (ret != DBFS_NOTFOUND || (ret = ccurs.close ())) {
	PROP_ERROR (ret) ("stackcat_movepos");
	return ret;
    }

    if ((ret = catalog_builtins (txn))) {
	PROP_ERROR (ret) ("catalog_builtins");
	return ret;
    }

    if ((ret = txn.commit ())) {
	PROP_ERROR (ret) ("txn_commit");
	return ret;
    }

    return 0;
}

int
DBFS::catalog_stack_id  (TXN        &txn,
			 const char *module,
			 const char *name,
			 const char *abbrev,
			 XSTCK      &stckval)
{
    int           ret;
    CKEY          key;
    RECDBT<XSTCK> stck_dbt (stckval);

    // @@ Sigh... this is serialization
    key.append_str (module);
    key.append_str (":");
    key.append_str (name);
    key.append_str (":");
    key.append_str (abbrev);
    key.append_byte (0);

    if ((ret = _stackcat_db.get_or_notfound (txn, key, stck_dbt, DBFS_NORMW)) && (ret != DBFS_NOTFOUND)) {
	PROP_ERROR (ret) ("get_or_notfound");
	return ret;
    }

    if (ret == DBFS_NOTFOUND) {

	guint32 val;

	if ((ret = _sid_seqv.next (val))) {
	    PROP_ERROR (ret) ("sid_seqv_next");
	    return ret;
	}

	stckval = XSTCK (val);

	if ((ret = _stackcat_db.put_no_overwrite (txn, key, stck_dbt))) {
	    PROP_ERROR (ret) ("sid_put");
	    return ret;
	}

	new STACKDEF (stckval, key);
    }

    return 0;
}

int
DBFS::catalog_builtins (TXN &txn)
{
    int ret;

    if ((ret = catalog_stack_id (txn, "builtin",        "default", "def", DBFS_DEF_STACK)),
	(ret = catalog_stack_id (txn, "builtin",        "inverse", "inv", DBFS_INV_STACK)),
	(ret = catalog_stack_id (txn, "linked list",    "next",    "nxt", __CLL_PREV_STCK)),
	(ret = catalog_stack_id (txn, "linked list",    "prev",    "prv", __CLL_NEXT_STCK)),
	(ret = catalog_stack_id (txn, "xdelta archive", "source",  "xsrc", __COMP_SRC_STCK)),
	(ret = catalog_stack_id (txn, "xdelta archive", "index",   "xidx", __COMP_INDEX_STCK)),
	(ret = catalog_stack_id (txn, "xdelta archive", "control", "xctrl", __COMP_CONTROL_STCK)),
	(ret = catalog_stack_id (txn, "xdelta archive", "state",   "xsvec", __COMP_STATE_STCK)),
	(ret = catalog_stack_id (txn, "xdelta archive", "archive", "xalnk", __COMP_SHAREDLNK_STCK))) {
	PROP_ERROR (ret) ("CATALOG builtins");
	return ret;
    }

    return 0;
}

int
DBFS::create_root (int flags)
{
    int    ret;
    TXN    txn;
    MAJORC root;
    SAREA  area;

    if ((ret = txn.begin (*this, DBFS_NOROOT_SPECIAL))) {
	PROP_ERROR (ret) ("txn_begin");
	return ret;
    }

    if ((ret = _fid_seqv.create ()) ||
	(ret = _sid_seqv.create ())) {
	PROP_ERROR (ret) ("ID sequence create");
	return ret;
    }

    g_assert (_fid_seqv.id () == DBFS_FID_SEQV);
    g_assert (_sid_seqv.id () == DBFS_SID_SEQV);

    if ((ret = catalog_builtins (txn))) {
	PROP_ERROR (ret) ("catalog_builtins");
	return ret;
    }

    if ((ret = txn.create_area (area))) {
	PROP_ERROR (ret) ("txn_create_area");
	return ret;
    }

    g_assert (area.area_id () == DBFS_ROOT_AREA);

    if ((ret = area.allocate (root, DBFS_ALLOC_ASCEND))) {
	PROP_ERROR (ret) ("area_allocate");
	return ret;
    }

    g_assert (root.number () == DBFS_ROOT_DIRECTORY);

    if ((ret = root.mk_directory ((flags & DBFS_CREATEROOT_HASH) ? XTYPE_DIRHASH : XTYPE_DIRBTREE,
				  DBFS_NOOVERWRITE))) {
	PROP_ERROR (ret) (root) ("mk_directory");
	return ret;
    }

    root.incr_refcount (1);

    if ((ret = txn.commit ())) {
	PROP_ERROR (ret) ("txn_commit");
	return ret;
    }

    return 0;
}

void
DBFS::relative_fname (const XFID   &fid,
		     string       &fname)
{
    fname = "";
    append_fname (fid, fname, false);
}

void
DBFS::absolute_fname (const XFID   &fid,
		     string       &fname)
{

    fname = _fsdir;
    fname += "/";
    append_fname (fid, fname, false);
}

void
DBFS::relative_dname (const XFID   &fid,
		     string       &fname)
{
    fname = "";
    append_fname (fid, fname, true);
}

void
DBFS::absolute_dname (const XFID   &fid,
		     string       &fname)
{

    fname = _fsdir;
    fname += "/";
    append_fname (fid, fname, true);
}

void
DBFS::append_fname (const XFID   &fid,
		    string       &fname,
		    bool          only_dir)
{
    guint    split_buf[16];
    int      split_i = 0;
    guint32  key     = fid.key ();

    fname += "files";

    do {
	split_buf[split_i++] = key & DBFS_FID_BASE_MASK;
	key >>= DBFS_FID_BASE_BITS;
    }
    while (key != 0);

    for (; --split_i >= 1;) {
	fname += "/d";
	fname += fmt_fid (split_buf[split_i]);
    }

    if (! only_dir) {
	fname += "/f";
	fname += fmt_fid (split_buf[0]);
    }
}

int
DBFS::create_base (XFID &fid_base)
{

    int     ret;
    guint32 next_base;

    if ((ret = _fid_seqv.next (next_base))) {
	PROP_ERROR (ret) ("FID sequence next");
	return ret;
    }

    fid_base = XFID (next_base << DBFS_FID_BASE_BITS);

    string dname;

    absolute_dname (fid_base, dname);

    if ((ret = create_dir (dname))) {
	PROP_ERROR (ret) ("create_dir: ") () << dname;
	return ret;
    }

    return 0;
}
int
BASIC_DBENV::checkpoint ()
{
    int ret;
    int tries = 0;

  again:

    if ((ret = _env->txn_checkpoint (0, 0, 0))) {

	if (ret == DB_INCOMPLETE) {

	    if (tries++ == 2) {
		DBFS_ERROR ("checkpoint incomplete 3 times: skip");
		return 0;
	    }

	    usleep (DBFS_INCOMPLETE_USECS);
	    goto again;
	}

	DB_ERROR (ret) ("txn_checkpoint");
	return ret;
    }

    DEBUG_MAINT ("checkpoint success");

    if (DEBUG_TAG (CACHE)) {
	_shared_fds.debug_cache ();
	_shared_dbs.debug_cache ();
	_shared_pgs.debug_cache ();
    }

    return 0;
}

int
DBFS::deadlock ()
{
    int ret;
    int aborted = 0;

    if ((ret = _env->lock_detect (0, DB_LOCK_DEFAULT, & aborted))) {
	DB_ERROR (ret) ("lock_detect");
	return ret;
    }

    if (aborted) {
	DEBUG_MAINT ("deadlock: aborted %d transactions", aborted);
    }

    return 0;
}

int
DBFS::clean_log ()
{
    int ret;
    char **list = NULL;

    if ((ret = _env->log_archive (& list, 0))) {
	DB_ERROR (ret) ("log_archive");
	return ret;
    }

    if (list) {

	for (char **p = list; *p; p += 1) {

	    string  full (_fsdir);
	    char   *base = *p;

	    full += '/';
	    full += base;

	    if ((ret = unlink (full.c_str ()))) {
		ret = errno;
		SYS_ERROR (ret) ("unlink: ") () << full;
		return ret;
	    }

	    DEBUG_MAINT ("clean log file %s", base);
	}

	free (list);
    }

    return 0;
}

int
DBFS::trickle ()
{
    int ret;
    int nwrote = 0;

    if ((ret = _env->memp_trickle (DBFS_TRICKLE_PCT, & nwrote))) {
	DB_ERROR (ret) ("memp_trickle");
	return ret;
    }

    if (nwrote) {
	DEBUG_MAINT ("memp trickle (%d%%) wrote %d pages", DBFS_TRICKLE_PCT, nwrote);
    }

    return 0;
}

void
DBFS::maint_start (int oflags)
{
    if (oflags & DBFS_MAINT_INIT) {
	pthread_create (& _maint_tid, NULL, & maint_thread_start, this);
	pthread_detach (_maint_tid);
    }
}

void
DBFS::deadlock_start (int oflags)
{
    if (oflags & DBFS_DEADLOCK_INIT) {
	pthread_create (& _deadlock_tid, NULL, & deadlock_thread_start, this);
	pthread_detach (_deadlock_tid);
    }
}

void*
DBFS::maint_thread_start (DBFS *dbfs)
{
    int ret;

    if ((ret = dbfs->maint_thread ())) {
	DBFS_FATAL (ret) ("maint thread returned");
    }

    return NULL;
}

void*
DBFS::deadlock_thread_start (DBFS *dbfs)
{
    int ret;

    if ((ret = dbfs->deadlock_thread ())) {
	DBFS_FATAL (ret) ("deadlock thread returned");
    }

    return NULL;
}

class USEC_PROCESS
{
public:

    typedef int (DBFS:: *ACTIVATE) (void);

    USEC_PROCESS (guint interval, const char* name, ACTIVATE activate)
	: _interval (interval),
	  _name     (name),
	  _activate (activate) { }

    guint                _interval;
    const char          *_name;
    ACTIVATE             _activate;
    elink<USEC_PROCESS>  _link;
};

class USEC_TIMER
{
public:

    USEC_TIMER (DBFS &dbfs, const char *type)
	: _dbfs (dbfs),
	  _type (type) { }

    guint64 now_usecs ();

    void schedule (USEC_PROCESS *up)
    {
	schedule1 (now_usecs (), up);
    }

    void schedule1 (guint64 now, USEC_PROCESS *up);
    int  start ();

    typedef Slp<guint64,USEC_PROCESS*,UINT64_MAX,8,0> UP_TABLE;

    DBFS      &_dbfs;
    const char *_type;
    UP_TABLE   _table;
};

guint64
USEC_TIMER::now_usecs ()
{
    struct timeval tv;

    gettimeofday (&tv, NULL);

    return (tv.tv_sec * 1000ULL * 1000ULL + tv.tv_usec);
}

int
USEC_TIMER::start ()
{
    guint64       first;
    guint64       now;
    USEC_PROCESS *up;
    int           ret;

    now = now_usecs ();

  loop:

    bool z = _table.slp_remove_min (& first, & up);

    g_assert (z);

    if (now < first) {
	int diff = first - now;

	DEBUG_MAINT ("%s thread sleep %d now %qu", _type, diff, now);

	usleep (diff);
    } else {
	int diff = now - first;

	DEBUG_MAINT ("%s thread behind %d now %qu", _type, diff, now);
    }

    DEBUG_MAINT ("%s thread run %s", _type, up->_name);

    if ((ret = (_dbfs.* (up->_activate)) ())) {
	DBFS_ERROR (ret) ("maintenence thread: %s", up->_name);
	return ret;
    }

    now = now_usecs ();

    schedule1 (now, up);

    goto loop;
}

void
USEC_TIMER::schedule1 (guint64 now, USEC_PROCESS *up)
{
    guint64 when = now + up->_interval;

    DEBUG_MAINT ("%s thread schedule %s now %qu + %d", _type, up->_name, now, up->_interval);

    while (! _table.slp_insert (when, up)) {
	when += 1;
    }
}

int
DBFS::maint_thread ()
{
    USEC_TIMER timer (*this, "maint");

    DEBUG_DBFS ("checkpoint/maintenence started");

    timer.schedule (new USEC_PROCESS (DBFS_CHECKPOINT_USECS, "checkpoint", & DBFS::checkpoint));
    timer.schedule (new USEC_PROCESS (DBFS_TRICKLE_USECS,    "trickle",    & DBFS::trickle));
    timer.schedule (new USEC_PROCESS (DBFS_CLEANLOG_USECS,   "clean_log",  & DBFS::clean_log));

    return timer.start ();
}

int
DBFS::deadlock_thread ()
{
    USEC_TIMER timer (*this, "deadlock");

    DEBUG_DBFS ("deadlock avoidance started");

    timer.schedule (new USEC_PROCESS (DBFS_DEADLOCK_USECS, "deadlock", & DBFS::deadlock));

    return timer.start ();
}
