/* -*-Mode: C++;-*-
 * $Id: admin.cc 1.2 Fri, 29 Jun 2001 16:59:43 +0400 jmacd $
 *
 * Copyright (C) 1999, 2000, Joshua P. MacDonald <jmacd@CS.Berkeley.EDU>
 * and The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *    Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 *    Redistributions in binary form must reproduce the above
 *    copyright notice, this list of conditions and the following
 *    disclaimer in the documentation and/or other materials provided
 *    with the distribution.
 *
 *    Neither name of The University of California nor the names of
 *    its contributors may be used to endorse or promote products
 *    derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <unistd.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <errno.h>
#include <signal.h>

#include "xdfs_cpp.h"
#include "edsiostdio.h"

static bool
pid_exists (pid_t pid)
{
    DEBUG_SIGNAL ("kill (pid=%d,0)", pid);
    return (kill (pid, 0) == 0) || (errno == EPERM);
}

int
DBFS::close_recovery ()
{
    int ret;
    bool admin = (_openflags & DBFS_ADMIN_TOOL);

    // If the transaction is still active, it means check_recovery failed,
    // so we don't have to re-open it here.  We don't leave the admin DB
    // open during normal operation, otherwise it will need recovery just
    // as often as the regular DB, which would destroy the benefit.
    if (! admin && _admintxn.active ()) {
	_admintxn.abort ();
	return 0;
    }

    if (_env == NULL || _envopen_ret != 0) {
	_admintxn.abort ();
	return 0;
    }

    XSEQNO         mypid = XNTOHL (XSEQNO (getpid ()));
    int            refs  = 0;
    RECDBT<XSEQNO> mypid_key  (mypid);
    DKEY           mypid_data (_cmdline.c_str ());
    DKEY           refs_key   ("refcount");
    RECDBT<int>    refs_data  (refs);

    if (! admin && (ret = _admindb.open (_admindir.c_str (), 0, DBFS_ENV_OPEN_FLAGS))) {
	PROP_ERROR (ret) ("admin_open");
	return ret;
    }

    if (! admin && (ret = _admintxn.begin (_admindb, DBFS_TXN_SYNC))) {
	PROP_ERROR (ret) ("admin_txn_begin");
	return ret;
    }

    if ((ret = _admindb._settings.get_or_notfound (_admintxn,
						   refs_key,
						   refs_data,
						   DBFS_RMW)) && (ret != DBFS_NOTFOUND)) {
	PROP_ERROR (ret) ("get_refcount");
	return ret;
    }

    if (refs >= 0) {
	refs -= 1;

	if ((ret = _admindb._settings.put_overwrite (_admintxn, refs_key, refs_data))) {
	    PROP_ERROR (ret) ("put_refcount");
	    return ret;
	}

    } else {
	ret = DBFS_NEEDS_ATTENTION;
	DBFS_WARN (ret) ("ADMIN DATABASE inconsistency");
	return ret;
    }

    if ((ret = _admindb._pids.delete_must_exist (_admintxn, mypid_key))) {
	PROP_ERROR (ret) ("delete_pid");
	return ret;
    }

    return 0;
}

int
DBFS::finish_recovery ()
{
    int ret;

    if (_admintxn.active () && (ret = _admintxn.commit ())) {
	PROP_ERROR (ret) ("admin txn_commit");
	return ret;
    }

    if ((ret = _admindb.close ())) {
	PROP_ERROR (ret) ("admin dbenv_close");
	return ret;
    }

    return 0;
}

int
DBFS::check_recovery (int oflags, int &dbenv_oflags)
{
    int ret;
    int aenv_oflags = DBFS_ENV_OPEN_FLAGS;

    // Note: I want PIDS to sort so use big endian--fix this later
    XSEQNO         mypid = XHTONL (XSEQNO (getpid ()));
    int            refs  = 0;
    RECDBT<XSEQNO> mypid_key  (mypid);
    DKEY           mypid_data (_cmdline.c_str ());
    DKEY           refs_key   ("refcount");
    DKEY           attention_key ("attention");
    RECDBT<int>    refs_data  (refs);
    bool           needs_attention = false;

    if (oflags & DBFS_CREATE) {
	dbenv_oflags |= DB_CREATE;
	aenv_oflags  |= DB_CREATE;
    }

    if (oflags & DBFS_RECOVER_ADMIN) {
	DBFS_WARN ("ADMIN RECOVERY assuming no active processes");
	aenv_oflags |= DB_RECOVER | DB_CREATE;
	oflags      |= DBFS_RECOVER;
    }

    // Here, we have to second-guess whatever the application says,
    // because recovery is UNSAFE--it must be single-threaded--if you
    // run recovery while other processes are still active, their
    // behavior is basically undefined.  Eventually, DB is expected to
    // return DB_RUNRECOVERY for those processes--and if they do we're
    // in trouble.
    //
    // Secondly, we need a way to pre-set DB environment variables
    // prior to DBENV->open, such as cache-size.  This mechanism
    // should maintain the DB_CONFIG file as well as any FS-specific
    // variables, and other persistent DBENV settings that are needed
    // prior to DBENV->open.

    if ((ret = _admindb.open (_admindir.c_str (), oflags, aenv_oflags))) {
	PROP_ERROR (ret) ("admin_open");
	return ret;
    }

    if ((ret = _admintxn.begin (_admindb, DBFS_TXN_NOWAIT))) {
	PROP_ERROR (ret) ("admin_txn_begin");
	return ret;
    }

    if ((ret = _admindb._settings.get_or_notfound (_admintxn, refs_key, refs_data, DBFS_RMW)) && (ret != DBFS_NOTFOUND)) {
	PROP_ERROR (ret) ("get_refcount");
	return ret;
    }

    // Admin recover: reset refcount
    if (oflags & DBFS_RECOVER_ADMIN) {
	refs = 0;
    }

    refs += 1;

    // Admin recover: clear PIDS, otherwise Open: check for dead procs
    DBCREF         dbc;
    NULLDBT        null;
    XSEQNO         pid_be;
    RECDBT<XSEQNO> pid_key (pid_be);
    DKEY           pid_data;
    int            count = 0;

    if ((ret = _admindb._pids.cursor (_admintxn, dbc))) {
	PROP_ERROR (ret) ("pids_cursor");
	return ret;
    }

    while ((ret = dbc.move_pos (pid_key, pid_data, DB_NEXT, DBFS_RMW)) == 0) {

	XSEQNO pid_nat = XNTOHL (pid_be);
	string pid_desc;
	pid_t  pid = pid_nat.key ();

	pid_data.get_string (pid_desc);

	if (oflags & DBFS_RECOVER_ADMIN) {

	    if ((ret = dbc.delete_current ())) {
		PROP_ERROR (ret) ("delete_pid");
		return ret;
	    }

	    count += 1;

	} else if (pid_exists (pid)) {
	    // An active process
	} else {
	    // A dead process

	    if (oflags & DBFS_RECOVER_KILLPROC) {

		DBFS_ERROR ("deleting crashed process PID %d: ", pid) (pid_desc);

		if ((ret = dbc.delete_current ())) {
		    PROP_ERROR (ret) ("delete_pid");
		    return ret;
		}

		refs  -= 1;
		count += 1;

	    } else {

		DBFS_ERROR ("crashed process PID %d: ", pid) (pid_desc);
		needs_attention = true;
	    }
	}
    }

    if (ret != DBFS_NOTFOUND) {
	PROP_ERROR (ret) ("pids_next");
	return ret;
    }

    if (count > 0) {
	DBFS_ERROR ("deleted %d active process records during recovery", count);
    }

    // Update refs
    if ((ret = _admindb._settings.put_overwrite (_admintxn, refs_key, refs_data))) {
	PROP_ERROR (ret) ("put_refcount");
	return ret;
    }

    // Place new PID in table
    if ((ret = _admindb._pids.put_no_overwrite (_admintxn, mypid_key, mypid_data))) {
	PROP_ERROR (ret) ("put_pid");
	return ret;
    }

    // At this point we have exclusive access to DBENV->open, the
    // transaction and ADMIN_DB remain open until finish_recovery()
    if (oflags & DBFS_CREATE) {
	g_assert (refs == 1);
	DEBUG_DBFS ("create: ") (_fsdir);
    } else if (oflags & (DBFS_RECOVER|DBFS_RECOVER_KILLPROC)) {

	// See if there are live processes
	if (refs > 1) {
	    DBFS_ERROR ("%d active processes still exist: stop these first", refs - 1);
	    return DBFS_NEEDS_ATTENTION;
	}

	// killproc left this to prevent open until recovery, remove now
	if ((ret = _admindb._settings.delete_or_notfound (_admintxn, attention_key)) && (ret != DBFS_NOTFOUND)) {
	    PROP_ERROR (ret) ("delete_attention");
	    return ret;
	}

	dbenv_oflags |= DB_RECOVER | DB_CREATE;

	INFO_DBFS ("recover: ") (_fsdir);

    } else {

	// See if killproc has left the attention flag
	if ((ret = _admindb._settings.get_or_notfound (_admintxn, attention_key, null, DBFS_NORMW)) != DBFS_NOTFOUND) {

	    if (ret != 0) {
		PROP_ERROR (ret) ("get_attention");
		return ret;
	    }

	    needs_attention = true;
	}

	if (needs_attention && ! (_openflags & DBFS_ADMIN_TOOL)) {
	    DBFS_ERROR ("at least one process has crashed: please killproc and run recovery");
	    return DBFS_NEEDS_ATTENTION;
	}

	DEBUG_DBFS ("open: ") (_fsdir);
    }

    return 0;
}

int
DBFS::admin_listproc (LISTPROC_DATA &data)
{
    int            ret;
    DBCREF         dbc;
    XSEQNO         pid_be;
    RECDBT<XSEQNO> pid_key (pid_be);
    DKEY           pid_data;
    pid_t          mypid = getpid ();

    g_assert (_openflags & DBFS_ADMIN_TOOL);

    if ((ret = _admindb._pids.cursor (_admintxn, dbc))) {
	PROP_ERROR (ret) ("pids_cursor");
	return ret;
    }

    while ((ret = dbc.move_pos (pid_key, pid_data, DB_NEXT, DBFS_NORMW)) == 0) {

	string pid_desc;
	XSEQNO pid_nat = XNTOHL (pid_be);
	pid_t  pid     = pid_nat.key ();

	if (mypid == pid) {
	    continue;
	}

	pid_data.get_string (pid_desc);

	data.add (pid, pid_desc, pid_exists (pid));

	DEBUG_DBFS ("ADMIN LISTPROC process %d: ", pid) (pid_desc);
    }

    if (ret != DBFS_NOTFOUND) {
	PROP_ERROR (ret) ("pids_next");
	return ret;
    }

    return 0;
}

int
DBFS::admin_killproc (KILLPROC_DATA &data)
{
    int            ret;
    DBCREF         dbc;
    XSEQNO         pid_be;
    RECDBT<XSEQNO> pid_key (pid_be);
    DKEY           pid_data;
    pid_t          mypid = getpid ();
    int            deadcount = 0;
    int            refs  = 0;
    DKEY           refs_key   ("refcount");
    RECDBT<int>    refs_data  (refs);
    DKEY           attention_key ("attention");
    NULLDBT        null;

    g_assert (_openflags & DBFS_ADMIN_TOOL);

    if ((ret = _admindb._pids.cursor (_admintxn, dbc))) {
	PROP_ERROR (ret) ("pids_cursor");
	return ret;
    }

    while ((ret = dbc.move_pos (pid_key, pid_data, DB_NEXT, DBFS_NORMW)) == 0) {

	string pid_desc;
	XSEQNO pid_nat = XNTOHL (pid_be);
	pid_t  pid     = pid_nat.key ();
	KILLPROC_STATUS status;
	int    sigs[2] = { SIGTERM, SIGKILL };

	if (pid == mypid) {
	    continue;
	}

	pid_data.get_string (pid_desc);

	for (uint i = 0; i < ARRAY_SIZE (sigs); i += 1) {

	    DEBUG_SIGNAL ("kill (pid=%d,%d)", pid, sigs[i]);
	    if ((ret = kill (pid, sigs[i]))) {

		if (errno == ESRCH) {
		    status = KILLPROC_NOTFOUND;
		} else if (errno == EPERM) {
		    status = KILLPROC_PERMISSION;
		}

		break;
	    }

	    usleep (10000);

	    if (pid_exists (pid)) {
		status = KILLPROC_NODEATH;
	    } else if (sigs[i] == SIGTERM) {
		status = KILLPROC_SIGTERM;
		break;
	    } else {
		status = KILLPROC_SIGKILL;
		break;
	    }
	}

	if (status != KILLPROC_NODEATH) {

	    if ((ret = dbc.delete_current ())) {
		PROP_ERROR (ret) ("delete_current");
		return ret;
	    }

	    deadcount += 1;
	}

	data.add (pid, pid_desc, status);

	DEBUG_DBFS ("ADMIN KILLPROC process %d: ", pid) (pid_desc);
    }

    if (ret != DBFS_NOTFOUND) {
	PROP_ERROR (ret) ("pids_next");
	return ret;
    }


    if ((ret = _admindb._settings.get_or_notfound (_admintxn, refs_key, refs_data, DBFS_RMW))) {
	PROP_ERROR (ret) ("get_refcount");
	return ret;
    }

    refs -= deadcount;

    // This process still has a ref...
    g_assert (refs > 0);

    if ((ret = _admindb._settings.put_overwrite (_admintxn, refs_key, refs_data))) {
	PROP_ERROR (ret) ("put_refcount");
	return ret;
    }

    if (deadcount > 0 && (ret = _admindb._settings.put_overwrite (_admintxn, attention_key, null))) {
	PROP_ERROR (ret) ("put_attention");
	return ret;
    }

    return 0;
}
