/*
 * salinfo_decode_all.c - start and monitor the salinfo_decode tasks.
 *
 * Copyright (c) 2005 Silicon Graphics, Inc
 *	Keith Owens <kaos@sgi.com>
 * 2005-12-14 Initial release.
 *	      Keith Owens <kaos@sgi.com>
 *
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */

/* This program takes no arguments, instead everything is passed via
 * environment variables to make it easier to set data in /etc/sysconfig.
 * All variables come in two forms, global (applies to all record types) and
 * per record (only applies to that record type).  The per record variables
 * have a prefix of 'CMC_', 'CPE_', 'INIT_' or 'MCA_', global settings have no
 * prefix.  The global value is used if there is no record specific variable in
 * the environment.
 *
 * Required variables are :-
 *
 * DIRECTORY		The value passed as parameter -D to salinfo_decode.
 *
 * RETRIES		How many times a version of salinfo_decode is restarted
 *			before we give up and log the failure.
 *
 * Optional variables are :-
 *
 * INODE_PCT		Passed as -i <value> to salinfo_decode.
 *
 * SPACE_PCT		Passed as -s <value> to salinfo_decode.
 *
 * RATE_LIMIT		Passed as -l <value> to salinfo_decode.
 *
 * TRIGGER		Passed as -T <value> to salinfo_decode.
 */

#include <ctype.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <syslog.h>
#include <time.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>

#ifndef NUL
#define NUL '\0'
#endif

#ifndef ARRAY_SIZE
#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
#endif

static char *prefix;

static const char *type[] = { "CMC", "CPE", "INIT", "MCA" };
#define GLOBAL (ARRAY_SIZE(type))

struct child {
	int pid;
	int status;
	int died;
	int retries;
	int max_retries;
	time_t start_time;
};
static struct child child[GLOBAL];

enum {
	DIRECTORY,
	RETRIES,
	INODE_PCT,
	SPACE_PCT,
	RATE_LIMIT,
	TRIGGER,
} varnum;

static const char *var[] = {
	[ DIRECTORY ]	= "DIRECTORY",
	[ RETRIES ]	= "RETRIES",
	[ INODE_PCT ]	= "INODE_PCT",
	[ SPACE_PCT ]	= "SPACE_PCT",
	[ RATE_LIMIT ]	= "RATE_LIMIT",
	[ TRIGGER ]	= "TRIGGER",
};

static const char parm[] = {
	[ DIRECTORY ]	= 'D',
	[ RETRIES ]	= NUL,
	[ INODE_PCT ]	= 'i',
	[ SPACE_PCT ]	= 's',
	[ RATE_LIMIT ]	= 'l',
	[ TRIGGER ]	= 'T',
};

static char *value[ARRAY_SIZE(var)][GLOBAL + 1];

static int errors;

static void
get_all_env(void)
{
	char name[200], *p;
	int v, t;
	for (v = 0; v < ARRAY_SIZE(var); ++v) {
		for (t = 0; t <= GLOBAL; ++t) {
			if (t == GLOBAL)
				snprintf(name, sizeof(name), "%s", var[v]);
			else
				snprintf(name, sizeof(name), "%s_%s", type[t], var[v]);
			value[v][t] = getenv(name);
			if (value[v][t] && !*value[v][t])
				value[v][t] = NULL;
		}
	}
	for (v = 0; v < ARRAY_SIZE(var); ++v) {
		for (t = 0; t < GLOBAL; ++t)
			if (!value[v][t])
				value[v][t] = value[v][GLOBAL];
	}
	for (v = 0; v < ARRAY_SIZE(var); ++v) {
		if (v != DIRECTORY && v != RETRIES)
			continue;
		for (t = 0; t < GLOBAL; ++t) {
			if (!value[v][t]) {
				fprintf(stderr,
					"%s: no value for environment variable %s_%s nor %s\n",
					prefix, type[t], var[v], var[v]);
				++errors;
			}
		}
	}
	for (t = 0; t < GLOBAL; ++t) {
		if (!value[RETRIES][t])
			continue;
		child[t].max_retries = strtol(value[RETRIES][t], &p, 0);
		if (*p) {
			fprintf(stderr,
					"%s: non-numeric value for %s retries (%s)\n",
					prefix, type[t], value[RETRIES][t]);
				++errors;
			}
	}
	if (errors)
		exit(1);
}

static void
fork_one(int t)
{
	char *argv[2 * (ARRAY_SIZE(var) + 3)], *p;
	int i, v;
	i = 0;
	argv[i++] = "salinfo_decode";
	argv[i++] = "-t";
	p = argv[i++] = alloca(strlen(type[t]) + 1);
	strcpy(p, type[t]);
	while (*p) {
		*p = tolower(*p);
		++p;
	}
	for (v = 0; v < ARRAY_SIZE(var); ++v) {
		if (value[v][t] && parm[v] != NUL) {
			p = argv[i++] = alloca(3);
			p[0] = '-';
			p[1] = parm[v];
			p[2] = NUL;
			argv[i++] = value[v][t];
		}
	}
	argv[i] = NULL;
	if ((child[t].pid = fork()) == 0) {
		execvp(argv[0], argv);
		fprintf(stderr, "%s: exec of %s for %s task failed (%m)\n",
			prefix, argv[0], type[t]);
		++errors;
	} else if (child[t].pid < 0) {
		fprintf(stderr, "%s: fork for %s task failed (%m)\n",
			prefix, type[t]);
		++errors;
	} else
		child[t].start_time = time(NULL);
}

static void
fork_all(void)
{
	int t;
	for (t = 0; t < GLOBAL; ++t)
		fork_one(t);
	if (errors) {
		for (t = 0; t < GLOBAL; ++t)
			if (child[t].pid > 0)
				kill(child[t].pid, SIGKILL);
		exit(1);
	}
}

static void
sig_chld (int sig)
{
	int t, p, status, moretodo = 1;
	while (moretodo) {
		moretodo = 0;
		for (t = 0; t < GLOBAL; ++t) {
			if (child[t].died)
				continue;
			p = waitpid(child[t].pid, &status, WNOHANG);
			if (p > 0) {
				child[t].died = 1;
				child[t].status = status;
				moretodo = 1;
			}
		}
	}
	signal(SIGCHLD, sig_chld);
}

/* Shutdown typically only kills this program, not its children.  Catch all
 * signals and kill the kids!.
 */
static void
sig_all (int sig)
{
	int t;
	for (t = 0; t < GLOBAL; ++t)
		kill(child[t].pid, sig);
	exit(0);
}

/* Loop forever, monitoring all the children.  Every few minutes (or earlier if
 * interrupted by a signal), check the status of the children.  Respawn unless
 * we hit the retry limit.  If a child dies within 5 seconds (arbitrary) of
 * start up then do not respawn it, it will almost certainly do exactly the
 * same thing again.
 */

static void
monitor_all(void)
{
	int t, moretodo = 1;
	char log[200];
	while (moretodo) {
		moretodo = 0;
		sleep(10*60*60);
		for (t = 0; t < GLOBAL; ++t) {
			if (!child[t].died) {
				if (kill(child[t].pid, 0)) {
					/* Strange, the child went away without
					 * us noticing.
					 */
					child[t].died = 1;
					signal(SIGCHLD, sig_chld);
				} else {
					moretodo = 1;
					continue;
				}
			}
			if (child[t].died == 2)
				continue;
			if (time(NULL) - child[t].start_time <= 5) {
				child[t].died = 2;
				snprintf(log, sizeof(log),
					 "Type %s died very quickly, no respawn, last status was %d",
					 type[t], child[t].status);
				syslog(LOG_ERR, "%s", log);
			} else if (++child[t].retries > child[t].max_retries) {
				child[t].died = 2;
				snprintf(log, sizeof(log),
					 "Retries for type %s exceeded, last status was %d",
					 type[t], child[t].status);
				syslog(LOG_ERR, "%s", log);
			} else {
				child[t].died = 0;
				snprintf(log, sizeof(log),
					 "Retry %d for type %s, previous status was %d",
					 child[t].retries, type[t], child[t].status);
				syslog(LOG_WARNING, "%s", log);
				moretodo = 1;
				fork_one(t);
			}
		}
	}
}


int main(int argc, char **argv)
{
	prefix = argv[0];
	int i;
	if (argc != 1) {
		fprintf(stderr, "%s takes no parameters\n", prefix);
		exit(1);
	}
	get_all_env();
	if (fork() != 0)
		return 0;
	openlog(prefix, LOG_PID, LOG_DAEMON);
	for (i = 1; i < 32; ++i) {
		if (i != SIGSTOP && i != SIGCONT)
			signal(i, sig_all);
	}
	signal(SIGCHLD, sig_chld);
	fork_all();
	monitor_all();
	syslog(LOG_ERR, "All children have died, giving up");
	return 1;
}
