/*
 *   (C) Copyright IBM Corp. 2004
 *
 *   This program is free software;  you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
 *   the GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program;  if not, write to the Free Software
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
 * evms_mpathd
 *
 * Daemon to monitor multipath devices for failed paths and test for path
 * recovery.
 */

#define _GNU_SOURCE

#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>
#include <unistd.h>
#include <malloc.h>
#include <getopt.h>
#include <libgen.h>
#include <libdevmapper.h>
#include <sys/stat.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <common.h>
#include <locale.h>

#define SLASH_REPLACEMENT	'|'
#define LOCK_FILE_PREFIX	"/var/lock/evms-mpathd-"
#define LOG_FILE_PREFIX		"/var/log/evms-mpathd-"
#define LOG_BUF_SIZE		1024
#define HARDSECTOR_SIZE		512
#define SLEEP_INTERVAL		10

#ifndef O_DIRECT
#define O_DIRECT		0
#endif

#define BLKSSZGET		_IO(0x12,104)

static char *prog_name;				
static int debug = FALSE;
static char *log_file = NULL;
static int log_file_fd = 0;
static char log_buf[LOG_BUF_SIZE];
static int dm_multipath_version[3];

struct path {
	char name[128];
	int major;
	int minor;
	int fd;
	int failed;
	int hardsector_size;
};

struct multipath {
	char name[128];
	u_int64_t size;

	struct path *paths;
	int num_paths;
	int num_failed;

	int event_nr;

	char *buffer;
	int max_hardsector_size;

	int lock_file_fd;
};

/**
 * log_debug
 *
 * If the "debug" option is on, write a message to the log file.
 **/
static int log_debug(char *fmt, ...)
{
	va_list args;
	int len = 0, rc = 0;

	if (debug && log_file_fd > 0) {
		va_start(args, fmt);
		len += vsprintf(log_buf, fmt, args);
		va_end(args);

		if (write(log_file_fd, log_buf, len) < 0) {
			rc = errno;
		}
	}

	return rc;
}

/**
 * open_log_file
 *
 * If the "debug" option is on, open the log file. If the user doesn't
 * specify a log-file name, the log-file will be called
 * /var/log/evms-mpathd-<device_name>.log
 **/
static int open_log_file(struct multipath *mp)
{
	char name[128];
	int rc = 0;

	if (debug) {
		if (!log_file) {
			snprintf(name, 128, "%s%s.log", LOG_FILE_PREFIX, mp->name);
			log_file = name;
		}

		log_file_fd = open(log_file, O_CREAT | O_TRUNC | O_WRONLY, 0664);
		if (log_file_fd < 0) {
			rc = errno;
			log_file_fd = 0;
		}

		log_debug("Log file %s opened.\n", log_file);
	}

	return rc;
}

/**
 * open_lock_file
 *
 * We use this lock-file to coordinate with the engine multipath plugin. When
 * this file is open and locked, the engine will know the daemon is running,
 * and can get the process-ID in case it needs to stop the daemon. If the
 * engine is able to open the lock-file, it knows the daemon is not running
 * and can start it.
 **/
static int open_lock_file(struct multipath *mp)
{
	struct flock lockinfo;
	char lock_file_name[256];
	int rc;

	snprintf(lock_file_name, 256, "%s%s", LOCK_FILE_PREFIX, mp->name);

	log_debug("Opening lock-file %s\n", lock_file_name);
	mp->lock_file_fd = open(lock_file_name, O_RDWR | O_CREAT, 0660);
	if (mp->lock_file_fd < 0) {
		rc = errno;
		log_debug("Error opening lock-file %s: %s\n",
			  lock_file_name, strerror(rc));
		goto out;
	}

	lockinfo.l_whence = SEEK_SET;
	lockinfo.l_start = 0;
	lockinfo.l_len = 0;
	lockinfo.l_type = F_WRLCK;

	log_debug("Locking lock-file %s\n", lock_file_name);
	rc = fcntl(mp->lock_file_fd, F_SETLK, &lockinfo);
	if (rc) {
		/* Lock file is already locked. This means another instance
		 * of the daemon is already running on this device.
		 */
		log_debug("Lock-file %s is already locked.\n", lock_file_name);
		rc = errno;
		goto out;
	}

out:
	return rc;
}

/**
 * show_help
 *
 * Display usage information.
 **/
static void show_help(void)
{
	printf(_("Usage: %s [options] <multipath_name> <size> <child_name>+\n"), prog_name);
	printf(_("Monitor a multipath device for path failures.\n\n"
		 "  [-d, --debug] write debug information to a log file\n"
		 "  [-l log_file_name, --log-file log_file_name]\n"
		 "  [-h | -? | --help] display this help\n"));
}

/**
 * parse_options
 *
 * Parse command-line options. See show_help() for allowable options.
 **/
static int parse_options(int argc, char **argv)
{
	int c, rc = 0;
	char *short_opts = "dhl:?";
	struct option long_opts[] = { { "debug",    no_argument,       NULL, 'd'},
				      { "help",     no_argument,       NULL, 'h'},
				      { "log-file", required_argument, NULL, 'l'},
				      { NULL,       0,                 NULL,  0} };

	while ((c = getopt_long(argc, argv, short_opts,
				long_opts, NULL)) != EOF) {
		switch (c) {
		case 'd':
			debug = TRUE;
			break;

		case 'l':
			log_file = strdup(optarg);
			break;

		case 'h':
		case '?':
			/* Display the help. */
			rc = EINVAL;
			break;

		default:
			fprintf(stderr, _("%s -- unrecognized option \"%c\"\n\n"),
				prog_name, c);
			/* Display the help. */
			rc = EINVAL;
			break;
		}
	}

	return rc;
}

/**
 * create_dm_control_node
 *
 * The DM control node may not already exist, which will cause DM commands to
 * fail. If we can't find the control file, create it.
 **/
static int create_dm_control_node(void)
{
	struct stat st;
	const char *control_dir = dm_dir();
	char control_file[128];
	char dev_name[64];
	char work_buf[256];
	boolean found_major = FALSE;
	boolean found_minor = FALSE;
	int dm_control_major = 0;
	int dm_control_minor = 0;
	int dev_num, rc = 0;
	dev_t devt;
	FILE *file;

	snprintf(control_file, 128, "%s/control", control_dir);

	rc = stat(control_file, &st);
	if (!rc) {
		/* Control file already exists. */
		return rc;
	}

	/* Find the control-node major-number. */
	file = fopen("/proc/devices", "r");
	if (!file) {
		return errno;
	}

	while (fgets(work_buf, sizeof(work_buf), file) != NULL) {
		rc = sscanf(work_buf, "%d %s", &dev_num, dev_name);
		if (rc == 2 &&
		    strcmp(dev_name, "misc") == 0) {
			dm_control_major = dev_num;
			found_major = TRUE;
			break;
		}
	}

	fclose(file);

	if (!found_major) {
		return ENODEV;
	}

	/* Find the control-node minor-number. */
	file = fopen("/proc/misc", "r");
	if (!file) {
		return errno;
	}

	while (fgets(work_buf, sizeof(work_buf), file) != NULL) {
		rc = sscanf(work_buf, "%d %s", &dev_num, dev_name);
		if (rc == 2 &&
		    strcmp(dev_name, "device-mapper") == 0) {
			dm_control_minor = dev_num;
			found_minor = TRUE;
			break;
		}
	}

	fclose(file);

	if (!found_minor) {
		return ENODEV;
	}

	rc = stat(control_dir, &st);
	if (rc) {
		/* Need to create the directory. */
		rc = mkdir(control_dir, (S_IFDIR | S_IRWXU | S_IRGRP |
					 S_IXGRP | S_IROTH | S_IXOTH));
		if (rc) {
			return errno;
		}
	}

	/* Make the control node. */
	devt = makedev(dm_control_major, dm_control_minor);
	rc = mknod(control_file, (S_IFCHR | S_IRUSR | S_IWUSR |
				  S_IRGRP | S_IWGRP), devt);
	if (rc) {
		return errno;
	}

	return 0;
}

/**
 * remove_slashes
 *
 * Search a string and change all slashes to a replacement character.
 **/
static void remove_slashes(char *string)
{
	for (; *string; string++)
		if (*string == '/') *string = SLASH_REPLACEMENT;
}

/**
 * free_multipath
 *
 * Free all memory and resources for this multipath device.
 **/
static void free_multipath(struct multipath *mp)
{
	if (mp) {
		if (mp->paths)
			free(mp->paths);
		if (mp->buffer)
			free(mp->buffer);
		free(mp);
	}
}

/**
 * alloc_test_buffer
 *
 * Allocate a properly aligned buffer for performing test I/O.
 **/
static int alloc_test_buffer(struct multipath *mp)
{
	mp->buffer = memalign(mp->max_hardsector_size, mp->max_hardsector_size);
	if (!mp->buffer) {
		return ENOMEM;
	}
	memset(mp->buffer, 0, mp->max_hardsector_size);

	return 0;
}

/**
 * alloc_multipath
 *
 * Allocate and initialize a multipath device.
 **/
static struct multipath *alloc_multipath(char *name, int num_paths)
{
	struct multipath *mp;

	mp = calloc(1, sizeof(*mp));
	if (!mp) {
		goto error;
	}

	strncpy(mp->name, name, 127);
	remove_slashes(mp->name);
	mp->num_paths = num_paths;

	mp->paths = calloc(num_paths, sizeof(struct path));
	if (!mp->paths) {
		goto error;
	}

	return mp;

error:
	free_multipath(mp);
	return NULL;
}

/**
 * get_path_filename
 *
 * Generate a full-path filename for this path.
 **/
static char *get_path_filename(struct path *path)
{
	static char filename[256];
	snprintf(filename, 256, "%s/%s", EVMS_OBJECT_NODE_DIR, path->name);
	return filename;
}

/**
 * get_devnum
 *
 * Lookup the device-number for this path.
 **/
static int get_devnum(struct path *path)
{
	struct stat st;
	char *filename;
	int rc;

	filename = get_path_filename(path);
	rc = stat(filename, &st);
	if (rc) {
		return errno;
	}

	path->major = major(st.st_rdev);
	path->minor = minor(st.st_rdev);

	return 0;
}

/**
 * open_path
 *
 * Open a filehandle for this path.
 **/
static int open_path(struct path *path)
{
	char *filename;

	filename = get_path_filename(path);
	path->fd = open(filename, O_RDONLY | O_DIRECT | O_SYNC);
	if (path->fd < 0) {
		return errno;
	}

	return 0;
}

/**
 * get_hardsector_size
 *
 * Get the hard-sector-size for this path using the BLKSSZGET ioctl. Default
 * to 512 if the ioctl doesn't work.
 **/
static void get_hardsector_size(struct path *path)
{
	u_int32_t hardsector_size;
	int rc;

	rc = ioctl(path->fd, BLKSSZGET, &hardsector_size);
	if (rc) {
		hardsector_size = HARDSECTOR_SIZE;
	}

	path->hardsector_size = hardsector_size;
}

/**
 * add_path
 *
 * Initialize the appropriate path entry for this multipath device.
 **/
static int add_path(struct multipath *mp, int index, char *name)
{
	int rc;

	strncpy(mp->paths[index].name, name, 127);

	rc = get_devnum(&mp->paths[index]);
	if (rc) {
		return rc;
	}

	rc = open_path(&mp->paths[index]);
	if (rc) {
		return rc;
	}

	get_hardsector_size(&mp->paths[index]);

	if (mp->paths[index].hardsector_size > mp->max_hardsector_size) {
		mp->max_hardsector_size = mp->paths[index].hardsector_size;
	}

	return rc;
}

/**
 * parse_device
 *
 * Parse the name of the multipath device and it's children. Allocate and
 * initialize structures to represent this device.
 **/
static int parse_device(int argc, char **argv, int argi,
			struct multipath **mpath)
{
	struct multipath *mp;
	int num_paths = argc - argi - 2;
	int i = argi;
	int j, rc;

	if (num_paths <= 0) {
		/* Not enough parameters. */
		show_help();
		return EINVAL;
	}

	mp = alloc_multipath(argv[i++], num_paths);
	if (!mp) {
		return ENOMEM;
	}

	mp->size = strtoull(argv[i++], NULL, 0);

	for (j = 0; j < num_paths; j++) {
		rc = add_path(mp, j, argv[i++]);
		if (rc) {
			free_multipath(mp);
			return rc;
		}
	}

	rc = alloc_test_buffer(mp);
	if (rc) {
		free_multipath(mp);
		return rc;
	}

	*mpath = mp;
	return 0;
}

/**
 * parse_command_line
 *
 * Parse the command line options and the names of the multipath device and
 * its child devices.
 **/
static int parse_command_line(int argc, char **argv, struct multipath **mp)
{
	int rc;

	rc = parse_options(argc, argv);
	if (rc) {
		show_help();
		return rc;
	}

	rc = parse_device(argc, argv, optind, mp);

	return rc;
}

/**
 * alloc_dm_task
 *
 * Create a DM task and set the device name.
 **/
static int alloc_dm_task(struct multipath *mp, struct dm_task **dmt, int type)
{
	int rc = ENOMEM;

	*dmt = dm_task_create(type);
	if (*dmt) {
		if (mp) {
			rc = dm_task_set_name(*dmt, mp->name);
			rc = (rc) ? 0 : ENOMEM;
		} else {
			rc = 0;
		}
	}

	return rc;
}

/**
 * get_dm_multipath_version
 *
 * Get the list of targets from DM and find the version of the
 * multipath target.
 **/
static int get_dm_multipath_version(void)
{
	struct dm_versions *version, *old_version;
	struct dm_task *task;
	int rc;

	rc = alloc_dm_task(NULL, &task, DM_DEVICE_LIST_VERSIONS);
	if (rc) {
		return rc;
	}

	rc = dm_task_run(task);
	if (!rc) {
		log_debug("Error running list-versions command.\n");
		rc = EINVAL;
		goto out;
	}

	rc = ENOENT;
	version = dm_task_get_versions(task);
	if (!version) {
		log_debug("Error getting versions array.\n");
		goto out;
	}

	do {
		if (!strcmp(version->name, "multipath")) {
			dm_multipath_version[0] = version->version[0];
			dm_multipath_version[1] = version->version[1];
			dm_multipath_version[2] = version->version[2];
			log_debug("Found multipath target: version %d.%d.%d\n",
				  dm_multipath_version[0],
				  dm_multipath_version[1],
				  dm_multipath_version[2]);
			rc = 0;
			break;
		}

		old_version = version;
		version = (void*)version + version->next;
	} while (old_version != version);

	if (rc) {
		log_debug("No multipath target found.\n");
	}

out:
	dm_task_destroy(task);
	return rc;
}

/**
 * get_info
 *
 * The the basic DM info about this multipath device. This will determine
 * if the device exists and update the event number.
 **/
static int get_info(struct multipath *mp)
{
	struct dm_task *task;
	struct dm_info info;
	int rc;

	log_debug("Getting info for device %s\n", mp->name);

	rc = alloc_dm_task(mp, &task, DM_DEVICE_INFO);
	if (rc) {
		return rc;
	}

	rc = dm_task_run(task);
	if (!rc) {
		log_debug("Error running info command.\n");
		rc = EINVAL;
		goto out;
	}

	rc = dm_task_get_info(task, &info);
	if (!rc) {
		rc = EINVAL;
		goto out;
	}

	if (!info.exists) {
		log_debug("Device is not active.\n");
		rc = ENODEV;
		goto out;
	}

	log_debug("Current event number is %d\n", info.event_nr);
	mp->event_nr = info.event_nr;
	rc = 0;

out:
	dm_task_destroy(task);
	return rc;
}

/**
 * wait_for_path_failure
 *
 * Wait for an event from the DM device.
 **/
static int wait_for_path_failure(struct multipath *mp)
{
	struct dm_task *task;
	int rc;

	log_debug("Waiting for path failure on device %s (event %d)\n",
		  mp->name, mp->event_nr);

	rc = alloc_dm_task(mp, &task, DM_DEVICE_WAITEVENT);
	if (rc) {
		return rc;
	}

	dm_task_set_event_nr(task, mp->event_nr);

	rc = dm_task_run(task);

	dm_task_destroy(task);

	return rc ? 0 : EINVAL;
}

/**
 * get_status
 *
 * Get the status of the DM device. Update each path with info from the
 * status command.
 **/
static int get_status(struct multipath *mp)
{
	struct dm_task *task;
	u_int64_t start, length;
	char *type, *params;
	char devnum[25], *dev, state;
	int i, rc, num_failed = 0;

	log_debug("Getting status for device %s\n", mp->name);

	rc = alloc_dm_task(mp, &task, DM_DEVICE_STATUS);
	if (rc) {
		return rc;
	}

	/* Run the status ioctl. */
	rc = dm_task_run(task);
	if (!rc) {
		log_debug("Error running status command.\n");
		rc = EINVAL;
		goto out;
	}

	/* Get the status string from DM. */
	dm_get_next_target(task, NULL, &start, &length, &type, &params);

	/* For each path, check the status string to see if that path is
	 * active or failed.
	 */
	for (i = 0; i < mp->num_paths; i++) {
		snprintf(devnum, 25, "%u:%u",
			 mp->paths[i].major, mp->paths[i].minor);
		dev = strstr(params, devnum);
		if (dev) {
			sscanf(dev, "%*u:%*u %c", &state);
			if (state == 'F' || state == 'f') {
				mp->paths[i].failed = TRUE;
				num_failed++;
			} else if (state == 'A' || state == 'a') {
				mp->paths[i].failed = FALSE;
			}

			log_debug("Path %s is currently %s\n",
				  mp->paths[i].name,
				  mp->paths[i].failed ? "failed" : "active");
		}
	}

	log_debug("%d paths currently failed\n", num_failed);
	mp->num_failed = num_failed;

out:
	dm_task_destroy(task);
	return 0;
}

/**
 * test_path
 *
 * If this path is marked "failed", send a test-I/O. If the I/O succeeds, we
 * can makr the path "active" and reload the device table.
 **/
static int test_path(struct multipath *mp, int path)
{
	int rc = EINVAL;

	if (mp->paths[path].failed) {
		log_debug("Testing failed path %s\n", mp->paths[path].name);
		lseek(mp->paths[path].fd, 0, SEEK_SET);
		rc = read(mp->paths[path].fd, mp->buffer,
			  mp->paths[path].hardsector_size);
		if (rc > 0) {
			log_debug("Path has recovered, marking active.\n");
			mp->paths[path].failed = FALSE;
			mp->num_failed--;
			rc = 0;
		} else {
			log_debug("Path has not recovered\n");
			rc = errno;
		}
	}

	return rc;
}

/**
 * reload_device_table
 *
 * When we detect that a failed path has been reactivated, we need to
 * reload the device table so DM knows that path is active again.
 *
 * For now we'll blindly reload all paths as active. We should change
 * this to place the still-failed paths in a secondary group so DM
 * doesn't actually try to use them until all active paths have failed.
 **/
static int reload_device_table(struct multipath *mp)
{
	struct dm_task *reload_task, *resume_task;
	char params[256];
	int sz = 0, maxlen = 256;
	int rc, i;

	log_debug("Reloading table for device %s\n", mp->name);

	rc = alloc_dm_task(mp, &reload_task, DM_DEVICE_RELOAD);
	if (rc) {
		return rc;
	}

	rc = alloc_dm_task(mp, &resume_task, DM_DEVICE_RESUME);
	if (rc) {
		dm_task_destroy(reload_task);
		return rc;
	}

	if (dm_multipath_version[0] == 1 &&
	    dm_multipath_version[1] == 0 &&
	    dm_multipath_version[2] <= 3) {
		sz += snprintf(params + sz, maxlen - sz,
			       "1 round-robin %u 0", mp->num_paths);

		for (i = 0; i < mp->num_paths; i++) {
			sz += snprintf(params + sz, maxlen - sz, " %u:%u",
				       mp->paths[i].major, mp->paths[i].minor);
		}
	} else {
		sz += snprintf(params + sz, maxlen - sz,
			       "0 0 1 1 round-robin 0 %u 0", mp->num_paths);

		for (i = 0; i < mp->num_paths; i++) {
			sz += snprintf(params + sz, maxlen - sz, " %u:%u",
				       mp->paths[i].major, mp->paths[i].minor);
		}
	}

	log_debug("Creating table parameters: %s\n", params);
	rc = dm_task_add_target(reload_task, 0, mp->size, "multipath", params);
	if (!rc) {
		rc = EINVAL;
		goto out;
	}

	rc = dm_task_run(reload_task);
	if (!rc) {
		log_debug("Error running reload command.\n");
		rc = EINVAL;
		goto out;
	}

	rc = dm_task_run(resume_task);
	if (!rc) {
		log_debug("Error running resume command.\n");
		rc = EINVAL;
		goto out;
	}

	rc = 0;

out:
	dm_task_destroy(reload_task);
	dm_task_destroy(resume_task);
	return rc;
}

/**
 * test_multipath
 *
 * Loop forever, waiting for an event on the multipath device. When one
 * occurs, get the status of the device. Then test each failed path to
 * see if any have recovered. If any paths have recovered, reload the
 * device table to reactivate those paths. If failed paths still remain,
 * wait a specified interval before testing again. If all paths are now
 * active, go back to waiting on the device.
 **/
static void test_multipath(struct multipath *mp)
{
	int i, rc, reload_table;

	rc = get_info(mp);
	if (rc) {
		/* Device doesn't exist, or other bad error. */
		return;
	}

	while (1) {
		do {
			reload_table = FALSE;
			get_status(mp);

			for (i = 0; i < mp->num_paths; i++) {
				rc = test_path(mp, i);
				if (!rc) {
					reload_table = TRUE;
				}
			}

			if (reload_table) {
				reload_device_table(mp);
			}

			if (mp->num_failed) {
				log_debug("Waiting %d seconds for next test\n",
					  SLEEP_INTERVAL);
				sleep(SLEEP_INTERVAL);
			}
		} while (mp->num_failed);

		wait_for_path_failure(mp);
		mp->event_nr++;
	}
}

int main(int argc, char **argv)
{
	struct multipath *mp = NULL;
	int rc;

	setlocale(LC_MESSAGES, "");
	bindtextdomain(PACKAGE, LOCALEDIR);
	textdomain(PACKAGE);

	prog_name = basename(argv[0]);

	rc = parse_command_line(argc, argv, &mp);
	if (rc) {
		return rc;
	}
	
	rc = create_dm_control_node();
	if (rc) {
		free_multipath(mp);
		return rc;
	}

	rc = daemon(0, 0);
	if (rc) {
		rc = errno;
		free_multipath(mp);
		return rc;
	}

	rc = open_log_file(mp);
	if (rc) {
		free_multipath(mp);
		return rc;
	}

	rc = open_lock_file(mp);
	if (rc) {
		free_multipath(mp);
		return rc;
	}

	rc = mlockall(MCL_CURRENT);
	if (rc) {
		rc = errno;
		free_multipath(mp);
		return rc;
	}

	rc = get_dm_multipath_version();
	if (rc) {
		free_multipath(mp);
		return rc;
	}

	test_multipath(mp);

	/* Should only get here on a bad error. */
	free_multipath(mp);
	return 0;
}

