/*
 * pfmon_task.c : handles per-task measurements
 *
 * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
 * Contributed by Stephane Eranian <eranian@hpl.hp.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
 * 02111-1307 USA
 */
#include "pfmon.h"

#include <fcntl.h>
#include <regex.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <sys/poll.h>
#include <sys/ptrace.h>
#include <sys/mman.h>

/*
 * This belongs to some LIBC header files for 2.6
 */
#ifndef PTRACE_SETOPTIONS

/* 0x4200-0x4300 are reserved for architecture-independent additions.  */
#define PTRACE_SETOPTIONS	0x4200
#define PTRACE_GETEVENTMSG	0x4201
#define PTRACE_GETSIGINFO	0x4202
#define PTRACE_SETSIGINFO	0x4203

/* options set using PTRACE_SETOPTIONS */
#define PTRACE_O_TRACESYSGOOD	0x00000001
#define PTRACE_O_TRACEFORK	0x00000002
#define PTRACE_O_TRACEVFORK	0x00000004
#define PTRACE_O_TRACECLONE	0x00000008
#define PTRACE_O_TRACEEXEC	0x00000010
#define PTRACE_O_TRACEVFORKDONE	0x00000020
#define PTRACE_O_TRACEEXIT	0x00000040

/* Wait extended result codes for the above trace pt_options.  */
#define PTRACE_EVENT_FORK	1
#define PTRACE_EVENT_VFORK	2
#define PTRACE_EVENT_CLONE	3
#define PTRACE_EVENT_EXEC	4
#define PTRACE_EVENT_VFORK_DONE	5
#define PTRACE_EVENT_EXIT	6
#endif /* PTRACE_OPTIONS */

#define PFMON_SDESC_PID_HASH_SIZE	256
#define PFMON_SDESC_PID_HASH(x)		((x) & (PFMON_SDESC_PID_HASH_SIZE-1))

#define PFMON_SDESC_FD_HASH_SIZE	256
#define PFMON_SDESC_FD_HASH(x)		((x) & (PFMON_SDESC_FD_HASH_SIZE-1))

/*
 * better is cache line size aligned
 */
typedef struct {
	pthread_t	thread_id;	/* worker's thread id */
	unsigned int	cpu_id;		/* worker's assigned CPU */
	int		to_worker[2];	/* worker's 1-way communication frofromm master */
	int		from_worker[2];	/* worker's 1-way communication back to master */

	pfmon_sdesc_t	*fd_hash[PFMON_SDESC_FD_HASH_SIZE];	/* hash table for sdesc managed by worker */
} task_worker_t;

typedef enum { 
	QUIT_NOTYET,	/* default value */
	QUIT_ALARM, 	/* quit because of alarm (session-timeout) */
	QUIT_ABORT,	/* quit because of user abort (CTRL-C) */
	QUIT_ERROR	/* quit because of error */
} task_quit_t;

typedef enum { 
	PFMON_TASK_MSG_QUIT,		/* time to quit */
	PFMON_TASK_MSG_ADD_TASK,	/* new task to handle */
	PFMON_TASK_MSG_REM_TASK,	/* new task to handle */
	PFMON_TASK_MSG_RESET		/* reset perfmon state (used for exec-split) */
} pfmon_worker_msg_type_t;

typedef struct {
	pfmon_worker_msg_type_t	type;
	void			*data;
} task_worker_msg_t;

typedef struct {
	unsigned long num_sdesc;	/* number of sdesc allocated at a particular time */
	unsigned long max_sdesc;	/* max number of allocated sdesc at a particular time */
	unsigned long num_active_sdesc; /* number of sdesc which are actively monitoring */
	unsigned long max_active_sdesc; /* max number of sdesc which are actively monitoring at a particular time */
	unsigned long total_sdesc;	/* total number of sdesc created for the entire session */
} task_info_t;

static pthread_key_t		arg_key;
static pthread_mutex_t		pfmon_hash_pid_lock  = PTHREAD_MUTEX_INITIALIZER;
static pthread_mutex_t		task_info_lock = PTHREAD_MUTEX_INITIALIZER;
static pthread_mutex_t		task_aggr_lock = PTHREAD_MUTEX_INITIALIZER;
static pfmon_sdesc_t		sdesc_task_aggr;
static regex_t 			follow_exec_preg;
static task_worker_t		*workers;
static pid_t 			master_tid;
static pfmon_sdesc_t 		*sdesc_pid_hash[PFMON_SDESC_PID_HASH_SIZE];
static int volatile 		time_to_quit;
static task_quit_t		quit_reason;
static int 			work_todo;
static task_info_t		task_info;
static sem_t			master_work_sem;

#define LOCK_TASK_INFO()	pthread_mutex_lock(&task_info_lock)
#define UNLOCK_TASK_INFO()	pthread_mutex_unlock(&task_info_lock)

/*
 * must be called with aggr_lock held
 */
static inline void
task_aggregate_results(pfmon_sdesc_t *sdesc)
{
	pfmon_event_set_t *set_aggr, *set;
	unsigned int i, count;
	
	for (set_aggr = sdesc_task_aggr.sets,
	     set = sdesc->sets;
	     set_aggr;
	     set_aggr = set_aggr->next,
	     set = set->next) {

		count = set_aggr->event_count;

		for (i=0; i < count; i++) {
			set_aggr->master_pd[i].reg_value += set->master_pd[i].reg_value;
		}
	}
}

static void
task_sigalarm_handler(int n, struct siginfo *info, void *sc)
{
	if (quit_reason == QUIT_NOTYET) quit_reason  = QUIT_ALARM;
	time_to_quit = 1;
	sem_post(&master_work_sem);
}

static void
task_sigint_handler(int n, struct siginfo *info, void *sc)
{
	if (gettid() != master_tid) return;

	if (quit_reason == QUIT_NOTYET) quit_reason  = QUIT_ABORT;
	time_to_quit = 1;
	sem_post(&master_work_sem);
}

static void
task_sigchild_handler(int n, struct siginfo *info, void *sc)
{
	sem_post(&master_work_sem);
}

/* for debug only */
static void
task_sigusr_handler(int n, struct siginfo *info, void *sc)
{
	int i;
	pfmon_sdesc_t *t;

	printf("dumping sdesc hash table\n");
	for(i=0; i < PFMON_SDESC_PID_HASH_SIZE; i++) {
		t = sdesc_pid_hash[i];
		while (t) {
			printf("%d> pid=%d tid=%d refcnt=%d\n", i, t->pid, t->tid, t->refcnt);
			t = t->next;
		}
	}
	exit(1);
}

static void
mask_global_signals(void)
{
	sigset_t my_set;

	sigemptyset(&my_set);
	sigaddset(&my_set, SIGINT);
	sigaddset(&my_set, SIGCHLD);
	sigaddset(&my_set, SIGALRM);
	/*
	 * we want to affect the caller's thread only, not the entire process
	 */
        pthread_sigmask(SIG_BLOCK, &my_set, NULL);
}

static void
unmask_global_signals(void)
{
	sigset_t my_set;

	sigemptyset(&my_set);
	sigaddset(&my_set, SIGINT);
	sigaddset(&my_set, SIGCHLD);
	sigaddset(&my_set, SIGALRM);

	/*
	 * we want to affect the caller's thread only, not the entire process
	 */
        pthread_sigmask(SIG_UNBLOCK, &my_set, NULL);
}

static void
setup_sigchild(void)
{
	struct sigaction act;
	sigset_t my_set;

	memset(&act, 0, sizeof(act));
	sigemptyset(&my_set);
	sigaddset(&my_set, SIGINT);
	sigaddset(&my_set, SIGALRM);
	act.sa_mask    = my_set;
	act.sa_flags   = SA_SIGINFO;
	act.sa_handler = (__sighandler_t)task_sigchild_handler;
	sigaction (SIGCHLD, &act, 0);

}


static void
setup_global_signals(void)
{
	struct sigaction act;
	sigset_t my_set;

	memset(&act,0,sizeof(act));
	sigemptyset(&my_set);
	sigaddset(&my_set, SIGINT);
	act.sa_mask    = my_set;
	act.sa_flags   = SA_SIGINFO;
	act.sa_handler = (__sighandler_t)task_sigalarm_handler;
	sigaction (SIGALRM, &act, 0);

	memset(&act,0,sizeof(act));
	sigemptyset(&my_set);
	sigaddset(&my_set, SIGALRM);
	act.sa_handler = (__sighandler_t)task_sigint_handler;
	act.sa_flags   = SA_SIGINFO;
	sigaction (SIGINT, &act, 0);

	memset(&act,0,sizeof(act));

	act.sa_handler = (__sighandler_t)task_sigusr_handler;
	act.sa_flags   = SA_SIGINFO;
	sigaction (SIGUSR1, &act, 0);
}

static inline int
pfmon_continue(pid_t pid, unsigned long sig)
{
	int r;

	r = ptrace(PTRACE_CONT, pid, NULL, (void *)sig);
	if (r == -1) {
		warning("cannot restart [%d]: %s\n", pid, strerror(errno));
	}
	return r;
}

static inline int
pfmon_detach(pid_t pid)
{
	int r;

	r = ptrace(PTRACE_DETACH, pid, NULL, NULL);
	if (r == -1) {
		warning("cannot detach [%d]: %s\n", pid, strerror(errno));
	}
	return r;
}

static int
install_code_triggers(pfmon_sdesc_t *sdesc)
{
	unsigned int i, num;
	pfmon_trigger_t *trg;
	pid_t pid;
	int ret;

	num = sdesc->num_code_triggers;
	trg = sdesc->code_triggers;
	pid = sdesc->tid;

	for (i=0; i < num; i++, trg++) {
		/*
		 * install start breakpoints and stop breakpoints only when not used 
		 * in leave function triggers
		 */
		trg->br_idx = i;
		if (trg->trg_attr_start || trg->trg_attr_func == 0) {
			ret = pfmon_set_code_breakpoint(pid, i, trg->brk_address);
			if (ret) {
				warning("cannot install code breakpoints\n");
				return -1;
			}
			vbprintf("[%d] installed %-5s code breakpoint at %p\n", 
				pid, 
				trg->trg_attr_start ? "start" : "stop",
				trg->brk_address);
		}
	}
	return 0;
}

static int
install_data_triggers(pfmon_sdesc_t *sdesc)
{
	pfmon_trigger_t *trg;
	pid_t pid;
	unsigned int i, num;
	int rw, ret;

	num = sdesc->num_data_triggers;
	trg = sdesc->data_triggers;
	pid = sdesc->tid;

	for (i=0; i < num; i++, trg++) {
		rw = trg->trg_attr_rw;
		trg->br_idx = i;
		ret = pfmon_set_data_breakpoint(pid, i, trg->brk_address, rw);
		if (ret) {
			warning("cannot install data breakpoints\n");
			return -1;
		}
		vbprintf("[%d] installed %-5s data breakpoint at %p\n", 
			pid, 
			trg->trg_attr_start ? "start" : "stop",
			trg->brk_address);

	}
	return 0;
}

static pfmon_trigger_t *
find_code_trigger(pfmon_sdesc_t *sdesc, unsigned long addr)
{
	unsigned int i, num;

	num = sdesc->num_code_triggers;

	for (i=0; i < num; i++) {
		if (addr == sdesc->code_triggers[i].brk_address) return sdesc->code_triggers+i;
	}
	return NULL;
}

static pfmon_trigger_t *
find_data_trigger(pfmon_sdesc_t *sdesc, unsigned long addr)
{
	unsigned int i, num;

	num = sdesc->num_data_triggers;

	for (i=0; i < num; i++) {
		if (addr == sdesc->data_triggers[i].brk_address) return sdesc->data_triggers+i;
	}
	return NULL;
}

static int 
task_setup_pfm_context(pfmon_sdesc_t *sdesc, pfmon_ctx_t *ctx)
{
	pfmon_smpl_desc_t *csmpl = &sdesc->csmpl;
	pfmon_ctxid_t id;
	int activate_brkpoints = 0;
	int ret;

	pfmon_clone_sets(options.sets, sdesc);

	memset(csmpl, 0, sizeof(pfmon_smpl_desc_t));

	if (pfmon_create_context(ctx, &csmpl->smpl_hdr, &sdesc->ctxid) == -1 ) {
		if (errno == EBUSY) {
			warning("concurrent conflicting monitoring session is present in your system\n");
		} else
			warning("can't create perfmon context: %s(%d)\n", strerror(errno), errno);
		return -1;
	}

	id = sdesc->ctxid;

	/*
	 * set close-on-exec for security reasons
	 */
	ret = fcntl(id, F_SETFD, FD_CLOEXEC);
	if (ret) {
		warning("cannot set CLOEXEC: %s\n", strerror(errno));
		return -1;
	}

	if (open_results(sdesc) == -1) return -1;

	if (options.opt_use_smpl) {
		if (pfmon_setup_sampling_output(sdesc, &sdesc_task_aggr.csmpl) == -1)
			return -1;
	}

	if (install_event_sets(sdesc) == -1) return -1;

	if (pfmon_load_context(sdesc->ctxid, sdesc->tid) == -1) return -1;

	if (sdesc->num_code_triggers) {
		ret = install_code_triggers(sdesc);
		if (ret) return ret;
		activate_brkpoints = 1;
	}

	if (sdesc->num_data_triggers) {
		ret = install_data_triggers(sdesc);
		if (ret) return ret;
		activate_brkpoints = 1;
	}

	if (activate_brkpoints) pfmon_enable_all_breakpoints(sdesc->tid);

	return 0;
}

static int 
task_reset_pfm_context(pfmon_sdesc_t *sdesc)
{
	pfmon_event_set_t *set;
	pfmon_pmd_t *pd;
	unsigned int i, count;
	int ret = -1;

	vbprintf("[%d] resetting perfmon state\n", sdesc->tid);

	for (set = sdesc->sets; set; set = set->next) {
		pd = set->master_pd;
		count = set->event_count;
		for(i=0; i < count; i++) {
			pd[i].reg_value = set->long_rates[i].value;
		}
	}

	/*
	 * tas is stopped but we need to unload because we reprogram
	 * the event sets
	 */
	if (pfmon_unload_context(sdesc->ctxid) == -1) return -1;

	install_event_sets(sdesc);

	if (pfmon_load_context(sdesc->ctxid, sdesc->tid) == -1) return -1;

	/* monitoring is always stopped on reload */

	if (options.opt_use_smpl) {
		if (pfmon_reset_sampling(sdesc) == -1) goto error;
		if (pfmon_setup_sampling_output(sdesc, &sdesc_task_aggr.csmpl) == -1) goto error;
		DPRINT(("reset setup sampling buffer for [%d]\n", sdesc->tid));
	}
	ret = 0;
error:
	return ret;
}

static int
task_collect_results(pfmon_sdesc_t *sdesc)
{
	/*
	 * no more context attached, there is nothing we can do here
	 */
	if (sdesc->ctxid == -1) return 0;

	/*
	 * read the last known values for the counters
	 */
	if (options.opt_use_smpl == 0 || options.opt_smpl_print_counts) {
		if (read_results(sdesc) == -1) {
			warning("read_results error\n");
			return -1;
		}
	}

	if (options.opt_aggr) {
		pthread_mutex_lock(&task_aggr_lock);

		task_aggregate_results(sdesc);

		if (options.opt_use_smpl) pfmon_process_smpl_buf(sdesc, 1);

		pthread_mutex_unlock(&task_aggr_lock);
	}
	else {
		if (options.opt_use_smpl) pfmon_process_smpl_buf(sdesc, 1);

		show_results(sdesc, 0, PFMON_RESULTS_FINAL);

		close_results(sdesc);
	}
	if (options.opt_use_smpl && options.opt_aggr == 0) 
		pfmon_close_sampling_output(sdesc, &sdesc->csmpl, sdesc->tid, 0);

	return 0;
}


/*
 * allocates sdesc with accompanying ctx_arg area
 */
static pfmon_sdesc_t *
pfmon_sdesc_alloc(void)
{
	pfmon_sdesc_t *tmp;

	tmp = malloc(sizeof(pfmon_sdesc_t) + options.ctx_arg_size);
	if (tmp == NULL) fatal_error("cannot allocate sdesc\n");

	/* don't need to initialize ctx area, will be done via copy */
	memset(tmp, 0, sizeof(pfmon_sdesc_t));

	pthread_mutex_init(&tmp->lock, PTHREAD_MUTEX_TIMED_NP);

	return tmp;
}

static void
pfmon_sdesc_free(pfmon_sdesc_t *t)
{
	free(t);
}

static void
pfmon_sdesc_pid_hash_add(pfmon_sdesc_t **hash, pfmon_sdesc_t *t)
{
	int slot = PFMON_SDESC_PID_HASH(t->tid);

	pthread_mutex_lock(&pfmon_hash_pid_lock);

	t->next    = hash[slot];
	hash[slot] = t;

	pthread_mutex_unlock(&pfmon_hash_pid_lock);

}

static pfmon_sdesc_t *
pfmon_sdesc_pid_hash_find(pfmon_sdesc_t **hash, pid_t pid)
{
	pfmon_sdesc_t *q;

	pthread_mutex_lock(&pfmon_hash_pid_lock);

	q = hash[PFMON_SDESC_PID_HASH(pid)];
	while (q) {
		if ((q)->tid == pid) break;
		q = q->next;
	}
	pthread_mutex_unlock(&pfmon_hash_pid_lock);

	return q;
}

static int
pfmon_sdesc_pid_hash_remove(pfmon_sdesc_t **hash, pfmon_sdesc_t *t)
{
	pfmon_sdesc_t *q, *prev = NULL;
	int slot = PFMON_SDESC_PID_HASH(t->tid);

	pthread_mutex_lock(&pfmon_hash_pid_lock);

	q = hash[slot];
	while (q) {
		if (q == t) goto found;
		prev = q;
		q = q->next;
	}
	pthread_mutex_unlock(&pfmon_hash_pid_lock);

	fatal_error("cannot find [%d] in hash queue\n", t->tid);
	return -1;
found:
	if (prev)
		prev->next = t->next;
	else 
		hash[slot] = t->next;

	pthread_mutex_unlock(&pfmon_hash_pid_lock);

	return 0;
}
	
static int
pfmon_setup_ptrace(pid_t pid)
{
	unsigned long ptrace_flags;
	int ret;

	ptrace_flags = 0UL;

	/*
	 * we need this notifcation to stop monitoring on exec when
	 * no "follow" option is specified
	 */
	ptrace_flags |= PTRACE_O_TRACEEXEC;

	if (options.opt_follow_vfork)
		ptrace_flags |= PTRACE_O_TRACEVFORK;
	if (options.opt_follow_fork)
		ptrace_flags |= PTRACE_O_TRACEFORK;
	if (options.opt_follow_pthread)
		ptrace_flags |= PTRACE_O_TRACECLONE;


	vbprintf("follow_exec=%c follow_vfork=%c follow_fork=%c follow_pthread=%c\n",
		options.opt_follow_exec  ? 'y' : 'n',
		options.opt_follow_vfork ? 'y' : 'n',
		options.opt_follow_fork  ? 'y' : 'n',
		options.opt_follow_pthread ? 'y' : 'n');

	if (ptrace_flags == 0UL) return 0;

	/*
	 * update the options
	 */
	ret = ptrace(PTRACE_SETOPTIONS, pid, NULL, (void *)ptrace_flags);
	if (ret == -1) warning("cannot set ptrace options on [%d], check PTRACE_SETOPTIONS support: %s\n", pid, strerror(errno));
	return ret;
}

static void
pfmon_sdesc_exit(pfmon_sdesc_t *sdesc)
{
	pid_t tid;

	tid = sdesc->tid;

	LOCK_SDESC(sdesc);

	sdesc->refcnt--;

	if (sdesc->refcnt == 0) {

		pfmon_sdesc_pid_hash_remove(sdesc_pid_hash, sdesc);

		LOCK_TASK_INFO();

		task_info.num_sdesc--;

		if (sdesc->ctxid != -1) task_info.num_active_sdesc--;

		vbprintf("[%d] detached\n", tid);

		if (task_info.num_sdesc == 0) {
			work_todo = 0;
			sem_post(&master_work_sem);
			DPRINT(("posted master_work_sem\n"));
		}
		DPRINT(("tid=%d removed active=%lu todo=%d\n", tid, task_info.num_active_sdesc, work_todo));

		UNLOCK_TASK_INFO();

		if (sdesc->ctxid != -1) close(sdesc->ctxid);

		pfmon_sdesc_free(sdesc);

	} else {
		if (sdesc->refcnt < 1) { 
			fatal_error("invalid refcnt=%d for [%d]\n", sdesc->refcnt, tid); 
		}
		DPRINT(("deferring remove tid=%d refcnt=%d\n", tid, sdesc->refcnt));

		UNLOCK_SDESC(sdesc);
	}
}

static const char *sdesc_type_str[]= {
	"attached",
	"fork",
	"vfork",
	"clone"
};

static pfmon_sdesc_t *
pfmon_sdesc_new(int type, pfmon_sdesc_t *parent, pid_t new_pid)
{
	pfmon_sdesc_t *sdesc;
	unsigned int n;

	sdesc = pfmon_sdesc_alloc();

	sdesc->type = type;
	sdesc->tid  = new_pid;

	if (parent) strcpy(sdesc->cmdline, parent->cmdline);

	/*
	 * for a pure clone, process id is the same, the new_pid refers
	 * to the new tid
	 */
	sdesc->pid = type == PFMON_SDESC_CLONE && parent ? parent->pid : new_pid;

	/*
	 * the following rules apply for flags inheritance:
	 * fl_monitoring	: inherited
	 * fl_seen_stopsig	: not inherited
	 * fl_detaching		: not inherited
	 * fl_dispatched	: not inherited
	 * fl_attached		: inherited
	 */
	if (parent) {
		sdesc->ppid  = parent->pid;
		sdesc->ptid  = parent->tid;

		if (parent->fl_attached) sdesc->fl_attached = 1;
		if (parent->fl_monitoring) sdesc->fl_monitoring = 1;

	} else {
		sdesc->ppid  = -1;
		sdesc->ptid  = -1;
	}

	if (type == PFMON_SDESC_ATTACH) sdesc->fl_attached = 1;

	if (new_pid) pfmon_sdesc_pid_hash_add(sdesc_pid_hash, sdesc);

	sdesc->ctxid  = -1; /* not associated with a context */

	sdesc->refcnt = 1;

	/*
	 * parent == NULL indicates first task
	 */
	n = options.num_code_triggers;
	if (n && (options.opt_code_trigger_follow || parent == NULL)) {
		memcpy(sdesc->code_triggers, options.code_triggers, n*sizeof(pfmon_trigger_t));
		sdesc->num_code_triggers = n;
	}

	/*
	 * parent == NULL indicates first task
	 */
	n = options.num_data_triggers;
	if (n && (options.opt_data_trigger_follow || parent == NULL)) {
		memcpy(sdesc->data_triggers, options.data_triggers, n*sizeof(pfmon_trigger_t));
		sdesc->num_data_triggers = n;
	}

	DPRINT(("%s parent=%d pid=%lu tid=%d flags=0x%lx cmd: %.64s\n", 
		sdesc_type_str[type],
		sdesc->ppid,
		sdesc->pid,
		sdesc->tid,
		sdesc->flags,
		sdesc->cmdline));

	LOCK_TASK_INFO();

	task_info.num_sdesc++;

	if (task_info.num_sdesc > task_info.max_sdesc) 
		task_info.max_sdesc = task_info.num_sdesc;

	task_info.total_sdesc++;

	UNLOCK_TASK_INFO();

	return sdesc;
}

static void
pfmon_sdesc_set_pid(pfmon_sdesc_t *sdesc, pid_t new_pid)
{
	sdesc->tid  = new_pid;

	/*
	 * for a pure clone, process id is the same, the new_pid refers
	 * to the new tid
	 */
	sdesc->pid = sdesc->type == PFMON_SDESC_CLONE ? sdesc->ppid : new_pid;
	pfmon_sdesc_pid_hash_add(sdesc_pid_hash, sdesc);
}

/*
 * return:
 * 	0 : not interested
 * 	1 : interested
 */
static inline int
pfmon_sdesc_interesting(pfmon_sdesc_t *sdesc)
{
	int r = 0;

	if (options.fexec_pattern) {
		/* r = 0 means match */
		r = regexec(&follow_exec_preg, sdesc->new_cmdline, 0, NULL, 0);
		if (options.opt_follow_exec_excl) r = !r;
	}
	return r == 0 ? 1 : 0;
}

static void
pfmon_sdesc_exec(pfmon_sdesc_t *sdesc)
{
	pfmon_extract_cmdline(sdesc->pid, sdesc->new_cmdline, PFMON_MAX_CMDLINE_LEN);

	/*
	 * deactivate symbol hash table after first exec
	 */
	if ((sdesc->ppid != -1 || sdesc->exec_count) && options.opt_addr2sym ) {
		options.opt_addr2sym = 0;
		vbprintf("[%d] deactivated symbol resolution because of multiple exec()\n", sdesc->tid);
	}
}

static int
task_worker_send_msg(unsigned int cpu, task_worker_msg_t *msg, int wait)
{
	task_worker_msg_t fake;
	int r;

	r = write(workers[cpu].to_worker[1], msg, sizeof(*msg));
	DPRINT(("sending msg.type=%d to wCPU%u\n", msg->type, cpu));

	/*
	 * dummy response, just used for synchronization
	 */
	if (wait) r = read(workers[cpu].from_worker[0], &fake, sizeof(fake));

	return r;
}


static pfmon_sdesc_t *
task_create(char **argv)
{
	pfmon_sdesc_t *sdesc;
	pid_t pid = 0;
	int status, ret;

	sdesc = pfmon_sdesc_new(PFMON_SDESC_VFORK, NULL, 0);
	if (sdesc == NULL) return NULL;

	if ((pid=vfork()) == -1) {
		warning("cannot vfork process\n");
		pfmon_sdesc_free(sdesc);
		return NULL;
	}

	if (pid == 0) {		 
		/*
		 * The use of ptrace() allows us to actually start monitoring after the exec()
		 * is done, i.e., when the new program is ready to go back to user mode for the
		 * "first time". Using this technique we ensure that the overhead of 
		 * exec'ing is not captured in the results. This * can be important for 
		 * short running programs.
		 */
		ret = ptrace(PTRACE_TRACEME, 0, NULL, NULL);
		if (ret == -1) {
			warning("cannot ptrace self: %s\n", strerror(errno));
			exit(1);
		}
		if (options.opt_cmd_no_verbose) {
			dup2 (open("/dev/null", O_WRONLY), 1);
			dup2 (open("/dev/null", O_WRONLY), 2);
		}	

		execvp(argv[0], argv);

		warning("cannot exec %s: %s\n", argv[0], strerror(errno));

		exit(1);
		/* NOT REACHED */
	}
	/* 
	 * wait for the child to exec 
	 */
	waitpid(pid, &status, WUNTRACED);

	if (options.opt_verbose) {
		char **p = argv;
		vbprintf("[%d] started task: ", pid);
		while (*p) vbprintf("%s ", *p++);
		vbprintf("\n");
	}

	/*
	 * process is stopped at this point
	 */
	if (WIFEXITED(status)) {
		warning("error cannot monitor task %s(%d): exit status %d\n", argv[0], pid, WEXITSTATUS(status));
		pfmon_sdesc_free(sdesc);
		return NULL;
	}

	if (pfmon_setup_ptrace(pid)) {
		/* get rid of the task, we cannot proceed */
		status = ptrace(PTRACE_KILL, pid, NULL, NULL);
		if (status != 0) warning("cannot kill task %d: %s\n", pid, strerror(errno));
		pfmon_sdesc_free(sdesc);
		return NULL;
	}
	pfmon_sdesc_set_pid(sdesc, pid);
	pfmon_sdesc_exec(sdesc);

	return sdesc;
}

static pfmon_sdesc_t *
task_attach(char **argv)
{
	pfmon_sdesc_t *sdesc;
	pid_t pid = 0;
	int status;

	sdesc = pfmon_sdesc_new(PFMON_SDESC_ATTACH, NULL, 0);
	if (sdesc == NULL) return NULL;

	pid = options.attach_pid;

	status = ptrace(PTRACE_ATTACH, pid, NULL, NULL);
	if (status == -1) {
		warning("cannot attach to %d: %s\n", pid, strerror(errno));
		pfmon_sdesc_free(sdesc);
		return NULL;
	}

	waitpid(pid, &status, WUNTRACED);

	/*
	 * process is stopped at this point
	 */
	if (WIFEXITED(status)) {
		warning("error command already terminated, exit code %d\n", WEXITSTATUS(status));
		pfmon_sdesc_free(sdesc);
		return NULL;
	}

	if (pfmon_setup_ptrace(pid)) {
		/* cannot proceed, just detach */
		status = ptrace(PTRACE_DETACH, pid, NULL, NULL);
		if (status != 0) warning("cannot detach task %d: %s\n", pid, strerror(errno));
		pfmon_sdesc_free(sdesc);
		return NULL;
	}
	pfmon_sdesc_set_pid(sdesc, pid);
	pfmon_sdesc_exec(sdesc);

	vbprintf("attached to [%d] %.16s...\n", pid, sdesc->cmdline);

	return sdesc;
}



static void
task_dispatch_sdesc(pfmon_sdesc_t *sdesc)
{
	task_worker_msg_t msg;
	static unsigned int next_cpu;

	/* sanity check */
	if (sdesc->fl_dispatched) fatal_error("[%d] already dispatched error\n", sdesc->tid);

	msg.type = PFMON_TASK_MSG_ADD_TASK;	
	msg.data = sdesc;

	sdesc->refcnt++;
	sdesc->cpu = next_cpu;
	sdesc->fl_dispatched = 1;

	DPRINT(("[%d] dispatched to worker on CPU%u\n", sdesc->tid, next_cpu));

	task_worker_send_msg(next_cpu, &msg, 0);

	/*
	 * basic round-robin allocation
	 */
	next_cpu = (next_cpu+1) % options.online_cpus;
}

/*
 * return:
 * 	-1 : error
 * 	 0 : ok
 */
static int
task_pfm_init(pfmon_sdesc_t *sdesc, int from_exec, pfmon_ctx_t *ctx)
{
	task_worker_msg_t msg;
	pid_t tid;
	int has_ctxid, was_monitoring;
	int ret;

	tid = sdesc->tid;

	/*
	 * we only take the long path if we are coming from exec, otherwise we inherited
	 * from the parent task. 
	 */
	if (from_exec == 0) {
		/*
		 * parent was active, we need to create our context
		 */
		if (sdesc->fl_monitoring) goto init_pfm;
		/*
		 * parent was not active
		 */
		DPRINT(("keep inactive task [%d] monitoring=%d: %s\n", tid, sdesc->fl_monitoring, sdesc->cmdline));
		return 0;
	} 
	/*
	 * we are coming for an exec event
	 */
	DPRINT((" in: [%d] ctxid=%d monitoring=%d refcnt=%d: %s\n", 
		tid, sdesc->ctxid, sdesc->fl_monitoring, sdesc->refcnt,sdesc->cmdline));

	/*
	 * in case we do not follow exec, we have to stop right here
	 * sdesc->ppid=-1 denotes the first process. In case we do not follow exec (pattern), 
	 * we always monitor the first process until it exec's.
	 */
	if (options.opt_follow_exec == 0) {
		ret = sdesc->ppid != -1 || sdesc->exec_count ? 0 : 1;
	} else {
		ret = pfmon_sdesc_interesting(sdesc);
	}
	if (ret == 0) {
		vbprintf("[%d] not monitoring %.55s...\n", sdesc->tid, sdesc->new_cmdline);

		/*
		 * if there was a context attached to the session, then clean up
		 * when split-exec is used. Otherwise, we just stop monitoring
		 * but keep the context around
		 */
		if (sdesc->ctxid != -1) {

			vbprintf("[%d] stopping monitoring at exec\n", tid);

			if (options.opt_split_exec) {
				if (sdesc->fl_monitoring) {
					vbprintf("[%d] collecting results at exec\n", tid);
					task_collect_results(sdesc);
				}

				if (sdesc->fl_dispatched) {
					msg.type = PFMON_TASK_MSG_REM_TASK;
					msg.data = sdesc;

					task_worker_send_msg(sdesc->cpu, &msg, 1);

					sdesc->fl_dispatched = 0;
				}
				close(sdesc->ctxid);
				sdesc->ctxid = -1;
			} else {
				/*
				 * only stop monitoring
				 *
				 * code/data triggers are automatically cleared 
				 * by the kernel on exec()
				 */
				pfmon_stop(sdesc->ctxid);
			}
			/*
			 * monitoring is deactivated
			 */
			sdesc->fl_monitoring = 0;

			LOCK_TASK_INFO();
			task_info.num_active_sdesc--;
			UNLOCK_TASK_INFO();

		}
		/* 
		 * cannot be done before we save results
		 */
		sdesc->exec_count++;
		return 0;
	}
	if (options.opt_split_exec && sdesc->ctxid != -1 && sdesc->fl_monitoring) {
		vbprintf("[%d] collecting results at exec\n", tid);
		task_collect_results(sdesc);
	}

	strcpy(sdesc->cmdline, sdesc->new_cmdline);

	sdesc->exec_count++;

	/*
	 * necessarily in follow-exec mode at this point
	 */

init_pfm:

	vbprintf("[%d] monitoring %.58s...\n", sdesc->tid, sdesc->cmdline);

	was_monitoring = sdesc->fl_monitoring;
	has_ctxid      = sdesc->ctxid != -1;

	/*
	 * we want to monitoring this task
	 */
	sdesc->fl_monitoring = 1;


	/* 
	 * if the sdesc was not connected to a perfmon context, then
	 * we do the whole setup procedure. 
	 */
	if (has_ctxid == 0) {

		DPRINT(("setup perfmon ctx for [%d] monitoring=%d refcnt=%d: %s\n", 
			tid, sdesc->fl_monitoring, sdesc->refcnt, sdesc->cmdline));

		ret = task_setup_pfm_context(sdesc, ctx);
		if (ret == -1) return -1;

		/*
		 * we may defer actual activation until later
		 */
		if (options.opt_dont_start == 0) {
			pfmon_start(sdesc->ctxid);
			vbprintf("[%d] activating monitoring\n", tid);
		} else {
			vbprintf("[%d] monitoring not activated\n", tid);
		}

	} else {
		/*
		 * we already have a context here
		 */

		/*
		 * in split-exec mode, we need to reset our context
		 * before we proceed further. We also need to reopen
		 * the output file because it was closed in
		 * task_collect_results()
		 */
		if (options.opt_split_exec) {
			task_reset_pfm_context(sdesc);
			if (open_results(sdesc) == -1) return -1;

			/* monitoring is stopped in task_reset_pfm() because of
			 * unload/reload
			 */
			was_monitoring = 0;
		}
		/*
		 * context was not actively monitoring, then we just
		 * need to restart now
		 */
		if (was_monitoring == 0 && options.opt_dont_start == 0) {
			pfmon_start(sdesc->ctxid);
			vbprintf("[%d] restarting monitoring\n", tid);
		}
	}
	if (was_monitoring == 0 || has_ctxid == 0) {
		LOCK_TASK_INFO();
		task_info.num_active_sdesc++;
		if (task_info.num_active_sdesc > task_info.max_active_sdesc) 
				task_info.max_active_sdesc = task_info.num_active_sdesc;
		UNLOCK_TASK_INFO();
	}

	DPRINT(("out: [%d] fl_monitoring=%d ctxid=%d was_monitoring=%d has_ctxid=%d\n",
			tid,
			sdesc->fl_monitoring,
			sdesc->ctxid,
			was_monitoring,
			has_ctxid));
	/*
	 * pick a worker thread to manage perfmon notifications, if necessary.
	 */
	if (has_ctxid == 0 && options.opt_use_smpl) task_dispatch_sdesc(sdesc);

	if (options.opt_show_rusage) gettimeofday(&sdesc->tv_start, NULL);

	return 0;
}

static void
task_pfm_exit(pfmon_sdesc_t *sdesc)
{
	/*
	 * a task descriptor not associated with a perfmon context, simply destroy
	 */
	if (sdesc->ctxid != -1) task_collect_results(sdesc);

	pfmon_sdesc_exit(sdesc);
}

static int
task_handle_trigger(pfmon_sdesc_t *sdesc)
{
	pfmon_trigger_t *trg, *stop_trg;
	unsigned long addr, rp;
	int is_start, is_repeat, is_data = 0;
	pid_t tid;
	int ret;
	
	tid = sdesc->tid;

	pfmon_get_breakpoint_addr(tid, &addr, &is_data);

	if (is_data) {
		trg = find_data_trigger(sdesc, addr);
	} else {
		trg = find_code_trigger(sdesc, addr);
	}

	if (trg == NULL) {
		warning("task [%d] interrupted @%p for no reason\n", tid, addr);
		return -1;
	}

	is_start  = trg->trg_attr_start;
	is_repeat = trg->trg_attr_repeat;

	vbprintf("[%d] reached %-5s %s breakpoint @%p\n", 
		tid, 
		is_start ? "start" : "stop",
		is_data ? "data" : "code",
		addr);

	/*
	 * dynamically insert stop trigger with start trigger is of type "func"
	 */
	if (trg->trg_attr_start && trg->trg_attr_func) {
		pfmon_get_return_pointer(tid, &rp);

		if (pfmon_validate_code_trigger_address(rp)) goto skip_stop;

		stop_trg = sdesc->code_triggers+trg->stop_trg_idx;
		stop_trg->brk_address = rp;

		ret = pfmon_set_code_breakpoint(tid, stop_trg->br_idx, rp);
		if (ret) {
			warning("cannot set dynamic stop breakpoint\n");
			return -1;
		}

		vbprintf("[%d] install br.ret breakpoint @%p\n", tid, rp);
	}
skip_stop:
	if (is_start) {
		pfmon_start(sdesc->ctxid);
		vbprintf("[%d] activating monitoring at trigger start\n", tid);
	} else {
		pfmon_stop(sdesc->ctxid);
		vbprintf("[%d] stopping monitoring at trigger stop\n", tid);
	}

	if (is_repeat == 0) {
		if (is_data)
			pfmon_clear_data_breakpoint(tid, trg->br_idx, trg->brk_address);
		else
			pfmon_clear_code_breakpoint(tid, trg->br_idx, trg->brk_address);

		vbprintf("[%d] clearing %s breakpoint @%p\n", 
			tid, 
			is_data? "data" : "code", 
			trg->brk_address);

	} else  {
		if (is_data)
			pfmon_resume_after_data_breakpoint(tid);
		else
			pfmon_resume_after_code_breakpoint(tid);

		vbprintf("[%d] resume after %s breakpoint\n", tid, is_data ? "data" : "code");
	}
	return 0;
}

/*
 * task must be stopped when calling
 */
static int
task_detach(pfmon_sdesc_t *sdesc)
{
	task_worker_msg_t msg;
	int was_dispatched = 0;
	pid_t pid;

	pid = sdesc->tid;


	vbprintf("detaching from [%d]\n", pid);
	if (sdesc->ctxid != -1) {
		
		if (sdesc->fl_dispatched) {
			msg.type = PFMON_TASK_MSG_REM_TASK;
			msg.data = sdesc;

			/* wait for ack */
			task_worker_send_msg(sdesc->cpu, &msg, 1);
			was_dispatched = 1;
		}
		task_collect_results(sdesc);
		if (was_dispatched) pfmon_sdesc_exit(sdesc);
	}

	vbprintf("detached from [%d]\n", pid);

	pfmon_detach(pid);

	pfmon_sdesc_exit(sdesc);

	return 0;
}

static void
task_force_exit(void)
{
	pfmon_sdesc_t *t;
	unsigned int i;
	int sig;

	for(i=0; i < PFMON_SDESC_PID_HASH_SIZE; i++) {
		t = sdesc_pid_hash[i];
		while (t) {
			if (t->fl_attached) {
				sig = SIGSTOP;
				t->fl_detaching = 1;
			} else {
				sig = SIGKILL;
			}

			vbprintf("sending signal %d to [%d]\n", sig, t->tid);
			kill(t->tid, sig);

			t = t->next;
		}
	}
}

static int
task_mainloop(pfmon_ctx_t *ctx, char **argv)
{	
	pfmon_sdesc_t *sdesc;
	time_t start_time;
	unsigned long sig;
	struct rusage rusage;
	struct timeval tv;
	long new_pid; /* must be long */
	pid_t tid = -1;
	int status, event, wait_type;
	int r, has_workers, needs_time, cleaning_up = 0;

	has_workers = options.opt_use_smpl    ? 1 : 0;
	needs_time  = options.opt_show_rusage ? 1 : 0;

	sdesc = options.opt_attach == 0 ? task_create(argv) : task_attach(argv);
	if (sdesc == NULL) return -1;

	setup_sigchild();

	r = task_pfm_init(sdesc, 1, ctx);
	if (r) return -1; /* return 1, if task is interesting, 0 otherwise, -1 if error */

	time(&start_time);
	vbprintf("measurements started at %s\n", asctime(localtime(&start_time)));

	/* actually start the task */
	pfmon_continue(sdesc->tid, 0);

	if (options.session_timeout) {
		alarm(options.session_timeout);
		vbprintf("arming session alarm to %u seconds\n", options.session_timeout);
	}
	wait_type = WUNTRACED|WNOHANG;
	if (options.opt_follow_pthread) wait_type |= __WALL;

	work_todo = 1;

	//while (task_info.num_active_sdesc) {
	for(;work_todo;) {

		unmask_global_signals();

		sem_wait(&master_work_sem);

		mask_global_signals();

		while (work_todo && (tid = wait4(-1, &status, wait_type, &rusage)) > 0) {

			if (needs_time) gettimeofday(&tv, NULL);

			sdesc = pfmon_sdesc_pid_hash_find(sdesc_pid_hash, tid);

			DPRINT(("tid=%d errno=%d exited=%d stopped=%d signaled=%d stopsig=%-2d "
					"ppid=%-6d ctxid=%-3d mon=%d att=%d det=%d quit=%d clean=%d cmd: %.16s\n",
					tid, errno, 
					WIFEXITED(status), 
					WIFSTOPPED(status), 
					WIFSIGNALED(status), 
					WSTOPSIG(status), 
					sdesc ? sdesc->ppid : -1,
					sdesc ? sdesc->ctxid: -1,
					sdesc ? sdesc->fl_monitoring: 0,
					sdesc ? sdesc->fl_attached: 0,
					sdesc ? sdesc->fl_detaching: 0,
					time_to_quit, cleaning_up,
					sdesc ? sdesc->cmdline : ""));

			if (sdesc == NULL) { 
				warning("unknown task [%d]\n", tid); 
				task_sigusr_handler(0, NULL, NULL);
				continue; 
			}

			if (WIFEXITED(status) || WIFSIGNALED(status)) {
				struct timeval start;

				start = sdesc->tv_start;

				vbprintf("[%d] task exited\n", tid);
				if (has_workers)
					pfmon_sdesc_exit(sdesc);
				else
					task_pfm_exit(sdesc);

				if (needs_time) show_task_rusage(&start, &tv, &rusage);

				continue;
			}

			/* 
			 * task is stopped
			 */
			sig = WSTOPSIG(status);
			if (sig == SIGTRAP) {
				/*
				 * do not propagate the signal, it was for us
				 */
				sig = 0;

				/*
				 * extract event code from status (should be in some macro)
				 */
				event = status >> 16;
				switch(event) {
					case PTRACE_EVENT_FORK:
						r = ptrace(PTRACE_GETEVENTMSG, tid, NULL, (void *)&new_pid);
						vbprintf("[%d] forked [%ld]\n", sdesc->tid, new_pid);
						if (cleaning_up == 0) {
							pfmon_sdesc_new(PFMON_SDESC_FORK, sdesc, new_pid);
						} else {
							pfmon_detach(new_pid);
						}
						break;
					case PTRACE_EVENT_CLONE:
						/* new pid is really new tid */
						r = ptrace(PTRACE_GETEVENTMSG, tid, NULL, (void *)&new_pid);
						vbprintf("[%d] clone [%ld]\n", sdesc->tid, new_pid);
						if (cleaning_up == 0) {
							pfmon_sdesc_new(PFMON_SDESC_CLONE, sdesc, new_pid);
						} else {
							pfmon_detach(new_pid);
						}
						break;
					case PTRACE_EVENT_VFORK:
						r = ptrace(PTRACE_GETEVENTMSG, tid, NULL, (void *)&new_pid);
						vbprintf("[%d] vfork [%ld]\n", sdesc->tid, new_pid);
						if (cleaning_up == 0) {
							pfmon_sdesc_new(PFMON_SDESC_VFORK, sdesc, new_pid);
						} else {
							pfmon_detach(new_pid);
						}
						break;
					case PTRACE_EVENT_EXEC:
						pfmon_sdesc_exec(sdesc);
						vbprintf("[%d] exec %.64s...\n", sdesc->tid, sdesc->new_cmdline);

						if (cleaning_up)  break;
						r = task_pfm_init(sdesc, 1, ctx);
						if (r) {
							time_to_quit = 1;
							quit_reason  = QUIT_ERROR;
						}
						break;
					case  0:
						if (cleaning_up) break;

						r = task_handle_trigger(sdesc);
						/* we detached the task, no need for PTRACE_CONT */
						if (r == 1) continue;
						/* need a cont */
					default: 
						DPRINT((">>got unknown event %d\n", event));
						/*
						 * when a task is ptraced' and executes execve:
						 * 	- if PTRACE_O_TRACEEXEC is set, then we get PTRACE_EVENT_EXEC event
						 * 	- if PTRACE_O_TRACEEXEC is not set, then we just receive a SIGTRAP
						 */
						if (options.opt_follow_exec == 1) 
							printf("unknown ptrace event %d\n", event);
				}
			} else if (sig == SIGSTOP) {
				/* 
				 * cancel signal, it was for us
				 *
				 * XXX: it that always the case?
				 */
				sig = 0;

				/*
				 * we need to wait until a newly created task reaches the stopped
				 * state to ensure that perfmon will see the task actually stopped
				 * and not just cloned. We do get two events: fork/vfork/clone and
				 * the first STOPPED signal when the task reaches its first 
				 * notification point.
				 */
				if (sdesc->fl_detaching) {
					task_detach(sdesc);
					continue;
				}
				//if (sdesc->fl_seen_stopsig == 0 && (sdesc->fl_active)) {
				if (sdesc->fl_seen_stopsig == 0 && (sdesc->fl_monitoring)) {
					sdesc->fl_seen_stopsig = 1;
					r = task_pfm_init(sdesc, 0, ctx);
					if (r) {
						time_to_quit = 1;
						quit_reason  = QUIT_ERROR;
					}
				}
			} else {
				DPRINT(("forward signal %lu to [%d]\n", sig, tid));
			}
			pfmon_continue(tid, sig);
		}
		DPRINT(("tid=%d errno=%d time_to_quit=%d cleaning_up=%d todo=%d active=%lu\n", 
			tid, errno, time_to_quit, cleaning_up, work_todo,task_info.num_active_sdesc));
		/*
		 * we check for interruption only when we are done processing pending ptrace events
		 */
		if (time_to_quit && cleaning_up == 0) {
			switch(quit_reason) {
				case QUIT_ALARM:
					printf("session timed out\n");
					break;
				case QUIT_ABORT:
					printf("session interrupted by user\n");
					break;
				case QUIT_ERROR:
					printf("session interrupted by error\n");
					break;
				default:
					printf("session interrupted for unknown reason!\n");
			}
			task_force_exit();

			cleaning_up  = 1;
		}
	}

	if (options.opt_aggr) {
		print_results(&sdesc_task_aggr);
		if (options.opt_use_smpl)
			pfmon_close_aggr_sampling_output(&sdesc_task_aggr, &sdesc_task_aggr.csmpl);
	}

	vbprintf("created tasks        : %lu\n"
		 "maximum tasks        : %lu\n"
		 "maximum active tasks : %lu\n", 
		task_info.total_sdesc, 
		task_info.max_sdesc,
		task_info.max_active_sdesc);

	return 0;
}

static
void pfmon_thread_arg_destroy(void *data)
{
	if (data) free(data);
}

static void
exit_per_task(int i)
{
	if (gettid() == master_tid) exit(i);

	pthread_exit((void *)((unsigned long)i));
}

static void
pfmon_sdesc_fd_hash_add(pfmon_sdesc_t **hash, pfmon_sdesc_t *t)
{
	int slot = PFMON_SDESC_FD_HASH(t->ctxid);
	t->fd_next = hash[slot];
	hash[slot] = t;
}

static pfmon_sdesc_t *
pfmon_sdesc_fd_hash_find(pfmon_sdesc_t **hash, int fd)
{
	pfmon_sdesc_t *q;

	q = hash[PFMON_SDESC_FD_HASH(fd)];
	while (q) {
		if ((q)->ctxid == fd) return q;
		q = q->fd_next;
	}
	/* should not happen */
	return NULL;
}

static int
pfmon_sdesc_fd_hash_remove(pfmon_sdesc_t **hash, pfmon_sdesc_t *t)
{
	pfmon_sdesc_t *q, *prev = NULL;
	int slot = PFMON_SDESC_FD_HASH(t->ctxid);

	q = hash[slot];
	while (q) {
		if (q == t) goto found;
		prev = q;
		q = q->fd_next;
	}
	return -1;
found:
	if (prev)
		prev->fd_next = t->fd_next;
	else 
		hash[slot] = t->fd_next;
	return 0;
}

static void
task_worker_mainloop(void *data)
{
	task_worker_t *mywork = (task_worker_t *)data;
	fd_set master_fds, fds;
	pfmon_sdesc_t **myhash, *sdesc;
	pfm_msg_t msg;
	task_worker_msg_t pfmon_msg;
	pid_t mytid;
	unsigned int mycpu;
	unsigned int myjobs = 0;
	int i, ret;
	int ctrl_fd;
	int max_fd;
	int ndesc, msg_type;

	/*
	 * POSIX threads: 
	 * The signal state of the new thread is initialised as follows:
    	 *    - the signal mask is inherited from the creating thread.
         *    - the set of signals pending for the new thread is empty.
	 *
	 * we want to let the master handle the global signals, therefore
	 * we mask them here.
	 */
	mask_global_signals();

	ctrl_fd = mywork->to_worker[0];
	mycpu   = mywork->cpu_id;
	mytid   = gettid();
	myhash  = mywork->fd_hash;

	/*
	 * some NPTL sanity checks
	 */
	if (mytid == master_tid) 
		fatal_error("pfmon is not compiled/linked with the correct pthread library,"
			"the program is linked with NPTL when it should not. Check Makefile.\n");

	pfmon_pin_self(mycpu);

	FD_ZERO(&master_fds);
	FD_SET(ctrl_fd, &master_fds);

	max_fd = ctrl_fd;

	DPRINT(("worker [%d] on CPU%u ctrl_fd=%d\n", mytid, mycpu, ctrl_fd));

	for(;;) {
		memcpy(&fds, &master_fds, sizeof(fds));

		ndesc = select(max_fd+1, &fds, NULL, NULL, NULL);
		if (ndesc == -1) {
			if (errno == EINTR) continue;
			fatal_error("select returned %d\n", errno);
		}

		DPRINT(("worker on CPU%u max_fd=%d select=%d ctrl_fd=%d\n", mycpu, max_fd, ndesc, ctrl_fd));

		for(i=0; ndesc; i++) {

			if (FD_ISSET(i, &fds) == 0) continue;

			DPRINT(("worker on CPU%u activity on fd=%d\n", mycpu, i));

			ndesc--;

			if (i != ctrl_fd) {
				sdesc = pfmon_sdesc_fd_hash_find(myhash, i);
				if (sdesc == NULL) 
					fatal_error("wCPU%u cannot find context for fd=%d\n", mycpu, i);

				ret = read(i, &msg, sizeof(msg));
				if (ret != sizeof(msg)) {
					warning("[%d] error reading on %d: ret=%d msg=%p errno=%s\n", mytid, i, ret, &msg, strerror(errno));
					continue;
				}

				msg_type = msg.type;

				DPRINT(("wCPU%u read=%d type=%d pmds=0x%lx\n", 
							mycpu, ret,
							msg_type,
							msg.pfm_ovfl_msg.msg_ovfl_pmds[0]));

				if (msg_type == PFM_MSG_OVFL) {
					pfmon_process_smpl_buf(sdesc, 0);
					continue;
				}

				if (msg_type != PFM_MSG_END) 
					fatal_error("wCPU%u unknown message type %d\n", mycpu, msg_type);

				/*
				 * remove from fd hash table
				 */
				pfmon_sdesc_fd_hash_remove(myhash, sdesc);

				/*
				 * remove from list of descriptors of interest
				 */
				FD_CLR(sdesc->ctxid, &master_fds);

				/* XXX: approximation */
				if (sdesc->ctxid == max_fd) max_fd--;

				myjobs--;

				DPRINT(("wCPU%u end_msg ctxid=%d tid=%d\n",
					mycpu,
					sdesc->ctxid,
					sdesc->tid));

				task_pfm_exit(sdesc);

				continue;
			} 

			ret = read(ctrl_fd, &pfmon_msg, sizeof(pfmon_msg));
			if (ret != sizeof(pfmon_msg)) {
				warning("error reading ctrl_fd(%d) on CPU%u: ret=%d errno=%d\n", ctrl_fd, mycpu, ret, errno);
				continue;
			}

			sdesc = (pfmon_sdesc_t *)pfmon_msg.data;

			switch(pfmon_msg.type) {

				case PFMON_TASK_MSG_ADD_TASK:
					myjobs++;
					DPRINT(("wCPU%u managing [tid=%d:fd=%d] jobs=%u\n", mycpu, sdesc->tid, sdesc->ctxid, myjobs));

					FD_SET(sdesc->ctxid, &master_fds);
					pfmon_sdesc_fd_hash_add(myhash, sdesc);

					if (sdesc->ctxid > max_fd) max_fd = sdesc->ctxid;
					break;

				case PFMON_TASK_MSG_REM_TASK:
					myjobs--;
					vbprintf("wCPU%u removing [%d:%d]\n", mycpu, sdesc->tid, sdesc->ctxid);

					FD_CLR(sdesc->ctxid, &master_fds);
					pfmon_sdesc_fd_hash_remove(myhash, sdesc);

					/* XXX: approximation */
					if (sdesc->ctxid == max_fd) max_fd--;

					/*
					 * ack the removal
					 */
					write(workers[mycpu].from_worker[1], &msg, sizeof(msg));
					break;

				case PFMON_TASK_MSG_QUIT:
				default:
					warning("wCPU%u unexpected message %d, size=%d\n", mycpu, pfmon_msg.type, ret);
			}
		}
	}
}

static void
task_create_workers(void)
{
	int i, j, ncpus = 0;
	int nfiles;
	unsigned long fd_hash_size, fd_hash_entries;
	pfmon_sdesc_t *hash_current;
	unsigned long last_bit;
	int ret;

	/*
	 * compute number of accessible CPUs
	 */
	for(i=0, j=0; i < options.online_cpus; i++) {
		if (PFMON_CPUMASK_ISSET(options.phys_cpu_mask, i)) ncpus++;
	}

	nfiles = sysconf(_SC_OPEN_MAX);

	fd_hash_entries = nfiles / ncpus;

	last_bit = find_last_bit_set(fd_hash_entries);

	if (fd_hash_entries & ((1UL << last_bit)-1)) last_bit++;

	DPRINT(("ncpus=%lu nfiles=%lu last_bit=%lu (entries=%lu)\n", ncpus, nfiles, last_bit, 1UL<<last_bit));

	fd_hash_entries = 1UL<<last_bit;
	fd_hash_size    = fd_hash_entries * sizeof(pfmon_sdesc_t *);

	workers = (task_worker_t *)malloc(ncpus* (fd_hash_size+sizeof(task_worker_t)));
	if (workers == NULL) fatal_error("cannot allocate worker table\n");

	hash_current = (pfmon_sdesc_t *)(workers+ncpus);

	for (i=0, j=0; i < options.online_cpus; i++) {

		if (PFMON_CPUMASK_ISSET(options.phys_cpu_mask, i) == 0) continue;

		workers[j].cpu_id = i;

		if (pipe(workers[j].to_worker) == -1 || pipe(workers[j].from_worker) == -1)
			fatal_error("cannot create control channels for worker for CPU%d\n", i);

		ret = pthread_create(&workers[j].thread_id, NULL, (void *(*)(void *))task_worker_mainloop, workers+j);
		if (ret != 0) 
			fatal_error("cannot create worker thread for CPU%u\n", i);

		hash_current += fd_hash_entries;
		j++;
	}
}

static int
pfmon_task_init(void)
{
	master_tid = gettid();

	sem_init(&master_work_sem, 0, 0);

	if (options.opt_aggr) {
		pfmon_clone_sets(options.sets, &sdesc_task_aggr);
		if (pfmon_setup_aggr_sampling_output(&sdesc_task_aggr, &sdesc_task_aggr.csmpl) == -1) return -1;
	}

	if (options.opt_use_smpl) task_create_workers();

	/*
	 * create thread argument key
	 */
	pthread_key_create(&arg_key, pfmon_thread_arg_destroy);

	register_exit_function(exit_per_task);

	setup_global_signals();

	/*
	 * compile regex once and for all
	 */
	if (options.fexec_pattern) {
		if (regcomp(&follow_exec_preg, options.fexec_pattern, REG_ICASE|REG_NOSUB)) {
			warning("error in regular expression for event \"%s\"\n", options.fexec_pattern);
			return -1;
		}
	}
	vbprintf("exec-pattern=%s\n", options.fexec_pattern ? options.fexec_pattern : "*");
	return 0;
}

static void
task_cleanup(void)
{
	register_exit_function(NULL);
}

int
measure_task(pfmon_ctx_t *ctx, char **argv)
{
	int ret;
	time_t end_time;

	ret = pfmon_task_init();
	if (ret) return ret;

	ret = task_mainloop(ctx, argv);
	if (ret == 0) {
		time(&end_time);
		vbprintf("measurements completed at %s\n", asctime(localtime(&end_time)));
	}
	task_cleanup();

	return ret;
}
