/*
 * Linux proc/<pid>/{stat,statm,status,...} Clusters
 *
 * Copyright (c) 2013-2021 Red Hat.
 * Copyright (c) 2000,2004,2006 Silicon Graphics, Inc.  All Rights Reserved.
 * Copyright (c) 2010 Aconex.  All Rights Reserved.
 * 
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2 of the License, or (at your
 * option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * for more details.
 */

#include "pmapi.h"
#include "libpcp.h"
#include "pmda.h"
#include <ctype.h>
#include <dirent.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <pwd.h>
#include <grp.h>
#include "proc_pid.h"
#include "indom.h"
#include "cgroups.h"
#include "hotproc.h"

static size_t	procbuflen;
static char	*procbuf;

static proc_pid_list_t procpids; /* previous pids list that the proc pmda uses */
static void refresh_proc_pidlist(proc_pid_t *, proc_pid_list_t *, proc_runq_t *);
static int refresh_proc_pid_stat(proc_pid_entry_t *);
static int refresh_proc_pid_status(proc_pid_entry_t *);
static int refresh_proc_pid_io(proc_pid_entry_t *);
static int refresh_proc_pid_schedstat(proc_pid_entry_t *);

/* Hotproc variables */

/* PIDS that we are keeping track of as POSSIBLE "hot" candidates
 * need a seperate list since it is generated by the timer update.
*/
static proc_pid_list_t hotpids;
/* Hold a pointer to this since we need it for the timer */
static proc_pid_t *hotproc_poss_pid;

#define INIT_HOTPROC_MAX 200

/* Actual processes that are hot based on the current configuration 
 * Filled in hotproc_eval_procs 
 */
static pid_t *hot_active_list;

static int hot_numactive;
static int hot_maxactive = INIT_HOTPROC_MAX;

/* array size allocated */
static int hot_maxprocs[2] = {INIT_HOTPROC_MAX, INIT_HOTPROC_MAX};

/* number of procs used in list (<= hot_maxprocs) */
static int hot_numprocs[2] = {0, 0};

/* Current and Previous list of processes that we are considering for "hot" inclusion
 * Updated by the timer callback
 * keeps stats that we will use for determination
 */
static process_t *hotproc_list[2] = {NULL, NULL};

/* index into proc_list etc.. */
static int current;
static int previous = 1;

/* various cpu time totals  */
static int hot_have_totals;
static double hot_total_transient;
static double hot_total_cpuidle;
static double hot_total_active;
static double hot_total_inactive;

struct timeval   hotproc_update_interval;
int     hotproc_timer_id = -1;

proc_pid_entry_t *
proc_pid_entry_lookup(int id, proc_pid_t *proc_pid)
{
    __pmHashNode	*node = __pmHashSearch(id, &proc_pid->pidhash);
    proc_pid_entry_t	*entry = node ? (proc_pid_entry_t *)node->data : NULL;

    return entry;
}

int 
get_hot_totals(double * ta, double * ti, double * tt, double * tci )
{
    if (hot_have_totals) {
	*ta = hot_total_active;
	*ti = hot_total_inactive;
	*tt = hot_total_transient;
	*tci = hot_total_cpuidle;
	return 1;
    }
    return 0;
}

static int
compare_pid(const void *pa, const void *pb)
{
    int a = *(int *)pa;
    int b = *(int *)pb;
    return a - b;
}

static void
pidlist_append_pid(int pid, proc_pid_list_t *pids)
{
    if (pids->count >= pids->size) {
	pids->size += 64;
	if (!(pids->pids = (int *)realloc(pids->pids, pids->size * sizeof(int)))) {
	    perror("pidlist_append_pid: out of memory");
	    pids->size = pids->count = 0;
	    return;	/* soldier on bravely */
	}
    }
    pids->pids[pids->count++] = pid;
}

static void
pidlist_append(const char *pidname, proc_pid_list_t *pids)
{
    pidlist_append_pid(atoi(pidname), pids);
}

static void
tasklist_append(const char *pid, proc_pid_list_t *pids)
{
    DIR			*taskdirp;
    struct dirent	*tdp;
    char		taskpath[1024];

    pmsprintf(taskpath, sizeof(taskpath), "%s/proc/%s/task", proc_statspath, pid);
    if ((taskdirp = opendir(taskpath)) != NULL) {
	while ((tdp = readdir(taskdirp)) != NULL) {
	    if (!isdigit((int)tdp->d_name[0]) || strcmp(pid, tdp->d_name) == 0)
		continue;
	    pidlist_append(tdp->d_name, pids);
	}
	closedir(taskdirp);
    } else if (pmDebugOptions.appl1 && pmDebugOptions.desperate) {
	fprintf(stderr, "%s: opendir(\"%s\") failed: %s\n",
			"tasklist_append", taskpath, pmErrStr(-oserror()));
    }
}

static int
refresh_cgroup_pidlist(int want_threads, proc_pid_list_t *pids, const char *cgroup)
{
    char		path[MAXPATHLEN];
    FILE		*fp;
    int			pid;

    pids->count = 0;
    pids->threads = want_threads;

    /*
     * We're running in cgroups mode where a subset of the processes is
     * going to be returned based on the cgroup specified earlier via a
     * store into the proc.control.{all,perclient}.cgroups metric.
     *
     * Use the "cgroup.procs" (v2/v1) and "cgroups.threads" (cgroups v2)
     * or "tasks" (cgroups1) file, depending on want_threads.
     * Note that both these files are already sorted, ascending numeric.
     */
    if (cgroup_version == 0)
	refresh_cgroup_filesys();
    if (want_threads && cgroup_version == 1)
	pmsprintf(path, sizeof(path), "%s%s/tasks", proc_statspath, cgroup);
    else if (want_threads && cgroup_version > 1)
	pmsprintf(path, sizeof(path), "%s%s/container/cgroup.threads", proc_statspath, cgroup);
    else
	pmsprintf(path, sizeof(path), "%s%s/container/cgroup.procs", proc_statspath, cgroup);

    if ((fp = fopen(path, "r")) != NULL) {
	while (fscanf(fp, "%d\n", &pid) == 1)
	    pidlist_append_pid(pid, pids);
	fclose(fp);
    }
    else if (pmDebugOptions.appl1 && pmDebugOptions.desperate) {
	fprintf(stderr, "%s: fopen(\"%s\", \"r\") failed: %s\n",
			    "refresh_cgroup_pidlist", path, pmErrStr(-oserror()));
    }
    return 0;
}

static int
refresh_global_pidlist(int want_threads, proc_pid_list_t *pids)
{
    DIR			*dirp;
    struct dirent	*dp;
    char		path[MAXPATHLEN];

    pids->count = 0;
    pids->threads = want_threads;

    pmsprintf(path, sizeof(path), "%s/proc", proc_statspath);
    if ((dirp = opendir(path)) == NULL) {
	if (pmDebugOptions.appl1 && pmDebugOptions.desperate)
	    fprintf(stderr, "%s: opendir(\"%s\") failed: %s\n",
		    "refresh_global_pidlist", path, pmErrStr(-oserror()));
	return -oserror();
    }

    /* note: readdir on /proc ignores threads */
    while ((dp = readdir(dirp)) != NULL) {
	if (isdigit((int)dp->d_name[0])) {
	    pidlist_append(dp->d_name, pids);
	    if (want_threads)
		tasklist_append(dp->d_name, pids);
	}
    }
    closedir(dirp);

    qsort(pids->pids, pids->count, sizeof(int), compare_pid);
    return 0;
}

static int
in_hot_active_list(pid_t pid)
{
    int			i;

    for (i = 0; i < hot_numactive; i++) {
        if (pid == hot_active_list[i])
            return 1;
    }
    return 0;
}

static int
check_if_hot(char *cpid)
{
    int			mypid;

    if (sscanf(cpid, "%d", &mypid) == 0)
	return 0;
    if (in_hot_active_list(mypid))
	return 1;
    return 0;
}

static int
refresh_hotproc_pidlist(proc_pid_list_t *pids)
{
    DIR			*dirp;
    struct dirent	*dp;

    if ((dirp = opendir("/proc")) == NULL)
	return -oserror();

    /* note: readdir on /proc ignores threads */
    while ((dp = readdir(dirp)) != NULL) {
	if (isdigit((int)dp->d_name[0])) {
	    if (check_if_hot( dp->d_name)) {
		pidlist_append(dp->d_name, pids);
		if (pids->threads)
		    tasklist_append(dp->d_name, pids);
	    }
	}
    }
    closedir(dirp);

    qsort(pids->pids, pids->count, sizeof(int), compare_pid);
    return 0;
}

static int
init_hotproc_list(void)
{
    hot_active_list = (pid_t*)malloc(INIT_HOTPROC_MAX * sizeof(pid_t));
    hotproc_list[0] = (process_t*)malloc(INIT_HOTPROC_MAX * sizeof(process_t));
    hotproc_list[1] = (process_t*)malloc(INIT_HOTPROC_MAX * sizeof(process_t));
    if (hotproc_list[0] == NULL || hotproc_list[1] == NULL || hot_active_list == NULL)
        return -oserror();
    return 0;
}

static void
init_hot_active_list(void)
{
    hot_numactive = 0;
}

/*
 * add_hot_active_list:
 * - If unsuccessful in add - due to memory then return neg status.
 * - If member of active list return 1
 * - If non-member of active list return 0
 */
static int
add_hot_active_list(process_t *node, config_vars *vars)
{
    pid_t		*res;

    if (eval_tree(vars) == 0)
        return 0;

    if (hot_numactive == hot_maxactive) {
        hot_maxactive = hot_numactive*2;
        res = (pid_t *)realloc(hot_active_list, hot_maxactive * sizeof(pid_t));
        if (res == NULL)
            return -1;
        hot_active_list = res;
    }
    hot_active_list[hot_numactive++] = node->pid;
    return 1;
}

static int
compare_pids(const void *n1, const void *n2)
{
    return ((process_t*)n2)->pid - ((process_t*)n1)->pid;
}

static process_t *
lookup_node(int curr_prev, pid_t pid)
{
    process_t		key, *node;

    key.pid = pid;
    if ((hot_numprocs[curr_prev] > 0) &&
        ((node = bsearch(&key, hotproc_list[curr_prev], hot_numprocs[curr_prev],
			sizeof(process_t), compare_pids)) != NULL)) {
	return node;
    }
    return NULL;
}

static process_t *
lookup_curr_node(pid_t pid)
{
    return lookup_node(current, pid);
}

static double
diff_counter(double current, double previous, int pmtype)
{
    double		outval = current-previous;

    if (outval < 0.0) {
        switch (pmtype) {
            case PM_TYPE_32:
            case PM_TYPE_U32:
                outval += (double)UINT_MAX+1;
                break;
            case PM_TYPE_64:
            case PM_TYPE_U64:
                outval += (double)ULONGLONG_MAX+1;
                break;
        }
    }
    return outval;
}

int
get_hotproc_node(pid_t pid, process_t **getnode)
{
    if (in_hot_active_list(pid)) {
	*getnode = lookup_curr_node(pid);
	return (*getnode != NULL);
    }
    *getnode = NULL;
    return 0;
}

/* The idea of this is copied from linux/proc_stat.c */
static unsigned long long
get_idle_time(void)
{
    FILE		*fp = NULL;
    unsigned long long	idle_time = 0;
    int			n;
    char		buf[MAXPATHLEN];

    pmsprintf(buf, sizeof(buf), "%s/proc/stat", proc_statspath);
    if ((fp = fopen(buf, "r")) == NULL)
	return -oserror();
    n = fscanf(fp, "cpu %*u %*u %*u %llu %*u %*u %*u %*u %*u", &idle_time);
    if (n != 1)
	idle_time = 0;
    fclose(fp);

    return idle_time;
}

/*
 * For each pid, compute stats and store in hotpid array
 * (called by the timer)
 */
static int
hotproc_eval_procs(void)
{
    struct timeval	ts;
    char                *name;
    process_t		*oldnode = NULL, *newnode = NULL;      
    struct timeval	timestamp;
    config_vars		vars;
    proc_pid_entry_t    *entry;
    pid_t		pid;
    int			np = 0, i, sts;

    static double	refresh_time[2];  /* timestamp after refresh */
    static time_t	sysidle[2];       /* sys idle from /proc/stat */
    static int		num_cpus;
    static unsigned int	hot_refresh_count;

    /* Still need to compute some of these */
    double	sysidle_delta,	/* system idle delta time since last refresh */
		actual_delta,	/* actual delta time since last refresh */
		transient_delta,/* calculated delta time of transient procs */
		cputime_delta,	/* delta cpu time for a process */
		vctx_delta,	/* delta num of vol ctx switches for a process */
		ictx_delta,	/* delta num of invol ctx switches for a process */
		bread_delta,	/* delta num of bytes read */
		bwrit_delta,	/* delta num of bytes written */
		bwtime_delta,	/* delta num of microsec for waiting for blocked io */
		qwtime_delta,	/* delta num of nanosec waiting on run queue */
		timestamp_delta,/* real time delta b/w refreshes for process */
		total_cputime,		/* total of cputime_deltas for each process */
		total_activetime,	/* total of cputime_deltas for active processes */
		total_inactivetime;	/* total of cputime_deltas for inactive processes */

    total_cputime = total_activetime = total_inactivetime = 0;

    if (num_cpus == 0)
	num_cpus = sysconf(_SC_NPROCESSORS_ONLN);

    if (current == 0) {
        current = 1; previous = 0;
    } else {
        current = 0; previous = 1;
    }

    init_hot_active_list();

    memset(&vars, 0, sizeof(config_vars));

    hotpids.count = 0;
    hotpids.threads = 0;

    /* Whats running right now */
    refresh_global_pidlist(0, &hotpids);
    refresh_proc_pidlist(hotproc_poss_pid, &hotpids, NULL);

    pmtimevalNow(&timestamp);

    for (i=0; i < hotpids.count; i++) {

	pid = hotpids.pids[i];

	entry = proc_pid_entry_lookup(pid, hotproc_poss_pid);
	if (entry == NULL) {
	    fprintf(stderr, "%s: hash search failed for process %d\n",
			    "hotproc_eval_procs", i);
	    continue;
	}

	/* Collect all the stat/status/statm info */
	refresh_proc_pid_stat(entry);
	refresh_proc_pid_status(entry);
	refresh_proc_pid_io(entry);
	refresh_proc_pid_schedstat(entry);

        /* Note: /proc/pid/schedstat and /proc/pid/io not on all platforms */
	if (!(entry->flags & PROC_PID_FLAG_STAT_SUCCESS) ||
	    !(entry->flags & PROC_PID_FLAG_STATUS_SUCCESS))
	    continue;

	if (np == hot_maxprocs[current]) {
	    process_t *res;
	    hot_maxprocs[current] = np*2;
	    res = (process_t *)realloc(hotproc_list[current],
		    hot_maxprocs[current] * sizeof(process_t));
	    if (res == NULL)
		return -oserror();
	    hotproc_list[current] = res;
	}

	newnode = &hotproc_list[current][np++];
        newnode->pid = pid;
	newnode->r_cputimestamp = timestamp.tv_sec + timestamp.tv_usec / 1000000;

	/* Calculate the stats we will need */

	/* CPU time is user and system time */
	newnode->r_cputime = (double)entry->stat.utime + entry->stat.stime;
	newnode->r_cputime /= (double)_pm_hertz;

	/* Context Switches : voluntary and involuntary */
	newnode->r_vctx = entry->status.vctxsw;
	newnode->r_ictx = entry->status.nvctxsw;

	/* IO demand: read and write - not available from all kernels */
	if (!(entry->flags & PROC_PID_FLAG_IO_SUCCESS)) {
	    newnode->r_bread = 0;
	    newnode->r_bwrit = 0;
	} else {
	    newnode->r_bread = entry->io.readb;
	    newnode->r_bwrit = entry->io.writeb;
	}

	/* Block IO wait (delayacct_blkio_ticks) */
	newnode->r_bwtime = (double)entry->stat.delayacct_blkio_time;
	newnode->r_bwtime /= (double)_pm_hertz;

	/* Schedwait (run_delay) - not available from all kernels */
	if (!(entry->flags & PROC_PID_FLAG_SCHEDSTAT_SUCCESS))
	    newnode->r_qwtime = 0;
        else
	    newnode->r_qwtime = entry->schedstat.rundelay;

	/* This is not the first time through, so we can generate rate stats */
	if ((oldnode = lookup_node(previous, pid)) != NULL) {

	    /* CPU */
	    cputime_delta = diff_counter(newnode->r_cputime, oldnode->r_cputime, PM_TYPE_64);
	    timestamp_delta = diff_counter(newnode->r_cputimestamp, oldnode->r_cputimestamp, PM_TYPE_64);

	    newnode->r_cpuburn = cputime_delta / timestamp_delta;
	    vars.cpuburn = newnode->r_cpuburn;

	    /* IO */
	    bread_delta = diff_counter((double)newnode->r_bread,
                                   (double)oldnode->r_bread, PM_TYPE_64);
	    bwrit_delta = diff_counter((double)newnode->r_bwrit,
                                    (double)oldnode->r_bwrit, PM_TYPE_64);
	    vars.preds.iodemand = (
                                 (double)bread_delta  +
                                 (double)bwrit_delta ) /
                                timestamp_delta;

	    /* ctx switches */
	    vctx_delta = diff_counter((double)newnode->r_vctx,
                                    (double)oldnode->r_vctx, PM_TYPE_64);
	    ictx_delta = diff_counter((double)newnode->r_ictx,
                                    (double)oldnode->r_ictx, PM_TYPE_64);
	    vars.preds.ctxswitch = (vctx_delta + ictx_delta) / timestamp_delta;

	    /* IO wait */
	    bwtime_delta = diff_counter((double)newnode->r_bwtime,
                                    (double)oldnode->r_bwtime, PM_TYPE_64);

	    vars.preds.iowait = bwtime_delta / timestamp_delta;

	    /* schedwait */
	    qwtime_delta = diff_counter((double)newnode->r_qwtime,
		    (double)oldnode->r_qwtime, PM_TYPE_64);
	    /* run_delay in nsec */
	    vars.preds.schedwait = qwtime_delta / (timestamp_delta * 1000000000);
	}
        else {
	    newnode->r_cpuburn = 0;
	    memset(&newnode->preds, 0, sizeof(newnode->preds));
	    vars.cpuburn = 0;
	    vars.preds.ctxswitch = 0;
	    vars.preds.iowait = 0;
	    vars.preds.schedwait = 0;
	    vars.preds.iodemand = 0;
	    cputime_delta = 0;
        }

        total_cputime += cputime_delta;

	/* Command */
	if (entry->stat.cmd == NULL)
	    strcpy(vars.fname, "Unknown");
	else {
	    char *cmd = entry->stat.cmd;
	    size_t len = strlen(cmd);
	    int parens = 0;

	    if (cmd[0] == '(') { /* skip enclosing parentheses */
		parens = 1;
		cmd++;
		len--;
	    }

	    strncpy(vars.fname, cmd, sizeof(vars.fname)-1);
	    if (len < sizeof(vars.fname) && parens && cmd[len-1] == ')')
		vars.fname[len-1] = '\0'; /* skip closing parenthesis */
	    vars.fname[sizeof(vars.fname) - 1] = '\0';
	}

	/* PS Args */
	strncpy(vars.psargs, entry->name+7, sizeof(vars.psargs));
	vars.psargs[sizeof(vars.psargs)-1]='\0';

	/* UID and GID */
	vars.uid = entry->status.uid;
	vars.gid = entry->status.gid;

	/* uname and gname */
	if ((name = proc_uidname_lookup(vars.uid)) != NULL) {
	    strncpy(vars.uname, name, sizeof(vars.uname));
	    vars.uname[sizeof(vars.uname)-1] = '\0';
	} else {
	    strcpy(vars.uname, "UNKNOWN");
	}
	if ((name = proc_gidname_lookup(vars.gid)) != NULL) {
	    strncpy(vars.gname, name, sizeof(vars.gname));
	    vars.gname[sizeof(vars.gname)-1] = '\0';
	} else {
	    strcpy(vars.gname, "UNKNOWN");
	}

	/* VSIZE from stat */
	vars.preds.virtualsize = entry->stat.vsize / 1024;

	/* RSS from stat */
	vars.preds.residentsize = entry->stat.rss * _pm_system_pagesize / 1024;

	/* Struct copy - copy should be after rss and vm calcs. */
	newnode->preds = vars.preds;

	if ((sts = add_hot_active_list(newnode, &vars)) < 0)
	    return sts;

       	if (sts == 0)
	    total_inactivetime += cputime_delta;
	else
	    total_activetime += cputime_delta;
    }

    hot_numprocs[current] = np;

    pmtimevalNow(&ts);
    refresh_time[current] = ts.tv_sec + ts.tv_usec / 1000000;

    if (pmDebugOptions.appl1) {
	double hptime = (ts.tv_sec - timestamp.tv_sec) +
			(ts.tv_usec - timestamp.tv_usec) / 1000000.0;
	fprintf(stderr, "%s: update took %f time\n",
			"hotproc_eval_procs", hptime);
    }

    /* Idle */
    sysidle[current] = get_idle_time();

    /* Handle rollover */
    if (++hot_refresh_count == 0)
	hot_refresh_count = 2;

    if (hot_refresh_count > 1) {
	sysidle_delta = diff_counter(sysidle[current], sysidle[previous], PM_TYPE_64) / (double)HZ;
	actual_delta = diff_counter(refresh_time[current], refresh_time[previous], PM_TYPE_64);
	transient_delta = num_cpus * actual_delta - (total_cputime + sysidle_delta);
	if (transient_delta < 0) /* sanity check */
	    transient_delta = 0;

	hot_have_totals = 1;
	hot_total_transient = transient_delta / actual_delta;
	hot_total_cpuidle = sysidle_delta / actual_delta;
	hot_total_active = total_activetime / actual_delta;
	hot_total_inactive = total_inactivetime / actual_delta;
    }

    qsort(hotproc_list[current], hot_numprocs[current],
          sizeof(process_t), compare_pids);

    return 0;
}

static void
hotproc_timer(int sig, void *ptr)
{
    hotproc_eval_procs();
}

void
init_hotproc_pid(proc_pid_t *_hotproc_poss_pid)
{
    hotproc_poss_pid = _hotproc_poss_pid;
    hotproc_update_interval.tv_sec = 10;
    init_hotproc_list();
    reset_hotproc_timer();
}

void
reset_hotproc_timer(void)
{
    int			sts;

    /* Only reset/enable timer when a valid configuration is present. */
    if (!conf_gen)
	return;

    __pmAFunregister(hotproc_timer_id);
    sts = __pmAFregister(&hotproc_update_interval, NULL, hotproc_timer);
    if (sts < 0) {
	pmNotifyErr(LOG_ERR, "error registering hotproc timer: %s",
			pmErrStr(sts));
	exit(1);
    }
    hotproc_timer_id = sts;
}

void
disable_hotproc(void)
{
    /* Clear out the hotlist */
    init_hot_active_list();
    /* Disable the timer */
    __pmAFunregister(hotproc_timer_id);
    conf_gen = 0;
}

static void
refresh_proc_indom_entry(proc_pid_entry_t *ep, pmdaIndom *indomp, int idx)
{
    indomp->it_set[idx].i_inst = ep->id; /* internal instid is pid */
    indomp->it_set[idx].i_name = ep->instname; /* ptr ref, do not free */
}

static void
refresh_proc_runq(proc_pid_entry_t *ep, proc_runq_t *runq)
{
    if (!(ep->flags & PROC_PID_FLAG_STAT_SUCCESS)) {
	runq->unknown++;
    } else if (ep->stat.state[0] == 'Z') {
	runq->defunct++;
    } else if (ep->stat.vsize == 0) {
	/* kernel process (not defunct and virtual size is zero) */
	runq->swapped++;
    } else {
	/* all other states :- fs/proc/array.c::task_state_array */
	switch (ep->stat.state[0]) {
	case 'R':
	    runq->runnable++;
	    break;
	case 'S':
	    runq->sleeping++;
	    break;
	case 't':
	case 'T':
	    runq->stopped++;
	    break;
	case 'P':
	case 'D':
	    runq->blocked++;
	    break;
	/* case 'Z': -- already counted above */
	default:
	    if (pmDebugOptions.appl1)
	        fprintf(stderr, "%s: UNKNOWN process state %c on pid %d\n",
			"refresh_proc_runq", ep->stat.state[0], ep->id);
	    runq->unknown++;
	    break;
	}
    }
}

static void
refresh_proc_pidlist(proc_pid_t *proc_pid, proc_pid_list_t *pids, proc_runq_t *runq)
{
    int			i, fd, numinst, idx = 0;
    char		*p, buf[MAXPATHLEN];
    __pmHashNode	*node, *next, *prev;
    proc_pid_entry_t	*ep;
    pmdaIndom		*indomp = proc_pid->indom;

    /*
     * invalidate all entries so we can harvest pids that have exited
     */
    for (i=0; i < proc_pid->pidhash.hsize; i++) {
	for (node=proc_pid->pidhash.hash[i]; node != NULL; node = node->next) {
	    ep = (proc_pid_entry_t *)node->data;
	    ep->flags = 0;
	}
    }

    /*
     * walk pid list and add new pids to the hash table,
     * marking entries valid as we go ...
     */
    for (i=0; i < pids->count; i++) {
	node = __pmHashSearch(pids->pids[i], &proc_pid->pidhash);
	if (node)
	    ep = (proc_pid_entry_t *)node->data;
	else {
	    int k = 0;

	    ep = (proc_pid_entry_t *)malloc(sizeof(proc_pid_entry_t));
	    memset(ep, 0, sizeof(proc_pid_entry_t));

	    ep->id = pids->pids[i];

	    pmsprintf(buf, sizeof(buf), "%s/proc/%d/cmdline", proc_statspath, pids->pids[i]);
	    if ((fd = open(buf, O_RDONLY)) >= 0) {
		int numlen = pmsprintf(buf, sizeof(buf), "%06d ", pids->pids[i]);
		if ((k = read(fd, buf+numlen, sizeof(buf)-numlen)) > 0) {
		    p = buf + k + numlen;
		    if (p - buf >= sizeof(buf))
			p--;
		    *p-- = '\0';
		    /* Skip trailing nils, i.e. don't replace them */
		    while (buf+numlen < p) {
			if (*p-- != '\0') {
				break;
			}
		    }
		    /* Remove NULL terminators from cmdline string array */
		    while (buf+numlen < p) {
			if (*p == '\0') *p = ' ';
			p--;
		    }
		}
		close(fd);
	    }
	    else if (pmDebugOptions.appl1 && pmDebugOptions.desperate) {
		fprintf(stderr, "%s: open(\"%s\", O_RDONLY) failed: %s\n",
			"refresh_proc_pidlist", buf, pmErrStr(-oserror()));
	    }
	    if (k == 0) {
		/*
		 * If a process is swapped out, /proc/<pid>/cmdline
		 * returns an empty string so we have to get it
		 * from /proc/<pid>/status or /proc/<pid>/stat
		 */
		pmsprintf(buf, sizeof(buf), "%s/proc/%d/status", proc_statspath, pids->pids[i]);
		if ((fd = open(buf, O_RDONLY)) >= 0) {
		    /* We engage in a bit of a hanky-panky here:
		     * the string should look like "123456 (name)",
		     * we get it from /proc/XX/status as "Name:   name\n...",
		     * to fit the 6 digits of PID and opening parenthesis, 
	             * save 2 bytes at the start of the buffer. 
	             * And don't forget to leave 2 bytes for the trailing 
		     * parenthesis and the nil. Here is
		     * an example of what we're trying to achieve:
		     * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+
		     * |  |  | N| a| m| e| :|\t| i| n| i| t|\n| S|...
		     * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+
		     * | 0| 0| 0| 0| 0| 1|  | (| i| n| i| t| )|\0|...
		     * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+ */
		    if ((k = read(fd, buf+2, sizeof(buf)-4)) > 0) {
			int bc;

			if ((p = strchr(buf+2, '\n')) == NULL)
			    p = buf+k;
			p[0] = ')'; 
			p[1] = '\0';
			bc = pmsprintf(buf, sizeof(buf), "%06d ", pids->pids[i]); 
			buf[bc] = '(';
		    }
		    close(fd);
		}
		else if (pmDebugOptions.appl1 && pmDebugOptions.desperate) {
		    fprintf(stderr, "%s: open(\"%s\", O_RDONLY) failed: %s\n",
			    "refresh_proc_pidlist", buf, pmErrStr(-oserror()));
		}
	    }

	    if (k <= 0) {
		/* hmm .. must be exiting */
	    	pmsprintf(buf, sizeof(buf), "%06d <exiting>", pids->pids[i]);
	    }

	    ep->name = strdup(buf);

	    __pmHashAdd(pids->pids[i], (void *)ep, &proc_pid->pidhash);
	    //fprintf(stderr, "key %d : ADDED \"%s\" to hash table\n", pids->pids[i], buf);
	}

	if (ep->instname == NULL) {
	   /*
	     * The external instance name is the pid followed by
	     * a copy of the psargs truncated at the first space.
	     * e.g. "012345 /path/to/command". Command line args,
	     * if any, are truncated. The full command line is
	     * available in the proc.psinfo.psargs metric.
	     */
	    if ((p = strchr(ep->name, ' ')) != NULL) {
	        if ((p = strchr(p+1, ' ')) != NULL) {
	            int len = p - ep->name;
		    if (len > PROC_PID_STAT_CMD_MAXLEN)
			len = PROC_PID_STAT_CMD_MAXLEN;
	            ep->instname = (char *)malloc(len+1);
	            strncpy(ep->instname, ep->name, len);
	            ep->instname[len] = '\0';
	        }
	    }
	    if (ep->instname == NULL) /* no spaces found, so use the full name */
	        ep->instname = strndup(ep->name, PROC_PID_STAT_CMD_MAXLEN);
	}
	
	/* mark pid as valid (new or still running) */
	ep->flags |= PROC_PID_FLAG_VALID;
    }

    /* 
     * harvest pids that have exit'ed
     */
    numinst = 0;
    for (i=0; i < proc_pid->pidhash.hsize; i++) {
	for (prev=NULL, node=proc_pid->pidhash.hash[i]; node != NULL;) {
	    next = node->next;
	    ep = (proc_pid_entry_t *)node->data;
	    // fprintf(stderr, "CHECKING key=%d node=" PRINTF_P_PFX "%p prev=" PRINTF_P_PFX "%p next=" PRINTF_P_PFX "%p ep=" PRINTF_P_PFX "%p valid=%d\n",
	    	// ep->id, node, prev, node->next, ep, ep->valid);
	    if (ep->flags & PROC_PID_FLAG_VALID) {
		numinst++;
	    	prev = node;
	    }
	    else {
		// This process has exited.
	        //fprintf(stderr, "DELETED key=%d name=\"%s\"\n", ep->id, ep->name);
		if (ep->instname != NULL)
		    free(ep->instname);
		if (ep->name != NULL)
		    free(ep->name);
		if (ep->stat.cmd != NULL)
		    free(ep->stat.cmd);
		if (ep->maps_buf != NULL)
		    free(ep->maps_buf);
		if (ep->wchan_buf != NULL)
		    free(ep->wchan_buf);
		if (ep->environ_buf != NULL)
		    free(ep->environ_buf);
	    	if (prev == NULL)
		    proc_pid->pidhash.hash[i] = node->next;
		else
		    prev->next = node->next;
		free(ep);
		free(node);
	    }
	    if ((node = next) == NULL)
	    	break;
	}
    }

    /* Reset accounting of the runqueue metrics, initially all zeroes */
    if (runq)
	memset(runq, 0, sizeof(proc_runq_t));

    /*
     * At this point, the hash table contains only valid pids.  Finally:
     * - refresh the indom table, based on the updated process hash table.
     *   (indom table instance names are shared with the hash table entry,
     *    so must not be freed).
     * - if runq metrics are being gathered, sample stat files now for all
     *   active processes and accumulate the values - and do this in a way
     *   that sets the FETCHED flag for these files such that they're only
     *   read once for each sample (fetch).
     */
    indomp->it_numinst = numinst;
    indomp->it_set = (pmdaInstid *)realloc(indomp->it_set, numinst * sizeof(pmdaInstid));
    for (i=0; i < proc_pid->pidhash.hsize; i++) {
	for (node=proc_pid->pidhash.hash[i]; node != NULL; node=node->next) {
	    ep = (proc_pid_entry_t *)node->data;
	    if (runq) {
		refresh_proc_pid_stat(ep);
		refresh_proc_runq(ep, runq);
	    }
	    refresh_proc_indom_entry(ep, indomp, idx++);
	}
    }
}

int
refresh_proc_pid(proc_pid_t *proc_pid, proc_runq_t *proc_runq,
		 int want_threads, const char *cgroups,
		 const char *container, int namelen)
{
    char		path[MAXPATHLEN];
    int			sts, want_cgroups;
    const char		*filter = cgroups;

    want_cgroups = container || (cgroups && cgroups[0] != '\0');

    /*
     * For containers we asked pmdaroot for a cgroup name for the container;
     * next find a matching filesystem path we can use to look up processes.
     */
    if (container)
	filter = cgroup_container_path(path, sizeof(path), container);

    sts = !want_cgroups ?
	refresh_global_pidlist(want_threads, &procpids) :
	refresh_cgroup_pidlist(want_threads, &procpids, filter);
    if (sts < 0)
	return sts;

    if (pmDebugOptions.appl1)
	fprintf(stderr, "%s: %d pids (threads=%d, %s=\"%s\")\n",
		"refresh_proc_pid", procpids.count, procpids.threads,
		container ? "container" : "cgroups", filter ? filter : "");

    refresh_proc_pidlist(proc_pid, &procpids, proc_runq);
    return 0;
}

int
refresh_hotproc_pid(proc_pid_t *proc_pid, int threads, const char *cgroups)
{
    int			sts;

    hotpids.count = 0;
    hotpids.threads = threads;

    if ((sts = refresh_hotproc_pidlist(&hotpids)) < 0)
	return sts;

    refresh_proc_pidlist(proc_pid, &hotpids, NULL);
    return 0;
}


/*
 * Open a proc file, taking into account that we may want thread info
 * rather than process information.
 *
 * We make (ab)use of some obscure Linux procfs mechanisms here!
 * Even though readdir(/proc) does not contain tasks, we can still open
 * taskid directory files; on top of that, the tasks sub-directory in a
 * task group has all (peer) tasks in that group, even for "children".
 */
static int
proc_open(const char *base, proc_pid_entry_t *ep)
{
    int			fd;
    char		buf[128];

    if (procpids.threads) {
	pmsprintf(buf, sizeof(buf), "%s/proc/%d/task/%d/%s",
			proc_statspath, ep->id, ep->id, base);
	fd = open(buf, O_RDONLY);
	if (fd < 0) {
	    if (pmDebugOptions.appl1 && pmDebugOptions.desperate)
		fprintf(stderr, "%s: open(\"%s\", O_RDONLY) failed: %s\n",
				"proc_open", buf, pmErrStr(-oserror()));
	    /* fallback to /proc path if task path open fails */
	} else {
	    if (pmDebugOptions.appl1 && pmDebugOptions.desperate)
		fprintf(stderr, "%s: thread: %s -> fd=%d\n",
				"proc_open", buf, fd);
	    return fd;
	}
    }
    pmsprintf(buf, sizeof(buf), "%s/proc/%d/%s", proc_statspath, ep->id, base);
    fd = open(buf, O_RDONLY);
    if (fd < 0) {
	if (pmDebugOptions.appl1 && pmDebugOptions.desperate)
	    fprintf(stderr, "%s: open(\"%s\", O_RDONLY) failed: %s\n",
			    "proc_open", buf, pmErrStr(-oserror()));
    }
    if (pmDebugOptions.appl1 && pmDebugOptions.desperate)
	fprintf(stderr, "%s: %s -> fd=%d\n", "proc_open", buf, fd);
    return fd;
}

static DIR *
proc_opendir(const char *base, proc_pid_entry_t *ep)
{
    DIR			*dir;
    char		buf[128];

    if (procpids.threads) {
	pmsprintf(buf, sizeof(buf), "%s/proc/%d/task/%d/%s", proc_statspath, ep->id, ep->id, base);
	if ((dir = opendir(buf)) != NULL) {
	    return dir;
	}
	else {
	    if (pmDebugOptions.appl1 && pmDebugOptions.desperate)
		fprintf(stderr, "%s: opendir(\"%s\") failed: %s\n",
				"proc_opendir", buf, pmErrStr(-oserror()));
	}
	/* fallback to /proc path if task path opendir fails */
    }
    pmsprintf(buf, sizeof(buf), "%s/proc/%d/%s", proc_statspath, ep->id, base);
    dir = opendir(buf);
    if (dir == NULL) {
	if (pmDebugOptions.appl1 && pmDebugOptions.desperate)
	    fprintf(stderr, "%s: opendir(\"%s\") failed: %s\n",
			    "proc_opendir", buf, pmErrStr(-oserror()));
    }
    return dir;
}

static int
proc_readlink(const char *base, proc_pid_entry_t *ep, size_t *lenp, char **bufp)
{
    char		buf[1024];
    int			sts;

    if (*lenp < MAXPATHLEN) {
	if ((*bufp = (char *)realloc(*bufp, MAXPATHLEN)) == NULL)
	    return -ENOMEM;
	*lenp = MAXPATHLEN;
    }
    pmsprintf(buf, sizeof(buf), "%s/proc/%d/%s", proc_statspath, ep->id, base);
    if ((sts = readlink(buf, *bufp, *lenp)) <= 0) {
	if (sts < 0)	/* expected for kernel threads */
	    sts = 0;
	if (pmDebugOptions.appl1 && pmDebugOptions.desperate)
	    fprintf(stderr, "%s: readlink(\"%s\") failed: %s\n",
			    "proc_readlink", buf, pmErrStr(-oserror()));
	(*bufp)[0] = '\0';
	return sts;
    }
    (*bufp)[sts] = '\0';
    return sts;
}

/*
 * error mapping for fetch routines ...
 * EACCESS, EINVAL => no values (don't disclose anything else)
 * ENOENT => PM_ERR_APPVERSION
 */
static int
maperr(void)
{
    int			sts = -oserror();

    if (sts == -EACCES || sts == -EINVAL) sts = 0;
    else if (sts == -ENOENT) sts = PM_ERR_APPVERSION;
    return sts;
}

static int
read_proc_entry(int fd, size_t *lenp, char **bufp)
{
    size_t		len = 0;
    char		*p = *bufp, buf[1024];
    int			n, sts = 0;

    for (len=0;;) {
	if ((n = read(fd, buf, sizeof(buf))) <= 0)
	    break;
	len += n;
	if (*lenp < len) {
	    *lenp = len;
	    *bufp = (char *)realloc(*bufp, len+1);
	    p = *bufp + len - n;
	}
	memcpy(p, buf, n);
	p += n;
    }

    if (len > 0)
    	*p = '\0';
    else {
	/* invalid read */
	if (n < 0)
	    sts = maperr();
	else if (n == 0) {
	    sts = -ENODATA;
	    if (pmDebugOptions.appl1 && pmDebugOptions.desperate)
		fprintf(stderr, "%s: fd=%d: no data\n", "read_proc_entry", fd);
	}
    }

    return sts;
}

static void
parse_proc_stat(proc_pid_entry_t *ep, size_t buflen, char *buf)
{
    char		*p, *end;

    /* skip PID */
    p = strchr(buf, ' ');
    p += 2;

    /* cmd (%s) */
    end = strrchr(p, ')');
    if (ep->stat.cmd != NULL)
	free(ep->stat.cmd);
    ep->stat.cmd = strndup(p, end - p);
    p = end + 2;

    /* state (char) */
    memset(ep->stat.state, 0, sizeof(ep->stat.state));
    ep->stat.state[0] = p[0];

    /* the rest are numeric values */
    ep->stat.ppid = strtoul(++p, &p, 10);
    ep->stat.pgrp = strtoul(++p, &p, 10);
    ep->stat.session = strtoul(++p, &p, 10);
    ep->stat.tty = strtoul(++p, &p, 10);
    ep->stat.tty_pgrp = strtol(++p, &p, 10);
    ep->stat.flags = strtoul(++p, &p, 10);
    ep->stat.minflt = strtoul(++p, &p, 10);
    ep->stat.cminflt = strtoul(++p, &p, 10);
    ep->stat.majflt = strtoul(++p, &p, 10);
    ep->stat.cmajflt = strtoul(++p, &p, 10);
    ep->stat.utime = strtoull(++p, &p, 10);
    ep->stat.stime = strtoull(++p, &p, 10);
    ep->stat.cutime = strtoull(++p, &p, 10);
    ep->stat.cstime = strtoull(++p, &p, 10);
    ep->stat.priority = strtol(++p, &p, 10);
    ep->stat.nice = strtol(++p, &p, 10);
    strtoul(++p, &p, 10); /* threads, we use /proc/pid/status */
    ep->stat.it_real_value = strtoul(++p, &p, 10);
    ep->stat.start_time = strtoull(++p, &p, 10);
    ep->stat.vsize = strtoull(++p, &p, 10);
    ep->stat.rss = strtoull(++p, &p, 10);
    ep->stat.rss_rlim = strtoull(++p, &p, 10);
    ep->stat.start_code = strtoul(++p, &p, 10);
    ep->stat.end_code = strtoul(++p, &p, 10);
    ep->stat.start_stack = strtoul(++p, &p, 10);
    ep->stat.esp = strtoul(++p, &p, 10);
    ep->stat.eip = strtoul(++p, &p, 10);
    ep->stat.signal = strtoul(++p, &p, 10);
    ep->stat.blocked = strtoul(++p, &p, 10);
    ep->stat.sigignore = strtoul(++p, &p, 10);
    ep->stat.sigcatch = strtoul(++p, &p, 10);
    ep->stat.wchan = strtoul(++p, &p, 10);
    ep->stat.nswap = strtoul(++p, &p, 10);
    ep->stat.cnswap = strtoul(++p, &p, 10);
    ep->stat.exit_signal = strtoul(++p, &p, 10);
    ep->stat.processor = strtoul(++p, &p, 10);
    ep->stat.priority = strtoul(++p, &p, 10);
    ep->stat.rtpriority = strtoul(++p, &p, 10);
    ep->stat.policy = strtoul(++p, &p, 10);
    ep->stat.delayacct_blkio_time = strtoull(++p, &p, 10);
    ep->stat.guest_time = strtoull(++p, &p, 10);
    ep->stat.cguest_time = strtoull(++p, &p, 10);
}

static int
refresh_proc_pid_stat(proc_pid_entry_t *ep)
{
    int			fd, sts;

    if (ep->flags & PROC_PID_FLAG_STAT_SUCCESS)
	return 0;
    if ((fd = proc_open("stat", ep)) < 0)
	return maperr();
    if ((sts = read_proc_entry(fd, &procbuflen, &procbuf)) >= 0) {
	parse_proc_stat(ep, procbuflen, procbuf);
	ep->flags |= PROC_PID_FLAG_STAT_SUCCESS;
    }
    close(fd);
    return sts;
}

/*
 * fetch a proc/<pid>/stat entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_stat(int id, proc_pid_t *proc_pid, int *sts)
{
    proc_pid_entry_t	*ep = proc_pid_entry_lookup(id, proc_pid);

    *sts = 0;
    if (!ep)
	return NULL;
    if (!(ep->flags & PROC_PID_FLAG_STAT_FETCHED)) {
	*sts = refresh_proc_pid_stat(ep);
	ep->flags |= PROC_PID_FLAG_STAT_FETCHED;
    }
    return (*sts < 0) ? NULL : ep;
}

static int
refresh_proc_pid_wchan(proc_pid_entry_t *ep)
{
    int			fd, sts = 0;

    if (ep->wchan_buflen > 0)
	ep->wchan_buf[0] = '\0';
    if ((fd = proc_open("wchan", ep)) >= 0) {
	sts = read_proc_entry(fd, &ep->wchan_buflen, &ep->wchan_buf);
	close(fd);
    } /* else - ignore failure here, backwards compat */
    return sts;
}

/*
 * fetch a proc/<pid>/wchan entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_wchan(int id, proc_pid_t *proc_pid, int *sts)
{
    proc_pid_entry_t	*ep = proc_pid_entry_lookup(id, proc_pid);

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_WCHAN_FETCHED)) {
	*sts = refresh_proc_pid_wchan(ep);
	ep->flags |= PROC_PID_FLAG_WCHAN_FETCHED;
    }

    if (*sts < 0)
    	return NULL;
    return ep;
}

static int
refresh_proc_pid_environ(proc_pid_entry_t *ep)
{
    char		*p;
    int			fd, sts;

    if (ep->environ_buflen > 0)
	ep->environ_buf[0] = '\0';
    if ((fd = proc_open("environ", ep)) >= 0) {
	sts = read_proc_entry(fd, &ep->environ_buflen, &ep->environ_buf);
	close(fd);
	if (sts == 0) {
	    /* replace nulls with spaces */
	    if (ep->environ_buf) {
		for (p=ep->environ_buf; p < ep->environ_buf + ep->environ_buflen; p++) {
		    if (*p == '\0')
			*p = ' ';
		}
		ep->environ_buf[ep->environ_buflen-1] = '\0';
	    }
	} else {
	    /* probably EOF on first read */
	    ep->environ_buflen = 0;
	    sts = 0; /* clear -ENODATA */
	}
    } else {
	/* have seen EPERM errors from open */
	ep->environ_buflen = 0;
	sts = 0; /* clear -ENODATA */
    }
    return sts;
}

/*
 * fetch a proc/<pid>/environ entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_environ(int id, proc_pid_t *proc_pid, int *sts)
{
    proc_pid_entry_t	*ep = proc_pid_entry_lookup(id, proc_pid);

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_ENVIRON_FETCHED)) {
	*sts = refresh_proc_pid_environ(ep);
	ep->flags |= PROC_PID_FLAG_ENVIRON_FETCHED;
    }

    if (*sts < 0)
    	return NULL;
    return ep;
}

/*
 * Skip an initial identifying header and any whitespace, comma-separate
 * the remainder of the line by overwriting any whitespace (optionally),
 * then insert resulting string into the strings cache.
 */
static int
parse_string_value(char **buf, size_t length, int commasep)
{
    char		*p, *start;

    *buf += length;
    for (p = *buf; *p && isspace(*p); p++);	/* skip initial whitespace */
    start = *buf = p;
    while (*p) {
	if (*p == '\n') {
	    *p = '\0';	/* replace end of line */
	    *buf = p;
	    break;
	}
	if (commasep && isspace(*p))
	    *p = ',';	/* replace whitespace */
	p++;
    }
    return proc_strings_insert(start);
}

static void
parse_proc_status(proc_pid_entry_t *ep, size_t buflen, char *buf)
{
    char		*curline = buf;

    /*
     * Expecting something like ...
     *
     * Name:	bash
     * State:	S (sleeping)
     * Tgid:	21374
     * Pid:	21374
     * PPid:	21373
     * TracerPid:	0
     * Uid:	1000	1000	1000	1000
     * Gid:	1000	1000	1000	1000
     * FDSize:	256
     * Groups:	24 25 27 29 30 44 46 105 110 112 1000 
     * VmPeak:	   22388 kB
     * VmSize:	   22324 kB
     * VmLck:	       0 kB
     * VmPin:	       0 kB
     * VmHWM:	    5200 kB
     * VmRSS:	    5200 kB
     * VmData:	    3280 kB
     * VmStk:	     136 kB
     * VmExe:	     916 kB
     * VmLib:	    2024 kB
     * VmPTE:	      60 kB
     * VmSwap:	       0 kB
     * Threads:	1
     * SigQ:	0/47779
     * SigPnd:	0000000000000000
     * ShdPnd:	0000000000000000
     * SigBlk:	0000000000010000
     * SigIgn:	0000000000384004
     * SigCgt:	000000004b813efb
     * CapInh:	0000000000000000
     * CapPrm:	0000000000000000
     * CapEff:	0000000000000000
     * CapBnd:	ffffffffffffffff
     * Cpus_allowed:	3
     * Cpus_allowed_list:	0-1
     * Mems_allowed:	00000000,00000001
     * Mems_allowed_list:	0
     * voluntary_ctxt_switches:	225
     * nonvoluntary_ctxt_switches:	56
     */
    ep->status.flags = 0;
    while (curline) {
	switch (*curline) {
	case 'C':
	    if (strncmp(curline, "Cpus_allowed_list:", 18) == 0) {
		ep->status.cpusallowed = parse_string_value(&curline, 19, 0);
		ep->status.flags |= PROC_STATUS_FLAG_CPUSALLOWED;
	    } else
		goto nomatch;
	    break;
	case 'e':
	    if (strncmp(curline, "envID:", 6) == 0) {
		ep->status.envid = strtoul(curline + 7, &curline, 0);
		ep->status.flags |= PROC_STATUS_FLAG_ENVID;
	    } else
		goto nomatch;
	    break;
	case 'G':
	    if (strncmp(curline, "Gid:", 4) == 0) {
		ep->status.gid = strtoul(curline + 5, &curline, 0);
		ep->status.egid = strtoul(++curline, &curline, 10);
		ep->status.sgid = strtoul(++curline, &curline, 10);
		ep->status.fsgid = strtoul(++curline, &curline, 10);
	    } else
		goto nomatch;
	    break;
	case 'N':
	    if (strncmp(curline, "Ngid:", 5) == 0) {
		ep->status.ngid = parse_string_value(&curline, 6, 0);
		ep->status.flags |= PROC_STATUS_FLAG_NGID;
	    } else if (strncmp(curline, "NStgid:", 7) == 0) {
		ep->status.nstgid = parse_string_value(&curline, 8, 1);
		ep->status.flags |= PROC_STATUS_FLAG_NSTGID;
	    } else if (strncmp(curline, "NSpid:", 6) == 0) {
		ep->status.nspid = parse_string_value(&curline, 7, 1);
		ep->status.flags |= PROC_STATUS_FLAG_NSPID;
	    } else if (strncmp(curline, "NSpgid:", 7) == 0) {
		ep->status.nspgid = parse_string_value(&curline, 8, 1);
		ep->status.flags |= PROC_STATUS_FLAG_NSPGID;
	    } else if (strncmp(curline, "NSsid:", 6) == 0) {
		ep->status.nssid = parse_string_value(&curline, 7, 1);
		ep->status.flags |= PROC_STATUS_FLAG_NSSID;
	    } else
		goto nomatch;
	    break;
	case 'n':
	    if (strncmp(curline, "nonvoluntary_ctxt_switches:", 27) == 0)
		ep->status.nvctxsw = strtoul(curline + 28, &curline, 0);
	    else
		goto nomatch;
	    break;
	case 'S':
	    if (strncmp(curline, "SigPnd:", 7) == 0)
		ep->status.sigpnd = parse_string_value(&curline, 8, 0);
	    else if (strncmp(curline, "SigBlk:", 7) == 0)
		ep->status.sigblk = parse_string_value(&curline, 8, 0);
	    else if (strncmp(curline, "SigIgn:", 7) == 0)
		ep->status.sigign = parse_string_value(&curline, 8, 0);
	    else if (strncmp(curline, "SigCgt:", 7) == 0)
		ep->status.sigcgt = parse_string_value(&curline, 8, 0);
	    else
		goto nomatch;
	    break;
	case 'T':
	    if (strncmp(curline, "Threads:", 8) == 0)
		ep->status.threads = strtoul(curline + 9, &curline, 0);
	    else if (strncmp(curline, "Tgid:", 5) == 0) {
		ep->status.tgid = strtoul(curline + 6, &curline, 0);
		ep->status.flags |= PROC_STATUS_FLAG_TGID;
	    } else
		goto nomatch;
	    break;
	case 'U':
	    if (strncmp(curline, "Uid:", 4) == 0) {
		ep->status.uid = strtoul(curline + 5, &curline, 0);
		ep->status.euid = strtoul(++curline, &curline, 10);
		ep->status.suid = strtoul(++curline, &curline, 10);
		ep->status.fsuid = strtoul(++curline, &curline, 10);
	    } else
		goto nomatch;
	    break;
	case 'V':
	    if (strncmp(curline, "VmPeak:", 7) == 0)
		ep->status.vmpeak = strtoul(curline + 8, &curline, 0);
	    else if (strncmp(curline, "VmSize:", 7) == 0)
		ep->status.vmsize = strtoul(curline + 8, &curline, 0);
	    else if (strncmp(curline, "VmLck:", 6) == 0)
		ep->status.vmlck = strtoul(curline + 7, &curline, 0);
	    else if (strncmp(curline, "VmPin:", 6) == 0)
		ep->status.vmpin = strtoul(curline + 7, &curline, 0);
	    else if (strncmp(curline, "VmHWM:", 6) == 0)
		ep->status.vmhwm = strtoul(curline + 7, &curline, 0);
	    else if (strncmp(curline, "VmRSS:", 6) == 0)
		ep->status.vmrss = strtoul(curline + 7, &curline, 0);
	    else if (strncmp(curline, "VmData:", 7) == 0)
		ep->status.vmdata = strtoul(curline + 8, &curline, 0);
	    else if (strncmp(curline, "VmStk:", 6) == 0)
		ep->status.vmstk = strtoul(curline + 7, &curline, 0);
	    else if (strncmp(curline, "VmExe:", 6) == 0)
		ep->status.vmexe = strtoul(curline + 7, &curline, 0);
	    else if (strncmp(curline, "VmLib:", 6) == 0)
		ep->status.vmlib = strtoul(curline + 7, &curline, 0);
	    else if (strncmp(curline, "VmPTE:", 6) == 0)
		ep->status.vmpte = strtoul(curline + 7, &curline, 0);
	    else if (strncmp(curline, "VmSwap:", 7) == 0)
		ep->status.vmswap = strtoul(curline + 7, &curline, 0);
	    else
		goto nomatch;
	    break;
	case 'v':
	    if (strncmp(curline, "voluntary_ctxt_switches:", 24) == 0)
		ep->status.vctxsw = strtoul(curline + 25, &curline, 0);
	    else
		goto nomatch;
	    break;

	default:
	nomatch:
		if (pmDebugOptions.appl1 && pmDebugOptions.desperate) {
		    char	*p;
		    fprintf(stderr, "%s: skip ", "fetch_proc_pid_status");
		    for (p = curline; *p && *p != '\n'; p++)
			fputc(*p, stderr);
		    fputc('\n', stderr);
		}
		curline = index(curline, '\n');
	}
	if (curline != NULL) curline++;
    }
}

static int
refresh_proc_pid_status(proc_pid_entry_t *ep)
{
    int			fd, sts;

    if (ep->flags & PROC_PID_FLAG_STATUS_SUCCESS)
	return 0;
    if ((fd = proc_open("status", ep)) < 0)
	return maperr();
    if ((sts = read_proc_entry(fd, &procbuflen, &procbuf)) == 0) {
	parse_proc_status(ep, procbuflen, procbuf);
	ep->flags |= PROC_PID_FLAG_STATUS_SUCCESS;
    }
    close(fd);
    return sts;
}

/*
 * fetch a proc/<pid>/status entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_status(int id, proc_pid_t *proc_pid, int *sts)
{
    proc_pid_entry_t	*ep = proc_pid_entry_lookup(id, proc_pid);

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_STATUS_FETCHED)) {
	*sts = refresh_proc_pid_status(ep);
	ep->flags |= PROC_PID_FLAG_STATUS_FETCHED;
    }
    return (*sts < 0) ? NULL : ep;
}

static void
parse_proc_statm(proc_pid_entry_t *ep, size_t buflen, char *buf)
{
    char	*p = buf;

    ep->statm.size = strtoul(p, &p, 10);
    ep->statm.rss = strtoul(++p, &p, 10);
    ep->statm.share = strtoul(++p, &p, 10);
    ep->statm.textrs = strtoul(++p, &p, 10);
    ep->statm.librs = strtol(++p, &p, 10);
    ep->statm.datrs = strtoul(++p, &p, 10);
    ep->statm.dirty = strtoul(++p, &p, 10);
}

static int
refresh_proc_pid_statm(proc_pid_entry_t *ep)
{
    int			fd, sts;

    if (ep->flags & PROC_PID_FLAG_STATM_SUCCESS)
	return 0;
    if ((fd = proc_open("statm", ep)) < 0)
	return maperr();
    if ((sts = read_proc_entry(fd, &procbuflen, &procbuf)) == 0) {
	parse_proc_statm(ep, procbuflen, procbuf);
	ep->flags |= PROC_PID_FLAG_STATM_SUCCESS;
    }
    close(fd);
    return sts;
}

/*
 * fetch a proc/<pid>/statm entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_statm(int id, proc_pid_t *proc_pid, int *sts)
{
    proc_pid_entry_t	*ep = proc_pid_entry_lookup(id, proc_pid);

    *sts = 0;
    if (!ep)
    	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_STATM_FETCHED)) {
	*sts = refresh_proc_pid_statm(ep);
	ep->flags |= PROC_PID_FLAG_STATM_FETCHED;
    }
    return (*sts < 0) ? NULL : ep;
}

static int
refresh_proc_pid_maps(proc_pid_entry_t *ep)
{
    int			fd, sts;

    if (ep->flags & PROC_PID_FLAG_MAPS_SUCCESS)
	return 0;
    if (ep->maps_buflen > 0)
	ep->maps_buf[0] = '\0';
    if ((fd = proc_open("maps", ep)) < 0)
	return maperr();
    sts = read_proc_entry(fd, &ep->maps_buflen, &ep->maps_buf);
    close(fd);

    /* If there are no maps, make maps_buf a zero length string. */
    if (ep->maps_buflen == 0) {
	ep->maps_buflen = 1;
	ep->maps_buf = (char *)malloc(1);
    }
    if (ep->maps_buf) {
	ep->maps_buf[ep->maps_buflen - 1] = '\0';
	ep->flags |= PROC_PID_FLAG_MAPS_SUCCESS;
	sts = 0; /* clear -ENODATA */
    } else {
	ep->maps_buflen = 0;
    }
    return sts;
}

/*
 * fetch a proc/<pid>/maps entry for pid
 *
 * Values are large and access *must* be protected (have_access).
 */
proc_pid_entry_t *
fetch_proc_pid_maps(int id, proc_pid_t *proc_pid, int *sts)
{
    proc_pid_entry_t	*ep = proc_pid_entry_lookup(id, proc_pid);

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_MAPS_FETCHED)) {
	*sts = refresh_proc_pid_maps(ep);
	ep->flags |= PROC_PID_FLAG_MAPS_FETCHED;
    }
    return (*sts < 0) ? NULL : ep;
}

static void
parse_proc_schedstat(proc_pid_entry_t *ep, size_t buflen, char *buf)
{
    char	*p = buf;

    ep->schedstat.cputime = strtoull(p, &p, 10);
    ep->schedstat.rundelay = strtoull(++p, &p, 10);
    ep->schedstat.count = strtoull(++p, &p, 10);
}

static int 
refresh_proc_pid_schedstat(proc_pid_entry_t *ep)
{
    int			fd, sts;

    if (ep->flags & PROC_PID_FLAG_SCHEDSTAT_SUCCESS)
	return 0;
    if ((fd = proc_open("schedstat", ep)) < 0)
	return maperr();
    if ((sts = read_proc_entry(fd, &procbuflen, &procbuf)) >= 0) {
	parse_proc_schedstat(ep, procbuflen, procbuf);
	ep->flags |= PROC_PID_FLAG_SCHEDSTAT_SUCCESS;
    }
    close(fd);
    return sts;
}

/*
 * fetch a proc/<pid>/schedstat entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_schedstat(int id, proc_pid_t *proc_pid, int *sts)
{
    proc_pid_entry_t	*ep = proc_pid_entry_lookup(id, proc_pid);

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_SCHEDSTAT_FETCHED)) {
	*sts = refresh_proc_pid_schedstat(ep);
	ep->flags |= PROC_PID_FLAG_SCHEDSTAT_FETCHED;
    }
    return (*sts < 0) ? NULL : ep;
}

static void
parse_proc_io(proc_pid_entry_t *ep, size_t buflen, char *buf)
{
    char *curline = buf;

    /*
     * rchar: 714415843
     * wchar: 101078796
     * syscr: 780339
     * syscw: 493583
     * read_bytes: 209099776
     * write_bytes: 118263808
     * cancelled_write_bytes: 102301696
    */
    while (curline) {
	if (strncmp(curline, "rchar:", 6) == 0)
	    ep->io.rchar = strtoull(curline + 7, &curline, 0);
	else if (strncmp(curline, "wchar:", 6) == 0)
	    ep->io.wchar = strtoull(curline + 7, &curline, 0);
	else if (strncmp(curline, "syscr:", 6) == 0)
	    ep->io.syscr = strtoull(curline + 7, &curline, 0);
	else if (strncmp(curline, "syscw:", 6) == 0)
	    ep->io.syscw = strtoull(curline + 7, &curline, 0);
	else if (strncmp(curline, "read_bytes:", 11) == 0)
	    ep->io.readb = strtoull(curline + 12, &curline, 0);
	else if (strncmp(curline, "write_bytes:", 12) == 0)
	    ep->io.writeb = strtoull(curline + 13, &curline, 0);
	else if (strncmp(curline, "cancelled_write_bytes:", 22) == 0)
	    ep->io.cancel = strtoull(curline + 23, &curline, 0);
	else {
	    if (pmDebugOptions.appl1 && pmDebugOptions.desperate) {
		char	*p;
		fprintf(stderr, "%s: skip ", "fetch_proc_pid_io");
		for (p = curline; *p && *p != '\n'; p++)
		    fputc(*p, stderr);
		fputc('\n', stderr);
	    }
	    curline = index(curline, '\n');
	}
	if (curline != NULL) curline++;
    }
}

static int
refresh_proc_pid_io(proc_pid_entry_t *ep)
{
    int			fd, sts;

    if (ep->flags & PROC_PID_FLAG_IO_SUCCESS)
	return 0;
    if ((fd = proc_open("io", ep)) < 0)
	return maperr();
    if ((sts = read_proc_entry(fd, &procbuflen, &procbuf)) >= 0) {
	parse_proc_io(ep, procbuflen, procbuf);
	ep->flags |= PROC_PID_FLAG_IO_SUCCESS;
    }
    close(fd);
    return sts;
}

/*
 * fetch a proc/<pid>/io entry for pid
 *
 * Depends on kernel built with CONFIG_TASK_IO_ACCOUNTING=y
 * which means the following must also be set:
 * CONFIG_TASKSTATS=y
 * CONFIG_TASK_DELAY_ACCT=y
 * CONFIG_TASK_XACCT=y
 */
proc_pid_entry_t *
fetch_proc_pid_io(int id, proc_pid_t *proc_pid, int *sts)
{
    proc_pid_entry_t	*ep = proc_pid_entry_lookup(id, proc_pid);

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_IO_FETCHED)) {
	*sts = refresh_proc_pid_io(ep);
	ep->flags |= PROC_PID_FLAG_IO_FETCHED;
    }
    return (*sts < 0) ? NULL : ep;
}

static void
parse_proc_smaps(proc_pid_entry_t *ep, size_t buflen, char *buf)
{
    char		*curline = buf;

    /*
     * Rss:                1860 kB
     * Pss:                 354 kB
     * Pss_Anon:             92 kB
     * Pss_File:            262 kB
     *  [...]
     * Locked:                0 kB
     */
    while (curline) {
	switch (curline[0]) {
	case 'A':
	    if (strncmp(curline, "AnonHugePages:", 14) == 0)
		ep->smaps.anonhugepages = strtoull(curline + 15, &curline, 0);
	    else if (strncmp(curline, "Anonymous:", 10) == 0)
		ep->smaps.anonymous = strtoull(curline + 11, &curline, 0);
	    else
		goto nomatch;
	    break;
	case 'F':
	    if (strncmp(curline, "FilePmdMapped:", 14) == 0)
		ep->smaps.filepmdmapped = strtoull(curline + 15, &curline, 0);
	    else
		goto nomatch;
	    break;
	case 'L':
	    if (strncmp(curline, "LazyFree:", 9) == 0)
		ep->smaps.lazyfree = strtoull(curline + 10, &curline, 0);
	    else if (strncmp(curline, "Locked:", 7) == 0)
		ep->smaps.locked = strtoull(curline + 8, &curline, 0);
	    else
		goto nomatch;
	    break;
	case 'P':
	    if (strncmp(curline, "Pss:", 4) == 0)
		ep->smaps.pss = strtoull(curline + 5, &curline, 0);
	    else if (strncmp(curline, "Pss_Anon:", 9) == 0)
		ep->smaps.pss_anon = strtoull(curline + 10, &curline, 0);
	    else if (strncmp(curline, "Pss_File:", 9) == 0)
		ep->smaps.pss_file = strtoull(curline + 10, &curline, 0);
	    else if (strncmp(curline, "Pss_Shmem:", 10) == 0)
		ep->smaps.pss_shmem = strtoull(curline + 11, &curline, 0);
	    else if (strncmp(curline, "Private_Clean:", 14) == 0)
		ep->smaps.private_clean = strtoull(curline + 15, &curline, 0);
	    else if (strncmp(curline, "Private_Dirty:", 14) == 0)
		ep->smaps.private_dirty = strtoull(curline + 15, &curline, 0);
	    else if (strncmp(curline, "Private_Hugetlb:", 16) == 0)
		ep->smaps.private_hugetlb = strtoull(curline + 17, &curline, 0);
	    else
		goto nomatch;
	    break;
	case 'R':
	    if (strncmp(curline, "Rss:", 4) == 0)
		ep->smaps.rss = strtoull(curline + 5, &curline, 0);
	    else if (strncmp(curline, "Referenced:", 11) == 0)
		ep->smaps.referenced = strtoull(curline + 12, &curline, 0);
	    else
		goto nomatch;
	    break;
	case 'S':
	    if (strncmp(curline, "Shared_Clean:", 13) == 0)
		ep->smaps.shared_clean = strtoull(curline + 14, &curline, 0);
	    else if (strncmp(curline, "Shared_Dirty:", 13) == 0)
		ep->smaps.shared_dirty = strtoull(curline + 14, &curline, 0);
	    else if (strncmp(curline, "ShmemPmdMapped:", 15) == 0)
		ep->smaps.shmempmdmapped = strtoull(curline + 16, &curline, 0);
	    else if (strncmp(curline, "Shared_Hugetlb:", 15) == 0)
		ep->smaps.shared_hugetlb = strtoull(curline + 16, &curline, 0);
	    else if (strncmp(curline, "Swap:", 5) == 0)
		ep->smaps.swap = strtoull(curline + 6, &curline, 0);
	    else if (strncmp(curline, "SwapPss:", 8) == 0)
		ep->smaps.swappss = strtoull(curline + 9, &curline, 0);
	    else
		goto nomatch;
	    break;
	default:
	nomatch:
	    if (pmDebugOptions.appl1 && pmDebugOptions.desperate) {
		char	*p;
		fprintf(stderr, "%s: skip ", "fetch_proc_pid_smaps");
		for (p = curline; *p && *p != '\n'; p++)
		    fputc(*p, stderr);
		fputc('\n', stderr);
	    }
	}
	curline = index(curline, '\n');	/* skips any kB suffix */
	if (curline != NULL) curline++;
    }
}

static int
refresh_proc_pid_smaps(proc_pid_entry_t *ep)
{
    int			fd, sts;

    if (ep->flags & PROC_PID_FLAG_SMAPS_SUCCESS)
	return 0;
    if ((fd = proc_open("smaps_rollup", ep)) < 0)
	return maperr();
    if ((sts = read_proc_entry(fd, &procbuflen, &procbuf)) >= 0) {
	parse_proc_smaps(ep, procbuflen, procbuf);
	ep->flags |= PROC_PID_FLAG_SMAPS_SUCCESS;
    }
    close(fd);
    return sts;
}

/*
 * fetch a proc/<pid>/smaps_rollup entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_smaps(int id, proc_pid_t *proc_pid, int *sts)
{
    proc_pid_entry_t	*ep = proc_pid_entry_lookup(id, proc_pid);

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_SMAPS_FETCHED)) {
	*sts = refresh_proc_pid_smaps(ep);
	ep->flags |= PROC_PID_FLAG_SMAPS_FETCHED;
    }
    return (*sts < 0) ? NULL : ep;
}

static int
refresh_proc_pid_fd(proc_pid_entry_t *ep)
{
    uint32_t		de_count;
    DIR			*dir;

    if (ep->flags & PROC_PID_FLAG_FD_SUCCESS)
	return 0;
    if ((dir = proc_opendir("fd", ep)) == NULL)
	return maperr();
    de_count = 0;
    while (readdir(dir) != NULL)
	de_count++;
    closedir(dir);
    ep->fd_count = de_count - 2; /* subtract cwd and parent entries */
    ep->flags |= PROC_PID_FLAG_FD_SUCCESS;
    return 0;
}

/*
 * fetch a proc/<pid>/fd entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_fd(int id, proc_pid_t *proc_pid, int *sts)
{
    proc_pid_entry_t	*ep = proc_pid_entry_lookup(id, proc_pid);

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_FD_FETCHED)) {
	*sts = refresh_proc_pid_fd(ep);
	ep->flags |= PROC_PID_FLAG_FD_FETCHED;
    }
    return (*sts < 0) ? NULL : ep;
}

/*
 * From a kernel proc cgroups file entry attempt to extract a
 * container ID using the cgroup_container_search routine.
 */
static char *
proc_container_search(const char *buf, int buflen, char *cid, int cidlen)
{
    if (strncmp(buf, "cpuset:", 7) != 0)
	return NULL;
    return cgroup_container_search(buf + 7, cid, cidlen);
}

/*
 * From the kernel format for a single process cgroup set:
 *     2:cpu:/
 *     1:cpuset:/
 *
 * Produce the same one-line format string that "ps" uses:
 *     "cpu:/;cpuset:/"
 */
static void
proc_cgroup_reformat(char *buf, int buflen, char *fmt, int fmtlen, char *cid, int cidlen)
{
    char	*target = fmt, *p, *s = NULL, *c = NULL;
    int		off, len;

    *target = *cid = '\0';
    for (p = buf; p - buf < buflen; p++) {
	if (*p == '\0')
	    break;
	if (*p == ':' && !s)	/* position "s" at start */
	    s = p + 1;
	if (*p != '\n' || !s)	/* find end of this line */
	    continue;
	if (target != fmt)      /* not the first cgroup? */
	    strncat(target, ";", 2);
	/* have a complete cgroup line now, copy it over */
	/* (but first try out container name heuristics) */
	off = target - fmt;
	len = p - s;
	if (off + len >= fmtlen)
	    break;
	if (!c)
	    c = proc_container_search(s, len, cid, cidlen);
	strncat(target, s, len);
	target += len;
	s = NULL;		/* reset it for new line */
    }
}

static int
refresh_proc_pid_cgroup(proc_pid_entry_t *ep)
{
    static size_t	clen1, clen2;
    static char		*cbuf1, *cbuf2;
    char		cid[72], *tmp;
    int			fd, sts;

    if (ep->flags & PROC_PID_FLAG_CGROUP_SUCCESS)
	return 0;
    if ((fd = proc_open("cgroup", ep)) < 0)
	return maperr();
    if ((sts = read_proc_entry(fd, &clen1, &cbuf1)) >= 0) {
	if (clen1 > clen2) {
	    if ((tmp = realloc(cbuf2, clen1)) != NULL) {
		clen2 = clen1;
		cbuf2 = tmp;
	    }
	}
	/* reformat the buffer to match "ps" output format and */
	/* try any container name heuristics, then hash (both) */
	proc_cgroup_reformat(cbuf1, clen1, cbuf2, clen2, cid, sizeof(cid));
	ep->container_id = proc_strings_insert(cid);
	ep->cgroup_id = proc_strings_insert(cbuf2);
	ep->flags |= PROC_PID_FLAG_CGROUP_SUCCESS;
    }
    close(fd);
    return sts;
}

/*
 * fetch a proc/<pid>/cgroup entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_cgroup(int id, proc_pid_t *proc_pid, int *sts)
{
    proc_pid_entry_t	*ep = proc_pid_entry_lookup(id, proc_pid);

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_CGROUP_FETCHED)) {
	*sts = refresh_proc_pid_cgroup(ep);
	ep->flags |= PROC_PID_FLAG_CGROUP_FETCHED;
    }

    return (*sts < 0) ? NULL : ep;
}

static int
refresh_proc_pid_label(proc_pid_entry_t *ep)
{
    ssize_t		n;
    int			fd, sts;

    if (ep->flags & PROC_PID_FLAG_LABEL_SUCCESS)
	return 0;
    if ((fd = proc_open("attr/current", ep)) < 0)
	return maperr();
    if ((n = read(fd, procbuf, procbuflen)) < 0)
	sts = maperr();
    else if (n == 0)
	sts = -ENODATA;
    else {
	sts = 0;
	/* buffer matches "ps" output format, direct hash */
	procbuf[n-1] = '\0';
	ep->label_id = proc_strings_insert(procbuf);
	ep->flags |= PROC_PID_FLAG_LABEL_SUCCESS;
    }
    close(fd);
    return sts;
}

/*
 * fetch a proc/<pid>/attr/current entry for pid
 */
proc_pid_entry_t *
fetch_proc_pid_label(int id, proc_pid_t *proc_pid, int *sts)
{
    proc_pid_entry_t	*ep = proc_pid_entry_lookup(id, proc_pid);

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_LABEL_FETCHED)) {
	*sts = refresh_proc_pid_label(ep);
	ep->flags |= PROC_PID_FLAG_LABEL_FETCHED;
    }
    return (*sts < 0) ? NULL : ep;
}

static int
refresh_proc_pid_oom_score(proc_pid_entry_t *ep)
{
    int			fd, sts;

    if (ep->flags & PROC_PID_FLAG_OOM_SCORE_SUCCESS)
	return 0;
    if ((fd = proc_open("oom_score", ep)) < 0)
	return maperr();
    ep->oom_score = 0;
    if ((sts = read_proc_entry(fd, &procbuflen, &procbuf)) >= 0) {
	ep->oom_score = (__uint32_t)strtoul(procbuf, NULL, 0);
	ep->flags |= PROC_PID_FLAG_OOM_SCORE_SUCCESS;
    }
    close(fd);
    return sts;
}

/*
 * fetch the proc/<pid>/oom_score value for pid
 */
proc_pid_entry_t *
fetch_proc_pid_oom_score(int id, proc_pid_t *proc_pid, int *sts)
{
    proc_pid_entry_t	*ep = proc_pid_entry_lookup(id, proc_pid);

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_OOM_SCORE_FETCHED)) {
	*sts = refresh_proc_pid_oom_score(ep);
	ep->flags |= PROC_PID_FLAG_OOM_SCORE_FETCHED;
    }
    return (*sts < 0) ? NULL : ep;
}

static int
refresh_proc_pid_cwd(proc_pid_entry_t *ep)
{
    int			sts;

    if (ep->flags & PROC_PID_FLAG_CWD_SUCCESS)
	return 0;
    if ((sts = proc_readlink("cwd", ep, &procbuflen, &procbuf)) >= 0) {
	ep->cwd_id = proc_strings_insert(procbuf);
	ep->flags |= PROC_PID_FLAG_CWD_SUCCESS;
    }
    return sts;
}

/*
 * fetch a proc/<pid>/cwd value for pid
 */
proc_pid_entry_t *
fetch_proc_pid_cwd(int id, proc_pid_t *proc_pid, int *sts)
{
    proc_pid_entry_t	*ep = proc_pid_entry_lookup(id, proc_pid);

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_CWD_FETCHED)) {
	*sts = refresh_proc_pid_cwd(ep);
	ep->flags |= PROC_PID_FLAG_CWD_FETCHED;
    }

    return (*sts < 0) ? NULL : ep;
}

static int
refresh_proc_pid_exe(proc_pid_entry_t *ep)
{
    int			sts;

    if (ep->flags & PROC_PID_FLAG_EXE_SUCCESS)
	return 0;
    if ((sts = proc_readlink("exe", ep, &procbuflen, &procbuf)) >= 0) {
	ep->exe_id = proc_strings_insert(procbuf);
	ep->flags |= PROC_PID_FLAG_EXE_SUCCESS;
    }
    return sts;
}

/*
 * fetch a proc/<pid>/exe value for pid
 */
proc_pid_entry_t *
fetch_proc_pid_exe(int id, proc_pid_t *proc_pid, int *sts)
{
    proc_pid_entry_t	*ep = proc_pid_entry_lookup(id, proc_pid);

    *sts = 0;
    if (!ep)
	return NULL;

    if (!(ep->flags & PROC_PID_FLAG_EXE_FETCHED)) {
	*sts = refresh_proc_pid_exe(ep);
	ep->flags |= PROC_PID_FLAG_EXE_FETCHED;
    }

    return (*sts < 0) ? NULL : ep;
}
