[PATCH] per task delay accounting taskstats interface: documentation fix

Change documentation and example program to reflect the flow control issues being addressed by the cpumask changes. Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-14 00:24:45 -07:00
parent ad4ecbcba7
commit 9e06d3f9f6
2 changed files with 361 additions and 301 deletions
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@ -5,6 +5,7 @@
 *
 * Copyright (C) Shailabh Nagar, IBM Corp. 2005
 * Copyright (C) Balbir Singh, IBM Corp. 2006
+ * Copyright (c) Jay Lan, SGI. 2006
 *
 */

@ -36,13 +37,37 @@

 #define err(code, fmt, arg...) do { printf(fmt, ##arg); exit(code); } while (0)
 int done = 0;
+int rcvbufsz=0;
+
+    char name[100];
+int dbg=0, print_delays=0;
+__u64 stime, utime;
+#define PRINTF(fmt, arg...) {			\
+	    if (dbg) {				\
+		printf(fmt, ##arg);		\
+	    }					\
+	}
+
+/* Maximum size of response requested or message sent */
+#define MAX_MSG_SIZE	256
+/* Maximum number of cpus expected to be specified in a cpumask */
+#define MAX_CPUS	32
+/* Maximum length of pathname to log file */
+#define MAX_FILENAME	256
+
+struct msgtemplate {
+	struct nlmsghdr n;
+	struct genlmsghdr g;
+	char buf[MAX_MSG_SIZE];
+};
+
+char cpumask[100+6*MAX_CPUS];

 /*
 * Create a raw netlink socket and bind
 */
-static int create_nl_socket(int protocol, int groups)
+static int create_nl_socket(int protocol)
 {
-    socklen_t addr_len;
 	int fd;
 	struct sockaddr_nl local;

@ -50,9 +75,16 @@ static int create_nl_socket(int protocol, int groups)
 	if (fd < 0)
 		return -1;

+	if (rcvbufsz)
+		if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF,
+				&rcvbufsz, sizeof(rcvbufsz)) < 0) {
+			printf("Unable to set socket rcv buf size to %d\n",
+			       rcvbufsz);
+			return -1;
+		}
+
 	memset(&local, 0, sizeof(local));
 	local.nl_family = AF_NETLINK;
-    local.nl_groups = groups;

 	if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0)
 		goto error;
@ -63,78 +95,72 @@ static int create_nl_socket(int protocol, int groups)
 	return -1;
 }

-int sendto_fd(int s, const char *buf, int bufLen)
-{
-    struct sockaddr_nl nladdr;
-    int r;

+int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
+	     __u8 genl_cmd, __u16 nla_type,
+	     void *nla_data, int nla_len)
+{
+	struct nlattr *na;
+	struct sockaddr_nl nladdr;
+	int r, buflen;
+	char *buf;
+
+	struct msgtemplate msg;
+
+	msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
+	msg.n.nlmsg_type = nlmsg_type;
+	msg.n.nlmsg_flags = NLM_F_REQUEST;
+	msg.n.nlmsg_seq = 0;
+	msg.n.nlmsg_pid = nlmsg_pid;
+	msg.g.cmd = genl_cmd;
+	msg.g.version = 0x1;
+	na = (struct nlattr *) GENLMSG_DATA(&msg);
+	na->nla_type = nla_type;
+	na->nla_len = nla_len + 1 + NLA_HDRLEN;
+	memcpy(NLA_DATA(na), nla_data, nla_len);
+	msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
+
+	buf = (char *) &msg;
+	buflen = msg.n.nlmsg_len ;
 	memset(&nladdr, 0, sizeof(nladdr));
 	nladdr.nl_family = AF_NETLINK;
-
-    while ((r = sendto(s, buf, bufLen, 0, (struct sockaddr *) &nladdr,
-		       sizeof(nladdr))) < bufLen) {
+	while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
+			   sizeof(nladdr))) < buflen) {
 		if (r > 0) {
 			buf += r;
-	    bufLen -= r;
+			buflen -= r;
 		} else if (errno != EAGAIN)
 			return -1;
 	}
 	return 0;
 }

+
 /*
 * Probe the controller in genetlink to find the family id
 * for the TASKSTATS family
 */
 int get_family_id(int sd)
 {
-    struct {
-	struct nlmsghdr n;
-	struct genlmsghdr g;
-	char buf[256];
-    } family_req;
 	struct {
 		struct nlmsghdr n;
 		struct genlmsghdr g;
 		char buf[256];
 	} ans;

-    int id;
+	int id, rc;
 	struct nlattr *na;
 	int rep_len;

-    /* Get family name */
-    family_req.n.nlmsg_type = GENL_ID_CTRL;
-    family_req.n.nlmsg_flags = NLM_F_REQUEST;
-    family_req.n.nlmsg_seq = 0;
-    family_req.n.nlmsg_pid = getpid();
-    family_req.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
-    family_req.g.cmd = CTRL_CMD_GETFAMILY;
-    family_req.g.version = 0x1;
-    na = (struct nlattr *) GENLMSG_DATA(&family_req);
-    na->nla_type = CTRL_ATTR_FAMILY_NAME;
-    na->nla_len = strlen(TASKSTATS_GENL_NAME) + 1 + NLA_HDRLEN;
-    strcpy(NLA_DATA(na), TASKSTATS_GENL_NAME);
-    family_req.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
-
-    if (sendto_fd(sd, (char *) &family_req, family_req.n.nlmsg_len) < 0)
-	err(1, "error sending message via Netlink\n");
+	strcpy(name, TASKSTATS_GENL_NAME);
+	rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
+			CTRL_ATTR_FAMILY_NAME, (void *)name,
+			strlen(TASKSTATS_GENL_NAME)+1);

 	rep_len = recv(sd, &ans, sizeof(ans), 0);
-
-    if (rep_len < 0)
-	err(1, "error receiving reply message via Netlink\n");
-
-
-    /* Validate response message */
-    if (!NLMSG_OK((&ans.n), rep_len))
-	err(1, "invalid reply message received via Netlink\n");
-
-    if (ans.n.nlmsg_type == NLMSG_ERROR) {	/* error */
-	printf("error received NACK - leaving\n");
-	exit(1);
-    }
-
+	if (ans.n.nlmsg_type == NLMSG_ERROR ||
+	    (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len))
+		return 0;

 	na = (struct nlattr *) GENLMSG_DATA(&ans);
 	na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
@ -144,7 +170,7 @@ int get_family_id(int sd)
 	return id;
 }

-void print_taskstats(struct taskstats *t)
+void print_delayacct(struct taskstats *t)
 {
 	printf("\n\nCPU   %15s%15s%15s%15s\n"
 	       "      %15llu%15llu%15llu%15llu\n"
@ -160,181 +186,154 @@ void print_taskstats(struct taskstats *t)
 	       "count", "delay total", t->swapin_count, t->swapin_delay_total);
 }

-void sigchld(int sig)
-{
-    done = 1;
-}
-
 int main(int argc, char *argv[])
 {
-    int rc;
-    int sk_nl;
-    struct nlmsghdr *nlh;
-    struct genlmsghdr *genlhdr;
-    char *buf;
-    struct taskstats_cmd_param *param;
+	int c, rc, rep_len, aggr_len, len2, cmd_type;
 	__u16 id;
+	__u32 mypid;
+
 	struct nlattr *na;
-
-    /* For receiving */
-    struct sockaddr_nl kern_nla, from_nla;
-    socklen_t from_nla_len;
-    int recv_len;
-    struct taskstats_reply *reply;
-
-    struct {
-	struct nlmsghdr n;
-	struct genlmsghdr g;
-	char buf[256];
-    } req;
-
-    struct {
-	struct nlmsghdr n;
-	struct genlmsghdr g;
-	char buf[256];
-    } ans;
-
 	int nl_sd = -1;
-    int rep_len;
 	int len = 0;
-    int aggr_len, len2;
-    struct sockaddr_nl nladdr;
 	pid_t tid = 0;
 	pid_t rtid = 0;
-    int cmd_type = TASKSTATS_TYPE_TGID;
-    int c, status;
-    int forking = 0;
-    struct sigaction act = {
-	.sa_handler = SIG_IGN,
-	.sa_mask = SA_NOMASK,
-    };
-    struct sigaction tact ;

-    if (argc < 3) {
-	printf("usage %s [-t tgid][-p pid][-c cmd]\n", argv[0]);
-	exit(-1);
-    }
+	int fd = 0;
+	int count = 0;
+	int write_file = 0;
+	int maskset = 0;
+	char logfile[128];
+	int loop = 0;

-    tact.sa_handler = sigchld;
-    sigemptyset(&tact.sa_mask);
-    if (sigaction(SIGCHLD, &tact, NULL) < 0)
-	err(1, "sigaction failed for SIGCHLD\n");
+	struct msgtemplate msg;

 	while (1) {
-
-	c = getopt(argc, argv, "t:p:c:");
+		c = getopt(argc, argv, "dw:r:m:t:p:v:l");
 		if (c < 0)
 			break;

 		switch (c) {
+		case 'd':
+			printf("print delayacct stats ON\n");
+			print_delays = 1;
+			break;
+		case 'w':
+			strncpy(logfile, optarg, MAX_FILENAME);
+			printf("write to file %s\n", logfile);
+			write_file = 1;
+			break;
+		case 'r':
+			rcvbufsz = atoi(optarg);
+			printf("receive buf size %d\n", rcvbufsz);
+			if (rcvbufsz < 0)
+				err(1, "Invalid rcv buf size\n");
+			break;
+		case 'm':
+			strncpy(cpumask, optarg, sizeof(cpumask));
+			maskset = 1;
+			printf("cpumask %s maskset %d\n", cpumask, maskset);
+			break;
 		case 't':
 			tid = atoi(optarg);
 			if (!tid)
 				err(1, "Invalid tgid\n");
 			cmd_type = TASKSTATS_CMD_ATTR_TGID;
+			print_delays = 1;
 			break;
 		case 'p':
 			tid = atoi(optarg);
 			if (!tid)
 				err(1, "Invalid pid\n");
-	    cmd_type = TASKSTATS_CMD_ATTR_TGID;
+			cmd_type = TASKSTATS_CMD_ATTR_PID;
+			print_delays = 1;
 			break;
-	case 'c':
-	    opterr = 0;
-	    tid = fork();
-	    if (tid < 0)
-		err(1, "fork failed\n");
-
-	    if (tid == 0) {	/* child process */
-		if (execvp(argv[optind - 1], &argv[optind - 1]) < 0) {
-		    exit(-1);
-		}
-	    }
-	    forking = 1;
+		case 'v':
+			printf("debug on\n");
+			dbg = 1;
+			break;
+		case 'l':
+			printf("listen forever\n");
+			loop = 1;
 			break;
 		default:
-	    printf("usage %s [-t tgid][-p pid][-c cmd]\n", argv[0]);
+			printf("Unknown option %d\n", c);
 			exit(-1);
-	    break;
 		}
-	if (c == 'c')
-	    break;
 	}

-    /* Construct Netlink request message */
+	if (write_file) {
+		fd = open(logfile, O_WRONLY | O_CREAT | O_TRUNC,
+			  S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+		if (fd == -1) {
+			perror("Cannot open output file\n");
+			exit(1);
+		}
+	}

-    /* Send Netlink request message & get reply */
-
-    if ((nl_sd =
-	 create_nl_socket(NETLINK_GENERIC, TASKSTATS_LISTEN_GROUP)) < 0)
+	if ((nl_sd = create_nl_socket(NETLINK_GENERIC)) < 0)
 		err(1, "error creating Netlink socket\n");


+	mypid = getpid();
 	id = get_family_id(nl_sd);
+	if (!id) {
+		printf("Error getting family id, errno %d", errno);
+		goto err;
+	}
+	PRINTF("family id %d\n", id);

-    /* Send command needed */
-    req.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
-    req.n.nlmsg_type = id;
-    req.n.nlmsg_flags = NLM_F_REQUEST;
-    req.n.nlmsg_seq = 0;
-    req.n.nlmsg_pid = tid;
-    req.g.cmd = TASKSTATS_CMD_GET;
-    na = (struct nlattr *) GENLMSG_DATA(&req);
-    na->nla_type = cmd_type;
-    na->nla_len = sizeof(unsigned int) + NLA_HDRLEN;
-    *(__u32 *) NLA_DATA(na) = tid;
-    req.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
+	if (maskset) {
+		rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
+			      TASKSTATS_CMD_ATTR_REGISTER_CPUMASK,
+			      &cpumask, sizeof(cpumask));
+		PRINTF("Sent register cpumask, retval %d\n", rc);
+		if (rc < 0) {
+			printf("error sending register cpumask\n");
+			goto err;
+		}
+	}

-
-    if (!forking && sendto_fd(nl_sd, (char *) &req, req.n.nlmsg_len) < 0)
-	err(1, "error sending message via Netlink\n");
-
-    act.sa_handler = SIG_IGN;
-    sigemptyset(&act.sa_mask);
-    if (sigaction(SIGINT, &act, NULL) < 0)
-	err(1, "sigaction failed for SIGINT\n");
+	if (tid) {
+		rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
+			      cmd_type, &tid, sizeof(__u32));
+		PRINTF("Sent pid/tgid, retval %d\n", rc);
+		if (rc < 0) {
+			printf("error sending tid/tgid cmd\n");
+			goto done;
+		}
+	}

 	do {
 		int i;
-	struct pollfd pfd;
-	int pollres;

-	pfd.events = 0xffff & ~POLLOUT;
-	pfd.fd = nl_sd;
-	pollres = poll(&pfd, 1, 5000);
-	if (pollres < 0 || done) {
-	    break;
-	}
-
-	rep_len = recv(nl_sd, &ans, sizeof(ans), 0);
-	nladdr.nl_family = AF_NETLINK;
-	nladdr.nl_groups = TASKSTATS_LISTEN_GROUP;
-
-	if (ans.n.nlmsg_type == NLMSG_ERROR) {	/* error */
-	    printf("error received NACK - leaving\n");
-	    exit(1);
-	}
+		rep_len = recv(nl_sd, &msg, sizeof(msg), 0);
+		PRINTF("received %d bytes\n", rep_len);

 		if (rep_len < 0) {
-	    err(1, "error receiving reply message via Netlink\n");
-	    break;
+			printf("nonfatal reply error: errno %d\n", errno);
+			continue;
+		}
+		if (msg.n.nlmsg_type == NLMSG_ERROR ||
+		    !NLMSG_OK((&msg.n), rep_len)) {
+			printf("fatal reply error,  errno %d\n", errno);
+			goto done;
 		}

-	/* Validate response message */
-	if (!NLMSG_OK((&ans.n), rep_len))
-	    err(1, "invalid reply message received via Netlink\n");
+		PRINTF("nlmsghdr size=%d, nlmsg_len=%d, rep_len=%d\n",
+		       sizeof(struct nlmsghdr), msg.n.nlmsg_len, rep_len);

-	rep_len = GENLMSG_PAYLOAD(&ans.n);

-	na = (struct nlattr *) GENLMSG_DATA(&ans);
+		rep_len = GENLMSG_PAYLOAD(&msg.n);
+
+		na = (struct nlattr *) GENLMSG_DATA(&msg);
 		len = 0;
 		i = 0;
 		while (len < rep_len) {
 			len += NLA_ALIGN(na->nla_len);
 			switch (na->nla_type) {
-	    case TASKSTATS_TYPE_AGGR_PID:
-		/* Fall through */
 			case TASKSTATS_TYPE_AGGR_TGID:
+				/* Fall through */
+			case TASKSTATS_TYPE_AGGR_PID:
 				aggr_len = NLA_PAYLOAD(na->nla_len);
 				len2 = 0;
 				/* For nested attributes, na follows */
@ -344,33 +343,54 @@ int main(int argc, char *argv[])
 					switch (na->nla_type) {
 					case TASKSTATS_TYPE_PID:
 						rtid = *(int *) NLA_DATA(na);
+						if (print_delays)
+							printf("PID\t%d\n", rtid);
 						break;
 					case TASKSTATS_TYPE_TGID:
 						rtid = *(int *) NLA_DATA(na);
+						if (print_delays)
+							printf("TGID\t%d\n", rtid);
 						break;
 					case TASKSTATS_TYPE_STATS:
-			if (rtid == tid) {
-			    print_taskstats((struct taskstats *)
-					    NLA_DATA(na));
-			    done = 1;
+						count++;
+						if (print_delays)
+							print_delayacct((struct taskstats *) NLA_DATA(na));
+						if (fd) {
+							if (write(fd, NLA_DATA(na), na->nla_len) < 0) {
+								err(1,"write error\n");
 							}
+						}
+						if (!loop)
+							goto done;
+						break;
+					default:
+						printf("Unknown nested nla_type %d\n", na->nla_type);
 						break;
 					}
 					len2 += NLA_ALIGN(na->nla_len);
 					na = (struct nlattr *) ((char *) na + len2);
-		    if (done)
+				}
 				break;
-		}
-	    }
-	    na = (struct nlattr *) (GENLMSG_DATA(&ans) + len);
-	    if (done)
-		break;
-	}
-	if (done)
-	    break;
-    }
-    while (1);

+			default:
+				printf("Unknown nla_type %d\n", na->nla_type);
+				break;
+			}
+			na = (struct nlattr *) (GENLMSG_DATA(&msg) + len);
+		}
+	} while (loop);
+done:
+	if (maskset) {
+		rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
+			      TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK,
+			      &cpumask, sizeof(cpumask));
+		printf("Sent deregister mask, retval %d\n", rc);
+		if (rc < 0)
+			err(rc, "error sending deregister cpumask\n");
+	}
+err:
 	close(nl_sd);
+	if (fd)
+		close(fd);
 	return 0;
 }
--- a/Documentation/accounting/taskstats.txt
+++ b/Documentation/accounting/taskstats.txt
@ -26,20 +26,28 @@ leader - a process is deemed alive as long as it has any task belonging to it.
 Usage
 -----

-To get statistics during task's lifetime, userspace opens a unicast netlink
+To get statistics during a task's lifetime, userspace opens a unicast netlink
 socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid.
 The response contains statistics for a task (if pid is specified) or the sum of
 statistics for all tasks of the process (if tgid is specified).

-To obtain statistics for tasks which are exiting, userspace opens a multicast
-netlink socket. Each time a task exits, its per-pid statistics is always sent
-by the kernel to each listener on the multicast socket. In addition, if it is
-the last thread exiting its thread group, an additional record containing the
-per-tgid stats are also sent. The latter contains the sum of per-pid stats for
-all threads in the thread group, both past and present.
+To obtain statistics for tasks which are exiting, the userspace listener
+sends a register command and specifies a cpumask. Whenever a task exits on
+one of the cpus in the cpumask, its per-pid statistics are sent to the
+registered listener. Using cpumasks allows the data received by one listener
+to be limited and assists in flow control over the netlink interface and is
+explained in more detail below.
+
+If the exiting task is the last thread exiting its thread group,
+an additional record containing the per-tgid stats is also sent to userspace.
+The latter contains the sum of per-pid stats for all threads in the thread
+group, both past and present.

 getdelays.c is a simple utility demonstrating usage of the taskstats interface
-for reporting delay accounting statistics.
+for reporting delay accounting statistics. Users can register cpumasks,
+send commands and process responses, listen for per-tid/tgid exit data,
+write the data received to a file and do basic flow control by increasing
+receive buffer sizes.

 Interface
 ---------
@ -66,10 +74,20 @@ The messages are in the format

 The taskstats payload is one of the following three kinds:

-1. Commands: Sent from user to kernel. The payload is one attribute, of type
-TASKSTATS_CMD_ATTR_PID/TGID, containing a u32 pid or tgid in the attribute
-payload. The pid/tgid denotes the task/process for which userspace wants
-statistics.
+1. Commands: Sent from user to kernel. Commands to get data on
+a pid/tgid consist of one attribute, of type TASKSTATS_CMD_ATTR_PID/TGID,
+containing a u32 pid or tgid in the attribute payload. The pid/tgid denotes
+the task/process for which userspace wants statistics.
+
+Commands to register/deregister interest in exit data from a set of cpus
+consist of one attribute, of type
+TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK and contain a cpumask in the
+attribute payload. The cpumask is specified as an ascii string of
+comma-separated cpu ranges e.g. to listen to exit data from cpus 1,2,3,5,7,8
+the cpumask would be "1-3,5,7-8". If userspace forgets to deregister interest
+in cpus before closing the listening socket, the kernel cleans up its interest
+set over time. However, for the sake of efficiency, an explicit deregistration
+is advisable.

 2. Response for a command: sent from the kernel in response to a userspace
 command. The payload is a series of three attributes of type:
@ -138,4 +156,26 @@ struct too much, requiring disparate userspace accounting utilities to
 unnecessarily receive large structures whose fields are of no interest, then
 extending the attributes structure would be worthwhile.

+Flow control for taskstats
+--------------------------
+
+When the rate of task exits becomes large, a listener may not be able to keep
+up with the kernel's rate of sending per-tid/tgid exit data leading to data
+loss. This possibility gets compounded when the taskstats structure gets
+extended and the number of cpus grows large.
+
+To avoid losing statistics, userspace should do one or more of the following:
+
+- increase the receive buffer sizes for the netlink sockets opened by
+listeners to receive exit data.
+
+- create more listeners and reduce the number of cpus being listened to by
+each listener. In the extreme case, there could be one listener for each cpu.
+Users may also consider setting the cpu affinity of the listener to the subset
+of cpus to which it listens, especially if they are listening to just one cpu.
+
+Despite these measures, if the userspace receives ENOBUFS error messages
+indicated overflow of receive buffers, it should take measures to handle the
+loss of data.
+
 ----