tracing/user_events: Use bits vs bytes for enabled status page data
User processes may require many events and when they do the cache performance of a byte index status check is less ideal than a bit index. The previous event limit per-page was 4096, the new limit is 32,768. This change adds a bitwise index to the user_reg struct. Programs check that the bit at status_bit has a bit set within the status page(s). Link: https://lkml.kernel.org/r/20220728233309.1896-6-beaub@linux.microsoft.com Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.zimbra@efficios.com/ Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
This commit is contained in:
		
				
					committed by
					
						 Steven Rostedt (Google)
						Steven Rostedt (Google)
					
				
			
			
				
	
			
			
			
						parent
						
							d401b72458
						
					
				
				
					commit
					39d6d08b2e
				
			| @@ -20,15 +20,6 @@ | ||||
| #define USER_EVENTS_SYSTEM "user_events" | ||||
| #define USER_EVENTS_PREFIX "u:" | ||||
| 
 | ||||
| /* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */ | ||||
| #define EVENT_BIT_FTRACE 0 | ||||
| #define EVENT_BIT_PERF 1 | ||||
| #define EVENT_BIT_OTHER 7 | ||||
| 
 | ||||
| #define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE) | ||||
| #define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF) | ||||
| #define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER) | ||||
| 
 | ||||
| /* Create dynamic location entry within a 32-bit value */ | ||||
| #define DYN_LOC(offset, size) ((size) << 16 | (offset)) | ||||
| 
 | ||||
| @@ -45,12 +36,12 @@ struct user_reg { | ||||
| 	/* Input: Pointer to string with event name, description and flags */ | ||||
| 	__u64 name_args; | ||||
| 
 | ||||
| 	/* Output: Byte index of the event within the status page */ | ||||
| 	__u32 status_index; | ||||
| 	/* Output: Bitwise index of the event within the status page */ | ||||
| 	__u32 status_bit; | ||||
| 
 | ||||
| 	/* Output: Index of the event to use when writing data */ | ||||
| 	__u32 write_index; | ||||
| }; | ||||
| } __attribute__((__packed__)); | ||||
| 
 | ||||
| #define DIAG_IOC_MAGIC '*' | ||||
| 
 | ||||
|   | ||||
| @@ -40,17 +40,44 @@ | ||||
|  */ | ||||
| #define MAX_PAGE_ORDER 0 | ||||
| #define MAX_PAGES (1 << MAX_PAGE_ORDER) | ||||
| #define MAX_EVENTS (MAX_PAGES * PAGE_SIZE) | ||||
| #define MAX_BYTES (MAX_PAGES * PAGE_SIZE) | ||||
| #define MAX_EVENTS (MAX_BYTES * 8) | ||||
| 
 | ||||
| /* Limit how long of an event name plus args within the subsystem. */ | ||||
| #define MAX_EVENT_DESC 512 | ||||
| #define EVENT_NAME(user_event) ((user_event)->tracepoint.name) | ||||
| #define MAX_FIELD_ARRAY_SIZE 1024 | ||||
| 
 | ||||
| /*
 | ||||
|  * The MAP_STATUS_* macros are used for taking a index and determining the | ||||
|  * appropriate byte and the bit in the byte to set/reset for an event. | ||||
|  * | ||||
|  * The lower 3 bits of the index decide which bit to set. | ||||
|  * The remaining upper bits of the index decide which byte to use for the bit. | ||||
|  * | ||||
|  * This is used when an event has a probe attached/removed to reflect live | ||||
|  * status of the event wanting tracing or not to user-programs via shared | ||||
|  * memory maps. | ||||
|  */ | ||||
| #define MAP_STATUS_BYTE(index) ((index) >> 3) | ||||
| #define MAP_STATUS_MASK(index) BIT((index) & 7) | ||||
| 
 | ||||
| /*
 | ||||
|  * Internal bits (kernel side only) to keep track of connected probes: | ||||
|  * These are used when status is requested in text form about an event. These | ||||
|  * bits are compared against an internal byte on the event to determine which | ||||
|  * probes to print out to the user. | ||||
|  * | ||||
|  * These do not reflect the mapped bytes between the user and kernel space. | ||||
|  */ | ||||
| #define EVENT_STATUS_FTRACE BIT(0) | ||||
| #define EVENT_STATUS_PERF BIT(1) | ||||
| #define EVENT_STATUS_OTHER BIT(7) | ||||
| 
 | ||||
| static char *register_page_data; | ||||
| 
 | ||||
| static DEFINE_MUTEX(reg_mutex); | ||||
| static DEFINE_HASHTABLE(register_table, 4); | ||||
| static DEFINE_HASHTABLE(register_table, 8); | ||||
| static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); | ||||
| 
 | ||||
| /*
 | ||||
| @@ -72,6 +99,7 @@ struct user_event { | ||||
| 	int index; | ||||
| 	int flags; | ||||
| 	int min_size; | ||||
| 	char status; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
| @@ -106,6 +134,22 @@ static u32 user_event_key(char *name) | ||||
| 	return jhash(name, strlen(name), 0); | ||||
| } | ||||
| 
 | ||||
| static __always_inline | ||||
| void user_event_register_set(struct user_event *user) | ||||
| { | ||||
| 	int i = user->index; | ||||
| 
 | ||||
| 	register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i); | ||||
| } | ||||
| 
 | ||||
| static __always_inline | ||||
| void user_event_register_clear(struct user_event *user) | ||||
| { | ||||
| 	int i = user->index; | ||||
| 
 | ||||
| 	register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i); | ||||
| } | ||||
| 
 | ||||
| static __always_inline __must_check | ||||
| bool user_event_last_ref(struct user_event *user) | ||||
| { | ||||
| @@ -648,7 +692,7 @@ static int destroy_user_event(struct user_event *user) | ||||
| 
 | ||||
| 	dyn_event_remove(&user->devent); | ||||
| 
 | ||||
| 	register_page_data[user->index] = 0; | ||||
| 	user_event_register_clear(user); | ||||
| 	clear_bit(user->index, page_bitmap); | ||||
| 	hash_del(&user->node); | ||||
| 
 | ||||
| @@ -827,7 +871,12 @@ static void update_reg_page_for(struct user_event *user) | ||||
| 		rcu_read_unlock_sched(); | ||||
| 	} | ||||
| 
 | ||||
| 	register_page_data[user->index] = status; | ||||
| 	if (status) | ||||
| 		user_event_register_set(user); | ||||
| 	else | ||||
| 		user_event_register_clear(user); | ||||
| 
 | ||||
| 	user->status = status; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
| @@ -1332,7 +1381,17 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) | ||||
| 	if (size > PAGE_SIZE) | ||||
| 		return -E2BIG; | ||||
| 
 | ||||
| 	return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); | ||||
| 	if (size < offsetofend(struct user_reg, write_index)) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); | ||||
| 
 | ||||
| 	if (ret) | ||||
| 		return ret; | ||||
| 
 | ||||
| 	kreg->size = size; | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
| @@ -1376,7 +1435,7 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg) | ||||
| 		return ret; | ||||
| 
 | ||||
| 	put_user((u32)ret, &ureg->write_index); | ||||
| 	put_user(user->index, &ureg->status_index); | ||||
| 	put_user(user->index, &ureg->status_bit); | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| @@ -1485,7 +1544,7 @@ static int user_status_mmap(struct file *file, struct vm_area_struct *vma) | ||||
| { | ||||
| 	unsigned long size = vma->vm_end - vma->vm_start; | ||||
| 
 | ||||
| 	if (size != MAX_EVENTS) | ||||
| 	if (size != MAX_BYTES) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	return remap_pfn_range(vma, vma->vm_start, | ||||
| @@ -1520,7 +1579,7 @@ static int user_seq_show(struct seq_file *m, void *p) | ||||
| 	mutex_lock(®_mutex); | ||||
| 
 | ||||
| 	hash_for_each(register_table, i, user, node) { | ||||
| 		status = register_page_data[user->index]; | ||||
| 		status = user->status; | ||||
| 		flags = user->flags; | ||||
| 
 | ||||
| 		seq_printf(m, "%d:%s", user->index, EVENT_NAME(user)); | ||||
|   | ||||
| @@ -12,13 +12,21 @@ | ||||
| #include <fcntl.h> | ||||
| #include <stdio.h> | ||||
| #include <unistd.h> | ||||
| #include <asm/bitsperlong.h> | ||||
| #include <endian.h> | ||||
| #include <linux/user_events.h> | ||||
| 
 | ||||
| #if __BITS_PER_LONG == 64 | ||||
| #define endian_swap(x) htole64(x) | ||||
| #else | ||||
| #define endian_swap(x) htole32(x) | ||||
| #endif | ||||
| 
 | ||||
| /* Assumes debugfs is mounted */ | ||||
| const char *data_file = "/sys/kernel/debug/tracing/user_events_data"; | ||||
| const char *status_file = "/sys/kernel/debug/tracing/user_events_status"; | ||||
| 
 | ||||
| static int event_status(char **status) | ||||
| static int event_status(long **status) | ||||
| { | ||||
| 	int fd = open(status_file, O_RDONLY); | ||||
| 
 | ||||
| @@ -33,7 +41,8 @@ static int event_status(char **status) | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static int event_reg(int fd, const char *command, int *status, int *write) | ||||
| static int event_reg(int fd, const char *command, long *index, long *mask, | ||||
| 		     int *write) | ||||
| { | ||||
| 	struct user_reg reg = {0}; | ||||
| 
 | ||||
| @@ -43,7 +52,8 @@ static int event_reg(int fd, const char *command, int *status, int *write) | ||||
| 	if (ioctl(fd, DIAG_IOCSREG, ®) == -1) | ||||
| 		return -1; | ||||
| 
 | ||||
| 	*status = reg.status_index; | ||||
| 	*index = reg.status_bit / __BITS_PER_LONG; | ||||
| 	*mask = endian_swap(1L << (reg.status_bit % __BITS_PER_LONG)); | ||||
| 	*write = reg.write_index; | ||||
| 
 | ||||
| 	return 0; | ||||
| @@ -51,8 +61,9 @@ static int event_reg(int fd, const char *command, int *status, int *write) | ||||
| 
 | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
| 	int data_fd, status, write; | ||||
| 	char *status_page; | ||||
| 	int data_fd, write; | ||||
| 	long index, mask; | ||||
| 	long *status_page; | ||||
| 	struct iovec io[2]; | ||||
| 	__u32 count = 0; | ||||
| 
 | ||||
| @@ -61,7 +72,7 @@ int main(int argc, char **argv) | ||||
| 
 | ||||
| 	data_fd = open(data_file, O_RDWR); | ||||
| 
 | ||||
| 	if (event_reg(data_fd, "test u32 count", &status, &write) == -1) | ||||
| 	if (event_reg(data_fd, "test u32 count", &index, &mask, &write) == -1) | ||||
| 		return errno; | ||||
| 
 | ||||
| 	/* Setup iovec */ | ||||
| @@ -75,7 +86,7 @@ ask: | ||||
| 	getchar(); | ||||
| 
 | ||||
| 	/* Check if anyone is listening */ | ||||
| 	if (status_page[status]) { | ||||
| 	if (status_page[index] & mask) { | ||||
| 		/* Yep, trace out our data */ | ||||
| 		writev(data_fd, (const struct iovec *)io, 2); | ||||
| 
 | ||||
|   | ||||
| @@ -22,6 +22,11 @@ const char *enable_file = "/sys/kernel/debug/tracing/events/user_events/__test_e | ||||
| const char *trace_file = "/sys/kernel/debug/tracing/trace"; | ||||
| const char *fmt_file = "/sys/kernel/debug/tracing/events/user_events/__test_event/format"; | ||||
| 
 | ||||
| static inline int status_check(char *status_page, int status_bit) | ||||
| { | ||||
| 	return status_page[status_bit >> 3] & (1 << (status_bit & 7)); | ||||
| } | ||||
| 
 | ||||
| static int trace_bytes(void) | ||||
| { | ||||
| 	int fd = open(trace_file, O_RDONLY); | ||||
| @@ -197,12 +202,12 @@ TEST_F(user, register_events) { | ||||
| 	/* Register should work */ | ||||
| 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); | ||||
| 	ASSERT_EQ(0, reg.write_index); | ||||
| 	ASSERT_NE(0, reg.status_index); | ||||
| 	ASSERT_NE(0, reg.status_bit); | ||||
| 
 | ||||
| 	/* Multiple registers should result in same index */ | ||||
| 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); | ||||
| 	ASSERT_EQ(0, reg.write_index); | ||||
| 	ASSERT_NE(0, reg.status_index); | ||||
| 	ASSERT_NE(0, reg.status_bit); | ||||
| 
 | ||||
| 	/* Ensure disabled */ | ||||
| 	self->enable_fd = open(enable_file, O_RDWR); | ||||
| @@ -212,15 +217,15 @@ TEST_F(user, register_events) { | ||||
| 	/* MMAP should work and be zero'd */ | ||||
| 	ASSERT_NE(MAP_FAILED, status_page); | ||||
| 	ASSERT_NE(NULL, status_page); | ||||
| 	ASSERT_EQ(0, status_page[reg.status_index]); | ||||
| 	ASSERT_EQ(0, status_check(status_page, reg.status_bit)); | ||||
| 
 | ||||
| 	/* Enable event and ensure bits updated in status */ | ||||
| 	ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) | ||||
| 	ASSERT_EQ(EVENT_STATUS_FTRACE, status_page[reg.status_index]); | ||||
| 	ASSERT_NE(0, status_check(status_page, reg.status_bit)); | ||||
| 
 | ||||
| 	/* Disable event and ensure bits updated in status */ | ||||
| 	ASSERT_NE(-1, write(self->enable_fd, "0", sizeof("0"))) | ||||
| 	ASSERT_EQ(0, status_page[reg.status_index]); | ||||
| 	ASSERT_EQ(0, status_check(status_page, reg.status_bit)); | ||||
| 
 | ||||
| 	/* File still open should return -EBUSY for delete */ | ||||
| 	ASSERT_EQ(-1, ioctl(self->data_fd, DIAG_IOCSDEL, "__test_event")); | ||||
| @@ -240,6 +245,8 @@ TEST_F(user, write_events) { | ||||
| 	struct iovec io[3]; | ||||
| 	__u32 field1, field2; | ||||
| 	int before = 0, after = 0; | ||||
| 	int page_size = sysconf(_SC_PAGESIZE); | ||||
| 	char *status_page; | ||||
| 
 | ||||
| 	reg.size = sizeof(reg); | ||||
| 	reg.name_args = (__u64)"__test_event u32 field1; u32 field2"; | ||||
| @@ -254,10 +261,18 @@ TEST_F(user, write_events) { | ||||
| 	io[2].iov_base = &field2; | ||||
| 	io[2].iov_len = sizeof(field2); | ||||
| 
 | ||||
| 	status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED, | ||||
| 			   self->status_fd, 0); | ||||
| 
 | ||||
| 	/* Register should work */ | ||||
| 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); | ||||
| 	ASSERT_EQ(0, reg.write_index); | ||||
| 	ASSERT_NE(0, reg.status_index); | ||||
| 	ASSERT_NE(0, reg.status_bit); | ||||
| 
 | ||||
| 	/* MMAP should work and be zero'd */ | ||||
| 	ASSERT_NE(MAP_FAILED, status_page); | ||||
| 	ASSERT_NE(NULL, status_page); | ||||
| 	ASSERT_EQ(0, status_check(status_page, reg.status_bit)); | ||||
| 
 | ||||
| 	/* Write should fail on invalid slot with ENOENT */ | ||||
| 	io[0].iov_base = &field2; | ||||
| @@ -271,6 +286,9 @@ TEST_F(user, write_events) { | ||||
| 	self->enable_fd = open(enable_file, O_RDWR); | ||||
| 	ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) | ||||
| 
 | ||||
| 	/* Event should now be enabled */ | ||||
| 	ASSERT_NE(0, status_check(status_page, reg.status_bit)); | ||||
| 
 | ||||
| 	/* Write should make it out to ftrace buffers */ | ||||
| 	before = trace_bytes(); | ||||
| 	ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 3)); | ||||
| @@ -298,7 +316,7 @@ TEST_F(user, write_fault) { | ||||
| 	/* Register should work */ | ||||
| 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); | ||||
| 	ASSERT_EQ(0, reg.write_index); | ||||
| 	ASSERT_NE(0, reg.status_index); | ||||
| 	ASSERT_NE(0, reg.status_bit); | ||||
| 
 | ||||
| 	/* Write should work normally */ | ||||
| 	ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 2)); | ||||
| @@ -315,6 +333,11 @@ TEST_F(user, write_validator) { | ||||
| 	int loc, bytes; | ||||
| 	char data[8]; | ||||
| 	int before = 0, after = 0; | ||||
| 	int page_size = sysconf(_SC_PAGESIZE); | ||||
| 	char *status_page; | ||||
| 
 | ||||
| 	status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED, | ||||
| 			   self->status_fd, 0); | ||||
| 
 | ||||
| 	reg.size = sizeof(reg); | ||||
| 	reg.name_args = (__u64)"__test_event __rel_loc char[] data"; | ||||
| @@ -322,7 +345,12 @@ TEST_F(user, write_validator) { | ||||
| 	/* Register should work */ | ||||
| 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); | ||||
| 	ASSERT_EQ(0, reg.write_index); | ||||
| 	ASSERT_NE(0, reg.status_index); | ||||
| 	ASSERT_NE(0, reg.status_bit); | ||||
| 
 | ||||
| 	/* MMAP should work and be zero'd */ | ||||
| 	ASSERT_NE(MAP_FAILED, status_page); | ||||
| 	ASSERT_NE(NULL, status_page); | ||||
| 	ASSERT_EQ(0, status_check(status_page, reg.status_bit)); | ||||
| 
 | ||||
| 	io[0].iov_base = ®.write_index; | ||||
| 	io[0].iov_len = sizeof(reg.write_index); | ||||
| @@ -340,6 +368,9 @@ TEST_F(user, write_validator) { | ||||
| 	self->enable_fd = open(enable_file, O_RDWR); | ||||
| 	ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1"))) | ||||
| 
 | ||||
| 	/* Event should now be enabled */ | ||||
| 	ASSERT_NE(0, status_check(status_page, reg.status_bit)); | ||||
| 
 | ||||
| 	/* Full in-bounds write should work */ | ||||
| 	before = trace_bytes(); | ||||
| 	loc = DYN_LOC(0, bytes); | ||||
|   | ||||
| @@ -35,6 +35,11 @@ static long perf_event_open(struct perf_event_attr *pe, pid_t pid, | ||||
| 	return syscall(__NR_perf_event_open, pe, pid, cpu, group_fd, flags); | ||||
| } | ||||
| 
 | ||||
| static inline int status_check(char *status_page, int status_bit) | ||||
| { | ||||
| 	return status_page[status_bit >> 3] & (1 << (status_bit & 7)); | ||||
| } | ||||
| 
 | ||||
| static int get_id(void) | ||||
| { | ||||
| 	FILE *fp = fopen(id_file, "r"); | ||||
| @@ -120,8 +125,8 @@ TEST_F(user, perf_write) { | ||||
| 	/* Register should work */ | ||||
| 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, ®)); | ||||
| 	ASSERT_EQ(0, reg.write_index); | ||||
| 	ASSERT_NE(0, reg.status_index); | ||||
| 	ASSERT_EQ(0, status_page[reg.status_index]); | ||||
| 	ASSERT_NE(0, reg.status_bit); | ||||
| 	ASSERT_EQ(0, status_check(status_page, reg.status_bit)); | ||||
| 
 | ||||
| 	/* Id should be there */ | ||||
| 	id = get_id(); | ||||
| @@ -144,7 +149,7 @@ TEST_F(user, perf_write) { | ||||
| 	ASSERT_NE(MAP_FAILED, perf_page); | ||||
| 
 | ||||
| 	/* Status should be updated */ | ||||
| 	ASSERT_EQ(EVENT_STATUS_PERF, status_page[reg.status_index]); | ||||
| 	ASSERT_NE(0, status_check(status_page, reg.status_bit)); | ||||
| 
 | ||||
| 	event.index = reg.write_index; | ||||
| 	event.field1 = 0xc001; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user