prb_next_seq() always iterates from the first known sequence number. In the worst case, it might loop 8k times for 256kB buffer, 15k times for 512kB buffer, and 64k times for 2MB buffer. It was reported that polling and reading using syslog interface might occupy 50% of CPU. Speedup the search by storing @id of the last finalized descriptor. The loop is still needed because the @id is stored and read in the best effort way. An atomic variable is used to keep the @id consistent. But the stores and reads are not serialized against each other. The descriptor could get reused in the meantime. The related sequence number will be used only when it is still valid. An invalid value should be read _only_ when there is a flood of messages and the ringbuffer is rapidly reused. The performance is the least problem in this case. Reported-by: Chunlei Wang <chunlei.wang@mediatek.com> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com> Reviewed-by: John Ogness <john.ogness@linutronix.de> Signed-off-by: Petr Mladek <pmladek@suse.com> Link: https://lore.kernel.org/r/1642770388-17327-1-git-send-email-quic_mojha@quicinc.com Link: https://lore.kernel.org/lkml/YXlddJxLh77DKfIO@alley/T/#m43062e8b2a17f8dbc8c6ccdb8851fb0dbaabbb14
		
			
				
	
	
		
			385 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			385 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* SPDX-License-Identifier: GPL-2.0 */
 | |
| 
 | |
| #ifndef _KERNEL_PRINTK_RINGBUFFER_H
 | |
| #define _KERNEL_PRINTK_RINGBUFFER_H
 | |
| 
 | |
| #include <linux/atomic.h>
 | |
| #include <linux/dev_printk.h>
 | |
| 
 | |
| /*
 | |
|  * Meta information about each stored message.
 | |
|  *
 | |
|  * All fields are set by the printk code except for @seq, which is
 | |
|  * set by the ringbuffer code.
 | |
|  */
 | |
| struct printk_info {
 | |
| 	u64	seq;		/* sequence number */
 | |
| 	u64	ts_nsec;	/* timestamp in nanoseconds */
 | |
| 	u16	text_len;	/* length of text message */
 | |
| 	u8	facility;	/* syslog facility */
 | |
| 	u8	flags:5;	/* internal record flags */
 | |
| 	u8	level:3;	/* syslog level */
 | |
| 	u32	caller_id;	/* thread id or processor id */
 | |
| 
 | |
| 	struct dev_printk_info	dev_info;
 | |
| };
 | |
| 
 | |
| /*
 | |
|  * A structure providing the buffers, used by writers and readers.
 | |
|  *
 | |
|  * Writers:
 | |
|  * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling
 | |
|  * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to
 | |
|  * buffers reserved for that writer.
 | |
|  *
 | |
|  * Readers:
 | |
|  * Using prb_rec_init_rd(), a reader sets all fields before calling
 | |
|  * prb_read_valid(). Note that the reader provides the @info and @text_buf,
 | |
|  * buffers. On success, the struct pointed to by @info will be filled and
 | |
|  * the char array pointed to by @text_buf will be filled with text data.
 | |
|  */
 | |
| struct printk_record {
 | |
| 	struct printk_info	*info;
 | |
| 	char			*text_buf;
 | |
| 	unsigned int		text_buf_size;
 | |
| };
 | |
| 
 | |
| /* Specifies the logical position and span of a data block. */
 | |
| struct prb_data_blk_lpos {
 | |
| 	unsigned long	begin;
 | |
| 	unsigned long	next;
 | |
| };
 | |
| 
 | |
| /*
 | |
|  * A descriptor: the complete meta-data for a record.
 | |
|  *
 | |
|  * @state_var: A bitwise combination of descriptor ID and descriptor state.
 | |
|  */
 | |
| struct prb_desc {
 | |
| 	atomic_long_t			state_var;
 | |
| 	struct prb_data_blk_lpos	text_blk_lpos;
 | |
| };
 | |
| 
 | |
| /* A ringbuffer of "ID + data" elements. */
 | |
| struct prb_data_ring {
 | |
| 	unsigned int	size_bits;
 | |
| 	char		*data;
 | |
| 	atomic_long_t	head_lpos;
 | |
| 	atomic_long_t	tail_lpos;
 | |
| };
 | |
| 
 | |
| /* A ringbuffer of "struct prb_desc" elements. */
 | |
| struct prb_desc_ring {
 | |
| 	unsigned int		count_bits;
 | |
| 	struct prb_desc		*descs;
 | |
| 	struct printk_info	*infos;
 | |
| 	atomic_long_t		head_id;
 | |
| 	atomic_long_t		tail_id;
 | |
| 	atomic_long_t		last_finalized_id;
 | |
| };
 | |
| 
 | |
| /*
 | |
|  * The high level structure representing the printk ringbuffer.
 | |
|  *
 | |
|  * @fail: Count of failed prb_reserve() calls where not even a data-less
 | |
|  *        record was created.
 | |
|  */
 | |
| struct printk_ringbuffer {
 | |
| 	struct prb_desc_ring	desc_ring;
 | |
| 	struct prb_data_ring	text_data_ring;
 | |
| 	atomic_long_t		fail;
 | |
| };
 | |
| 
 | |
| /*
 | |
|  * Used by writers as a reserve/commit handle.
 | |
|  *
 | |
|  * @rb:         Ringbuffer where the entry is reserved.
 | |
|  * @irqflags:   Saved irq flags to restore on entry commit.
 | |
|  * @id:         ID of the reserved descriptor.
 | |
|  * @text_space: Total occupied buffer space in the text data ring, including
 | |
|  *              ID, alignment padding, and wrapping data blocks.
 | |
|  *
 | |
|  * This structure is an opaque handle for writers. Its contents are only
 | |
|  * to be used by the ringbuffer implementation.
 | |
|  */
 | |
| struct prb_reserved_entry {
 | |
| 	struct printk_ringbuffer	*rb;
 | |
| 	unsigned long			irqflags;
 | |
| 	unsigned long			id;
 | |
| 	unsigned int			text_space;
 | |
| };
 | |
| 
 | |
| /* The possible responses of a descriptor state-query. */
 | |
| enum desc_state {
 | |
| 	desc_miss	=  -1,	/* ID mismatch (pseudo state) */
 | |
| 	desc_reserved	= 0x0,	/* reserved, in use by writer */
 | |
| 	desc_committed	= 0x1,	/* committed by writer, could get reopened */
 | |
| 	desc_finalized	= 0x2,	/* committed, no further modification allowed */
 | |
| 	desc_reusable	= 0x3,	/* free, not yet used by any writer */
 | |
| };
 | |
| 
 | |
| #define _DATA_SIZE(sz_bits)	(1UL << (sz_bits))
 | |
| #define _DESCS_COUNT(ct_bits)	(1U << (ct_bits))
 | |
| #define DESC_SV_BITS		(sizeof(unsigned long) * 8)
 | |
| #define DESC_FLAGS_SHIFT	(DESC_SV_BITS - 2)
 | |
| #define DESC_FLAGS_MASK		(3UL << DESC_FLAGS_SHIFT)
 | |
| #define DESC_STATE(sv)		(3UL & (sv >> DESC_FLAGS_SHIFT))
 | |
| #define DESC_SV(id, state)	(((unsigned long)state << DESC_FLAGS_SHIFT) | id)
 | |
| #define DESC_ID_MASK		(~DESC_FLAGS_MASK)
 | |
| #define DESC_ID(sv)		((sv) & DESC_ID_MASK)
 | |
| #define FAILED_LPOS		0x1
 | |
| #define NO_LPOS			0x3
 | |
| 
 | |
| #define FAILED_BLK_LPOS	\
 | |
| {				\
 | |
| 	.begin	= FAILED_LPOS,	\
 | |
| 	.next	= FAILED_LPOS,	\
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Descriptor Bootstrap
 | |
|  *
 | |
|  * The descriptor array is minimally initialized to allow immediate usage
 | |
|  * by readers and writers. The requirements that the descriptor array
 | |
|  * initialization must satisfy:
 | |
|  *
 | |
|  *   Req1
 | |
|  *     The tail must point to an existing (committed or reusable) descriptor.
 | |
|  *     This is required by the implementation of prb_first_seq().
 | |
|  *
 | |
|  *   Req2
 | |
|  *     Readers must see that the ringbuffer is initially empty.
 | |
|  *
 | |
|  *   Req3
 | |
|  *     The first record reserved by a writer is assigned sequence number 0.
 | |
|  *
 | |
|  * To satisfy Req1, the tail initially points to a descriptor that is
 | |
|  * minimally initialized (having no data block, i.e. data-less with the
 | |
|  * data block's lpos @begin and @next values set to FAILED_LPOS).
 | |
|  *
 | |
|  * To satisfy Req2, the initial tail descriptor is initialized to the
 | |
|  * reusable state. Readers recognize reusable descriptors as existing
 | |
|  * records, but skip over them.
 | |
|  *
 | |
|  * To satisfy Req3, the last descriptor in the array is used as the initial
 | |
|  * head (and tail) descriptor. This allows the first record reserved by a
 | |
|  * writer (head + 1) to be the first descriptor in the array. (Only the first
 | |
|  * descriptor in the array could have a valid sequence number of 0.)
 | |
|  *
 | |
|  * The first time a descriptor is reserved, it is assigned a sequence number
 | |
|  * with the value of the array index. A "first time reserved" descriptor can
 | |
|  * be recognized because it has a sequence number of 0 but does not have an
 | |
|  * index of 0. (Only the first descriptor in the array could have a valid
 | |
|  * sequence number of 0.) After the first reservation, all future reservations
 | |
|  * (recycling) simply involve incrementing the sequence number by the array
 | |
|  * count.
 | |
|  *
 | |
|  *   Hack #1
 | |
|  *     Only the first descriptor in the array is allowed to have the sequence
 | |
|  *     number 0. In this case it is not possible to recognize if it is being
 | |
|  *     reserved the first time (set to index value) or has been reserved
 | |
|  *     previously (increment by the array count). This is handled by _always_
 | |
|  *     incrementing the sequence number by the array count when reserving the
 | |
|  *     first descriptor in the array. In order to satisfy Req3, the sequence
 | |
|  *     number of the first descriptor in the array is initialized to minus
 | |
|  *     the array count. Then, upon the first reservation, it is incremented
 | |
|  *     to 0, thus satisfying Req3.
 | |
|  *
 | |
|  *   Hack #2
 | |
|  *     prb_first_seq() can be called at any time by readers to retrieve the
 | |
|  *     sequence number of the tail descriptor. However, due to Req2 and Req3,
 | |
|  *     initially there are no records to report the sequence number of
 | |
|  *     (sequence numbers are u64 and there is nothing less than 0). To handle
 | |
|  *     this, the sequence number of the initial tail descriptor is initialized
 | |
|  *     to 0. Technically this is incorrect, because there is no record with
 | |
|  *     sequence number 0 (yet) and the tail descriptor is not the first
 | |
|  *     descriptor in the array. But it allows prb_read_valid() to correctly
 | |
|  *     report the existence of a record for _any_ given sequence number at all
 | |
|  *     times. Bootstrapping is complete when the tail is pushed the first
 | |
|  *     time, thus finally pointing to the first descriptor reserved by a
 | |
|  *     writer, which has the assigned sequence number 0.
 | |
|  */
 | |
| 
 | |
| /*
 | |
|  * Initiating Logical Value Overflows
 | |
|  *
 | |
|  * Both logical position (lpos) and ID values can be mapped to array indexes
 | |
|  * but may experience overflows during the lifetime of the system. To ensure
 | |
|  * that printk_ringbuffer can handle the overflows for these types, initial
 | |
|  * values are chosen that map to the correct initial array indexes, but will
 | |
|  * result in overflows soon.
 | |
|  *
 | |
|  *   BLK0_LPOS
 | |
|  *     The initial @head_lpos and @tail_lpos for data rings. It is at index
 | |
|  *     0 and the lpos value is such that it will overflow on the first wrap.
 | |
|  *
 | |
|  *   DESC0_ID
 | |
|  *     The initial @head_id and @tail_id for the desc ring. It is at the last
 | |
|  *     index of the descriptor array (see Req3 above) and the ID value is such
 | |
|  *     that it will overflow on the second wrap.
 | |
|  */
 | |
| #define BLK0_LPOS(sz_bits)	(-(_DATA_SIZE(sz_bits)))
 | |
| #define DESC0_ID(ct_bits)	DESC_ID(-(_DESCS_COUNT(ct_bits) + 1))
 | |
| #define DESC0_SV(ct_bits)	DESC_SV(DESC0_ID(ct_bits), desc_reusable)
 | |
| 
 | |
| /*
 | |
|  * Define a ringbuffer with an external text data buffer. The same as
 | |
|  * DEFINE_PRINTKRB() but requires specifying an external buffer for the
 | |
|  * text data.
 | |
|  *
 | |
|  * Note: The specified external buffer must be of the size:
 | |
|  *       2 ^ (descbits + avgtextbits)
 | |
|  */
 | |
| #define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf)			\
 | |
| static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = {				\
 | |
| 	/* the initial head and tail */								\
 | |
| 	[_DESCS_COUNT(descbits) - 1] = {							\
 | |
| 		/* reusable */									\
 | |
| 		.state_var	= ATOMIC_INIT(DESC0_SV(descbits)),				\
 | |
| 		/* no associated data block */							\
 | |
| 		.text_blk_lpos	= FAILED_BLK_LPOS,						\
 | |
| 	},											\
 | |
| };												\
 | |
| static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = {				\
 | |
| 	/* this will be the first record reserved by a writer */				\
 | |
| 	[0] = {											\
 | |
| 		/* will be incremented to 0 on the first reservation */				\
 | |
| 		.seq = -(u64)_DESCS_COUNT(descbits),						\
 | |
| 	},											\
 | |
| 	/* the initial head and tail */								\
 | |
| 	[_DESCS_COUNT(descbits) - 1] = {							\
 | |
| 		/* reports the first seq value during the bootstrap phase */			\
 | |
| 		.seq = 0,									\
 | |
| 	},											\
 | |
| };												\
 | |
| static struct printk_ringbuffer name = {							\
 | |
| 	.desc_ring = {										\
 | |
| 		.count_bits	= descbits,							\
 | |
| 		.descs		= &_##name##_descs[0],						\
 | |
| 		.infos		= &_##name##_infos[0],						\
 | |
| 		.head_id	= ATOMIC_INIT(DESC0_ID(descbits)),				\
 | |
| 		.tail_id	= ATOMIC_INIT(DESC0_ID(descbits)),				\
 | |
| 		.last_finalized_id = ATOMIC_INIT(DESC0_ID(descbits)),				\
 | |
| 	},											\
 | |
| 	.text_data_ring = {									\
 | |
| 		.size_bits	= (avgtextbits) + (descbits),					\
 | |
| 		.data		= text_buf,							\
 | |
| 		.head_lpos	= ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))),	\
 | |
| 		.tail_lpos	= ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))),	\
 | |
| 	},											\
 | |
| 	.fail			= ATOMIC_LONG_INIT(0),						\
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * DEFINE_PRINTKRB() - Define a ringbuffer.
 | |
|  *
 | |
|  * @name:        The name of the ringbuffer variable.
 | |
|  * @descbits:    The number of descriptors as a power-of-2 value.
 | |
|  * @avgtextbits: The average text data size per record as a power-of-2 value.
 | |
|  *
 | |
|  * This is a macro for defining a ringbuffer and all internal structures
 | |
|  * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a
 | |
|  * variant where the text data buffer can be specified externally.
 | |
|  */
 | |
| #define DEFINE_PRINTKRB(name, descbits, avgtextbits)				\
 | |
| static char _##name##_text[1U << ((avgtextbits) + (descbits))]			\
 | |
| 			__aligned(__alignof__(unsigned long));			\
 | |
| _DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0])
 | |
| 
 | |
| /* Writer Interface */
 | |
| 
 | |
| /**
 | |
|  * prb_rec_init_wr() - Initialize a buffer for writing records.
 | |
|  *
 | |
|  * @r:             The record to initialize.
 | |
|  * @text_buf_size: The needed text buffer size.
 | |
|  */
 | |
| static inline void prb_rec_init_wr(struct printk_record *r,
 | |
| 				   unsigned int text_buf_size)
 | |
| {
 | |
| 	r->info = NULL;
 | |
| 	r->text_buf = NULL;
 | |
| 	r->text_buf_size = text_buf_size;
 | |
| }
 | |
| 
 | |
| bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
 | |
| 		 struct printk_record *r);
 | |
| bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
 | |
| 			 struct printk_record *r, u32 caller_id, unsigned int max_size);
 | |
| void prb_commit(struct prb_reserved_entry *e);
 | |
| void prb_final_commit(struct prb_reserved_entry *e);
 | |
| 
 | |
| void prb_init(struct printk_ringbuffer *rb,
 | |
| 	      char *text_buf, unsigned int text_buf_size,
 | |
| 	      struct prb_desc *descs, unsigned int descs_count_bits,
 | |
| 	      struct printk_info *infos);
 | |
| unsigned int prb_record_text_space(struct prb_reserved_entry *e);
 | |
| 
 | |
| /* Reader Interface */
 | |
| 
 | |
| /**
 | |
|  * prb_rec_init_rd() - Initialize a buffer for reading records.
 | |
|  *
 | |
|  * @r:             The record to initialize.
 | |
|  * @info:          A buffer to store record meta-data.
 | |
|  * @text_buf:      A buffer to store text data.
 | |
|  * @text_buf_size: The size of @text_buf.
 | |
|  *
 | |
|  * Initialize all the fields that a reader is interested in. All arguments
 | |
|  * (except @r) are optional. Only record data for arguments that are
 | |
|  * non-NULL or non-zero will be read.
 | |
|  */
 | |
| static inline void prb_rec_init_rd(struct printk_record *r,
 | |
| 				   struct printk_info *info,
 | |
| 				   char *text_buf, unsigned int text_buf_size)
 | |
| {
 | |
| 	r->info = info;
 | |
| 	r->text_buf = text_buf;
 | |
| 	r->text_buf_size = text_buf_size;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * prb_for_each_record() - Iterate over the records of a ringbuffer.
 | |
|  *
 | |
|  * @from: The sequence number to begin with.
 | |
|  * @rb:   The ringbuffer to iterate over.
 | |
|  * @s:    A u64 to store the sequence number on each iteration.
 | |
|  * @r:    A printk_record to store the record on each iteration.
 | |
|  *
 | |
|  * This is a macro for conveniently iterating over a ringbuffer.
 | |
|  * Note that @s may not be the sequence number of the record on each
 | |
|  * iteration. For the sequence number, @r->info->seq should be checked.
 | |
|  *
 | |
|  * Context: Any context.
 | |
|  */
 | |
| #define prb_for_each_record(from, rb, s, r) \
 | |
| for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1)
 | |
| 
 | |
| /**
 | |
|  * prb_for_each_info() - Iterate over the meta data of a ringbuffer.
 | |
|  *
 | |
|  * @from: The sequence number to begin with.
 | |
|  * @rb:   The ringbuffer to iterate over.
 | |
|  * @s:    A u64 to store the sequence number on each iteration.
 | |
|  * @i:    A printk_info to store the record meta data on each iteration.
 | |
|  * @lc:   An unsigned int to store the text line count of each record.
 | |
|  *
 | |
|  * This is a macro for conveniently iterating over a ringbuffer.
 | |
|  * Note that @s may not be the sequence number of the record on each
 | |
|  * iteration. For the sequence number, @r->info->seq should be checked.
 | |
|  *
 | |
|  * Context: Any context.
 | |
|  */
 | |
| #define prb_for_each_info(from, rb, s, i, lc) \
 | |
| for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1)
 | |
| 
 | |
| bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
 | |
| 		    struct printk_record *r);
 | |
| bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
 | |
| 			 struct printk_info *info, unsigned int *line_count);
 | |
| 
 | |
| u64 prb_first_valid_seq(struct printk_ringbuffer *rb);
 | |
| u64 prb_next_seq(struct printk_ringbuffer *rb);
 | |
| 
 | |
| #endif /* _KERNEL_PRINTK_RINGBUFFER_H */
 |