irq_work, smp: Allow irq_work on call_single_queue

Currently irq_work_queue_on() will issue an unconditional arch_send_call_function_single_ipi() and has the handler do irq_work_run(). This is unfortunate in that it makes the IPI handler look at a second cacheline and it misses the opportunity to avoid the IPI. Instead note that struct irq_work and struct __call_single_data are very similar in layout, so use a few bits in the flags word to encode a type and stick the irq_work on the call_single_queue list. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Ingo Molnar <mingo@kernel.org> Link: https://lore.kernel.org/r/20200526161908.011635912@infradead.org
2020-05-26 18:11:02 +02:00 · 2020-05-26 18:11:02 +02:00 · 4b44a21dd6
commit 4b44a21dd6
parent b2a02fc43a
4 changed files with 130 additions and 72 deletions
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@ -13,6 +13,8 @@
 * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
 */
 /* flags share CSD_FLAG_ space */
 #define IRQ_WORK_PENDING	BIT(0)
 #define IRQ_WORK_BUSY		BIT(1)
@ -23,9 +25,12 @@
 #define IRQ_WORK_CLAIMED	(IRQ_WORK_PENDING | IRQ_WORK_BUSY)
 /*
 * structure shares layout with single_call_data_t.
 */
 struct irq_work {
 	atomic_t flags;
 	struct llist_node llnode;
 	atomic_t flags;
 	void (*func)(struct irq_work *);
 };
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@ -16,17 +16,38 @@
 typedef void (*smp_call_func_t)(void *info);
 typedef bool (*smp_cond_func_t)(int cpu, void *info);
 enum {
 	CSD_FLAG_LOCK		= 0x01,
 	/* IRQ_WORK_flags */
 	CSD_TYPE_ASYNC		= 0x00,
 	CSD_TYPE_SYNC		= 0x10,
 	CSD_TYPE_IRQ_WORK	= 0x20,
 	CSD_FLAG_TYPE_MASK	= 0xF0,
 };
 /*
 * structure shares (partial) layout with struct irq_work
 */
 struct __call_single_data {
 	struct llist_node llist;
 	unsigned int flags;
 	smp_call_func_t func;
 	void *info;
 	unsigned int flags;
 };
 /* Use __aligned() to avoid to use 2 cache lines for 1 csd */
 typedef struct __call_single_data call_single_data_t
 	__aligned(sizeof(struct __call_single_data));
 /*
 * Enqueue a llist_node on the call_single_queue; be very careful, read
 * flush_smp_call_function_queue() in detail.
 */
 extern void __smp_call_single_queue(int cpu, struct llist_node *node);
 /* total number of cpus in this system (may exceed NR_CPUS) */
 extern unsigned int total_cpus;
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work)
 {
 	int oflags;
-	oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
+	oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags);
 	/*
 	 * If the work is already pending, no need to raise the IPI.
 	 * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
@ -102,8 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
 	if (cpu != smp_processor_id()) {
 		/* Arch remote IPI send/receive backend aren't NMI safe */
 		WARN_ON_ONCE(in_nmi());
-		if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
+		__smp_call_single_queue(cpu, &work->llnode);
 			arch_send_call_function_single_ipi(cpu);
 	} else {
 		__irq_work_queue_local(work);
 	}
@ -131,6 +130,31 @@ bool irq_work_needs_cpu(void)
 	return true;
 }
 void irq_work_single(void *arg)
 {
 	struct irq_work *work = arg;
 	int flags;
 	/*
 	 * Clear the PENDING bit, after this point the @work
 	 * can be re-used.
 	 * Make it immediately visible so that other CPUs trying
 	 * to claim that work don't rely on us to handle their data
 	 * while we are in the middle of the func.
 	 */
 	flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
 	lockdep_irq_work_enter(work);
 	work->func(work);
 	lockdep_irq_work_exit(work);
 	/*
 	 * Clear the BUSY bit and return to the free state if
 	 * no-one else claimed it meanwhile.
 	 */
 	flags &= ~IRQ_WORK_PENDING;
 	(void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
 }
 static void irq_work_run_list(struct llist_head *list)
 {
 	struct irq_work *work, *tmp;
@ -142,27 +166,8 @@ static void irq_work_run_list(struct llist_head *list)
 		return;
 	llnode = llist_del_all(list);
-	llist_for_each_entry_safe(work, tmp, llnode, llnode) {
+	llist_for_each_entry_safe(work, tmp, llnode, llnode)
-		int flags;
+		irq_work_single(work);
 		/*
 		 * Clear the PENDING bit, after this point the @work
 		 * can be re-used.
 		 * Make it immediately visible so that other CPUs trying
 		 * to claim that work don't rely on us to handle their data
 		 * while we are in the middle of the func.
 		 */
 		flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
 		lockdep_irq_work_enter(work);
 		work->func(work);
 		lockdep_irq_work_exit(work);
 		/*
 		 * Clear the BUSY bit and return to the free state if
 		 * no-one else claimed it meanwhile.
 		 */
 		flags &= ~IRQ_WORK_PENDING;
 		(void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
 	}
 }
 /*
--- a/kernel/smp.c
+++ b/kernel/smp.c
@ -23,10 +23,8 @@
 #include "smpboot.h"
-enum {
+
-	CSD_FLAG_LOCK		= 0x01,
+#define CSD_TYPE(_csd)	((_csd)->flags & CSD_FLAG_TYPE_MASK)
 	CSD_FLAG_SYNCHRONOUS	= 0x02,
 };
 struct call_function_data {
 	call_single_data_t	__percpu *csd;
@ -137,15 +135,33 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
 extern void send_call_function_single_ipi(int cpu);
 void __smp_call_single_queue(int cpu, struct llist_node *node)
 {
 	/*
 	 * The list addition should be visible before sending the IPI
 	 * handler locks the list to pull the entry off it because of
 	 * normal cache coherency rules implied by spinlocks.
 	 *
 	 * If IPIs can go out of order to the cache coherency protocol
 	 * in an architecture, sufficient synchronisation should be added
 	 * to arch code to make it appear to obey cache coherency WRT
 	 * locking and barrier primitives. Generic code isn't really
 	 * equipped to do the right thing...
 	 */
 	if (llist_add(node, &per_cpu(call_single_queue, cpu)))
 		send_call_function_single_ipi(cpu);
 }
 /*
 * Insert a previously allocated call_single_data_t element
 * for execution on the given CPU. data must already have
 * ->func, ->info, and ->flags set.
 */
-static int generic_exec_single(int cpu, call_single_data_t *csd,
+static int generic_exec_single(int cpu, call_single_data_t *csd)
 			       smp_call_func_t func, void *info)
 {
 	if (cpu == smp_processor_id()) {
 		smp_call_func_t func = csd->func;
 		void *info = csd->info;
 		unsigned long flags;
 		/*
@ -159,28 +175,12 @@ static int generic_exec_single(int cpu, call_single_data_t *csd,
 		return 0;
 	}
 	if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
 		csd_unlock(csd);
 		return -ENXIO;
 	}
-	csd->func = func;
+	__smp_call_single_queue(cpu, &csd->llist);
 	csd->info = info;
 	/*
 	 * The list addition should be visible before sending the IPI
 	 * handler locks the list to pull the entry off it because of
 	 * normal cache coherency rules implied by spinlocks.
 	 *
 	 * If IPIs can go out of order to the cache coherency protocol
 	 * in an architecture, sufficient synchronisation should be added
 	 * to arch code to make it appear to obey cache coherency WRT
 	 * locking and barrier primitives. Generic code isn't really
 	 * equipped to do the right thing...
 	 */
 	if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
 		send_call_function_single_ipi(cpu);
 	return 0;
 }
@ -194,16 +194,10 @@ static int generic_exec_single(int cpu, call_single_data_t *csd,
 void generic_smp_call_function_single_interrupt(void)
 {
 	flush_smp_call_function_queue(true);
 	/*
 	 * Handle irq works queued remotely by irq_work_queue_on().
 	 * Smp functions above are typically synchronous so they
 	 * better run first since some other CPUs may be busy waiting
 	 * for them.
 	 */
 	irq_work_run();
 }
 extern void irq_work_single(void *);
 /**
 * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
 *
@ -241,9 +235,21 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 		 * We don't have to use the _safe() variant here
 		 * because we are not invoking the IPI handlers yet.
 		 */
-		llist_for_each_entry(csd, entry, llist)
+		llist_for_each_entry(csd, entry, llist) {
-			pr_warn("IPI callback %pS sent to offline CPU\n",
+			switch (CSD_TYPE(csd)) {
-				csd->func);
+			case CSD_TYPE_ASYNC:
 			case CSD_TYPE_SYNC:
 			case CSD_TYPE_IRQ_WORK:
 				pr_warn("IPI callback %pS sent to offline CPU\n",
 					csd->func);
 				break;
 			default:
 				pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
 					CSD_TYPE(csd));
 				break;
 			}
 		}
 	}
 	/*
@ -251,16 +257,17 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 	 */
 	prev = NULL;
 	llist_for_each_entry_safe(csd, csd_next, entry, llist) {
 		smp_call_func_t func = csd->func;
 		void *info = csd->info;
 		/* Do we wait until *after* callback? */
-		if (csd->flags & CSD_FLAG_SYNCHRONOUS) {
+		if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
 			smp_call_func_t func = csd->func;
 			void *info = csd->info;
 			if (prev) {
 				prev->next = &csd_next->llist;
 			} else {
 				entry = &csd_next->llist;
 			}
 			func(info);
 			csd_unlock(csd);
 		} else {
@ -272,11 +279,17 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 	 * Second; run all !SYNC callbacks.
 	 */
 	llist_for_each_entry_safe(csd, csd_next, entry, llist) {
-		smp_call_func_t func = csd->func;
+		int type = CSD_TYPE(csd);
 		void *info = csd->info;
-		csd_unlock(csd);
+		if (type == CSD_TYPE_ASYNC) {
-		func(info);
+			smp_call_func_t func = csd->func;
 			void *info = csd->info;
 			csd_unlock(csd);
 			func(info);
 		} else if (type == CSD_TYPE_IRQ_WORK) {
 			irq_work_single(csd);
 		}
 	}
 }
@ -305,7 +318,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 {
 	call_single_data_t *csd;
 	call_single_data_t csd_stack = {
-		.flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
+		.flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
 	};
 	int this_cpu;
 	int err;
@ -339,7 +352,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 		csd_lock(csd);
 	}
-	err = generic_exec_single(cpu, csd, func, info);
+	csd->func = func;
 	csd->info = info;
 	err = generic_exec_single(cpu, csd);
 	if (wait)
 		csd_lock_wait(csd);
@ -385,7 +401,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 	csd->flags = CSD_FLAG_LOCK;
 	smp_wmb();
-	err = generic_exec_single(cpu, csd, csd->func, csd->info);
+	err = generic_exec_single(cpu, csd);
 out:
 	preempt_enable();
@ -500,7 +516,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
 		csd_lock(csd);
 		if (wait)
-			csd->flags |= CSD_FLAG_SYNCHRONOUS;
+			csd->flags |= CSD_TYPE_SYNC;
 		csd->func = func;
 		csd->info = info;
 		if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
@ -632,6 +648,17 @@ void __init smp_init(void)
 {
 	int num_nodes, num_cpus;
 	/*
 	 * Ensure struct irq_work layout matches so that
 	 * flush_smp_call_function_queue() can do horrible things.
 	 */
 	BUILD_BUG_ON(offsetof(struct irq_work, llnode) !=
 		     offsetof(struct __call_single_data, llist));
 	BUILD_BUG_ON(offsetof(struct irq_work, func) !=
 		     offsetof(struct __call_single_data, func));
 	BUILD_BUG_ON(offsetof(struct irq_work, flags) !=
 		     offsetof(struct __call_single_data, flags));
 	idle_threads_init();
 	cpuhp_threads_init();