39d8f0d102
Recent improvements in LOCKDEP highlighted a potential A-A deadlock with pcpu_freelist in NMI: ./tools/testing/selftests/bpf/test_progs -t stacktrace_build_id_nmi [ 18.984807] ================================ [ 18.984807] WARNING: inconsistent lock state [ 18.984808] 5.9.0-rc6-01771-g1466de1330e1 #2967 Not tainted [ 18.984809] -------------------------------- [ 18.984809] inconsistent {INITIAL USE} -> {IN-NMI} usage. [ 18.984810] test_progs/1990 [HC2[2]:SC0[0]:HE0:SE1] takes: [ 18.984810] ffffe8ffffc219c0 (&head->lock){....}-{2:2}, at: __pcpu_freelist_pop+0xe3/0x180 [ 18.984813] {INITIAL USE} state was registered at: [ 18.984814] lock_acquire+0x175/0x7c0 [ 18.984814] _raw_spin_lock+0x2c/0x40 [ 18.984815] __pcpu_freelist_pop+0xe3/0x180 [ 18.984815] pcpu_freelist_pop+0x31/0x40 [ 18.984816] htab_map_alloc+0xbbf/0xf40 [ 18.984816] __do_sys_bpf+0x5aa/0x3ed0 [ 18.984817] do_syscall_64+0x2d/0x40 [ 18.984818] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 18.984818] irq event stamp: 12 [...] [ 18.984822] other info that might help us debug this: [ 18.984823] Possible unsafe locking scenario: [ 18.984823] [ 18.984824] CPU0 [ 18.984824] ---- [ 18.984824] lock(&head->lock); [ 18.984826] <Interrupt> [ 18.984826] lock(&head->lock); [ 18.984827] [ 18.984828] *** DEADLOCK *** [ 18.984828] [ 18.984829] 2 locks held by test_progs/1990: [...] [ 18.984838] <NMI> [ 18.984838] dump_stack+0x9a/0xd0 [ 18.984839] lock_acquire+0x5c9/0x7c0 [ 18.984839] ? lock_release+0x6f0/0x6f0 [ 18.984840] ? __pcpu_freelist_pop+0xe3/0x180 [ 18.984840] _raw_spin_lock+0x2c/0x40 [ 18.984841] ? __pcpu_freelist_pop+0xe3/0x180 [ 18.984841] __pcpu_freelist_pop+0xe3/0x180 [ 18.984842] pcpu_freelist_pop+0x17/0x40 [ 18.984842] ? lock_release+0x6f0/0x6f0 [ 18.984843] __bpf_get_stackid+0x534/0xaf0 [ 18.984843] bpf_prog_1fd9e30e1438d3c5_oncpu+0x73/0x350 [ 18.984844] bpf_overflow_handler+0x12f/0x3f0 This is because pcpu_freelist_head.lock is accessed in both NMI and non-NMI context. Fix this issue by using raw_spin_trylock() in NMI. Since NMI interrupts non-NMI context, when NMI context tries to lock the raw_spinlock, non-NMI context of the same CPU may already have locked a lock and is blocked from unlocking the lock. For a system with N CPUs, there could be N NMIs at the same time, and they may block N non-NMI raw_spinlocks. This is tricky for pcpu_freelist_push(), where unlike _pop(), failing _push() means leaking memory. This issue is more likely to trigger in non-SMP system. Fix this issue with an extra list, pcpu_freelist.extralist. The extralist is primarily used to take _push() when raw_spin_trylock() failed on all the per CPU lists. It should be empty most of the time. The following table summarizes the behavior of pcpu_freelist in NMI and non-NMI: non-NMI pop(): use _lock(); check per CPU lists first; if all per CPU lists are empty, check extralist; if extralist is empty, return NULL. non-NMI push(): use _lock(); only push to per CPU lists. NMI pop(): use _trylock(); check per CPU lists first; if all per CPU lists are locked or empty, check extralist; if extralist is locked or empty, return NULL. NMI push(): use _trylock(); check per CPU lists first; if all per CPU lists are locked; try push to extralist; if extralist is also locked, keep trying on per CPU lists. Reported-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Martin KaFai Lau <kafai@fb.com> Link: https://lore.kernel.org/bpf/20201005165838.3735218-1-songliubraving@fb.com
210 lines
4.7 KiB
C
210 lines
4.7 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/* Copyright (c) 2016 Facebook
|
|
*/
|
|
#include "percpu_freelist.h"
|
|
|
|
int pcpu_freelist_init(struct pcpu_freelist *s)
|
|
{
|
|
int cpu;
|
|
|
|
s->freelist = alloc_percpu(struct pcpu_freelist_head);
|
|
if (!s->freelist)
|
|
return -ENOMEM;
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu);
|
|
|
|
raw_spin_lock_init(&head->lock);
|
|
head->first = NULL;
|
|
}
|
|
raw_spin_lock_init(&s->extralist.lock);
|
|
s->extralist.first = NULL;
|
|
return 0;
|
|
}
|
|
|
|
void pcpu_freelist_destroy(struct pcpu_freelist *s)
|
|
{
|
|
free_percpu(s->freelist);
|
|
}
|
|
|
|
static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
|
|
struct pcpu_freelist_node *node)
|
|
{
|
|
node->next = head->first;
|
|
head->first = node;
|
|
}
|
|
|
|
static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
|
|
struct pcpu_freelist_node *node)
|
|
{
|
|
raw_spin_lock(&head->lock);
|
|
pcpu_freelist_push_node(head, node);
|
|
raw_spin_unlock(&head->lock);
|
|
}
|
|
|
|
static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s,
|
|
struct pcpu_freelist_node *node)
|
|
{
|
|
if (!raw_spin_trylock(&s->extralist.lock))
|
|
return false;
|
|
|
|
pcpu_freelist_push_node(&s->extralist, node);
|
|
raw_spin_unlock(&s->extralist.lock);
|
|
return true;
|
|
}
|
|
|
|
static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
|
|
struct pcpu_freelist_node *node)
|
|
{
|
|
int cpu, orig_cpu;
|
|
|
|
orig_cpu = cpu = raw_smp_processor_id();
|
|
while (1) {
|
|
struct pcpu_freelist_head *head;
|
|
|
|
head = per_cpu_ptr(s->freelist, cpu);
|
|
if (raw_spin_trylock(&head->lock)) {
|
|
pcpu_freelist_push_node(head, node);
|
|
raw_spin_unlock(&head->lock);
|
|
return;
|
|
}
|
|
cpu = cpumask_next(cpu, cpu_possible_mask);
|
|
if (cpu >= nr_cpu_ids)
|
|
cpu = 0;
|
|
|
|
/* cannot lock any per cpu lock, try extralist */
|
|
if (cpu == orig_cpu &&
|
|
pcpu_freelist_try_push_extra(s, node))
|
|
return;
|
|
}
|
|
}
|
|
|
|
void __pcpu_freelist_push(struct pcpu_freelist *s,
|
|
struct pcpu_freelist_node *node)
|
|
{
|
|
if (in_nmi())
|
|
___pcpu_freelist_push_nmi(s, node);
|
|
else
|
|
___pcpu_freelist_push(this_cpu_ptr(s->freelist), node);
|
|
}
|
|
|
|
void pcpu_freelist_push(struct pcpu_freelist *s,
|
|
struct pcpu_freelist_node *node)
|
|
{
|
|
unsigned long flags;
|
|
|
|
local_irq_save(flags);
|
|
__pcpu_freelist_push(s, node);
|
|
local_irq_restore(flags);
|
|
}
|
|
|
|
void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
|
|
u32 nr_elems)
|
|
{
|
|
struct pcpu_freelist_head *head;
|
|
int i, cpu, pcpu_entries;
|
|
|
|
pcpu_entries = nr_elems / num_possible_cpus() + 1;
|
|
i = 0;
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
again:
|
|
head = per_cpu_ptr(s->freelist, cpu);
|
|
/* No locking required as this is not visible yet. */
|
|
pcpu_freelist_push_node(head, buf);
|
|
i++;
|
|
buf += elem_size;
|
|
if (i == nr_elems)
|
|
break;
|
|
if (i % pcpu_entries)
|
|
goto again;
|
|
}
|
|
}
|
|
|
|
static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
|
|
{
|
|
struct pcpu_freelist_head *head;
|
|
struct pcpu_freelist_node *node;
|
|
int orig_cpu, cpu;
|
|
|
|
orig_cpu = cpu = raw_smp_processor_id();
|
|
while (1) {
|
|
head = per_cpu_ptr(s->freelist, cpu);
|
|
raw_spin_lock(&head->lock);
|
|
node = head->first;
|
|
if (node) {
|
|
head->first = node->next;
|
|
raw_spin_unlock(&head->lock);
|
|
return node;
|
|
}
|
|
raw_spin_unlock(&head->lock);
|
|
cpu = cpumask_next(cpu, cpu_possible_mask);
|
|
if (cpu >= nr_cpu_ids)
|
|
cpu = 0;
|
|
if (cpu == orig_cpu)
|
|
break;
|
|
}
|
|
|
|
/* per cpu lists are all empty, try extralist */
|
|
raw_spin_lock(&s->extralist.lock);
|
|
node = s->extralist.first;
|
|
if (node)
|
|
s->extralist.first = node->next;
|
|
raw_spin_unlock(&s->extralist.lock);
|
|
return node;
|
|
}
|
|
|
|
static struct pcpu_freelist_node *
|
|
___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
|
|
{
|
|
struct pcpu_freelist_head *head;
|
|
struct pcpu_freelist_node *node;
|
|
int orig_cpu, cpu;
|
|
|
|
orig_cpu = cpu = raw_smp_processor_id();
|
|
while (1) {
|
|
head = per_cpu_ptr(s->freelist, cpu);
|
|
if (raw_spin_trylock(&head->lock)) {
|
|
node = head->first;
|
|
if (node) {
|
|
head->first = node->next;
|
|
raw_spin_unlock(&head->lock);
|
|
return node;
|
|
}
|
|
raw_spin_unlock(&head->lock);
|
|
}
|
|
cpu = cpumask_next(cpu, cpu_possible_mask);
|
|
if (cpu >= nr_cpu_ids)
|
|
cpu = 0;
|
|
if (cpu == orig_cpu)
|
|
break;
|
|
}
|
|
|
|
/* cannot pop from per cpu lists, try extralist */
|
|
if (!raw_spin_trylock(&s->extralist.lock))
|
|
return NULL;
|
|
node = s->extralist.first;
|
|
if (node)
|
|
s->extralist.first = node->next;
|
|
raw_spin_unlock(&s->extralist.lock);
|
|
return node;
|
|
}
|
|
|
|
struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
|
|
{
|
|
if (in_nmi())
|
|
return ___pcpu_freelist_pop_nmi(s);
|
|
return ___pcpu_freelist_pop(s);
|
|
}
|
|
|
|
struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
|
|
{
|
|
struct pcpu_freelist_node *ret;
|
|
unsigned long flags;
|
|
|
|
local_irq_save(flags);
|
|
ret = __pcpu_freelist_pop(s);
|
|
local_irq_restore(flags);
|
|
return ret;
|
|
}
|