percpu: changes for v6.6
percpu * A couple cleanups by Baoquan He and Bibo Mao. The only behavior change is to start printing messages if we're under the warn limit for failed atomic allocations. percpu_counter * Shakeel introduced percpu counters into mm_struct which caused percpu allocations be on the hot path [1]. Originally I spent some time trying to improve the percpu allocator, but instead preferred what Mateusz Guzik proposed grouping at the allocation site, percpu_counter_init_many(). This allows a single percpu allocation to be shared by the counters. I like this approach because it creates a shared lifetime by the allocations. Additionally, I believe many inits have higher level synchronization requirements, like percpu_counter does against HOTPLUG_CPU. Therefore we can group these optimizations together. [1] https://lore.kernel.org/linux-mm/20221024052841.3291983-1-shakeelb@google.com/ -----BEGIN PGP SIGNATURE----- iQIzBAABCAAdFiEE3hZPHJdcVwe+yTTtiDc0yuoFPR0FAmTv2IUACgkQiDc0yuoF PR0+gg//U430Y9jRSKQtbh3dEPaAeWGcTfSTnVHbQGfBj3A4ePJyWl/Tgzri31AC rzr8SRs0yX8b82TbECWsV67i/GrntLJyz4yQ52S/RRqVwnQqSn/wicEdCY00lJBt Tye8zApOnYBouaYqIOxm/M7ofvKzJ3gWOVeF/zBwM6hwvNaXXtY5r86fSDxoEbhY HOFnCDmg5Spf0U50j1G7nV5KfAb7BNA3/HFyzfzH+w+OWi4IGbThsfrg1qvjyFot KlEK/kF8Af2xj2A2se4XFsLc2D/Tj+29juYVQqIPBJzVPrZ2uerKSszK5Zcr+Use kMiG7tRWKE+2vkOM1RQ5Y5NCVEBhlXlienz1gf/C7247SEGs6OIyqvyDAgPTRx6p oR2/vx9hMtaSMf4aHWd+fYS5gNZ05iMvOIbRZnI1wZkQglQVkJvXhzuLaJ+dIGSP ypv6XOepik7vDjZ3p3xJXd0TAn4NSkn3jWRetrymdtMFanF99qw1VqjmkLecSil0 Gr0UhRL1oiMde6niVJrOpdOGLwt/M4N99Y5rksw6NCnktRJ99coFGj7LglZGMsu+ YkOyjD8MVJXTkBtBNGeqHTKe6nyVkHFq9ad5EmWjPkefP5JziH8i18k7JlF1dLA5 c8peq3ES659D5f0mU2jilD9PsCsBfSn6Of4ruMZa2Zr1XDD8snI= =vcA1 -----END PGP SIGNATURE----- Merge tag 'percpu-for-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu Pull percpu updates from Dennis Zhou: "One bigger change to percpu_counter's api allowing for init and destroy of multiple counters via percpu_counter_init_many() and percpu_counter_destroy_many(). This is used to help begin remediating a performance regression with percpu rss stats. Additionally, it seems larger core count machines are feeling the burden of the single threaded allocation of percpu. Mateusz is thinking about it and I will spend some time on it too. percpu: - A couple cleanups by Baoquan He and Bibo Mao. The only behavior change is to start printing messages if we're under the warn limit for failed atomic allocations. percpu_counter: - Shakeel introduced percpu counters into mm_struct which caused percpu allocations be on the hot path [1]. Originally I spent some time trying to improve the percpu allocator, but instead preferred what Mateusz Guzik proposed grouping at the allocation site, percpu_counter_init_many(). This allows a single percpu allocation to be shared by the counters. I like this approach because it creates a shared lifetime by the allocations. Additionally, I believe many inits have higher level synchronization requirements, like percpu_counter does against HOTPLUG_CPU. Therefore we can group these optimizations together" Link: https://lore.kernel.org/linux-mm/20221024052841.3291983-1-shakeelb@google.com/ [1] * tag 'percpu-for-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu: kernel/fork: group allocation/free of per-cpu counters for mm struct pcpcntr: add group allocation/free mm/percpu.c: print error message too if atomic alloc failed mm/percpu.c: optimize the code in pcpu_setup_first_chunk() a little bit mm/percpu.c: remove redundant check mm/percpu: Remove some local variables in pcpu_populate_pte
This commit is contained in:
commit
e987af4546
@ -30,17 +30,28 @@ struct percpu_counter {
|
||||
|
||||
extern int percpu_counter_batch;
|
||||
|
||||
int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
|
||||
struct lock_class_key *key);
|
||||
int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
|
||||
gfp_t gfp, u32 nr_counters,
|
||||
struct lock_class_key *key);
|
||||
|
||||
#define percpu_counter_init(fbc, value, gfp) \
|
||||
#define percpu_counter_init_many(fbc, value, gfp, nr_counters) \
|
||||
({ \
|
||||
static struct lock_class_key __key; \
|
||||
\
|
||||
__percpu_counter_init(fbc, value, gfp, &__key); \
|
||||
__percpu_counter_init_many(fbc, value, gfp, nr_counters,\
|
||||
&__key); \
|
||||
})
|
||||
|
||||
void percpu_counter_destroy(struct percpu_counter *fbc);
|
||||
|
||||
#define percpu_counter_init(fbc, value, gfp) \
|
||||
percpu_counter_init_many(fbc, value, gfp, 1)
|
||||
|
||||
void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters);
|
||||
static inline void percpu_counter_destroy(struct percpu_counter *fbc)
|
||||
{
|
||||
percpu_counter_destroy_many(fbc, 1);
|
||||
}
|
||||
|
||||
void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
|
||||
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
|
||||
s32 batch);
|
||||
@ -116,11 +127,27 @@ struct percpu_counter {
|
||||
s64 count;
|
||||
};
|
||||
|
||||
static inline int percpu_counter_init_many(struct percpu_counter *fbc,
|
||||
s64 amount, gfp_t gfp,
|
||||
u32 nr_counters)
|
||||
{
|
||||
u32 i;
|
||||
|
||||
for (i = 0; i < nr_counters; i++)
|
||||
fbc[i].count = amount;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount,
|
||||
gfp_t gfp)
|
||||
{
|
||||
fbc->count = amount;
|
||||
return 0;
|
||||
return percpu_counter_init_many(fbc, amount, gfp, 1);
|
||||
}
|
||||
|
||||
static inline void percpu_counter_destroy_many(struct percpu_counter *fbc,
|
||||
u32 nr_counters)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void percpu_counter_destroy(struct percpu_counter *fbc)
|
||||
|
@ -909,8 +909,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm)
|
||||
*/
|
||||
void __mmdrop(struct mm_struct *mm)
|
||||
{
|
||||
int i;
|
||||
|
||||
BUG_ON(mm == &init_mm);
|
||||
WARN_ON_ONCE(mm == current->mm);
|
||||
|
||||
@ -925,9 +923,8 @@ void __mmdrop(struct mm_struct *mm)
|
||||
put_user_ns(mm->user_ns);
|
||||
mm_pasid_drop(mm);
|
||||
mm_destroy_cid(mm);
|
||||
percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
|
||||
|
||||
for (i = 0; i < NR_MM_COUNTERS; i++)
|
||||
percpu_counter_destroy(&mm->rss_stat[i]);
|
||||
free_mm(mm);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__mmdrop);
|
||||
@ -1260,8 +1257,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
|
||||
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
|
||||
struct user_namespace *user_ns)
|
||||
{
|
||||
int i;
|
||||
|
||||
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
|
||||
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
|
||||
atomic_set(&mm->mm_users, 1);
|
||||
@ -1309,17 +1304,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
|
||||
if (mm_alloc_cid(mm))
|
||||
goto fail_cid;
|
||||
|
||||
for (i = 0; i < NR_MM_COUNTERS; i++)
|
||||
if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
|
||||
goto fail_pcpu;
|
||||
if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
|
||||
NR_MM_COUNTERS))
|
||||
goto fail_pcpu;
|
||||
|
||||
mm->user_ns = get_user_ns(user_ns);
|
||||
lru_gen_init_mm(mm);
|
||||
return mm;
|
||||
|
||||
fail_pcpu:
|
||||
while (i > 0)
|
||||
percpu_counter_destroy(&mm->rss_stat[--i]);
|
||||
mm_destroy_cid(mm);
|
||||
fail_cid:
|
||||
destroy_context(mm);
|
||||
|
@ -151,48 +151,72 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
|
||||
}
|
||||
EXPORT_SYMBOL(__percpu_counter_sum);
|
||||
|
||||
int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
|
||||
struct lock_class_key *key)
|
||||
int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
|
||||
gfp_t gfp, u32 nr_counters,
|
||||
struct lock_class_key *key)
|
||||
{
|
||||
unsigned long flags __maybe_unused;
|
||||
size_t counter_size;
|
||||
s32 __percpu *counters;
|
||||
u32 i;
|
||||
|
||||
raw_spin_lock_init(&fbc->lock);
|
||||
lockdep_set_class(&fbc->lock, key);
|
||||
fbc->count = amount;
|
||||
fbc->counters = alloc_percpu_gfp(s32, gfp);
|
||||
if (!fbc->counters)
|
||||
counter_size = ALIGN(sizeof(*counters), __alignof__(*counters));
|
||||
counters = __alloc_percpu_gfp(nr_counters * counter_size,
|
||||
__alignof__(*counters), gfp);
|
||||
if (!counters) {
|
||||
fbc[0].counters = NULL;
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
debug_percpu_counter_activate(fbc);
|
||||
for (i = 0; i < nr_counters; i++) {
|
||||
raw_spin_lock_init(&fbc[i].lock);
|
||||
lockdep_set_class(&fbc[i].lock, key);
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
INIT_LIST_HEAD(&fbc[i].list);
|
||||
#endif
|
||||
fbc[i].count = amount;
|
||||
fbc[i].counters = (void *)counters + (i * counter_size);
|
||||
|
||||
debug_percpu_counter_activate(&fbc[i]);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
INIT_LIST_HEAD(&fbc->list);
|
||||
spin_lock_irqsave(&percpu_counters_lock, flags);
|
||||
list_add(&fbc->list, &percpu_counters);
|
||||
for (i = 0; i < nr_counters; i++)
|
||||
list_add(&fbc[i].list, &percpu_counters);
|
||||
spin_unlock_irqrestore(&percpu_counters_lock, flags);
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(__percpu_counter_init);
|
||||
EXPORT_SYMBOL(__percpu_counter_init_many);
|
||||
|
||||
void percpu_counter_destroy(struct percpu_counter *fbc)
|
||||
void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters)
|
||||
{
|
||||
unsigned long flags __maybe_unused;
|
||||
u32 i;
|
||||
|
||||
if (!fbc->counters)
|
||||
if (WARN_ON_ONCE(!fbc))
|
||||
return;
|
||||
|
||||
debug_percpu_counter_deactivate(fbc);
|
||||
if (!fbc[0].counters)
|
||||
return;
|
||||
|
||||
for (i = 0; i < nr_counters; i++)
|
||||
debug_percpu_counter_deactivate(&fbc[i]);
|
||||
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
spin_lock_irqsave(&percpu_counters_lock, flags);
|
||||
list_del(&fbc->list);
|
||||
for (i = 0; i < nr_counters; i++)
|
||||
list_del(&fbc[i].list);
|
||||
spin_unlock_irqrestore(&percpu_counters_lock, flags);
|
||||
#endif
|
||||
free_percpu(fbc->counters);
|
||||
fbc->counters = NULL;
|
||||
|
||||
free_percpu(fbc[0].counters);
|
||||
|
||||
for (i = 0; i < nr_counters; i++)
|
||||
fbc[i].counters = NULL;
|
||||
}
|
||||
EXPORT_SYMBOL(percpu_counter_destroy);
|
||||
EXPORT_SYMBOL(percpu_counter_destroy_many);
|
||||
|
||||
int percpu_counter_batch __read_mostly = 32;
|
||||
EXPORT_SYMBOL(percpu_counter_batch);
|
||||
|
69
mm/percpu.c
69
mm/percpu.c
@ -1890,13 +1890,15 @@ fail_unlock:
|
||||
fail:
|
||||
trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
|
||||
|
||||
if (!is_atomic && do_warn && warn_limit) {
|
||||
if (do_warn && warn_limit) {
|
||||
pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
|
||||
size, align, is_atomic, err);
|
||||
dump_stack();
|
||||
if (!is_atomic)
|
||||
dump_stack();
|
||||
if (!--warn_limit)
|
||||
pr_info("limit reached, disable warning\n");
|
||||
}
|
||||
|
||||
if (is_atomic) {
|
||||
/* see the flag handling in pcpu_balance_workfn() */
|
||||
pcpu_atomic_alloc_failed = true;
|
||||
@ -2581,14 +2583,12 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
|
||||
{
|
||||
size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
|
||||
size_t static_size, dyn_size;
|
||||
struct pcpu_chunk *chunk;
|
||||
unsigned long *group_offsets;
|
||||
size_t *group_sizes;
|
||||
unsigned long *unit_off;
|
||||
unsigned int cpu;
|
||||
int *unit_map;
|
||||
int group, unit, i;
|
||||
int map_size;
|
||||
unsigned long tmp_addr;
|
||||
size_t alloc_size;
|
||||
|
||||
@ -2615,7 +2615,6 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
|
||||
PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
|
||||
PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
|
||||
PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
|
||||
PCPU_SETUP_BUG_ON(!ai->dyn_size);
|
||||
PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
|
||||
PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
|
||||
IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
|
||||
@ -2698,7 +2697,7 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
|
||||
pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
|
||||
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
|
||||
pcpu_atom_size = ai->atom_size;
|
||||
pcpu_chunk_struct_size = struct_size(chunk, populated,
|
||||
pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated,
|
||||
BITS_TO_LONGS(pcpu_unit_pages));
|
||||
|
||||
pcpu_stats_save_ai(ai);
|
||||
@ -2735,29 +2734,23 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
|
||||
dyn_size = ai->dyn_size - (static_size - ai->static_size);
|
||||
|
||||
/*
|
||||
* Initialize first chunk.
|
||||
* If the reserved_size is non-zero, this initializes the reserved
|
||||
* chunk. If the reserved_size is zero, the reserved chunk is NULL
|
||||
* and the dynamic region is initialized here. The first chunk,
|
||||
* pcpu_first_chunk, will always point to the chunk that serves
|
||||
* the dynamic region.
|
||||
* Initialize first chunk:
|
||||
* This chunk is broken up into 3 parts:
|
||||
* < static | [reserved] | dynamic >
|
||||
* - static - there is no backing chunk because these allocations can
|
||||
* never be freed.
|
||||
* - reserved (pcpu_reserved_chunk) - exists primarily to serve
|
||||
* allocations from module load.
|
||||
* - dynamic (pcpu_first_chunk) - serves the dynamic part of the first
|
||||
* chunk.
|
||||
*/
|
||||
tmp_addr = (unsigned long)base_addr + static_size;
|
||||
map_size = ai->reserved_size ?: dyn_size;
|
||||
chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
|
||||
if (ai->reserved_size)
|
||||
pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr,
|
||||
ai->reserved_size);
|
||||
tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size;
|
||||
pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size);
|
||||
|
||||
/* init dynamic chunk if necessary */
|
||||
if (ai->reserved_size) {
|
||||
pcpu_reserved_chunk = chunk;
|
||||
|
||||
tmp_addr = (unsigned long)base_addr + static_size +
|
||||
ai->reserved_size;
|
||||
map_size = dyn_size;
|
||||
chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
|
||||
}
|
||||
|
||||
/* link the first chunk in */
|
||||
pcpu_first_chunk = chunk;
|
||||
pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
|
||||
pcpu_chunk_relocate(pcpu_first_chunk, -1);
|
||||
|
||||
@ -3189,32 +3182,26 @@ void __init __weak pcpu_populate_pte(unsigned long addr)
|
||||
pmd_t *pmd;
|
||||
|
||||
if (pgd_none(*pgd)) {
|
||||
p4d_t *new;
|
||||
|
||||
new = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
|
||||
if (!new)
|
||||
p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
|
||||
if (!p4d)
|
||||
goto err_alloc;
|
||||
pgd_populate(&init_mm, pgd, new);
|
||||
pgd_populate(&init_mm, pgd, p4d);
|
||||
}
|
||||
|
||||
p4d = p4d_offset(pgd, addr);
|
||||
if (p4d_none(*p4d)) {
|
||||
pud_t *new;
|
||||
|
||||
new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
|
||||
if (!new)
|
||||
pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
|
||||
if (!pud)
|
||||
goto err_alloc;
|
||||
p4d_populate(&init_mm, p4d, new);
|
||||
p4d_populate(&init_mm, p4d, pud);
|
||||
}
|
||||
|
||||
pud = pud_offset(p4d, addr);
|
||||
if (pud_none(*pud)) {
|
||||
pmd_t *new;
|
||||
|
||||
new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
|
||||
if (!new)
|
||||
pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
|
||||
if (!pmd)
|
||||
goto err_alloc;
|
||||
pud_populate(&init_mm, pud, new);
|
||||
pud_populate(&init_mm, pud, pmd);
|
||||
}
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
|
Loading…
x
Reference in New Issue
Block a user