6d4675e601
The rmap locks(i_mmap_rwsem and anon_vma->root->rwsem) could be contended under memory pressure if processes keep working on their vmas(e.g., fork, mmap, munmap). It makes reclaim path stuck. In our real workload traces, we see kswapd is waiting the lock for 300ms+(worst case, a sec) and it makes other processes entering direct reclaim, which were also stuck on the lock. This patch makes lru aging path try_lock mode like shink_page_list so the reclaim context will keep working with next lru pages without being stuck. if it found the rmap lock contended, it rotates the page back to head of lru in both active/inactive lrus to make them consistent behavior, which is basic starting point rather than adding more heristic. Since this patch introduces a new "contended" field as out-param along with try_lock in-param in rmap_walk_control, it's not immutable any longer if the try_lock is set so remove const keywords on rmap related functions. Since rmap walking is already expensive operation, I doubt the const would help sizable benefit( And we didn't have it until 5.17). In a heavy app workload in Android, trace shows following statistics. It almost removes rmap lock contention from reclaim path. Martin Liu reported: Before: max_dur(ms) min_dur(ms) max-min(dur)ms avg_dur(ms) sum_dur(ms) count blocked_function 1632 0 1631 151.542173 31672 209 page_lock_anon_vma_read 601 0 601 145.544681 28817 198 rmap_walk_file After: max_dur(ms) min_dur(ms) max-min(dur)ms avg_dur(ms) sum_dur(ms) count blocked_function NaN NaN NaN NaN NaN 0.0 NaN 0 0 0 0.127645 1 12 rmap_walk_file [minchan@kernel.org: add comment, per Matthew] Link: https://lkml.kernel.org/r/YnNqeB5tUf6LZ57b@google.com Link: https://lkml.kernel.org/r/20220510215423.164547-1-minchan@kernel.org Signed-off-by: Minchan Kim <minchan@kernel.org> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Michal Hocko <mhocko@suse.com> Cc: John Dias <joaodias@google.com> Cc: Tim Murray <timmurray@google.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Martin Liu <liumartin@google.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Matthew Wilcox <willy@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
221 lines
5.2 KiB
C
221 lines
5.2 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include <linux/init.h>
|
|
#include <linux/memblock.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/sysfs.h>
|
|
#include <linux/kobject.h>
|
|
#include <linux/memory_hotplug.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/mmzone.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/rmap.h>
|
|
#include <linux/mmu_notifier.h>
|
|
#include <linux/page_ext.h>
|
|
#include <linux/page_idle.h>
|
|
|
|
#include "internal.h"
|
|
|
|
#define BITMAP_CHUNK_SIZE sizeof(u64)
|
|
#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
|
|
|
|
/*
|
|
* Idle page tracking only considers user memory pages, for other types of
|
|
* pages the idle flag is always unset and an attempt to set it is silently
|
|
* ignored.
|
|
*
|
|
* We treat a page as a user memory page if it is on an LRU list, because it is
|
|
* always safe to pass such a page to rmap_walk(), which is essential for idle
|
|
* page tracking. With such an indicator of user pages we can skip isolated
|
|
* pages, but since there are not usually many of them, it will hardly affect
|
|
* the overall result.
|
|
*
|
|
* This function tries to get a user memory page by pfn as described above.
|
|
*/
|
|
static struct page *page_idle_get_page(unsigned long pfn)
|
|
{
|
|
struct page *page = pfn_to_online_page(pfn);
|
|
|
|
if (!page || !PageLRU(page) ||
|
|
!get_page_unless_zero(page))
|
|
return NULL;
|
|
|
|
if (unlikely(!PageLRU(page))) {
|
|
put_page(page);
|
|
page = NULL;
|
|
}
|
|
return page;
|
|
}
|
|
|
|
static bool page_idle_clear_pte_refs_one(struct folio *folio,
|
|
struct vm_area_struct *vma,
|
|
unsigned long addr, void *arg)
|
|
{
|
|
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
|
|
bool referenced = false;
|
|
|
|
while (page_vma_mapped_walk(&pvmw)) {
|
|
addr = pvmw.address;
|
|
if (pvmw.pte) {
|
|
/*
|
|
* For PTE-mapped THP, one sub page is referenced,
|
|
* the whole THP is referenced.
|
|
*/
|
|
if (ptep_clear_young_notify(vma, addr, pvmw.pte))
|
|
referenced = true;
|
|
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
|
|
if (pmdp_clear_young_notify(vma, addr, pvmw.pmd))
|
|
referenced = true;
|
|
} else {
|
|
/* unexpected pmd-mapped page? */
|
|
WARN_ON_ONCE(1);
|
|
}
|
|
}
|
|
|
|
if (referenced) {
|
|
folio_clear_idle(folio);
|
|
/*
|
|
* We cleared the referenced bit in a mapping to this page. To
|
|
* avoid interference with page reclaim, mark it young so that
|
|
* folio_referenced() will return > 0.
|
|
*/
|
|
folio_set_young(folio);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static void page_idle_clear_pte_refs(struct page *page)
|
|
{
|
|
struct folio *folio = page_folio(page);
|
|
|
|
/*
|
|
* Since rwc.try_lock is unused, rwc is effectively immutable, so we
|
|
* can make it static to save some cycles and stack.
|
|
*/
|
|
static struct rmap_walk_control rwc = {
|
|
.rmap_one = page_idle_clear_pte_refs_one,
|
|
.anon_lock = folio_lock_anon_vma_read,
|
|
};
|
|
bool need_lock;
|
|
|
|
if (!folio_mapped(folio) || !folio_raw_mapping(folio))
|
|
return;
|
|
|
|
need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
|
|
if (need_lock && !folio_trylock(folio))
|
|
return;
|
|
|
|
rmap_walk(folio, &rwc);
|
|
|
|
if (need_lock)
|
|
folio_unlock(folio);
|
|
}
|
|
|
|
static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
|
|
struct bin_attribute *attr, char *buf,
|
|
loff_t pos, size_t count)
|
|
{
|
|
u64 *out = (u64 *)buf;
|
|
struct page *page;
|
|
unsigned long pfn, end_pfn;
|
|
int bit;
|
|
|
|
if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
|
|
return -EINVAL;
|
|
|
|
pfn = pos * BITS_PER_BYTE;
|
|
if (pfn >= max_pfn)
|
|
return 0;
|
|
|
|
end_pfn = pfn + count * BITS_PER_BYTE;
|
|
if (end_pfn > max_pfn)
|
|
end_pfn = max_pfn;
|
|
|
|
for (; pfn < end_pfn; pfn++) {
|
|
bit = pfn % BITMAP_CHUNK_BITS;
|
|
if (!bit)
|
|
*out = 0ULL;
|
|
page = page_idle_get_page(pfn);
|
|
if (page) {
|
|
if (page_is_idle(page)) {
|
|
/*
|
|
* The page might have been referenced via a
|
|
* pte, in which case it is not idle. Clear
|
|
* refs and recheck.
|
|
*/
|
|
page_idle_clear_pte_refs(page);
|
|
if (page_is_idle(page))
|
|
*out |= 1ULL << bit;
|
|
}
|
|
put_page(page);
|
|
}
|
|
if (bit == BITMAP_CHUNK_BITS - 1)
|
|
out++;
|
|
cond_resched();
|
|
}
|
|
return (char *)out - buf;
|
|
}
|
|
|
|
static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
|
|
struct bin_attribute *attr, char *buf,
|
|
loff_t pos, size_t count)
|
|
{
|
|
const u64 *in = (u64 *)buf;
|
|
struct page *page;
|
|
unsigned long pfn, end_pfn;
|
|
int bit;
|
|
|
|
if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
|
|
return -EINVAL;
|
|
|
|
pfn = pos * BITS_PER_BYTE;
|
|
if (pfn >= max_pfn)
|
|
return -ENXIO;
|
|
|
|
end_pfn = pfn + count * BITS_PER_BYTE;
|
|
if (end_pfn > max_pfn)
|
|
end_pfn = max_pfn;
|
|
|
|
for (; pfn < end_pfn; pfn++) {
|
|
bit = pfn % BITMAP_CHUNK_BITS;
|
|
if ((*in >> bit) & 1) {
|
|
page = page_idle_get_page(pfn);
|
|
if (page) {
|
|
page_idle_clear_pte_refs(page);
|
|
set_page_idle(page);
|
|
put_page(page);
|
|
}
|
|
}
|
|
if (bit == BITMAP_CHUNK_BITS - 1)
|
|
in++;
|
|
cond_resched();
|
|
}
|
|
return (char *)in - buf;
|
|
}
|
|
|
|
static struct bin_attribute page_idle_bitmap_attr =
|
|
__BIN_ATTR(bitmap, 0600,
|
|
page_idle_bitmap_read, page_idle_bitmap_write, 0);
|
|
|
|
static struct bin_attribute *page_idle_bin_attrs[] = {
|
|
&page_idle_bitmap_attr,
|
|
NULL,
|
|
};
|
|
|
|
static const struct attribute_group page_idle_attr_group = {
|
|
.bin_attrs = page_idle_bin_attrs,
|
|
.name = "page_idle",
|
|
};
|
|
|
|
static int __init page_idle_init(void)
|
|
{
|
|
int err;
|
|
|
|
err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
|
|
if (err) {
|
|
pr_err("page_idle: register sysfs failed\n");
|
|
return err;
|
|
}
|
|
return 0;
|
|
}
|
|
subsys_initcall(page_idle_init);
|