58ac9a8993
The main benefit of THPs are that they can be mapped at the pmd level, increasing the likelihood of TLB hit and spending less cycles in page table walks. pte-mapped hugepages - that is - hugepage-aligned compound pages of order HPAGE_PMD_ORDER mapped by ptes - although being contiguous in physical memory, don't have this advantage. In fact, one could argue they are detrimental to system performance overall since they occupy a precious hugepage-aligned/sized region of physical memory that could otherwise be used more effectively. Additionally, pte-mapped hugepages can be the cheapest memory to collapse for khugepaged since no new hugepage allocation or copying of memory contents is necessary - we only need to update the mapping page tables. In the anonymous collapse path, we are able to collapse pte-mapped hugepages (albeit, perhaps suboptimally), but the file/shmem path makes no effort when compound pages (of any order) are encountered. Identify pte-mapped hugepages in the file/shmem collapse path. The final step of which makes a racy check of the value of the pmd to ensure it maps a pte table. This should be fine, since races that result in false-positive (i.e. attempt collapse even though we shouldn't) will fail later in collapse_pte_mapped_thp() once we actually lock mmap_lock and reinspect the pmd value. Races that result in false-negatives (i.e. where we decide to not attempt collapse, but should have) shouldn't be an issue, since in the worst case, we do nothing - which is what we've done up to this point. We make a similar check in retract_page_tables(). If we do think we've found a pte-mapped hugepgae in khugepaged context, attempt to update page tables mapping this hugepage. Note that these collapses still count towards the /sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed counter, and if the pte-mapped hugepage was also mapped into multiple process' address spaces, could be incremented for each page table update. Since we increment the counter when a pte-mapped hugepage is successfully added to the list of to-collapse pte-mapped THPs, it's possible that we never actually update the page table either. This is different from how file/shmem pages_collapsed accounting works today where only a successful page cache update is counted (it's also possible here that no page tables are actually changed). Though it incurs some slop, this is preferred to either not accounting for the event at all, or plumbing through data in struct mm_slot on whether to account for the collapse or not. Also note that work still needs to be done to support arbitrary compound pages, and that this should all be converted to using folios. [shy828301@gmail.com: Spelling mistake, update comment, and add Documentation] Link: https://lore.kernel.org/linux-mm/CAHbLzkpHwZxFzjfX9nxVoRhzup8WMjMfyL6Xiq8mZ9M-N3ombw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220907144521.3115321-3-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-3-zokeefe@google.com Signed-off-by: Zach O'Keefe <zokeefe@google.com> Reviewed-by: Yang Shi <shy828301@gmail.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Chris Kennelly <ckennelly@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: David Rientjes <rientjes@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: James Houghton <jthoughton@google.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Peter Xu <peterx@redhat.com> Cc: Rongwei Wang <rongwei.wang@linux.alibaba.com> Cc: SeongJae Park <sj@kernel.org> Cc: Song Liu <songliubraving@fb.com> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
||
---|---|---|
.. | ||
9p.h | ||
afs.h | ||
alarmtimer.h | ||
asoc.h | ||
avc.h | ||
bcache.h | ||
block.h | ||
bpf_test_run.h | ||
bridge.h | ||
btrfs.h | ||
cachefiles.h | ||
cgroup.h | ||
clk.h | ||
cma.h | ||
compaction.h | ||
context_tracking.h | ||
cpuhp.h | ||
damon.h | ||
devfreq.h | ||
devlink.h | ||
dlm.h | ||
dma_fence.h | ||
erofs.h | ||
error_report.h | ||
ext4.h | ||
f2fs.h | ||
fib6.h | ||
fib.h | ||
filelock.h | ||
filemap.h | ||
fs_dax.h | ||
fs.h | ||
fscache.h | ||
fsi_master_aspeed.h | ||
fsi_master_ast_cf.h | ||
fsi_master_gpio.h | ||
fsi.h | ||
gpio.h | ||
gpu_mem.h | ||
host1x.h | ||
huge_memory.h | ||
hwmon.h | ||
i2c_slave.h | ||
i2c.h | ||
ib_mad.h | ||
ib_umad.h | ||
initcall.h | ||
intel_ifs.h | ||
intel_ish.h | ||
intel-sst.h | ||
io_uring.h | ||
iocost.h | ||
iommu.h | ||
ipi.h | ||
irq_matrix.h | ||
irq.h | ||
iscsi.h | ||
jbd2.h | ||
kmem.h | ||
kvm.h | ||
kyber.h | ||
libata.h | ||
lock.h | ||
maple_tree.h | ||
mce.h | ||
mctp.h | ||
mdio.h | ||
migrate.h | ||
mlxsw.h | ||
mmap_lock.h | ||
mmap.h | ||
mmc.h | ||
mmflags.h | ||
module.h | ||
mptcp.h | ||
napi.h | ||
nbd.h | ||
neigh.h | ||
net_probe_common.h | ||
net.h | ||
netfs.h | ||
netlink.h | ||
nfs.h | ||
nilfs2.h | ||
nmi.h | ||
objagg.h | ||
oom.h | ||
osnoise.h | ||
page_isolation.h | ||
page_pool.h | ||
page_ref.h | ||
pagemap.h | ||
percpu.h | ||
power_cpu_migrate.h | ||
power.h | ||
preemptirq.h | ||
printk.h | ||
pwc.h | ||
pwm.h | ||
qdisc.h | ||
qla.h | ||
qrtr.h | ||
rcu.h | ||
rdma_core.h | ||
rdma.h | ||
regulator.h | ||
rpcgss.h | ||
rpcrdma.h | ||
rpm.h | ||
rseq.h | ||
rtc.h | ||
rv.h | ||
rwmmio.h | ||
rxrpc.h | ||
sched.h | ||
scmi.h | ||
scsi.h | ||
sctp.h | ||
signal.h | ||
siox.h | ||
skb.h | ||
smbus.h | ||
sock.h | ||
spi.h | ||
spmi.h | ||
sunrpc_base.h | ||
sunrpc.h | ||
sunvnet.h | ||
swiotlb.h | ||
syscalls.h | ||
target.h | ||
task.h | ||
tcp.h | ||
tegra_apb_dma.h | ||
thermal_power_allocator.h | ||
thermal_pressure.h | ||
thermal.h | ||
thp.h | ||
timer.h | ||
tlb.h | ||
udp.h | ||
ufs.h | ||
v4l2.h | ||
vb2.h | ||
vmscan.h | ||
vsock_virtio_transport_common.h | ||
wbt.h | ||
workqueue.h | ||
writeback.h | ||
xdp.h | ||
xen.h |