linux/drivers/dma-buf/heaps/system_heap.c
Linus Torvalds 7fa8a8ee94 - Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
 
 - More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav.
 
 - zsmalloc performance improvements from Sergey Senozhatsky.
 
 - Yue Zhao has found and fixed some data race issues around the
   alteration of memcg userspace tunables.
 
 - VFS rationalizations from Christoph Hellwig:
 
   - removal of most of the callers of write_one_page().
 
   - make __filemap_get_folio()'s return value more useful
 
 - Luis Chamberlain has changed tmpfs so it no longer requires swap
   backing.  Use `mount -o noswap'.
 
 - Qi Zheng has made the slab shrinkers operate locklessly, providing
   some scalability benefits.
 
 - Keith Busch has improved dmapool's performance, making part of its
   operations O(1) rather than O(n).
 
 - Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
   permitting userspace to wr-protect anon memory unpopulated ptes.
 
 - Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather
   than exclusive, and has fixed a bunch of errors which were caused by its
   unintuitive meaning.
 
 - Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
   which causes minor faults to install a write-protected pte.
 
 - Vlastimil Babka has done some maintenance work on vma_merge():
   cleanups to the kernel code and improvements to our userspace test
   harness.
 
 - Cleanups to do_fault_around() by Lorenzo Stoakes.
 
 - Mike Rapoport has moved a lot of initialization code out of various
   mm/ files and into mm/mm_init.c.
 
 - Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
   DRM, but DRM doesn't use it any more.
 
 - Lorenzo has also coverted read_kcore() and vread() to use iterators
   and has thereby removed the use of bounce buffers in some cases.
 
 - Lorenzo has also contributed further cleanups of vma_merge().
 
 - Chaitanya Prakash provides some fixes to the mmap selftesting code.
 
 - Matthew Wilcox changes xfs and afs so they no longer take sleeping
   locks in ->map_page(), a step towards RCUification of pagefaults.
 
 - Suren Baghdasaryan has improved mmap_lock scalability by switching to
   per-VMA locking.
 
 - Frederic Weisbecker has reworked the percpu cache draining so that it
   no longer causes latency glitches on cpu isolated workloads.
 
 - Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
   logic.
 
 - Liu Shixin has changed zswap's initialization so we no longer waste a
   chunk of memory if zswap is not being used.
 
 - Yosry Ahmed has improved the performance of memcg statistics flushing.
 
 - David Stevens has fixed several issues involving khugepaged,
   userfaultfd and shmem.
 
 - Christoph Hellwig has provided some cleanup work to zram's IO-related
   code paths.
 
 - David Hildenbrand has fixed up some issues in the selftest code's
   testing of our pte state changing.
 
 - Pankaj Raghav has made page_endio() unneeded and has removed it.
 
 - Peter Xu contributed some rationalizations of the userfaultfd
   selftests.
 
 - Yosry Ahmed has fixed an issue around memcg's page recalim accounting.
 
 - Chaitanya Prakash has fixed some arm-related issues in the
   selftests/mm code.
 
 - Longlong Xia has improved the way in which KSM handles hwpoisoned
   pages.
 
 - Peter Xu fixes a few issues with uffd-wp at fork() time.
 
 - Stefan Roesch has changed KSM so that it may now be used on a
   per-process and per-cgroup basis.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZEr3zQAKCRDdBJ7gKXxA
 jlLoAP0fpQBipwFxED0Us4SKQfupV6z4caXNJGPeay7Aj11/kQD/aMRC2uPfgr96
 eMG3kwn2pqkB9ST2QpkaRbxA//eMbQY=
 =J+Dj
 -----END PGP SIGNATURE-----

Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull MM updates from Andrew Morton:

 - Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
   switching from a user process to a kernel thread.

 - More folio conversions from Kefeng Wang, Zhang Peng and Pankaj
   Raghav.

 - zsmalloc performance improvements from Sergey Senozhatsky.

 - Yue Zhao has found and fixed some data race issues around the
   alteration of memcg userspace tunables.

 - VFS rationalizations from Christoph Hellwig:
     - removal of most of the callers of write_one_page()
     - make __filemap_get_folio()'s return value more useful

 - Luis Chamberlain has changed tmpfs so it no longer requires swap
   backing. Use `mount -o noswap'.

 - Qi Zheng has made the slab shrinkers operate locklessly, providing
   some scalability benefits.

 - Keith Busch has improved dmapool's performance, making part of its
   operations O(1) rather than O(n).

 - Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
   permitting userspace to wr-protect anon memory unpopulated ptes.

 - Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive
   rather than exclusive, and has fixed a bunch of errors which were
   caused by its unintuitive meaning.

 - Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
   which causes minor faults to install a write-protected pte.

 - Vlastimil Babka has done some maintenance work on vma_merge():
   cleanups to the kernel code and improvements to our userspace test
   harness.

 - Cleanups to do_fault_around() by Lorenzo Stoakes.

 - Mike Rapoport has moved a lot of initialization code out of various
   mm/ files and into mm/mm_init.c.

 - Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
   DRM, but DRM doesn't use it any more.

 - Lorenzo has also coverted read_kcore() and vread() to use iterators
   and has thereby removed the use of bounce buffers in some cases.

 - Lorenzo has also contributed further cleanups of vma_merge().

 - Chaitanya Prakash provides some fixes to the mmap selftesting code.

 - Matthew Wilcox changes xfs and afs so they no longer take sleeping
   locks in ->map_page(), a step towards RCUification of pagefaults.

 - Suren Baghdasaryan has improved mmap_lock scalability by switching to
   per-VMA locking.

 - Frederic Weisbecker has reworked the percpu cache draining so that it
   no longer causes latency glitches on cpu isolated workloads.

 - Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
   logic.

 - Liu Shixin has changed zswap's initialization so we no longer waste a
   chunk of memory if zswap is not being used.

 - Yosry Ahmed has improved the performance of memcg statistics
   flushing.

 - David Stevens has fixed several issues involving khugepaged,
   userfaultfd and shmem.

 - Christoph Hellwig has provided some cleanup work to zram's IO-related
   code paths.

 - David Hildenbrand has fixed up some issues in the selftest code's
   testing of our pte state changing.

 - Pankaj Raghav has made page_endio() unneeded and has removed it.

 - Peter Xu contributed some rationalizations of the userfaultfd
   selftests.

 - Yosry Ahmed has fixed an issue around memcg's page recalim
   accounting.

 - Chaitanya Prakash has fixed some arm-related issues in the
   selftests/mm code.

 - Longlong Xia has improved the way in which KSM handles hwpoisoned
   pages.

 - Peter Xu fixes a few issues with uffd-wp at fork() time.

 - Stefan Roesch has changed KSM so that it may now be used on a
   per-process and per-cgroup basis.

* tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits)
  mm,unmap: avoid flushing TLB in batch if PTE is inaccessible
  shmem: restrict noswap option to initial user namespace
  mm/khugepaged: fix conflicting mods to collapse_file()
  sparse: remove unnecessary 0 values from rc
  mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area()
  hugetlb: pte_alloc_huge() to replace huge pte_alloc_map()
  maple_tree: fix allocation in mas_sparse_area()
  mm: do not increment pgfault stats when page fault handler retries
  zsmalloc: allow only one active pool compaction context
  selftests/mm: add new selftests for KSM
  mm: add new KSM process and sysfs knobs
  mm: add new api to enable ksm per process
  mm: shrinkers: fix debugfs file permissions
  mm: don't check VMA write permissions if the PTE/PMD indicates write permissions
  migrate_pages_batch: fix statistics for longterm pin retry
  userfaultfd: use helper function range_in_vma()
  lib/show_mem.c: use for_each_populated_zone() simplify code
  mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list()
  fs/buffer: convert create_page_buffers to folio_create_buffers
  fs/buffer: add folio_create_empty_buffers helper
  ...
2023-04-27 19:42:02 -07:00

442 lines
10 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* DMABUF System heap exporter
*
* Copyright (C) 2011 Google, Inc.
* Copyright (C) 2019, 2020 Linaro Ltd.
*
* Portions based off of Andrew Davis' SRAM heap:
* Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com/
* Andrew F. Davis <afd@ti.com>
*/
#include <linux/dma-buf.h>
#include <linux/dma-mapping.h>
#include <linux/dma-heap.h>
#include <linux/dma-resv.h>
#include <linux/err.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
static struct dma_heap *sys_heap;
struct system_heap_buffer {
struct dma_heap *heap;
struct list_head attachments;
struct mutex lock;
unsigned long len;
struct sg_table sg_table;
int vmap_cnt;
void *vaddr;
};
struct dma_heap_attachment {
struct device *dev;
struct sg_table *table;
struct list_head list;
bool mapped;
};
#define LOW_ORDER_GFP (GFP_HIGHUSER | __GFP_ZERO)
#define HIGH_ORDER_GFP (((GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN \
| __GFP_NORETRY) & ~__GFP_RECLAIM) \
| __GFP_COMP)
static gfp_t order_flags[] = {HIGH_ORDER_GFP, HIGH_ORDER_GFP, LOW_ORDER_GFP};
/*
* The selection of the orders used for allocation (1MB, 64K, 4K) is designed
* to match with the sizes often found in IOMMUs. Using order 4 pages instead
* of order 0 pages can significantly improve the performance of many IOMMUs
* by reducing TLB pressure and time spent updating page tables.
*/
static const unsigned int orders[] = {8, 4, 0};
#define NUM_ORDERS ARRAY_SIZE(orders)
static struct sg_table *dup_sg_table(struct sg_table *table)
{
struct sg_table *new_table;
int ret, i;
struct scatterlist *sg, *new_sg;
new_table = kzalloc(sizeof(*new_table), GFP_KERNEL);
if (!new_table)
return ERR_PTR(-ENOMEM);
ret = sg_alloc_table(new_table, table->orig_nents, GFP_KERNEL);
if (ret) {
kfree(new_table);
return ERR_PTR(-ENOMEM);
}
new_sg = new_table->sgl;
for_each_sgtable_sg(table, sg, i) {
sg_set_page(new_sg, sg_page(sg), sg->length, sg->offset);
new_sg = sg_next(new_sg);
}
return new_table;
}
static int system_heap_attach(struct dma_buf *dmabuf,
struct dma_buf_attachment *attachment)
{
struct system_heap_buffer *buffer = dmabuf->priv;
struct dma_heap_attachment *a;
struct sg_table *table;
a = kzalloc(sizeof(*a), GFP_KERNEL);
if (!a)
return -ENOMEM;
table = dup_sg_table(&buffer->sg_table);
if (IS_ERR(table)) {
kfree(a);
return -ENOMEM;
}
a->table = table;
a->dev = attachment->dev;
INIT_LIST_HEAD(&a->list);
a->mapped = false;
attachment->priv = a;
mutex_lock(&buffer->lock);
list_add(&a->list, &buffer->attachments);
mutex_unlock(&buffer->lock);
return 0;
}
static void system_heap_detach(struct dma_buf *dmabuf,
struct dma_buf_attachment *attachment)
{
struct system_heap_buffer *buffer = dmabuf->priv;
struct dma_heap_attachment *a = attachment->priv;
mutex_lock(&buffer->lock);
list_del(&a->list);
mutex_unlock(&buffer->lock);
sg_free_table(a->table);
kfree(a->table);
kfree(a);
}
static struct sg_table *system_heap_map_dma_buf(struct dma_buf_attachment *attachment,
enum dma_data_direction direction)
{
struct dma_heap_attachment *a = attachment->priv;
struct sg_table *table = a->table;
int ret;
ret = dma_map_sgtable(attachment->dev, table, direction, 0);
if (ret)
return ERR_PTR(ret);
a->mapped = true;
return table;
}
static void system_heap_unmap_dma_buf(struct dma_buf_attachment *attachment,
struct sg_table *table,
enum dma_data_direction direction)
{
struct dma_heap_attachment *a = attachment->priv;
a->mapped = false;
dma_unmap_sgtable(attachment->dev, table, direction, 0);
}
static int system_heap_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
enum dma_data_direction direction)
{
struct system_heap_buffer *buffer = dmabuf->priv;
struct dma_heap_attachment *a;
mutex_lock(&buffer->lock);
if (buffer->vmap_cnt)
invalidate_kernel_vmap_range(buffer->vaddr, buffer->len);
list_for_each_entry(a, &buffer->attachments, list) {
if (!a->mapped)
continue;
dma_sync_sgtable_for_cpu(a->dev, a->table, direction);
}
mutex_unlock(&buffer->lock);
return 0;
}
static int system_heap_dma_buf_end_cpu_access(struct dma_buf *dmabuf,
enum dma_data_direction direction)
{
struct system_heap_buffer *buffer = dmabuf->priv;
struct dma_heap_attachment *a;
mutex_lock(&buffer->lock);
if (buffer->vmap_cnt)
flush_kernel_vmap_range(buffer->vaddr, buffer->len);
list_for_each_entry(a, &buffer->attachments, list) {
if (!a->mapped)
continue;
dma_sync_sgtable_for_device(a->dev, a->table, direction);
}
mutex_unlock(&buffer->lock);
return 0;
}
static int system_heap_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
{
struct system_heap_buffer *buffer = dmabuf->priv;
struct sg_table *table = &buffer->sg_table;
unsigned long addr = vma->vm_start;
struct sg_page_iter piter;
int ret;
dma_resv_assert_held(dmabuf->resv);
for_each_sgtable_page(table, &piter, vma->vm_pgoff) {
struct page *page = sg_page_iter_page(&piter);
ret = remap_pfn_range(vma, addr, page_to_pfn(page), PAGE_SIZE,
vma->vm_page_prot);
if (ret)
return ret;
addr += PAGE_SIZE;
if (addr >= vma->vm_end)
return 0;
}
return 0;
}
static void *system_heap_do_vmap(struct system_heap_buffer *buffer)
{
struct sg_table *table = &buffer->sg_table;
int npages = PAGE_ALIGN(buffer->len) / PAGE_SIZE;
struct page **pages = vmalloc(sizeof(struct page *) * npages);
struct page **tmp = pages;
struct sg_page_iter piter;
void *vaddr;
if (!pages)
return ERR_PTR(-ENOMEM);
for_each_sgtable_page(table, &piter, 0) {
WARN_ON(tmp - pages >= npages);
*tmp++ = sg_page_iter_page(&piter);
}
vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
vfree(pages);
if (!vaddr)
return ERR_PTR(-ENOMEM);
return vaddr;
}
static int system_heap_vmap(struct dma_buf *dmabuf, struct iosys_map *map)
{
struct system_heap_buffer *buffer = dmabuf->priv;
void *vaddr;
int ret = 0;
mutex_lock(&buffer->lock);
if (buffer->vmap_cnt) {
buffer->vmap_cnt++;
iosys_map_set_vaddr(map, buffer->vaddr);
goto out;
}
vaddr = system_heap_do_vmap(buffer);
if (IS_ERR(vaddr)) {
ret = PTR_ERR(vaddr);
goto out;
}
buffer->vaddr = vaddr;
buffer->vmap_cnt++;
iosys_map_set_vaddr(map, buffer->vaddr);
out:
mutex_unlock(&buffer->lock);
return ret;
}
static void system_heap_vunmap(struct dma_buf *dmabuf, struct iosys_map *map)
{
struct system_heap_buffer *buffer = dmabuf->priv;
mutex_lock(&buffer->lock);
if (!--buffer->vmap_cnt) {
vunmap(buffer->vaddr);
buffer->vaddr = NULL;
}
mutex_unlock(&buffer->lock);
iosys_map_clear(map);
}
static void system_heap_dma_buf_release(struct dma_buf *dmabuf)
{
struct system_heap_buffer *buffer = dmabuf->priv;
struct sg_table *table;
struct scatterlist *sg;
int i;
table = &buffer->sg_table;
for_each_sgtable_sg(table, sg, i) {
struct page *page = sg_page(sg);
__free_pages(page, compound_order(page));
}
sg_free_table(table);
kfree(buffer);
}
static const struct dma_buf_ops system_heap_buf_ops = {
.attach = system_heap_attach,
.detach = system_heap_detach,
.map_dma_buf = system_heap_map_dma_buf,
.unmap_dma_buf = system_heap_unmap_dma_buf,
.begin_cpu_access = system_heap_dma_buf_begin_cpu_access,
.end_cpu_access = system_heap_dma_buf_end_cpu_access,
.mmap = system_heap_mmap,
.vmap = system_heap_vmap,
.vunmap = system_heap_vunmap,
.release = system_heap_dma_buf_release,
};
static struct page *alloc_largest_available(unsigned long size,
unsigned int max_order)
{
struct page *page;
int i;
for (i = 0; i < NUM_ORDERS; i++) {
if (size < (PAGE_SIZE << orders[i]))
continue;
if (max_order < orders[i])
continue;
page = alloc_pages(order_flags[i], orders[i]);
if (!page)
continue;
return page;
}
return NULL;
}
static struct dma_buf *system_heap_allocate(struct dma_heap *heap,
unsigned long len,
unsigned long fd_flags,
unsigned long heap_flags)
{
struct system_heap_buffer *buffer;
DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
unsigned long size_remaining = len;
unsigned int max_order = orders[0];
struct dma_buf *dmabuf;
struct sg_table *table;
struct scatterlist *sg;
struct list_head pages;
struct page *page, *tmp_page;
int i, ret = -ENOMEM;
buffer = kzalloc(sizeof(*buffer), GFP_KERNEL);
if (!buffer)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&buffer->attachments);
mutex_init(&buffer->lock);
buffer->heap = heap;
buffer->len = len;
INIT_LIST_HEAD(&pages);
i = 0;
while (size_remaining > 0) {
/*
* Avoid trying to allocate memory if the process
* has been killed by SIGKILL
*/
if (fatal_signal_pending(current)) {
ret = -EINTR;
goto free_buffer;
}
page = alloc_largest_available(size_remaining, max_order);
if (!page)
goto free_buffer;
list_add_tail(&page->lru, &pages);
size_remaining -= page_size(page);
max_order = compound_order(page);
i++;
}
table = &buffer->sg_table;
if (sg_alloc_table(table, i, GFP_KERNEL))
goto free_buffer;
sg = table->sgl;
list_for_each_entry_safe(page, tmp_page, &pages, lru) {
sg_set_page(sg, page, page_size(page), 0);
sg = sg_next(sg);
list_del(&page->lru);
}
/* create the dmabuf */
exp_info.exp_name = dma_heap_get_name(heap);
exp_info.ops = &system_heap_buf_ops;
exp_info.size = buffer->len;
exp_info.flags = fd_flags;
exp_info.priv = buffer;
dmabuf = dma_buf_export(&exp_info);
if (IS_ERR(dmabuf)) {
ret = PTR_ERR(dmabuf);
goto free_pages;
}
return dmabuf;
free_pages:
for_each_sgtable_sg(table, sg, i) {
struct page *p = sg_page(sg);
__free_pages(p, compound_order(p));
}
sg_free_table(table);
free_buffer:
list_for_each_entry_safe(page, tmp_page, &pages, lru)
__free_pages(page, compound_order(page));
kfree(buffer);
return ERR_PTR(ret);
}
static const struct dma_heap_ops system_heap_ops = {
.allocate = system_heap_allocate,
};
static int system_heap_create(void)
{
struct dma_heap_export_info exp_info;
exp_info.name = "system";
exp_info.ops = &system_heap_ops;
exp_info.priv = NULL;
sys_heap = dma_heap_add(&exp_info);
if (IS_ERR(sys_heap))
return PTR_ERR(sys_heap);
return 0;
}
module_init(system_heap_create);