Linus Torvalds 4f9701057a IOMMU Updates for Linux v5.13
Including:
 
 	- Big cleanup of almost unsused parts of the IOMMU API by
 	  Christoph Hellwig. This mostly affects the Freescale PAMU
 	  driver.
 
 	- New IOMMU driver for Unisoc SOCs
 
 	- ARM SMMU Updates from Will:
 
 	  - SMMUv3: Drop vestigial PREFETCH_ADDR support
 	  - SMMUv3: Elide TLB sync logic for empty gather
 	  - SMMUv3: Fix "Service Failure Mode" handling
     	  - SMMUv2: New Qualcomm compatible string
 
 	- Removal of the AMD IOMMU performance counter writeable check
 	  on AMD. It caused long boot delays on some machines and is
 	  only needed to work around an errata on some older (possibly
 	  pre-production) chips. If someone is still hit by this
 	  hardware issue anyway the performance counters will just
 	  return 0.
 
 	- Support for targeted invalidations in the AMD IOMMU driver.
 	  Before that the driver only invalidated a single 4k page or the
 	  whole IO/TLB for an address space. This has been extended now
 	  and is mostly useful for emulated AMD IOMMUs.
 
 	- Several fixes for the Shared Virtual Memory support in the
 	  Intel VT-d driver
 
 	- Mediatek drivers can now be built as modules
 
 	- Re-introduction of the forcedac boot option which got lost
 	  when converting the Intel VT-d driver to the common dma-iommu
 	  implementation.
 
 	- Extension of the IOMMU device registration interface and
 	  support iommu_ops to be const again when drivers are built as
 	  modules.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEr9jSbILcajRFYWYyK/BELZcBGuMFAmCMEIoACgkQK/BELZcB
 GuOu9xAAvg6aR0uHlxvRq6cgNnHN9Ltp5+t3qFYtRRrauY0iOPMO62k0QQli5shX
 CGeczD0e59KAZqI0zNJnQn8hMY5dg7XVkFCC5BrSzuCDCtwJZ0N5Tq3pfUlaV1rw
 BJf41t79Fd+jp7kn53tu+vRAfYZ3+sLOx/6U3c15pqKRZSkyFWbQllOtD3J5LnLu
 1PyPlfiNpMwCajiS7aQbN+fuJ/lKIFeA2MDPOsCBzhbfxiJUqJxZOKAZO3rOjFfK
 feTibqQ+3Zz6MPXt9st1cvPpy8jCosv81OY6Knqvxf/oB5q+fEdi2uNrKISonb/t
 Fw331oOIwg2A+HOpwC9MN1AumOIqiHSWWENAMk9SlP+TMIWKQ8kZreyI6IEB23dV
 +QvP3DVA+CfLwtNY/Zh0IqKh28D+IHlKbpWNU1m+9AUe468mV/MTjfwxr9Yfffhm
 LZ6C0DgFdmtqv8jPuDGUOgo3RNeN8bLnUSEHG9gHibA+RKujl5BWDjKkwILqMQTt
 Ysdsu8TiNtFIULomizqCpgqEbQfW8TLFvASXCM1VMQ/PDURxvchZPxFDJonYXy+K
 z2HGaG3eUE07YrAdRKH69aMVIbmS+sjEhvmi4xZ1Lh7wWcIE2AZVvO8qNb+Ckcp3
 4tLPPDksm/iQngnFf6gdgH3qv4rgbzE4+74GXqeANiQCjY9dSJI=
 =qF2C
 -----END PGP SIGNATURE-----

Merge tag 'iommu-updates-v5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu

Pull iommu updates from Joerg Roedel:

 - Big cleanup of almost unsused parts of the IOMMU API by Christoph
   Hellwig. This mostly affects the Freescale PAMU driver.

 - New IOMMU driver for Unisoc SOCs

 - ARM SMMU Updates from Will:
     - Drop vestigial PREFETCH_ADDR support (SMMUv3)
     - Elide TLB sync logic for empty gather (SMMUv3)
     - Fix "Service Failure Mode" handling (SMMUv3)
     - New Qualcomm compatible string (SMMUv2)

 - Removal of the AMD IOMMU performance counter writeable check on AMD.
   It caused long boot delays on some machines and is only needed to
   work around an errata on some older (possibly pre-production) chips.
   If someone is still hit by this hardware issue anyway the performance
   counters will just return 0.

 - Support for targeted invalidations in the AMD IOMMU driver. Before
   that the driver only invalidated a single 4k page or the whole IO/TLB
   for an address space. This has been extended now and is mostly useful
   for emulated AMD IOMMUs.

 - Several fixes for the Shared Virtual Memory support in the Intel VT-d
   driver

 - Mediatek drivers can now be built as modules

 - Re-introduction of the forcedac boot option which got lost when
   converting the Intel VT-d driver to the common dma-iommu
   implementation.

 - Extension of the IOMMU device registration interface and support
   iommu_ops to be const again when drivers are built as modules.

* tag 'iommu-updates-v5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu: (84 commits)
  iommu: Streamline registration interface
  iommu: Statically set module owner
  iommu/mediatek-v1: Add error handle for mtk_iommu_probe
  iommu/mediatek-v1: Avoid build fail when build as module
  iommu/mediatek: Always enable the clk on resume
  iommu/fsl-pamu: Fix uninitialized variable warning
  iommu/vt-d: Force to flush iotlb before creating superpage
  iommu/amd: Put newline after closing bracket in warning
  iommu/vt-d: Fix an error handling path in 'intel_prepare_irq_remapping()'
  iommu/vt-d: Fix build error of pasid_enable_wpe() with !X86
  iommu/amd: Remove performance counter pre-initialization test
  Revert "iommu/amd: Fix performance counter initialization"
  iommu/amd: Remove duplicate check of devid
  iommu/exynos: Remove unneeded local variable initialization
  iommu/amd: Page-specific invalidations for more than one page
  iommu/arm-smmu-v3: Remove the unused fields for PREFETCH_CONFIG command
  iommu/vt-d: Avoid unnecessary cache flush in pasid entry teardown
  iommu/vt-d: Invalidate PASID cache when root/context entry changed
  iommu/vt-d: Remove WO permissions on second-level paging entries
  iommu/vt-d: Report the right page fault address
  ...
2021-05-01 09:33:00 -07:00

1137 lines
25 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2018-2020 Intel Corporation.
* Copyright (C) 2020 Red Hat, Inc.
*
* Author: Tiwei Bie <tiwei.bie@intel.com>
* Jason Wang <jasowang@redhat.com>
*
* Thanks Michael S. Tsirkin for the valuable comments and
* suggestions. And thanks to Cunming Liang and Zhihong Wang for all
* their supports.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/cdev.h>
#include <linux/device.h>
#include <linux/mm.h>
#include <linux/iommu.h>
#include <linux/uuid.h>
#include <linux/vdpa.h>
#include <linux/nospec.h>
#include <linux/vhost.h>
#include <linux/virtio_net.h>
#include "vhost.h"
enum {
VHOST_VDPA_BACKEND_FEATURES =
(1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) |
(1ULL << VHOST_BACKEND_F_IOTLB_BATCH),
};
#define VHOST_VDPA_DEV_MAX (1U << MINORBITS)
struct vhost_vdpa {
struct vhost_dev vdev;
struct iommu_domain *domain;
struct vhost_virtqueue *vqs;
struct completion completion;
struct vdpa_device *vdpa;
struct device dev;
struct cdev cdev;
atomic_t opened;
int nvqs;
int virtio_id;
int minor;
struct eventfd_ctx *config_ctx;
int in_batch;
struct vdpa_iova_range range;
};
static DEFINE_IDA(vhost_vdpa_ida);
static dev_t vhost_vdpa_major;
static void handle_vq_kick(struct vhost_work *work)
{
struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
poll.work);
struct vhost_vdpa *v = container_of(vq->dev, struct vhost_vdpa, vdev);
const struct vdpa_config_ops *ops = v->vdpa->config;
ops->kick_vq(v->vdpa, vq - v->vqs);
}
static irqreturn_t vhost_vdpa_virtqueue_cb(void *private)
{
struct vhost_virtqueue *vq = private;
struct eventfd_ctx *call_ctx = vq->call_ctx.ctx;
if (call_ctx)
eventfd_signal(call_ctx, 1);
return IRQ_HANDLED;
}
static irqreturn_t vhost_vdpa_config_cb(void *private)
{
struct vhost_vdpa *v = private;
struct eventfd_ctx *config_ctx = v->config_ctx;
if (config_ctx)
eventfd_signal(config_ctx, 1);
return IRQ_HANDLED;
}
static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid)
{
struct vhost_virtqueue *vq = &v->vqs[qid];
const struct vdpa_config_ops *ops = v->vdpa->config;
struct vdpa_device *vdpa = v->vdpa;
int ret, irq;
if (!ops->get_vq_irq)
return;
irq = ops->get_vq_irq(vdpa, qid);
irq_bypass_unregister_producer(&vq->call_ctx.producer);
if (!vq->call_ctx.ctx || irq < 0)
return;
vq->call_ctx.producer.token = vq->call_ctx.ctx;
vq->call_ctx.producer.irq = irq;
ret = irq_bypass_register_producer(&vq->call_ctx.producer);
if (unlikely(ret))
dev_info(&v->dev, "vq %u, irq bypass producer (token %p) registration fails, ret = %d\n",
qid, vq->call_ctx.producer.token, ret);
}
static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, u16 qid)
{
struct vhost_virtqueue *vq = &v->vqs[qid];
irq_bypass_unregister_producer(&vq->call_ctx.producer);
}
static void vhost_vdpa_reset(struct vhost_vdpa *v)
{
struct vdpa_device *vdpa = v->vdpa;
vdpa_reset(vdpa);
v->in_batch = 0;
}
static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp)
{
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
u32 device_id;
device_id = ops->get_device_id(vdpa);
if (copy_to_user(argp, &device_id, sizeof(device_id)))
return -EFAULT;
return 0;
}
static long vhost_vdpa_get_status(struct vhost_vdpa *v, u8 __user *statusp)
{
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
u8 status;
status = ops->get_status(vdpa);
if (copy_to_user(statusp, &status, sizeof(status)))
return -EFAULT;
return 0;
}
static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
{
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
u8 status, status_old;
int nvqs = v->nvqs;
u16 i;
if (copy_from_user(&status, statusp, sizeof(status)))
return -EFAULT;
status_old = ops->get_status(vdpa);
/*
* Userspace shouldn't remove status bits unless reset the
* status to 0.
*/
if (status != 0 && (ops->get_status(vdpa) & ~status) != 0)
return -EINVAL;
ops->set_status(vdpa, status);
if ((status & VIRTIO_CONFIG_S_DRIVER_OK) && !(status_old & VIRTIO_CONFIG_S_DRIVER_OK))
for (i = 0; i < nvqs; i++)
vhost_vdpa_setup_vq_irq(v, i);
if ((status_old & VIRTIO_CONFIG_S_DRIVER_OK) && !(status & VIRTIO_CONFIG_S_DRIVER_OK))
for (i = 0; i < nvqs; i++)
vhost_vdpa_unsetup_vq_irq(v, i);
return 0;
}
static int vhost_vdpa_config_validate(struct vhost_vdpa *v,
struct vhost_vdpa_config *c)
{
long size = 0;
switch (v->virtio_id) {
case VIRTIO_ID_NET:
size = sizeof(struct virtio_net_config);
break;
}
if (c->len == 0)
return -EINVAL;
if (c->len > size - c->off)
return -E2BIG;
return 0;
}
static long vhost_vdpa_get_config(struct vhost_vdpa *v,
struct vhost_vdpa_config __user *c)
{
struct vdpa_device *vdpa = v->vdpa;
struct vhost_vdpa_config config;
unsigned long size = offsetof(struct vhost_vdpa_config, buf);
u8 *buf;
if (copy_from_user(&config, c, size))
return -EFAULT;
if (vhost_vdpa_config_validate(v, &config))
return -EINVAL;
buf = kvzalloc(config.len, GFP_KERNEL);
if (!buf)
return -ENOMEM;
vdpa_get_config(vdpa, config.off, buf, config.len);
if (copy_to_user(c->buf, buf, config.len)) {
kvfree(buf);
return -EFAULT;
}
kvfree(buf);
return 0;
}
static long vhost_vdpa_set_config(struct vhost_vdpa *v,
struct vhost_vdpa_config __user *c)
{
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
struct vhost_vdpa_config config;
unsigned long size = offsetof(struct vhost_vdpa_config, buf);
u8 *buf;
if (copy_from_user(&config, c, size))
return -EFAULT;
if (vhost_vdpa_config_validate(v, &config))
return -EINVAL;
buf = vmemdup_user(c->buf, config.len);
if (IS_ERR(buf))
return PTR_ERR(buf);
ops->set_config(vdpa, config.off, buf, config.len);
kvfree(buf);
return 0;
}
static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep)
{
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
u64 features;
features = ops->get_features(vdpa);
if (copy_to_user(featurep, &features, sizeof(features)))
return -EFAULT;
return 0;
}
static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep)
{
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
u64 features;
/*
* It's not allowed to change the features after they have
* been negotiated.
*/
if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_FEATURES_OK)
return -EBUSY;
if (copy_from_user(&features, featurep, sizeof(features)))
return -EFAULT;
if (vdpa_set_features(vdpa, features))
return -EINVAL;
return 0;
}
static long vhost_vdpa_get_vring_num(struct vhost_vdpa *v, u16 __user *argp)
{
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
u16 num;
num = ops->get_vq_num_max(vdpa);
if (copy_to_user(argp, &num, sizeof(num)))
return -EFAULT;
return 0;
}
static void vhost_vdpa_config_put(struct vhost_vdpa *v)
{
if (v->config_ctx) {
eventfd_ctx_put(v->config_ctx);
v->config_ctx = NULL;
}
}
static long vhost_vdpa_set_config_call(struct vhost_vdpa *v, u32 __user *argp)
{
struct vdpa_callback cb;
int fd;
struct eventfd_ctx *ctx;
cb.callback = vhost_vdpa_config_cb;
cb.private = v->vdpa;
if (copy_from_user(&fd, argp, sizeof(fd)))
return -EFAULT;
ctx = fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(fd);
swap(ctx, v->config_ctx);
if (!IS_ERR_OR_NULL(ctx))
eventfd_ctx_put(ctx);
if (IS_ERR(v->config_ctx)) {
long ret = PTR_ERR(v->config_ctx);
v->config_ctx = NULL;
return ret;
}
v->vdpa->config->set_config_cb(v->vdpa, &cb);
return 0;
}
static long vhost_vdpa_get_iova_range(struct vhost_vdpa *v, u32 __user *argp)
{
struct vhost_vdpa_iova_range range = {
.first = v->range.first,
.last = v->range.last,
};
if (copy_to_user(argp, &range, sizeof(range)))
return -EFAULT;
return 0;
}
static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
void __user *argp)
{
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
struct vdpa_vq_state vq_state;
struct vdpa_callback cb;
struct vhost_virtqueue *vq;
struct vhost_vring_state s;
u32 idx;
long r;
r = get_user(idx, (u32 __user *)argp);
if (r < 0)
return r;
if (idx >= v->nvqs)
return -ENOBUFS;
idx = array_index_nospec(idx, v->nvqs);
vq = &v->vqs[idx];
switch (cmd) {
case VHOST_VDPA_SET_VRING_ENABLE:
if (copy_from_user(&s, argp, sizeof(s)))
return -EFAULT;
ops->set_vq_ready(vdpa, idx, s.num);
return 0;
case VHOST_GET_VRING_BASE:
r = ops->get_vq_state(v->vdpa, idx, &vq_state);
if (r)
return r;
vq->last_avail_idx = vq_state.avail_index;
break;
}
r = vhost_vring_ioctl(&v->vdev, cmd, argp);
if (r)
return r;
switch (cmd) {
case VHOST_SET_VRING_ADDR:
if (ops->set_vq_address(vdpa, idx,
(u64)(uintptr_t)vq->desc,
(u64)(uintptr_t)vq->avail,
(u64)(uintptr_t)vq->used))
r = -EINVAL;
break;
case VHOST_SET_VRING_BASE:
vq_state.avail_index = vq->last_avail_idx;
if (ops->set_vq_state(vdpa, idx, &vq_state))
r = -EINVAL;
break;
case VHOST_SET_VRING_CALL:
if (vq->call_ctx.ctx) {
cb.callback = vhost_vdpa_virtqueue_cb;
cb.private = vq;
} else {
cb.callback = NULL;
cb.private = NULL;
}
ops->set_vq_cb(vdpa, idx, &cb);
vhost_vdpa_setup_vq_irq(v, idx);
break;
case VHOST_SET_VRING_NUM:
ops->set_vq_num(vdpa, idx, vq->num);
break;
}
return r;
}
static long vhost_vdpa_unlocked_ioctl(struct file *filep,
unsigned int cmd, unsigned long arg)
{
struct vhost_vdpa *v = filep->private_data;
struct vhost_dev *d = &v->vdev;
void __user *argp = (void __user *)arg;
u64 __user *featurep = argp;
u64 features;
long r = 0;
if (cmd == VHOST_SET_BACKEND_FEATURES) {
if (copy_from_user(&features, featurep, sizeof(features)))
return -EFAULT;
if (features & ~VHOST_VDPA_BACKEND_FEATURES)
return -EOPNOTSUPP;
vhost_set_backend_features(&v->vdev, features);
return 0;
}
mutex_lock(&d->mutex);
switch (cmd) {
case VHOST_VDPA_GET_DEVICE_ID:
r = vhost_vdpa_get_device_id(v, argp);
break;
case VHOST_VDPA_GET_STATUS:
r = vhost_vdpa_get_status(v, argp);
break;
case VHOST_VDPA_SET_STATUS:
r = vhost_vdpa_set_status(v, argp);
break;
case VHOST_VDPA_GET_CONFIG:
r = vhost_vdpa_get_config(v, argp);
break;
case VHOST_VDPA_SET_CONFIG:
r = vhost_vdpa_set_config(v, argp);
break;
case VHOST_GET_FEATURES:
r = vhost_vdpa_get_features(v, argp);
break;
case VHOST_SET_FEATURES:
r = vhost_vdpa_set_features(v, argp);
break;
case VHOST_VDPA_GET_VRING_NUM:
r = vhost_vdpa_get_vring_num(v, argp);
break;
case VHOST_SET_LOG_BASE:
case VHOST_SET_LOG_FD:
r = -ENOIOCTLCMD;
break;
case VHOST_VDPA_SET_CONFIG_CALL:
r = vhost_vdpa_set_config_call(v, argp);
break;
case VHOST_GET_BACKEND_FEATURES:
features = VHOST_VDPA_BACKEND_FEATURES;
if (copy_to_user(featurep, &features, sizeof(features)))
r = -EFAULT;
break;
case VHOST_VDPA_GET_IOVA_RANGE:
r = vhost_vdpa_get_iova_range(v, argp);
break;
default:
r = vhost_dev_ioctl(&v->vdev, cmd, argp);
if (r == -ENOIOCTLCMD)
r = vhost_vdpa_vring_ioctl(v, cmd, argp);
break;
}
mutex_unlock(&d->mutex);
return r;
}
static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last)
{
struct vhost_dev *dev = &v->vdev;
struct vhost_iotlb *iotlb = dev->iotlb;
struct vhost_iotlb_map *map;
struct page *page;
unsigned long pfn, pinned;
while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) {
pinned = map->size >> PAGE_SHIFT;
for (pfn = map->addr >> PAGE_SHIFT;
pinned > 0; pfn++, pinned--) {
page = pfn_to_page(pfn);
if (map->perm & VHOST_ACCESS_WO)
set_page_dirty_lock(page);
unpin_user_page(page);
}
atomic64_sub(map->size >> PAGE_SHIFT, &dev->mm->pinned_vm);
vhost_iotlb_map_free(iotlb, map);
}
}
static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v)
{
struct vhost_dev *dev = &v->vdev;
vhost_vdpa_iotlb_unmap(v, 0ULL, 0ULL - 1);
kfree(dev->iotlb);
dev->iotlb = NULL;
}
static int perm_to_iommu_flags(u32 perm)
{
int flags = 0;
switch (perm) {
case VHOST_ACCESS_WO:
flags |= IOMMU_WRITE;
break;
case VHOST_ACCESS_RO:
flags |= IOMMU_READ;
break;
case VHOST_ACCESS_RW:
flags |= (IOMMU_WRITE | IOMMU_READ);
break;
default:
WARN(1, "invalidate vhost IOTLB permission\n");
break;
}
return flags | IOMMU_CACHE;
}
static int vhost_vdpa_map(struct vhost_vdpa *v,
u64 iova, u64 size, u64 pa, u32 perm)
{
struct vhost_dev *dev = &v->vdev;
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
int r = 0;
r = vhost_iotlb_add_range(dev->iotlb, iova, iova + size - 1,
pa, perm);
if (r)
return r;
if (ops->dma_map) {
r = ops->dma_map(vdpa, iova, size, pa, perm);
} else if (ops->set_map) {
if (!v->in_batch)
r = ops->set_map(vdpa, dev->iotlb);
} else {
r = iommu_map(v->domain, iova, pa, size,
perm_to_iommu_flags(perm));
}
if (r)
vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1);
else
atomic64_add(size >> PAGE_SHIFT, &dev->mm->pinned_vm);
return r;
}
static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size)
{
struct vhost_dev *dev = &v->vdev;
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
vhost_vdpa_iotlb_unmap(v, iova, iova + size - 1);
if (ops->dma_map) {
ops->dma_unmap(vdpa, iova, size);
} else if (ops->set_map) {
if (!v->in_batch)
ops->set_map(vdpa, dev->iotlb);
} else {
iommu_unmap(v->domain, iova, size);
}
}
static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
struct vhost_iotlb_msg *msg)
{
struct vhost_dev *dev = &v->vdev;
struct vhost_iotlb *iotlb = dev->iotlb;
struct page **page_list;
unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
unsigned int gup_flags = FOLL_LONGTERM;
unsigned long npages, cur_base, map_pfn, last_pfn = 0;
unsigned long lock_limit, sz2pin, nchunks, i;
u64 iova = msg->iova;
long pinned;
int ret = 0;
if (msg->iova < v->range.first ||
msg->iova + msg->size - 1 > v->range.last)
return -EINVAL;
if (vhost_iotlb_itree_first(iotlb, msg->iova,
msg->iova + msg->size - 1))
return -EEXIST;
/* Limit the use of memory for bookkeeping */
page_list = (struct page **) __get_free_page(GFP_KERNEL);
if (!page_list)
return -ENOMEM;
if (msg->perm & VHOST_ACCESS_WO)
gup_flags |= FOLL_WRITE;
npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT;
if (!npages) {
ret = -EINVAL;
goto free;
}
mmap_read_lock(dev->mm);
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) {
ret = -ENOMEM;
goto unlock;
}
cur_base = msg->uaddr & PAGE_MASK;
iova &= PAGE_MASK;
nchunks = 0;
while (npages) {
sz2pin = min_t(unsigned long, npages, list_size);
pinned = pin_user_pages(cur_base, sz2pin,
gup_flags, page_list, NULL);
if (sz2pin != pinned) {
if (pinned < 0) {
ret = pinned;
} else {
unpin_user_pages(page_list, pinned);
ret = -ENOMEM;
}
goto out;
}
nchunks++;
if (!last_pfn)
map_pfn = page_to_pfn(page_list[0]);
for (i = 0; i < pinned; i++) {
unsigned long this_pfn = page_to_pfn(page_list[i]);
u64 csize;
if (last_pfn && (this_pfn != last_pfn + 1)) {
/* Pin a contiguous chunk of memory */
csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
ret = vhost_vdpa_map(v, iova, csize,
map_pfn << PAGE_SHIFT,
msg->perm);
if (ret) {
/*
* Unpin the pages that are left unmapped
* from this point on in the current
* page_list. The remaining outstanding
* ones which may stride across several
* chunks will be covered in the common
* error path subsequently.
*/
unpin_user_pages(&page_list[i],
pinned - i);
goto out;
}
map_pfn = this_pfn;
iova += csize;
nchunks = 0;
}
last_pfn = this_pfn;
}
cur_base += pinned << PAGE_SHIFT;
npages -= pinned;
}
/* Pin the rest chunk */
ret = vhost_vdpa_map(v, iova, (last_pfn - map_pfn + 1) << PAGE_SHIFT,
map_pfn << PAGE_SHIFT, msg->perm);
out:
if (ret) {
if (nchunks) {
unsigned long pfn;
/*
* Unpin the outstanding pages which are yet to be
* mapped but haven't due to vdpa_map() or
* pin_user_pages() failure.
*
* Mapped pages are accounted in vdpa_map(), hence
* the corresponding unpinning will be handled by
* vdpa_unmap().
*/
WARN_ON(!last_pfn);
for (pfn = map_pfn; pfn <= last_pfn; pfn++)
unpin_user_page(pfn_to_page(pfn));
}
vhost_vdpa_unmap(v, msg->iova, msg->size);
}
unlock:
mmap_read_unlock(dev->mm);
free:
free_page((unsigned long)page_list);
return ret;
}
static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev,
struct vhost_iotlb_msg *msg)
{
struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev);
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
int r = 0;
mutex_lock(&dev->mutex);
r = vhost_dev_check_owner(dev);
if (r)
goto unlock;
switch (msg->type) {
case VHOST_IOTLB_UPDATE:
r = vhost_vdpa_process_iotlb_update(v, msg);
break;
case VHOST_IOTLB_INVALIDATE:
vhost_vdpa_unmap(v, msg->iova, msg->size);
break;
case VHOST_IOTLB_BATCH_BEGIN:
v->in_batch = true;
break;
case VHOST_IOTLB_BATCH_END:
if (v->in_batch && ops->set_map)
ops->set_map(vdpa, dev->iotlb);
v->in_batch = false;
break;
default:
r = -EINVAL;
break;
}
unlock:
mutex_unlock(&dev->mutex);
return r;
}
static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct vhost_vdpa *v = file->private_data;
struct vhost_dev *dev = &v->vdev;
return vhost_chr_write_iter(dev, from);
}
static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
{
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
struct device *dma_dev = vdpa_get_dma_dev(vdpa);
struct bus_type *bus;
int ret;
/* Device want to do DMA by itself */
if (ops->set_map || ops->dma_map)
return 0;
bus = dma_dev->bus;
if (!bus)
return -EFAULT;
if (!iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
return -ENOTSUPP;
v->domain = iommu_domain_alloc(bus);
if (!v->domain)
return -EIO;
ret = iommu_attach_device(v->domain, dma_dev);
if (ret)
goto err_attach;
return 0;
err_attach:
iommu_domain_free(v->domain);
return ret;
}
static void vhost_vdpa_free_domain(struct vhost_vdpa *v)
{
struct vdpa_device *vdpa = v->vdpa;
struct device *dma_dev = vdpa_get_dma_dev(vdpa);
if (v->domain) {
iommu_detach_device(v->domain, dma_dev);
iommu_domain_free(v->domain);
}
v->domain = NULL;
}
static void vhost_vdpa_set_iova_range(struct vhost_vdpa *v)
{
struct vdpa_iova_range *range = &v->range;
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
if (ops->get_iova_range) {
*range = ops->get_iova_range(vdpa);
} else if (v->domain && v->domain->geometry.force_aperture) {
range->first = v->domain->geometry.aperture_start;
range->last = v->domain->geometry.aperture_end;
} else {
range->first = 0;
range->last = ULLONG_MAX;
}
}
static int vhost_vdpa_open(struct inode *inode, struct file *filep)
{
struct vhost_vdpa *v;
struct vhost_dev *dev;
struct vhost_virtqueue **vqs;
int nvqs, i, r, opened;
v = container_of(inode->i_cdev, struct vhost_vdpa, cdev);
opened = atomic_cmpxchg(&v->opened, 0, 1);
if (opened)
return -EBUSY;
nvqs = v->nvqs;
vhost_vdpa_reset(v);
vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL);
if (!vqs) {
r = -ENOMEM;
goto err;
}
dev = &v->vdev;
for (i = 0; i < nvqs; i++) {
vqs[i] = &v->vqs[i];
vqs[i]->handle_kick = handle_vq_kick;
}
vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false,
vhost_vdpa_process_iotlb_msg);
dev->iotlb = vhost_iotlb_alloc(0, 0);
if (!dev->iotlb) {
r = -ENOMEM;
goto err_init_iotlb;
}
r = vhost_vdpa_alloc_domain(v);
if (r)
goto err_init_iotlb;
vhost_vdpa_set_iova_range(v);
filep->private_data = v;
return 0;
err_init_iotlb:
vhost_dev_cleanup(&v->vdev);
kfree(vqs);
err:
atomic_dec(&v->opened);
return r;
}
static void vhost_vdpa_clean_irq(struct vhost_vdpa *v)
{
int i;
for (i = 0; i < v->nvqs; i++)
vhost_vdpa_unsetup_vq_irq(v, i);
}
static int vhost_vdpa_release(struct inode *inode, struct file *filep)
{
struct vhost_vdpa *v = filep->private_data;
struct vhost_dev *d = &v->vdev;
mutex_lock(&d->mutex);
filep->private_data = NULL;
vhost_vdpa_reset(v);
vhost_dev_stop(&v->vdev);
vhost_vdpa_iotlb_free(v);
vhost_vdpa_free_domain(v);
vhost_vdpa_config_put(v);
vhost_vdpa_clean_irq(v);
vhost_dev_cleanup(&v->vdev);
kfree(v->vdev.vqs);
mutex_unlock(&d->mutex);
atomic_dec(&v->opened);
complete(&v->completion);
return 0;
}
#ifdef CONFIG_MMU
static vm_fault_t vhost_vdpa_fault(struct vm_fault *vmf)
{
struct vhost_vdpa *v = vmf->vma->vm_file->private_data;
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
struct vdpa_notification_area notify;
struct vm_area_struct *vma = vmf->vma;
u16 index = vma->vm_pgoff;
notify = ops->get_vq_notification(vdpa, index);
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
if (remap_pfn_range(vma, vmf->address & PAGE_MASK,
notify.addr >> PAGE_SHIFT, PAGE_SIZE,
vma->vm_page_prot))
return VM_FAULT_SIGBUS;
return VM_FAULT_NOPAGE;
}
static const struct vm_operations_struct vhost_vdpa_vm_ops = {
.fault = vhost_vdpa_fault,
};
static int vhost_vdpa_mmap(struct file *file, struct vm_area_struct *vma)
{
struct vhost_vdpa *v = vma->vm_file->private_data;
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
struct vdpa_notification_area notify;
unsigned long index = vma->vm_pgoff;
if (vma->vm_end - vma->vm_start != PAGE_SIZE)
return -EINVAL;
if ((vma->vm_flags & VM_SHARED) == 0)
return -EINVAL;
if (vma->vm_flags & VM_READ)
return -EINVAL;
if (index > 65535)
return -EINVAL;
if (!ops->get_vq_notification)
return -ENOTSUPP;
/* To be safe and easily modelled by userspace, We only
* support the doorbell which sits on the page boundary and
* does not share the page with other registers.
*/
notify = ops->get_vq_notification(vdpa, index);
if (notify.addr & (PAGE_SIZE - 1))
return -EINVAL;
if (vma->vm_end - vma->vm_start != notify.size)
return -ENOTSUPP;
vma->vm_ops = &vhost_vdpa_vm_ops;
return 0;
}
#endif /* CONFIG_MMU */
static const struct file_operations vhost_vdpa_fops = {
.owner = THIS_MODULE,
.open = vhost_vdpa_open,
.release = vhost_vdpa_release,
.write_iter = vhost_vdpa_chr_write_iter,
.unlocked_ioctl = vhost_vdpa_unlocked_ioctl,
#ifdef CONFIG_MMU
.mmap = vhost_vdpa_mmap,
#endif /* CONFIG_MMU */
.compat_ioctl = compat_ptr_ioctl,
};
static void vhost_vdpa_release_dev(struct device *device)
{
struct vhost_vdpa *v =
container_of(device, struct vhost_vdpa, dev);
ida_simple_remove(&vhost_vdpa_ida, v->minor);
kfree(v->vqs);
kfree(v);
}
static int vhost_vdpa_probe(struct vdpa_device *vdpa)
{
const struct vdpa_config_ops *ops = vdpa->config;
struct vhost_vdpa *v;
int minor;
int r;
/* Currently, we only accept the network devices. */
if (ops->get_device_id(vdpa) != VIRTIO_ID_NET)
return -ENOTSUPP;
v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!v)
return -ENOMEM;
minor = ida_simple_get(&vhost_vdpa_ida, 0,
VHOST_VDPA_DEV_MAX, GFP_KERNEL);
if (minor < 0) {
kfree(v);
return minor;
}
atomic_set(&v->opened, 0);
v->minor = minor;
v->vdpa = vdpa;
v->nvqs = vdpa->nvqs;
v->virtio_id = ops->get_device_id(vdpa);
device_initialize(&v->dev);
v->dev.release = vhost_vdpa_release_dev;
v->dev.parent = &vdpa->dev;
v->dev.devt = MKDEV(MAJOR(vhost_vdpa_major), minor);
v->vqs = kmalloc_array(v->nvqs, sizeof(struct vhost_virtqueue),
GFP_KERNEL);
if (!v->vqs) {
r = -ENOMEM;
goto err;
}
r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor);
if (r)
goto err;
cdev_init(&v->cdev, &vhost_vdpa_fops);
v->cdev.owner = THIS_MODULE;
r = cdev_device_add(&v->cdev, &v->dev);
if (r)
goto err;
init_completion(&v->completion);
vdpa_set_drvdata(vdpa, v);
return 0;
err:
put_device(&v->dev);
return r;
}
static void vhost_vdpa_remove(struct vdpa_device *vdpa)
{
struct vhost_vdpa *v = vdpa_get_drvdata(vdpa);
int opened;
cdev_device_del(&v->cdev, &v->dev);
do {
opened = atomic_cmpxchg(&v->opened, 0, 1);
if (!opened)
break;
wait_for_completion(&v->completion);
} while (1);
put_device(&v->dev);
}
static struct vdpa_driver vhost_vdpa_driver = {
.driver = {
.name = "vhost_vdpa",
},
.probe = vhost_vdpa_probe,
.remove = vhost_vdpa_remove,
};
static int __init vhost_vdpa_init(void)
{
int r;
r = alloc_chrdev_region(&vhost_vdpa_major, 0, VHOST_VDPA_DEV_MAX,
"vhost-vdpa");
if (r)
goto err_alloc_chrdev;
r = vdpa_register_driver(&vhost_vdpa_driver);
if (r)
goto err_vdpa_register_driver;
return 0;
err_vdpa_register_driver:
unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX);
err_alloc_chrdev:
return r;
}
module_init(vhost_vdpa_init);
static void __exit vhost_vdpa_exit(void)
{
vdpa_unregister_driver(&vhost_vdpa_driver);
unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX);
}
module_exit(vhost_vdpa_exit);
MODULE_VERSION("0.0.1");
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Intel Corporation");
MODULE_DESCRIPTION("vDPA-based vhost backend for virtio");