abafbc551f
Accessing the disabled memory space of a PCI device would typically result in a master abort response on conventional PCI, or an unsupported request on PCI express. The user would generally see these as a -1 response for the read return data and the write would be silently discarded, possibly with an uncorrected, non-fatal AER error triggered on the host. Some systems however take it upon themselves to bring down the entire system when they see something that might indicate a loss of data, such as this discarded write to a disabled memory space. To avoid this, we want to try to block the user from accessing memory spaces while they're disabled. We start with a semaphore around the memory enable bit, where writers modify the memory enable state and must be serialized, while readers make use of the memory region and can access in parallel. Writers include both direct manipulation via the command register, as well as any reset path where the internal mechanics of the reset may both explicitly and implicitly disable memory access, and manipulation of the MSI-X configuration, where the MSI-X vector table resides in MMIO space of the device. Readers include the read and write file ops to access the vfio device fd offsets as well as memory mapped access. In the latter case, we make use of our new vma list support to zap, or invalidate, those memory mappings in order to force them to be faulted back in on access. Our semaphore usage will stall user access to MMIO spaces across internal operations like reset, but the user might experience new behavior when trying to access the MMIO space while disabled via the PCI command register. Access via read or write while disabled will return -EIO and access via memory maps will result in a SIGBUS. This is expected to be compatible with known use cases and potentially provides better error handling capabilities than present in the hardware, while avoiding the more readily accessible and severe platform error responses that might otherwise occur. Fixes: CVE-2020-12888 Reviewed-by: Peter Xu <peterx@redhat.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
402 lines
8.4 KiB
C
402 lines
8.4 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* VFIO PCI I/O Port & MMIO access
|
|
*
|
|
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
|
|
* Author: Alex Williamson <alex.williamson@redhat.com>
|
|
*
|
|
* Derived from original vfio:
|
|
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
|
|
* Author: Tom Lyon, pugs@cisco.com
|
|
*/
|
|
|
|
#include <linux/fs.h>
|
|
#include <linux/pci.h>
|
|
#include <linux/uaccess.h>
|
|
#include <linux/io.h>
|
|
#include <linux/vfio.h>
|
|
#include <linux/vgaarb.h>
|
|
|
|
#include "vfio_pci_private.h"
|
|
|
|
#ifdef __LITTLE_ENDIAN
|
|
#define vfio_ioread64 ioread64
|
|
#define vfio_iowrite64 iowrite64
|
|
#define vfio_ioread32 ioread32
|
|
#define vfio_iowrite32 iowrite32
|
|
#define vfio_ioread16 ioread16
|
|
#define vfio_iowrite16 iowrite16
|
|
#else
|
|
#define vfio_ioread64 ioread64be
|
|
#define vfio_iowrite64 iowrite64be
|
|
#define vfio_ioread32 ioread32be
|
|
#define vfio_iowrite32 iowrite32be
|
|
#define vfio_ioread16 ioread16be
|
|
#define vfio_iowrite16 iowrite16be
|
|
#endif
|
|
#define vfio_ioread8 ioread8
|
|
#define vfio_iowrite8 iowrite8
|
|
|
|
/*
|
|
* Read or write from an __iomem region (MMIO or I/O port) with an excluded
|
|
* range which is inaccessible. The excluded range drops writes and fills
|
|
* reads with -1. This is intended for handling MSI-X vector tables and
|
|
* leftover space for ROM BARs.
|
|
*/
|
|
static ssize_t do_io_rw(void __iomem *io, char __user *buf,
|
|
loff_t off, size_t count, size_t x_start,
|
|
size_t x_end, bool iswrite)
|
|
{
|
|
ssize_t done = 0;
|
|
|
|
while (count) {
|
|
size_t fillable, filled;
|
|
|
|
if (off < x_start)
|
|
fillable = min(count, (size_t)(x_start - off));
|
|
else if (off >= x_end)
|
|
fillable = count;
|
|
else
|
|
fillable = 0;
|
|
|
|
if (fillable >= 4 && !(off % 4)) {
|
|
u32 val;
|
|
|
|
if (iswrite) {
|
|
if (copy_from_user(&val, buf, 4))
|
|
return -EFAULT;
|
|
|
|
vfio_iowrite32(val, io + off);
|
|
} else {
|
|
val = vfio_ioread32(io + off);
|
|
|
|
if (copy_to_user(buf, &val, 4))
|
|
return -EFAULT;
|
|
}
|
|
|
|
filled = 4;
|
|
} else if (fillable >= 2 && !(off % 2)) {
|
|
u16 val;
|
|
|
|
if (iswrite) {
|
|
if (copy_from_user(&val, buf, 2))
|
|
return -EFAULT;
|
|
|
|
vfio_iowrite16(val, io + off);
|
|
} else {
|
|
val = vfio_ioread16(io + off);
|
|
|
|
if (copy_to_user(buf, &val, 2))
|
|
return -EFAULT;
|
|
}
|
|
|
|
filled = 2;
|
|
} else if (fillable) {
|
|
u8 val;
|
|
|
|
if (iswrite) {
|
|
if (copy_from_user(&val, buf, 1))
|
|
return -EFAULT;
|
|
|
|
vfio_iowrite8(val, io + off);
|
|
} else {
|
|
val = vfio_ioread8(io + off);
|
|
|
|
if (copy_to_user(buf, &val, 1))
|
|
return -EFAULT;
|
|
}
|
|
|
|
filled = 1;
|
|
} else {
|
|
/* Fill reads with -1, drop writes */
|
|
filled = min(count, (size_t)(x_end - off));
|
|
if (!iswrite) {
|
|
u8 val = 0xFF;
|
|
size_t i;
|
|
|
|
for (i = 0; i < filled; i++)
|
|
if (copy_to_user(buf + i, &val, 1))
|
|
return -EFAULT;
|
|
}
|
|
}
|
|
|
|
count -= filled;
|
|
done += filled;
|
|
off += filled;
|
|
buf += filled;
|
|
}
|
|
|
|
return done;
|
|
}
|
|
|
|
static int vfio_pci_setup_barmap(struct vfio_pci_device *vdev, int bar)
|
|
{
|
|
struct pci_dev *pdev = vdev->pdev;
|
|
int ret;
|
|
void __iomem *io;
|
|
|
|
if (vdev->barmap[bar])
|
|
return 0;
|
|
|
|
ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
|
|
if (ret)
|
|
return ret;
|
|
|
|
io = pci_iomap(pdev, bar, 0);
|
|
if (!io) {
|
|
pci_release_selected_regions(pdev, 1 << bar);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
vdev->barmap[bar] = io;
|
|
|
|
return 0;
|
|
}
|
|
|
|
ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
|
|
size_t count, loff_t *ppos, bool iswrite)
|
|
{
|
|
struct pci_dev *pdev = vdev->pdev;
|
|
loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
|
|
int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
|
|
size_t x_start = 0, x_end = 0;
|
|
resource_size_t end;
|
|
void __iomem *io;
|
|
struct resource *res = &vdev->pdev->resource[bar];
|
|
ssize_t done;
|
|
|
|
if (pci_resource_start(pdev, bar))
|
|
end = pci_resource_len(pdev, bar);
|
|
else if (bar == PCI_ROM_RESOURCE &&
|
|
pdev->resource[bar].flags & IORESOURCE_ROM_SHADOW)
|
|
end = 0x20000;
|
|
else
|
|
return -EINVAL;
|
|
|
|
if (pos >= end)
|
|
return -EINVAL;
|
|
|
|
count = min(count, (size_t)(end - pos));
|
|
|
|
if (res->flags & IORESOURCE_MEM) {
|
|
down_read(&vdev->memory_lock);
|
|
if (!__vfio_pci_memory_enabled(vdev)) {
|
|
up_read(&vdev->memory_lock);
|
|
return -EIO;
|
|
}
|
|
}
|
|
|
|
if (bar == PCI_ROM_RESOURCE) {
|
|
/*
|
|
* The ROM can fill less space than the BAR, so we start the
|
|
* excluded range at the end of the actual ROM. This makes
|
|
* filling large ROM BARs much faster.
|
|
*/
|
|
io = pci_map_rom(pdev, &x_start);
|
|
if (!io) {
|
|
done = -ENOMEM;
|
|
goto out;
|
|
}
|
|
x_end = end;
|
|
} else {
|
|
int ret = vfio_pci_setup_barmap(vdev, bar);
|
|
if (ret) {
|
|
done = ret;
|
|
goto out;
|
|
}
|
|
|
|
io = vdev->barmap[bar];
|
|
}
|
|
|
|
if (bar == vdev->msix_bar) {
|
|
x_start = vdev->msix_offset;
|
|
x_end = vdev->msix_offset + vdev->msix_size;
|
|
}
|
|
|
|
done = do_io_rw(io, buf, pos, count, x_start, x_end, iswrite);
|
|
|
|
if (done >= 0)
|
|
*ppos += done;
|
|
|
|
if (bar == PCI_ROM_RESOURCE)
|
|
pci_unmap_rom(pdev, io);
|
|
out:
|
|
if (res->flags & IORESOURCE_MEM)
|
|
up_read(&vdev->memory_lock);
|
|
|
|
return done;
|
|
}
|
|
|
|
ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
|
|
size_t count, loff_t *ppos, bool iswrite)
|
|
{
|
|
int ret;
|
|
loff_t off, pos = *ppos & VFIO_PCI_OFFSET_MASK;
|
|
void __iomem *iomem = NULL;
|
|
unsigned int rsrc;
|
|
bool is_ioport;
|
|
ssize_t done;
|
|
|
|
if (!vdev->has_vga)
|
|
return -EINVAL;
|
|
|
|
if (pos > 0xbfffful)
|
|
return -EINVAL;
|
|
|
|
switch ((u32)pos) {
|
|
case 0xa0000 ... 0xbffff:
|
|
count = min(count, (size_t)(0xc0000 - pos));
|
|
iomem = ioremap(0xa0000, 0xbffff - 0xa0000 + 1);
|
|
off = pos - 0xa0000;
|
|
rsrc = VGA_RSRC_LEGACY_MEM;
|
|
is_ioport = false;
|
|
break;
|
|
case 0x3b0 ... 0x3bb:
|
|
count = min(count, (size_t)(0x3bc - pos));
|
|
iomem = ioport_map(0x3b0, 0x3bb - 0x3b0 + 1);
|
|
off = pos - 0x3b0;
|
|
rsrc = VGA_RSRC_LEGACY_IO;
|
|
is_ioport = true;
|
|
break;
|
|
case 0x3c0 ... 0x3df:
|
|
count = min(count, (size_t)(0x3e0 - pos));
|
|
iomem = ioport_map(0x3c0, 0x3df - 0x3c0 + 1);
|
|
off = pos - 0x3c0;
|
|
rsrc = VGA_RSRC_LEGACY_IO;
|
|
is_ioport = true;
|
|
break;
|
|
default:
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (!iomem)
|
|
return -ENOMEM;
|
|
|
|
ret = vga_get_interruptible(vdev->pdev, rsrc);
|
|
if (ret) {
|
|
is_ioport ? ioport_unmap(iomem) : iounmap(iomem);
|
|
return ret;
|
|
}
|
|
|
|
done = do_io_rw(iomem, buf, off, count, 0, 0, iswrite);
|
|
|
|
vga_put(vdev->pdev, rsrc);
|
|
|
|
is_ioport ? ioport_unmap(iomem) : iounmap(iomem);
|
|
|
|
if (done >= 0)
|
|
*ppos += done;
|
|
|
|
return done;
|
|
}
|
|
|
|
static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
|
|
{
|
|
struct vfio_pci_ioeventfd *ioeventfd = opaque;
|
|
|
|
switch (ioeventfd->count) {
|
|
case 1:
|
|
vfio_iowrite8(ioeventfd->data, ioeventfd->addr);
|
|
break;
|
|
case 2:
|
|
vfio_iowrite16(ioeventfd->data, ioeventfd->addr);
|
|
break;
|
|
case 4:
|
|
vfio_iowrite32(ioeventfd->data, ioeventfd->addr);
|
|
break;
|
|
#ifdef iowrite64
|
|
case 8:
|
|
vfio_iowrite64(ioeventfd->data, ioeventfd->addr);
|
|
break;
|
|
#endif
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
|
|
uint64_t data, int count, int fd)
|
|
{
|
|
struct pci_dev *pdev = vdev->pdev;
|
|
loff_t pos = offset & VFIO_PCI_OFFSET_MASK;
|
|
int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset);
|
|
struct vfio_pci_ioeventfd *ioeventfd;
|
|
|
|
/* Only support ioeventfds into BARs */
|
|
if (bar > VFIO_PCI_BAR5_REGION_INDEX)
|
|
return -EINVAL;
|
|
|
|
if (pos + count > pci_resource_len(pdev, bar))
|
|
return -EINVAL;
|
|
|
|
/* Disallow ioeventfds working around MSI-X table writes */
|
|
if (bar == vdev->msix_bar &&
|
|
!(pos + count <= vdev->msix_offset ||
|
|
pos >= vdev->msix_offset + vdev->msix_size))
|
|
return -EINVAL;
|
|
|
|
#ifndef iowrite64
|
|
if (count == 8)
|
|
return -EINVAL;
|
|
#endif
|
|
|
|
ret = vfio_pci_setup_barmap(vdev, bar);
|
|
if (ret)
|
|
return ret;
|
|
|
|
mutex_lock(&vdev->ioeventfds_lock);
|
|
|
|
list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) {
|
|
if (ioeventfd->pos == pos && ioeventfd->bar == bar &&
|
|
ioeventfd->data == data && ioeventfd->count == count) {
|
|
if (fd == -1) {
|
|
vfio_virqfd_disable(&ioeventfd->virqfd);
|
|
list_del(&ioeventfd->next);
|
|
vdev->ioeventfds_nr--;
|
|
kfree(ioeventfd);
|
|
ret = 0;
|
|
} else
|
|
ret = -EEXIST;
|
|
|
|
goto out_unlock;
|
|
}
|
|
}
|
|
|
|
if (fd < 0) {
|
|
ret = -ENODEV;
|
|
goto out_unlock;
|
|
}
|
|
|
|
if (vdev->ioeventfds_nr >= VFIO_PCI_IOEVENTFD_MAX) {
|
|
ret = -ENOSPC;
|
|
goto out_unlock;
|
|
}
|
|
|
|
ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL);
|
|
if (!ioeventfd) {
|
|
ret = -ENOMEM;
|
|
goto out_unlock;
|
|
}
|
|
|
|
ioeventfd->addr = vdev->barmap[bar] + pos;
|
|
ioeventfd->data = data;
|
|
ioeventfd->pos = pos;
|
|
ioeventfd->bar = bar;
|
|
ioeventfd->count = count;
|
|
|
|
ret = vfio_virqfd_enable(ioeventfd, vfio_pci_ioeventfd_handler,
|
|
NULL, NULL, &ioeventfd->virqfd, fd);
|
|
if (ret) {
|
|
kfree(ioeventfd);
|
|
goto out_unlock;
|
|
}
|
|
|
|
list_add(&ioeventfd->next, &vdev->ioeventfds_list);
|
|
vdev->ioeventfds_nr++;
|
|
|
|
out_unlock:
|
|
mutex_unlock(&vdev->ioeventfds_lock);
|
|
|
|
return ret;
|
|
}
|