2019-01-18 02:14:25 +03:00
// SPDX-License-Identifier: GPL-2.0-only
2018-12-20 04:10:36 +03:00
/*
* VFIO PCI NVIDIA Whitherspoon GPU support a . k . a . NVLink2 .
*
* Copyright ( C ) 2018 IBM Corp . All rights reserved .
* Author : Alexey Kardashevskiy < aik @ ozlabs . ru >
*
* Register an on - GPU RAM region for cacheable access .
*
* Derived from original vfio_pci_igd . c :
* Copyright ( C ) 2016 Red Hat , Inc . All rights reserved .
* Author : Alex Williamson < alex . williamson @ redhat . com >
*/
# include <linux/io.h>
# include <linux/pci.h>
# include <linux/uaccess.h>
# include <linux/vfio.h>
# include <linux/sched/mm.h>
# include <linux/mmu_context.h>
# include <asm/kvm_ppc.h>
# include "vfio_pci_private.h"
# define CREATE_TRACE_POINTS
# include "trace.h"
EXPORT_TRACEPOINT_SYMBOL_GPL ( vfio_pci_nvgpu_mmap_fault ) ;
EXPORT_TRACEPOINT_SYMBOL_GPL ( vfio_pci_nvgpu_mmap ) ;
EXPORT_TRACEPOINT_SYMBOL_GPL ( vfio_pci_npu2_mmap ) ;
struct vfio_pci_nvgpu_data {
unsigned long gpu_hpa ; /* GPU RAM physical address */
unsigned long gpu_tgt ; /* TGT address of corresponding GPU RAM */
unsigned long useraddr ; /* GPU RAM userspace address */
unsigned long size ; /* Size of the GPU RAM window (usually 128GB) */
struct mm_struct * mm ;
struct mm_iommu_table_group_mem_t * mem ; /* Pre-registered RAM descr. */
struct pci_dev * gpdev ;
struct notifier_block group_notifier ;
} ;
static size_t vfio_pci_nvgpu_rw ( struct vfio_pci_device * vdev ,
char __user * buf , size_t count , loff_t * ppos , bool iswrite )
{
unsigned int i = VFIO_PCI_OFFSET_TO_INDEX ( * ppos ) - VFIO_PCI_NUM_REGIONS ;
struct vfio_pci_nvgpu_data * data = vdev - > region [ i ] . data ;
loff_t pos = * ppos & VFIO_PCI_OFFSET_MASK ;
loff_t posaligned = pos & PAGE_MASK , posoff = pos & ~ PAGE_MASK ;
size_t sizealigned ;
void __iomem * ptr ;
if ( pos > = vdev - > region [ i ] . size )
return - EINVAL ;
count = min ( count , ( size_t ) ( vdev - > region [ i ] . size - pos ) ) ;
/*
* We map only a bit of GPU RAM for a short time instead of mapping it
* for the guest lifetime as :
*
* 1 ) we do not know GPU RAM size , only aperture which is 4 - 8 times
* bigger than actual RAM size ( 16 / 32 GB RAM vs . 128 GB aperture ) ;
* 2 ) mapping GPU RAM allows CPU to prefetch and if this happens
* before NVLink bridge is reset ( which fences GPU RAM ) ,
* hardware management interrupts ( HMI ) might happen , this
* will freeze NVLink bridge .
*
* This is not fast path anyway .
*/
2020-04-20 21:36:34 +03:00
sizealigned = ALIGN ( posoff + count , PAGE_SIZE ) ;
2018-12-20 04:10:36 +03:00
ptr = ioremap_cache ( data - > gpu_hpa + posaligned , sizealigned ) ;
if ( ! ptr )
return - EFAULT ;
if ( iswrite ) {
if ( copy_from_user ( ptr + posoff , buf , count ) )
count = - EFAULT ;
else
* ppos + = count ;
} else {
if ( copy_to_user ( buf , ptr + posoff , count ) )
count = - EFAULT ;
else
* ppos + = count ;
}
iounmap ( ptr ) ;
return count ;
}
static void vfio_pci_nvgpu_release ( struct vfio_pci_device * vdev ,
struct vfio_pci_region * region )
{
struct vfio_pci_nvgpu_data * data = region - > data ;
long ret ;
/* If there were any mappings at all... */
if ( data - > mm ) {
2019-12-23 04:09:27 +03:00
if ( data - > mem ) {
ret = mm_iommu_put ( data - > mm , data - > mem ) ;
WARN_ON ( ret ) ;
}
2018-12-20 04:10:36 +03:00
mmdrop ( data - > mm ) ;
}
vfio_unregister_notifier ( & data - > gpdev - > dev , VFIO_GROUP_NOTIFY ,
& data - > group_notifier ) ;
pnv_npu2_unmap_lpar_dev ( data - > gpdev ) ;
kfree ( data ) ;
}
static vm_fault_t vfio_pci_nvgpu_mmap_fault ( struct vm_fault * vmf )
{
vm_fault_t ret ;
struct vm_area_struct * vma = vmf - > vma ;
struct vfio_pci_region * region = vma - > vm_private_data ;
struct vfio_pci_nvgpu_data * data = region - > data ;
unsigned long vmf_off = ( vmf - > address - vma - > vm_start ) > > PAGE_SHIFT ;
unsigned long nv2pg = data - > gpu_hpa > > PAGE_SHIFT ;
unsigned long vm_pgoff = vma - > vm_pgoff &
( ( 1U < < ( VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT ) ) - 1 ) ;
unsigned long pfn = nv2pg + vm_pgoff + vmf_off ;
ret = vmf_insert_pfn ( vma , vmf - > address , pfn ) ;
trace_vfio_pci_nvgpu_mmap_fault ( data - > gpdev , pfn < < PAGE_SHIFT ,
vmf - > address , ret ) ;
return ret ;
}
static const struct vm_operations_struct vfio_pci_nvgpu_mmap_vmops = {
. fault = vfio_pci_nvgpu_mmap_fault ,
} ;
static int vfio_pci_nvgpu_mmap ( struct vfio_pci_device * vdev ,
struct vfio_pci_region * region , struct vm_area_struct * vma )
{
int ret ;
struct vfio_pci_nvgpu_data * data = region - > data ;
if ( data - > useraddr )
return - EPERM ;
if ( vma - > vm_end - vma - > vm_start > data - > size )
return - EINVAL ;
vma - > vm_private_data = region ;
vma - > vm_flags | = VM_PFNMAP ;
vma - > vm_ops = & vfio_pci_nvgpu_mmap_vmops ;
/*
* Calling mm_iommu_newdev ( ) here once as the region is not
* registered yet and therefore right initialization will happen now .
* Other places will use mm_iommu_find ( ) which returns
* registered @ mem and does not go gup ( ) .
*/
data - > useraddr = vma - > vm_start ;
data - > mm = current - > mm ;
2019-12-29 18:42:56 +03:00
mmgrab ( data - > mm ) ;
2018-12-20 04:10:36 +03:00
ret = ( int ) mm_iommu_newdev ( data - > mm , data - > useraddr ,
2019-07-02 20:38:09 +03:00
vma_pages ( vma ) , data - > gpu_hpa , & data - > mem ) ;
2018-12-20 04:10:36 +03:00
trace_vfio_pci_nvgpu_mmap ( vdev - > pdev , data - > gpu_hpa , data - > useraddr ,
vma - > vm_end - vma - > vm_start , ret ) ;
return ret ;
}
static int vfio_pci_nvgpu_add_capability ( struct vfio_pci_device * vdev ,
struct vfio_pci_region * region , struct vfio_info_cap * caps )
{
struct vfio_pci_nvgpu_data * data = region - > data ;
2019-01-23 07:07:11 +03:00
struct vfio_region_info_cap_nvlink2_ssatgt cap = {
. header . id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT ,
. header . version = 1 ,
. tgt = data - > gpu_tgt
} ;
2018-12-20 04:10:36 +03:00
return vfio_info_add_capability ( caps , & cap . header , sizeof ( cap ) ) ;
}
static const struct vfio_pci_regops vfio_pci_nvgpu_regops = {
. rw = vfio_pci_nvgpu_rw ,
. release = vfio_pci_nvgpu_release ,
. mmap = vfio_pci_nvgpu_mmap ,
. add_capability = vfio_pci_nvgpu_add_capability ,
} ;
static int vfio_pci_nvgpu_group_notifier ( struct notifier_block * nb ,
unsigned long action , void * opaque )
{
struct kvm * kvm = opaque ;
struct vfio_pci_nvgpu_data * data = container_of ( nb ,
struct vfio_pci_nvgpu_data ,
group_notifier ) ;
if ( action = = VFIO_GROUP_NOTIFY_SET_KVM & & kvm & &
pnv_npu2_map_lpar_dev ( data - > gpdev ,
kvm - > arch . lpid , MSR_DR | MSR_PR ) )
return NOTIFY_BAD ;
return NOTIFY_OK ;
}
int vfio_pci_nvdia_v100_nvlink2_init ( struct vfio_pci_device * vdev )
{
int ret ;
u64 reg [ 2 ] ;
u64 tgt = 0 ;
struct device_node * npu_node , * mem_node ;
struct pci_dev * npu_dev ;
struct vfio_pci_nvgpu_data * data ;
uint32_t mem_phandle = 0 ;
unsigned long events = VFIO_GROUP_NOTIFY_SET_KVM ;
/*
* PCI config space does not tell us about NVLink presense but
* platform does , use this .
*/
npu_dev = pnv_pci_get_npu_dev ( vdev - > pdev , 0 ) ;
if ( ! npu_dev )
return - ENODEV ;
npu_node = pci_device_to_OF_node ( npu_dev ) ;
if ( ! npu_node )
return - EINVAL ;
if ( of_property_read_u32 ( npu_node , " memory-region " , & mem_phandle ) )
return - EINVAL ;
mem_node = of_find_node_by_phandle ( mem_phandle ) ;
if ( ! mem_node )
return - EINVAL ;
if ( of_property_read_variable_u64_array ( mem_node , " reg " , reg ,
ARRAY_SIZE ( reg ) , ARRAY_SIZE ( reg ) ) ! =
ARRAY_SIZE ( reg ) )
return - EINVAL ;
if ( of_property_read_u64 ( npu_node , " ibm,device-tgt-addr " , & tgt ) ) {
dev_warn ( & vdev - > pdev - > dev , " No ibm,device-tgt-addr found \n " ) ;
return - EFAULT ;
}
data = kzalloc ( sizeof ( * data ) , GFP_KERNEL ) ;
if ( ! data )
return - ENOMEM ;
data - > gpu_hpa = reg [ 0 ] ;
data - > gpu_tgt = tgt ;
data - > size = reg [ 1 ] ;
dev_dbg ( & vdev - > pdev - > dev , " %lx..%lx \n " , data - > gpu_hpa ,
data - > gpu_hpa + data - > size - 1 ) ;
data - > gpdev = vdev - > pdev ;
data - > group_notifier . notifier_call = vfio_pci_nvgpu_group_notifier ;
ret = vfio_register_notifier ( & data - > gpdev - > dev , VFIO_GROUP_NOTIFY ,
& events , & data - > group_notifier ) ;
if ( ret )
goto free_exit ;
/*
* We have just set KVM , we do not need the listener anymore .
* Also , keeping it registered means that if more than one GPU is
* assigned , we will get several similar notifiers notifying about
* the same device again which does not help with anything .
*/
vfio_unregister_notifier ( & data - > gpdev - > dev , VFIO_GROUP_NOTIFY ,
& data - > group_notifier ) ;
ret = vfio_pci_register_dev_region ( vdev ,
PCI_VENDOR_ID_NVIDIA | VFIO_REGION_TYPE_PCI_VENDOR_TYPE ,
VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM ,
& vfio_pci_nvgpu_regops ,
data - > size ,
VFIO_REGION_INFO_FLAG_READ |
VFIO_REGION_INFO_FLAG_WRITE |
VFIO_REGION_INFO_FLAG_MMAP ,
data ) ;
if ( ret )
goto free_exit ;
return 0 ;
free_exit :
kfree ( data ) ;
return ret ;
}
/*
* IBM NPU2 bridge
*/
struct vfio_pci_npu2_data {
void * base ; /* ATSD register virtual address, for emulated access */
unsigned long mmio_atsd ; /* ATSD physical address */
unsigned long gpu_tgt ; /* TGT address of corresponding GPU RAM */
unsigned int link_speed ; /* The link speed from DT's ibm,nvlink-speed */
} ;
static size_t vfio_pci_npu2_rw ( struct vfio_pci_device * vdev ,
char __user * buf , size_t count , loff_t * ppos , bool iswrite )
{
unsigned int i = VFIO_PCI_OFFSET_TO_INDEX ( * ppos ) - VFIO_PCI_NUM_REGIONS ;
struct vfio_pci_npu2_data * data = vdev - > region [ i ] . data ;
loff_t pos = * ppos & VFIO_PCI_OFFSET_MASK ;
if ( pos > = vdev - > region [ i ] . size )
return - EINVAL ;
count = min ( count , ( size_t ) ( vdev - > region [ i ] . size - pos ) ) ;
if ( iswrite ) {
if ( copy_from_user ( data - > base + pos , buf , count ) )
return - EFAULT ;
} else {
if ( copy_to_user ( buf , data - > base + pos , count ) )
return - EFAULT ;
}
* ppos + = count ;
return count ;
}
static int vfio_pci_npu2_mmap ( struct vfio_pci_device * vdev ,
struct vfio_pci_region * region , struct vm_area_struct * vma )
{
int ret ;
struct vfio_pci_npu2_data * data = region - > data ;
unsigned long req_len = vma - > vm_end - vma - > vm_start ;
if ( req_len ! = PAGE_SIZE )
return - EINVAL ;
vma - > vm_flags | = VM_PFNMAP ;
vma - > vm_page_prot = pgprot_noncached ( vma - > vm_page_prot ) ;
ret = remap_pfn_range ( vma , vma - > vm_start , data - > mmio_atsd > > PAGE_SHIFT ,
req_len , vma - > vm_page_prot ) ;
trace_vfio_pci_npu2_mmap ( vdev - > pdev , data - > mmio_atsd , vma - > vm_start ,
vma - > vm_end - vma - > vm_start , ret ) ;
return ret ;
}
static void vfio_pci_npu2_release ( struct vfio_pci_device * vdev ,
struct vfio_pci_region * region )
{
struct vfio_pci_npu2_data * data = region - > data ;
memunmap ( data - > base ) ;
kfree ( data ) ;
}
static int vfio_pci_npu2_add_capability ( struct vfio_pci_device * vdev ,
struct vfio_pci_region * region , struct vfio_info_cap * caps )
{
struct vfio_pci_npu2_data * data = region - > data ;
2019-01-23 07:07:11 +03:00
struct vfio_region_info_cap_nvlink2_ssatgt captgt = {
. header . id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT ,
. header . version = 1 ,
. tgt = data - > gpu_tgt
} ;
struct vfio_region_info_cap_nvlink2_lnkspd capspd = {
. header . id = VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD ,
. header . version = 1 ,
. link_speed = data - > link_speed
} ;
2018-12-20 04:10:36 +03:00
int ret ;
ret = vfio_info_add_capability ( caps , & captgt . header , sizeof ( captgt ) ) ;
if ( ret )
return ret ;
return vfio_info_add_capability ( caps , & capspd . header , sizeof ( capspd ) ) ;
}
static const struct vfio_pci_regops vfio_pci_npu2_regops = {
. rw = vfio_pci_npu2_rw ,
. mmap = vfio_pci_npu2_mmap ,
. release = vfio_pci_npu2_release ,
. add_capability = vfio_pci_npu2_add_capability ,
} ;
int vfio_pci_ibm_npu2_init ( struct vfio_pci_device * vdev )
{
int ret ;
struct vfio_pci_npu2_data * data ;
struct device_node * nvlink_dn ;
u32 nvlink_index = 0 ;
struct pci_dev * npdev = vdev - > pdev ;
struct device_node * npu_node = pci_device_to_OF_node ( npdev ) ;
struct pci_controller * hose = pci_bus_to_host ( npdev - > bus ) ;
u64 mmio_atsd = 0 ;
u64 tgt = 0 ;
u32 link_speed = 0xff ;
/*
* PCI config space does not tell us about NVLink presense but
* platform does , use this .
*/
if ( ! pnv_pci_get_gpu_dev ( vdev - > pdev ) )
return - ENODEV ;
/*
* NPU2 normally has 8 ATSD registers ( for concurrency ) and 6 links
* so we can allocate one register per link , using nvlink index as
* a key .
* There is always at least one ATSD register so as long as at least
* NVLink bridge # 0 is passed to the guest , ATSD will be available .
*/
nvlink_dn = of_parse_phandle ( npdev - > dev . of_node , " ibm,nvlink " , 0 ) ;
if ( WARN_ON ( of_property_read_u32 ( nvlink_dn , " ibm,npu-link-index " ,
& nvlink_index ) ) )
return - ENODEV ;
if ( of_property_read_u64_index ( hose - > dn , " ibm,mmio-atsd " , nvlink_index ,
& mmio_atsd ) ) {
2020-03-31 07:12:46 +03:00
if ( of_property_read_u64_index ( hose - > dn , " ibm,mmio-atsd " , 0 ,
& mmio_atsd ) ) {
dev_warn ( & vdev - > pdev - > dev , " No available ATSD found \n " ) ;
mmio_atsd = 0 ;
} else {
dev_warn ( & vdev - > pdev - > dev ,
" Using fallback ibm,mmio-atsd[0] for ATSD. \n " ) ;
}
2018-12-20 04:10:36 +03:00
}
if ( of_property_read_u64 ( npu_node , " ibm,device-tgt-addr " , & tgt ) ) {
dev_warn ( & vdev - > pdev - > dev , " No ibm,device-tgt-addr found \n " ) ;
return - EFAULT ;
}
if ( of_property_read_u32 ( npu_node , " ibm,nvlink-speed " , & link_speed ) ) {
dev_warn ( & vdev - > pdev - > dev , " No ibm,nvlink-speed found \n " ) ;
return - EFAULT ;
}
data = kzalloc ( sizeof ( * data ) , GFP_KERNEL ) ;
if ( ! data )
return - ENOMEM ;
data - > mmio_atsd = mmio_atsd ;
data - > gpu_tgt = tgt ;
data - > link_speed = link_speed ;
if ( data - > mmio_atsd ) {
data - > base = memremap ( data - > mmio_atsd , SZ_64K , MEMREMAP_WT ) ;
if ( ! data - > base ) {
ret = - ENOMEM ;
goto free_exit ;
}
}
/*
* We want to expose the capability even if this specific NVLink
* did not get its own ATSD register because capabilities
* belong to VFIO regions and normally there will be ATSD register
* assigned to the NVLink bridge .
*/
ret = vfio_pci_register_dev_region ( vdev ,
PCI_VENDOR_ID_IBM |
VFIO_REGION_TYPE_PCI_VENDOR_TYPE ,
VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD ,
& vfio_pci_npu2_regops ,
data - > mmio_atsd ? PAGE_SIZE : 0 ,
VFIO_REGION_INFO_FLAG_READ |
VFIO_REGION_INFO_FLAG_WRITE |
VFIO_REGION_INFO_FLAG_MMAP ,
data ) ;
if ( ret )
goto free_exit ;
return 0 ;
free_exit :
2019-04-19 18:37:17 +03:00
if ( data - > base )
memunmap ( data - > base ) ;
2018-12-20 04:10:36 +03:00
kfree ( data ) ;
return ret ;
}