powerpc/ioda/npu: Call skiboot's hot reset hook when disabling NPU2
The skiboot firmware has a hot reset handler which fences the NVIDIA V100 GPU RAM on Witherspoons and makes accesses no-op instead of throwing HMIs: https://github.com/open-power/skiboot/commit/fca2b2b839a67 Now we are going to pass V100 via VFIO which most certainly involves KVM guests which are often terminated without getting a chance to offline GPU RAM so we end up with a running machine with misconfigured memory. Accessing this memory produces hardware management interrupts (HMI) which bring the host down. To suppress HMIs, this wires up this hot reset hook to vfio_pci_disable() via pci_disable_device() which switches NPU2 to a safe mode and prevents HMIs. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Acked-by: Alistair Popple <alistair@popple.id.au> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
This commit is contained in:
parent
ffca395b11
commit
ab7032e793
@ -3667,6 +3667,15 @@ static void pnv_pci_release_device(struct pci_dev *pdev)
|
||||
pnv_ioda_release_pe(pe);
|
||||
}
|
||||
|
||||
static void pnv_npu_disable_device(struct pci_dev *pdev)
|
||||
{
|
||||
struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
|
||||
struct eeh_pe *eehpe = edev ? edev->pe : NULL;
|
||||
|
||||
if (eehpe && eeh_ops && eeh_ops->reset)
|
||||
eeh_ops->reset(eehpe, EEH_RESET_HOT);
|
||||
}
|
||||
|
||||
static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
|
||||
{
|
||||
struct pnv_phb *phb = hose->private_data;
|
||||
@ -3707,6 +3716,7 @@ static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
|
||||
.reset_secondary_bus = pnv_pci_reset_secondary_bus,
|
||||
.dma_set_mask = pnv_npu_dma_set_mask,
|
||||
.shutdown = pnv_pci_ioda_shutdown,
|
||||
.disable_device = pnv_npu_disable_device,
|
||||
};
|
||||
|
||||
static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
|
||||
|
Loading…
x
Reference in New Issue
Block a user