From 64536abc62c17ef6c7b0f17b820de9923b80e745 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Wed, 27 May 2020 12:38:16 +0300 Subject: [PATCH 1/6] habanalabs: block scalar load_and_exe on external queue In Gaudi, the user can't execute scalar load_and_exe on external queue because it can be a security hole. The driver doesn't parse the commands being loaded and it can be msg_prot, which the user isn't allowed to use. Reviewed-by: Tomer Tayar Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 25 ++++++++++++++++++- .../habanalabs/include/gaudi/gaudi_packets.h | 3 +++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 61f88e9884ce..f34ac8d35a14 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -3790,6 +3790,25 @@ static int gaudi_validate_dma_pkt_no_mmu(struct hl_device *hdev, src_in_host); } +static int gaudi_validate_load_and_exe_pkt(struct hl_device *hdev, + struct hl_cs_parser *parser, + struct packet_load_and_exe *user_pkt) +{ + u32 cfg; + + cfg = le32_to_cpu(user_pkt->cfg); + + if (cfg & GAUDI_PKT_LOAD_AND_EXE_CFG_DST_MASK) { + dev_err(hdev->dev, + "User not allowed to use Load and Execute\n"); + return -EPERM; + } + + parser->patched_cb_size += sizeof(struct packet_load_and_exe); + + return 0; +} + static int gaudi_validate_cb(struct hl_device *hdev, struct hl_cs_parser *parser, bool is_mmu) { @@ -3838,6 +3857,11 @@ static int gaudi_validate_cb(struct hl_device *hdev, rc = -EPERM; break; + case PACKET_LOAD_AND_EXE: + rc = gaudi_validate_load_and_exe_pkt(hdev, parser, + (struct packet_load_and_exe *) user_pkt); + break; + case PACKET_LIN_DMA: parser->contains_dma_pkt = true; if (is_mmu) @@ -3855,7 +3879,6 @@ static int gaudi_validate_cb(struct hl_device *hdev, case PACKET_FENCE: case PACKET_NOP: case PACKET_ARB_POINT: - case PACKET_LOAD_AND_EXE: parser->patched_cb_size += pkt_size; break; diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h b/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h index 9a5800b0086b..0f0cd067bb43 100644 --- a/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h +++ b/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h @@ -197,6 +197,9 @@ struct packet_wait { __le32 ctl; }; +#define GAUDI_PKT_LOAD_AND_EXE_CFG_DST_SHIFT 0 +#define GAUDI_PKT_LOAD_AND_EXE_CFG_DST_MASK 0x00000001 + struct packet_load_and_exe { __le32 cfg; __le32 ctl; From cfd4176dc0bbbd3d01d6764cd85e1f1705b35548 Mon Sep 17 00:00:00 2001 From: Omer Shpigelman Date: Wed, 3 Jun 2020 13:03:35 +0300 Subject: [PATCH 2/6] habanalabs: use PI in MMU cache invalidation The PS flow for MMU cache invalidation caused timeouts in stress tests. Use PS + PI flow so no timeouts should happen whatsoever. Signed-off-by: Omer Shpigelman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 8 ++++++++ drivers/misc/habanalabs/gaudi/gaudiP.h | 3 +++ 2 files changed, 11 insertions(+) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index f34ac8d35a14..211547d4f8a7 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -2725,6 +2725,12 @@ static int gaudi_mmu_init(struct hl_device *hdev) WREG32(mmSTLB_HOP_CONFIGURATION, hdev->mmu_huge_page_opt ? 0x30440 : 0x40440); + /* + * The H/W expects the first PI after init to be 1. After wraparound + * we'll write 0. + */ + gaudi->mmu_cache_inv_pi = 1; + gaudi->hw_cap_initialized |= HW_CAP_MMU; return 0; @@ -6017,6 +6023,8 @@ static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, mutex_lock(&hdev->mmu_cache_lock); /* L0 & L1 invalidation */ + WREG32(mmSTLB_INV_PS, 3); + WREG32(mmSTLB_CACHE_INV, gaudi->mmu_cache_inv_pi++); WREG32(mmSTLB_INV_PS, 2); rc = hl_poll_timeout( diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h index a46530d375fa..41a8d9bff6bf 100644 --- a/drivers/misc/habanalabs/gaudi/gaudiP.h +++ b/drivers/misc/habanalabs/gaudi/gaudiP.h @@ -229,6 +229,8 @@ struct gaudi_internal_qman_info { * @multi_msi_mode: whether we are working in multi MSI single MSI mode. * Multi MSI is possible only with IOMMU enabled. * @ext_queue_idx: helper index for external queues initialization. + * @mmu_cache_inv_pi: PI for MMU cache invalidation flow. The H/W expects an + * 8-bit value so use u8. */ struct gaudi_device { int (*armcp_info_get)(struct hl_device *hdev); @@ -248,6 +250,7 @@ struct gaudi_device { u32 hw_cap_initialized; u8 multi_msi_mode; u8 ext_queue_idx; + u8 mmu_cache_inv_pi; }; void gaudi_init_security(struct hl_device *hdev); From dd2fde10934fd0f21e144cf7fbb76564d1d57c77 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sun, 7 Jun 2020 08:21:48 +0300 Subject: [PATCH 3/6] habanalabs: rename mmu_write() to mmu_asid_va_write() The function name conflicts with a static inline function in arch/m68k/include/asm/mcfmmu.h Reported-by: kernel test robot Reviewed-by: Tomer Tayar Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/debugfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/misc/habanalabs/debugfs.c b/drivers/misc/habanalabs/debugfs.c index 3c8dcdfba20c..fc4372c18ce2 100644 --- a/drivers/misc/habanalabs/debugfs.c +++ b/drivers/misc/habanalabs/debugfs.c @@ -480,7 +480,7 @@ out: return 0; } -static ssize_t mmu_write(struct file *file, const char __user *buf, +static ssize_t mmu_asid_va_write(struct file *file, const char __user *buf, size_t count, loff_t *f_pos) { struct seq_file *s = file->private_data; @@ -1125,7 +1125,7 @@ static const struct hl_info_list hl_debugfs_list[] = { {"command_submission_jobs", command_submission_jobs_show, NULL}, {"userptr", userptr_show, NULL}, {"vm", vm_show, NULL}, - {"mmu", mmu_show, mmu_write}, + {"mmu", mmu_show, mmu_asid_va_write}, {"engines", engines_show, NULL} }; From 647e835e67421ee47b0ff8acc2808de86f7d5e01 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sun, 7 Jun 2020 11:26:48 +0300 Subject: [PATCH 4/6] habanalabs: increase GAUDI QMAN ARB WDT timeout The current timeout is too low for some of the workloads and we see false errors as a result. Reviewed-by: Tomer Tayar Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 211547d4f8a7..69317d2ebdfa 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -96,7 +96,7 @@ #define GAUDI_NUM_OF_QM_ARB_ERR_CAUSE 3 -#define GAUDI_ARB_WDT_TIMEOUT 0x400000 +#define GAUDI_ARB_WDT_TIMEOUT 0x1000000 static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = { "gaudi cq 0_0", "gaudi cq 0_1", "gaudi cq 0_2", "gaudi cq 0_3", From 3292055c85aa02723b7b7a1114c4c91deefad5e4 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Thu, 11 Jun 2020 11:43:23 +0300 Subject: [PATCH 5/6] habanalabs: Correct handling when failing to enqueue CB The fence release flow is different if the CS was never submitted. In that case, we don't have an hw_sob object attached that we need to "put". While if the CS was aborted, we do need to "put" the hw_sob. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/command_submission.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index f82974a916c3..b0f62cbbdc87 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -62,6 +62,12 @@ static void hl_fence_release(struct dma_fence *fence) container_of(fence, struct hl_cs_compl, base_fence); struct hl_device *hdev = hl_cs_cmpl->hdev; + /* EBUSY means the CS was never submitted and hence we don't have + * an attached hw_sob object that we should handle here + */ + if (fence->error == -EBUSY) + goto free; + if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) || (hl_cs_cmpl->type == CS_TYPE_WAIT)) { @@ -92,6 +98,7 @@ static void hl_fence_release(struct dma_fence *fence) kref_put(&hl_cs_cmpl->hw_sob->kref, hl_sob_reset); } +free: kfree_rcu(hl_cs_cmpl, base_fence.rcu); } @@ -328,10 +335,16 @@ static void cs_do_release(struct kref *ref) hl_ctx_put(cs->ctx); + /* We need to mark an error for not submitted because in that case + * the dma fence release flow is different. Mainly, we don't need + * to handle hw_sob for signal/wait + */ if (cs->timedout) dma_fence_set_error(cs->fence, -ETIMEDOUT); else if (cs->aborted) dma_fence_set_error(cs->fence, -EIO); + else if (!cs->submitted) + dma_fence_set_error(cs->fence, -EBUSY); dma_fence_signal(cs->fence); dma_fence_put(cs->fence); From ce04326edd59d7902d7ef3a9d853864096e0cd1a Mon Sep 17 00:00:00 2001 From: Omer Shpigelman Date: Tue, 16 Jun 2020 17:56:27 +0300 Subject: [PATCH 6/6] habanalabs: increase h/w timer when checking idle In GAUDI the current timer value for the hardware to check if it is in IDLE state is too low. As a result, there are occasions where the H/W wrongly reports it is not IDLE. The driver checks that before submitting work on behalf of the driver during initialization, so a false report might cause the driver to fail during device initialization. Signed-off-by: Omer Shpigelman Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 69317d2ebdfa..834470d10b46 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -1893,6 +1893,8 @@ static void gaudi_init_pci_dma_qman(struct hl_device *hdev, int dma_id, WREG32(mmDMA0_QM_CP_MSG_BASE3_ADDR_LO_0 + q_off, so_base_ws_lo); WREG32(mmDMA0_QM_CP_MSG_BASE3_ADDR_HI_0 + q_off, so_base_ws_hi); + WREG32(mmDMA0_QM_CP_BARRIER_CFG_0 + q_off, 0x100); + /* The following configuration is needed only once per QMAN */ if (qman_id == 0) { /* Configure RAZWI IRQ */