diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c index 2cebc1c47248..af9d2e22c6e7 100644 --- a/drivers/accel/habanalabs/common/command_submission.c +++ b/drivers/accel/habanalabs/common/command_submission.c @@ -14,7 +14,7 @@ #define HL_CS_FLAGS_TYPE_MASK (HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT | \ HL_CS_FLAGS_COLLECTIVE_WAIT | HL_CS_FLAGS_RESERVE_SIGNALS_ONLY | \ HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY | HL_CS_FLAGS_ENGINE_CORE_COMMAND | \ - HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES) + HL_CS_FLAGS_ENGINES_COMMAND | HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES) #define MAX_TS_ITER_NUM 100 @@ -1319,6 +1319,8 @@ static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags) return CS_UNRESERVE_SIGNALS; else if (cs_type_flags & HL_CS_FLAGS_ENGINE_CORE_COMMAND) return CS_TYPE_ENGINE_CORE; + else if (cs_type_flags & HL_CS_FLAGS_ENGINES_COMMAND) + return CS_TYPE_ENGINES; else if (cs_type_flags & HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES) return CS_TYPE_FLUSH_PCI_HBW_WRITES; else @@ -2444,10 +2446,13 @@ out: static int cs_ioctl_engine_cores(struct hl_fpriv *hpriv, u64 engine_cores, u32 num_engine_cores, u32 core_command) { - int rc; struct hl_device *hdev = hpriv->hdev; void __user *engine_cores_arr; u32 *cores; + int rc; + + if (!hdev->asic_prop.supports_engine_modes) + return -EPERM; if (!num_engine_cores || num_engine_cores > hdev->asic_prop.num_engine_cores) { dev_err(hdev->dev, "Number of engine cores %d is invalid\n", num_engine_cores); @@ -2476,6 +2481,48 @@ static int cs_ioctl_engine_cores(struct hl_fpriv *hpriv, u64 engine_cores, return rc; } +static int cs_ioctl_engines(struct hl_fpriv *hpriv, u64 engines_arr_user_addr, + u32 num_engines, enum hl_engine_command command) +{ + struct hl_device *hdev = hpriv->hdev; + u32 *engines, max_num_of_engines; + void __user *engines_arr; + int rc; + + if (!hdev->asic_prop.supports_engine_modes) + return -EPERM; + + if (command >= HL_ENGINE_COMMAND_MAX) { + dev_err(hdev->dev, "Engine command is invalid\n"); + return -EINVAL; + } + + max_num_of_engines = hdev->asic_prop.max_num_of_engines; + if (command == HL_ENGINE_CORE_RUN || command == HL_ENGINE_CORE_HALT) + max_num_of_engines = hdev->asic_prop.num_engine_cores; + + if (!num_engines || num_engines > max_num_of_engines) { + dev_err(hdev->dev, "Number of engines %d is invalid\n", num_engines); + return -EINVAL; + } + + engines_arr = (void __user *) (uintptr_t) engines_arr_user_addr; + engines = kmalloc_array(num_engines, sizeof(u32), GFP_KERNEL); + if (!engines) + return -ENOMEM; + + if (copy_from_user(engines, engines_arr, num_engines * sizeof(u32))) { + dev_err(hdev->dev, "Failed to copy engine-ids array from user\n"); + kfree(engines); + return -EFAULT; + } + + rc = hdev->asic_funcs->set_engines(hdev, engines, num_engines, command); + kfree(engines); + + return rc; +} + static int cs_ioctl_flush_pci_hbw_writes(struct hl_fpriv *hpriv) { struct hl_device *hdev = hpriv->hdev; @@ -2547,6 +2594,10 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) rc = cs_ioctl_engine_cores(hpriv, args->in.engine_cores, args->in.num_engine_cores, args->in.core_command); break; + case CS_TYPE_ENGINES: + rc = cs_ioctl_engines(hpriv, args->in.engines, + args->in.num_engines, args->in.engine_command); + break; case CS_TYPE_FLUSH_PCI_HBW_WRITES: rc = cs_ioctl_flush_pci_hbw_writes(hpriv); break; diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index e03f9c125e30..af5a51f8c173 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -372,6 +372,7 @@ enum hl_cs_type { CS_RESERVE_SIGNALS, CS_UNRESERVE_SIGNALS, CS_TYPE_ENGINE_CORE, + CS_TYPE_ENGINES, CS_TYPE_FLUSH_PCI_HBW_WRITES, }; @@ -644,7 +645,8 @@ struct hl_hints_range { * which the property supports_user_set_page_size is true * (i.e. the DRAM supports multiple page sizes), otherwise * it will shall be equal to dram_page_size. - * @num_engine_cores: number of engine cpu cores + * @num_engine_cores: number of engine cpu cores. + * @max_num_of_engines: maximum number of all engines in the ASIC. * @num_of_special_blocks: special_blocks array size. * @glbl_err_cause_num: global err cause number. * @hbw_flush_reg: register to read to generate HBW flush. value of 0 means HBW flush is @@ -695,6 +697,7 @@ struct hl_hints_range { * @supports_user_set_page_size: true if user can set the allocation page size. * @dma_mask: the dma mask to be set for this device * @supports_advanced_cpucp_rc: true if new cpucp opcodes are supported. + * @supports_engine_modes: true if changing engines/engine_cores modes is supported. */ struct asic_fixed_properties { struct hw_queue_properties *hw_queues_props; @@ -773,6 +776,7 @@ struct asic_fixed_properties { u32 xbar_edge_enabled_mask; u32 device_mem_alloc_default_page_size; u32 num_engine_cores; + u32 max_num_of_engines; u32 num_of_special_blocks; u32 glbl_err_cause_num; u32 hbw_flush_reg; @@ -810,6 +814,7 @@ struct asic_fixed_properties { u8 supports_user_set_page_size; u8 dma_mask; u8 supports_advanced_cpucp_rc; + u8 supports_engine_modes; }; /** @@ -1564,6 +1569,7 @@ struct engines_data { * @access_dev_mem: access device memory * @set_dram_bar_base: set the base of the DRAM BAR * @set_engine_cores: set a config command to engine cores + * @set_engines: set a config command to user engines * @send_device_activity: indication to FW about device availability * @set_dram_properties: set DRAM related properties. * @set_binning_masks: set binning/enable masks for all relevant components. @@ -1703,6 +1709,8 @@ struct hl_asic_funcs { u64 (*set_dram_bar_base)(struct hl_device *hdev, u64 addr); int (*set_engine_cores)(struct hl_device *hdev, u32 *core_ids, u32 num_cores, u32 core_command); + int (*set_engines)(struct hl_device *hdev, u32 *engine_ids, + u32 num_engines, u32 engine_command); int (*send_device_activity)(struct hl_device *hdev, bool open); int (*set_dram_properties)(struct hl_device *hdev); int (*set_binning_masks)(struct hl_device *hdev); @@ -1826,7 +1834,7 @@ struct hl_cs_outcome_store { * @hpriv: pointer to the private (Kernel Driver) data of the process (fd). * @hdev: pointer to the device structure. * @refcount: reference counter for the context. Context is released only when - * this hits 0l. It is incremented on CS and CS_WAIT. + * this hits 0. It is incremented on CS and CS_WAIT. * @cs_pending: array of hl fence objects representing pending CS. * @outcome_store: storage data structure used to remember outcomes of completed * command submissions for a long time after CS id wraparound. diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c index d37c5d4893b9..004846bc086e 100644 --- a/drivers/accel/habanalabs/gaudi/gaudi.c +++ b/drivers/accel/habanalabs/gaudi/gaudi.c @@ -656,6 +656,7 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev) prop->cfg_size = CFG_SIZE; prop->max_asid = MAX_ASID; prop->num_of_events = GAUDI_EVENT_SIZE; + prop->max_num_of_engines = GAUDI_ENGINE_ID_SIZE; prop->tpc_enabled_mask = TPC_ENABLED_MASK; set_default_power_values(hdev); diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 1131aae68690..166469179628 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -1714,6 +1714,34 @@ static const u32 gaudi2_tpc_cfg_blocks_bases[TPC_ID_SIZE] = { [TPC_ID_DCORE0_TPC6] = mmDCORE0_TPC6_CFG_BASE, }; +static const u32 gaudi2_tpc_eml_cfg_blocks_bases[TPC_ID_SIZE] = { + [TPC_ID_DCORE0_TPC0] = mmDCORE0_TPC0_EML_CFG_BASE, + [TPC_ID_DCORE0_TPC1] = mmDCORE0_TPC1_EML_CFG_BASE, + [TPC_ID_DCORE0_TPC2] = mmDCORE0_TPC2_EML_CFG_BASE, + [TPC_ID_DCORE0_TPC3] = mmDCORE0_TPC3_EML_CFG_BASE, + [TPC_ID_DCORE0_TPC4] = mmDCORE0_TPC4_EML_CFG_BASE, + [TPC_ID_DCORE0_TPC5] = mmDCORE0_TPC5_EML_CFG_BASE, + [TPC_ID_DCORE1_TPC0] = mmDCORE1_TPC0_EML_CFG_BASE, + [TPC_ID_DCORE1_TPC1] = mmDCORE1_TPC1_EML_CFG_BASE, + [TPC_ID_DCORE1_TPC2] = mmDCORE1_TPC2_EML_CFG_BASE, + [TPC_ID_DCORE1_TPC3] = mmDCORE1_TPC3_EML_CFG_BASE, + [TPC_ID_DCORE1_TPC4] = mmDCORE1_TPC4_EML_CFG_BASE, + [TPC_ID_DCORE1_TPC5] = mmDCORE1_TPC5_EML_CFG_BASE, + [TPC_ID_DCORE2_TPC0] = mmDCORE2_TPC0_EML_CFG_BASE, + [TPC_ID_DCORE2_TPC1] = mmDCORE2_TPC1_EML_CFG_BASE, + [TPC_ID_DCORE2_TPC2] = mmDCORE2_TPC2_EML_CFG_BASE, + [TPC_ID_DCORE2_TPC3] = mmDCORE2_TPC3_EML_CFG_BASE, + [TPC_ID_DCORE2_TPC4] = mmDCORE2_TPC4_EML_CFG_BASE, + [TPC_ID_DCORE2_TPC5] = mmDCORE2_TPC5_EML_CFG_BASE, + [TPC_ID_DCORE3_TPC0] = mmDCORE3_TPC0_EML_CFG_BASE, + [TPC_ID_DCORE3_TPC1] = mmDCORE3_TPC1_EML_CFG_BASE, + [TPC_ID_DCORE3_TPC2] = mmDCORE3_TPC2_EML_CFG_BASE, + [TPC_ID_DCORE3_TPC3] = mmDCORE3_TPC3_EML_CFG_BASE, + [TPC_ID_DCORE3_TPC4] = mmDCORE3_TPC4_EML_CFG_BASE, + [TPC_ID_DCORE3_TPC5] = mmDCORE3_TPC5_EML_CFG_BASE, + [TPC_ID_DCORE0_TPC6] = mmDCORE0_TPC6_EML_CFG_BASE, +}; + const u32 gaudi2_rot_blocks_bases[ROTATOR_ID_SIZE] = { [ROTATOR_ID_0] = mmROT0_BASE, [ROTATOR_ID_1] = mmROT1_BASE @@ -1752,6 +1780,56 @@ static const u32 gaudi2_rot_id_to_queue_id[ROTATOR_ID_SIZE] = { [ROTATOR_ID_1] = GAUDI2_QUEUE_ID_ROT_1_0, }; +static const u32 gaudi2_tpc_engine_id_to_tpc_id[] = { + [GAUDI2_DCORE0_ENGINE_ID_TPC_0] = TPC_ID_DCORE0_TPC0, + [GAUDI2_DCORE0_ENGINE_ID_TPC_1] = TPC_ID_DCORE0_TPC1, + [GAUDI2_DCORE0_ENGINE_ID_TPC_2] = TPC_ID_DCORE0_TPC2, + [GAUDI2_DCORE0_ENGINE_ID_TPC_3] = TPC_ID_DCORE0_TPC3, + [GAUDI2_DCORE0_ENGINE_ID_TPC_4] = TPC_ID_DCORE0_TPC4, + [GAUDI2_DCORE0_ENGINE_ID_TPC_5] = TPC_ID_DCORE0_TPC5, + [GAUDI2_DCORE1_ENGINE_ID_TPC_0] = TPC_ID_DCORE1_TPC0, + [GAUDI2_DCORE1_ENGINE_ID_TPC_1] = TPC_ID_DCORE1_TPC1, + [GAUDI2_DCORE1_ENGINE_ID_TPC_2] = TPC_ID_DCORE1_TPC2, + [GAUDI2_DCORE1_ENGINE_ID_TPC_3] = TPC_ID_DCORE1_TPC3, + [GAUDI2_DCORE1_ENGINE_ID_TPC_4] = TPC_ID_DCORE1_TPC4, + [GAUDI2_DCORE1_ENGINE_ID_TPC_5] = TPC_ID_DCORE1_TPC5, + [GAUDI2_DCORE2_ENGINE_ID_TPC_0] = TPC_ID_DCORE2_TPC0, + [GAUDI2_DCORE2_ENGINE_ID_TPC_1] = TPC_ID_DCORE2_TPC1, + [GAUDI2_DCORE2_ENGINE_ID_TPC_2] = TPC_ID_DCORE2_TPC2, + [GAUDI2_DCORE2_ENGINE_ID_TPC_3] = TPC_ID_DCORE2_TPC3, + [GAUDI2_DCORE2_ENGINE_ID_TPC_4] = TPC_ID_DCORE2_TPC4, + [GAUDI2_DCORE2_ENGINE_ID_TPC_5] = TPC_ID_DCORE2_TPC5, + [GAUDI2_DCORE3_ENGINE_ID_TPC_0] = TPC_ID_DCORE3_TPC0, + [GAUDI2_DCORE3_ENGINE_ID_TPC_1] = TPC_ID_DCORE3_TPC1, + [GAUDI2_DCORE3_ENGINE_ID_TPC_2] = TPC_ID_DCORE3_TPC2, + [GAUDI2_DCORE3_ENGINE_ID_TPC_3] = TPC_ID_DCORE3_TPC3, + [GAUDI2_DCORE3_ENGINE_ID_TPC_4] = TPC_ID_DCORE3_TPC4, + [GAUDI2_DCORE3_ENGINE_ID_TPC_5] = TPC_ID_DCORE3_TPC5, + /* the PCI TPC is placed last (mapped liked HW) */ + [GAUDI2_DCORE0_ENGINE_ID_TPC_6] = TPC_ID_DCORE0_TPC6, +}; + +static const u32 gaudi2_mme_engine_id_to_mme_id[] = { + [GAUDI2_DCORE0_ENGINE_ID_MME] = MME_ID_DCORE0, + [GAUDI2_DCORE1_ENGINE_ID_MME] = MME_ID_DCORE1, + [GAUDI2_DCORE2_ENGINE_ID_MME] = MME_ID_DCORE2, + [GAUDI2_DCORE3_ENGINE_ID_MME] = MME_ID_DCORE3, +}; + +static const u32 gaudi2_edma_engine_id_to_edma_id[] = { + [GAUDI2_ENGINE_ID_PDMA_0] = DMA_CORE_ID_PDMA0, + [GAUDI2_ENGINE_ID_PDMA_1] = DMA_CORE_ID_PDMA1, + [GAUDI2_DCORE0_ENGINE_ID_EDMA_0] = DMA_CORE_ID_EDMA0, + [GAUDI2_DCORE0_ENGINE_ID_EDMA_1] = DMA_CORE_ID_EDMA1, + [GAUDI2_DCORE1_ENGINE_ID_EDMA_0] = DMA_CORE_ID_EDMA2, + [GAUDI2_DCORE1_ENGINE_ID_EDMA_1] = DMA_CORE_ID_EDMA3, + [GAUDI2_DCORE2_ENGINE_ID_EDMA_0] = DMA_CORE_ID_EDMA4, + [GAUDI2_DCORE2_ENGINE_ID_EDMA_1] = DMA_CORE_ID_EDMA5, + [GAUDI2_DCORE3_ENGINE_ID_EDMA_0] = DMA_CORE_ID_EDMA6, + [GAUDI2_DCORE3_ENGINE_ID_EDMA_1] = DMA_CORE_ID_EDMA7, + [GAUDI2_ENGINE_ID_KDMA] = DMA_CORE_ID_KDMA, +}; + const u32 edma_stream_base[NUM_OF_EDMA_PER_DCORE * NUM_OF_DCORES] = { GAUDI2_QUEUE_ID_DCORE0_EDMA_0_0, GAUDI2_QUEUE_ID_DCORE0_EDMA_1_0, @@ -2026,6 +2104,12 @@ static void gaudi2_set_arc_id_cap(struct hl_device *hdev, u64 arc_id); static void gaudi2_memset_device_lbw(struct hl_device *hdev, u32 addr, u32 size, u32 val); static int gaudi2_send_job_to_kdma(struct hl_device *hdev, u64 src_addr, u64 dst_addr, u32 size, bool is_memset); +static bool gaudi2_get_tpc_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len, + struct engines_data *e); +static bool gaudi2_get_mme_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len, + struct engines_data *e); +static bool gaudi2_get_edma_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len, + struct engines_data *e); static u64 gaudi2_mmu_scramble_addr(struct hl_device *hdev, u64 raw_addr); static void gaudi2_init_scrambler_hbm(struct hl_device *hdev) @@ -2326,11 +2410,14 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev) prop->pmmu_huge.end_addr = VA_HOST_SPACE_HPAGE_END; } + prop->max_num_of_engines = GAUDI2_ENGINE_ID_SIZE; prop->num_engine_cores = CPU_ID_MAX; prop->cfg_size = CFG_SIZE; prop->max_asid = MAX_ASID; prop->num_of_events = GAUDI2_EVENT_SIZE; + prop->supports_engine_modes = true; + prop->dc_power_default = DC_POWER_DEFAULT; prop->cb_pool_cb_cnt = GAUDI2_CB_POOL_CB_CNT; @@ -4322,7 +4409,6 @@ static int gaudi2_set_engine_cores(struct hl_device *hdev, u32 *core_ids, { int i, rc; - for (i = 0 ; i < num_cores ; i++) { if (gaudi2_is_arc_enabled(hdev, core_ids[i])) gaudi2_set_arc_running_mode(hdev, core_ids[i], core_command); @@ -4344,6 +4430,172 @@ static int gaudi2_set_engine_cores(struct hl_device *hdev, u32 *core_ids, return 0; } +static int gaudi2_set_tpc_engine_mode(struct hl_device *hdev, u32 engine_id, u32 engine_command) +{ + struct gaudi2_device *gaudi2 = hdev->asic_specific; + u32 reg_base, reg_addr, reg_val, tpc_id; + + if (!(gaudi2->tpc_hw_cap_initialized & HW_CAP_TPC_MASK)) + return 0; + + tpc_id = gaudi2_tpc_engine_id_to_tpc_id[engine_id]; + if (!(gaudi2->tpc_hw_cap_initialized & BIT_ULL(HW_CAP_TPC_SHIFT + tpc_id))) + return 0; + + reg_base = gaudi2_tpc_cfg_blocks_bases[tpc_id]; + reg_addr = reg_base + TPC_CFG_STALL_OFFSET; + reg_val = FIELD_PREP(DCORE0_TPC0_CFG_TPC_STALL_V_MASK, + !!(engine_command == HL_ENGINE_STALL)); + WREG32(reg_addr, reg_val); + + if (engine_command == HL_ENGINE_RESUME) { + reg_base = gaudi2_tpc_eml_cfg_blocks_bases[tpc_id]; + reg_addr = reg_base + TPC_EML_CFG_DBG_CNT_OFFSET; + RMWREG32(reg_addr, 0x1, DCORE0_TPC0_EML_CFG_DBG_CNT_DBG_EXIT_MASK); + } + + return 0; +} + +static int gaudi2_set_mme_engine_mode(struct hl_device *hdev, u32 engine_id, u32 engine_command) +{ + struct gaudi2_device *gaudi2 = hdev->asic_specific; + u32 reg_base, reg_addr, reg_val, mme_id; + + mme_id = gaudi2_mme_engine_id_to_mme_id[engine_id]; + if (!(gaudi2->hw_cap_initialized & BIT_ULL(HW_CAP_MME_SHIFT + mme_id))) + return 0; + + reg_base = gaudi2_mme_ctrl_lo_blocks_bases[mme_id]; + reg_addr = reg_base + MME_CTRL_LO_QM_STALL_OFFSET; + reg_val = FIELD_PREP(DCORE0_MME_CTRL_LO_QM_STALL_V_MASK, + !!(engine_command == HL_ENGINE_STALL)); + WREG32(reg_addr, reg_val); + + return 0; +} + +static int gaudi2_set_edma_engine_mode(struct hl_device *hdev, u32 engine_id, u32 engine_command) +{ + struct gaudi2_device *gaudi2 = hdev->asic_specific; + u32 reg_base, reg_addr, reg_val, edma_id; + + if (!(gaudi2->hw_cap_initialized & HW_CAP_EDMA_MASK)) + return 0; + + edma_id = gaudi2_edma_engine_id_to_edma_id[engine_id]; + if (!(gaudi2->hw_cap_initialized & BIT_ULL(HW_CAP_EDMA_SHIFT + edma_id))) + return 0; + + reg_base = gaudi2_dma_core_blocks_bases[edma_id]; + reg_addr = reg_base + EDMA_CORE_CFG_STALL_OFFSET; + reg_val = FIELD_PREP(DCORE0_EDMA0_CORE_CFG_1_HALT_MASK, + !!(engine_command == HL_ENGINE_STALL)); + WREG32(reg_addr, reg_val); + + if (engine_command == HL_ENGINE_STALL) { + reg_val = FIELD_PREP(DCORE0_EDMA0_CORE_CFG_1_HALT_MASK, 0x1) | + FIELD_PREP(DCORE0_EDMA0_CORE_CFG_1_FLUSH_MASK, 0x1); + WREG32(reg_addr, reg_val); + } + + return 0; +} + +static int gaudi2_set_engine_modes(struct hl_device *hdev, + u32 *engine_ids, u32 num_engines, u32 engine_command) +{ + int i, rc; + + for (i = 0 ; i < num_engines ; ++i) { + switch (engine_ids[i]) { + case GAUDI2_DCORE0_ENGINE_ID_TPC_0 ... GAUDI2_DCORE0_ENGINE_ID_TPC_5: + case GAUDI2_DCORE1_ENGINE_ID_TPC_0 ... GAUDI2_DCORE1_ENGINE_ID_TPC_5: + case GAUDI2_DCORE2_ENGINE_ID_TPC_0 ... GAUDI2_DCORE2_ENGINE_ID_TPC_5: + case GAUDI2_DCORE3_ENGINE_ID_TPC_0 ... GAUDI2_DCORE3_ENGINE_ID_TPC_5: + rc = gaudi2_set_tpc_engine_mode(hdev, engine_ids[i], engine_command); + if (rc) + return rc; + + break; + case GAUDI2_DCORE0_ENGINE_ID_MME: + case GAUDI2_DCORE1_ENGINE_ID_MME: + case GAUDI2_DCORE2_ENGINE_ID_MME: + case GAUDI2_DCORE3_ENGINE_ID_MME: + rc = gaudi2_set_mme_engine_mode(hdev, engine_ids[i], engine_command); + if (rc) + return rc; + + break; + case GAUDI2_DCORE0_ENGINE_ID_EDMA_0 ... GAUDI2_DCORE0_ENGINE_ID_EDMA_1: + case GAUDI2_DCORE1_ENGINE_ID_EDMA_0 ... GAUDI2_DCORE1_ENGINE_ID_EDMA_1: + case GAUDI2_DCORE2_ENGINE_ID_EDMA_0 ... GAUDI2_DCORE2_ENGINE_ID_EDMA_1: + case GAUDI2_DCORE3_ENGINE_ID_EDMA_0 ... GAUDI2_DCORE3_ENGINE_ID_EDMA_1: + rc = gaudi2_set_edma_engine_mode(hdev, engine_ids[i], engine_command); + if (rc) + return rc; + + break; + default: + dev_err(hdev->dev, "Invalid engine ID %u\n", engine_ids[i]); + return -EINVAL; + } + } + + return 0; +} + +static int gaudi2_verify_engine_modes(struct hl_device *hdev, u32 *engine_ids, + u32 num_engines, u32 engine_command) +{ + bool is_engine_idle = true; + u64 mask_arr = 0; + int i; + + gaudi2_get_tpc_idle_status(hdev, &mask_arr, 8 * sizeof(mask_arr), NULL); + gaudi2_get_mme_idle_status(hdev, &mask_arr, 8 * sizeof(mask_arr), NULL); + gaudi2_get_edma_idle_status(hdev, &mask_arr, 8 * sizeof(mask_arr), NULL); + + for (i = 0 ; i < num_engines ; ++i) { + is_engine_idle = !(mask_arr & BIT_ULL(engine_ids[i])); + if ((engine_command == HL_ENGINE_RESUME) && !is_engine_idle) { + dev_err(hdev->dev, "Engine ID %u remained NOT idle!\n", engine_ids[i]); + return -EBUSY; + } else if ((engine_command == HL_ENGINE_STALL) && is_engine_idle) { + dev_err(hdev->dev, "Engine ID %u remained idle!\n", engine_ids[i]); + return -EBUSY; + } + } + + return 0; +} + +static int gaudi2_set_engines(struct hl_device *hdev, u32 *engine_ids, + u32 num_engines, u32 engine_command) +{ + int rc; + + switch (engine_command) { + case HL_ENGINE_CORE_HALT: + case HL_ENGINE_CORE_RUN: + return gaudi2_set_engine_cores(hdev, engine_ids, num_engines, engine_command); + + case HL_ENGINE_STALL: + case HL_ENGINE_RESUME: + rc = gaudi2_set_engine_modes(hdev, engine_ids, num_engines, engine_command); + if (rc) + return rc; + + return gaudi2_verify_engine_modes(hdev, engine_ids, num_engines, engine_command); + + default: + dev_err(hdev->dev, "failed to execute command id %u\n", engine_command); + return -EINVAL; + } + + return 0; +} + static void gaudi2_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw_reset) { u32 wait_timeout_ms; @@ -10883,6 +11135,7 @@ static const struct hl_asic_funcs gaudi2_funcs = { .access_dev_mem = hl_access_dev_mem, .set_dram_bar_base = gaudi2_set_hbm_bar_base, .set_engine_cores = gaudi2_set_engine_cores, + .set_engines = gaudi2_set_engines, .send_device_activity = gaudi2_send_device_activity, .set_dram_properties = gaudi2_set_dram_properties, .set_binning_masks = gaudi2_set_binning_masks, diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2_masks.h b/drivers/accel/habanalabs/gaudi2/gaudi2_masks.h index 74bc1daaeeda..e6664c4a2cf5 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2_masks.h +++ b/drivers/accel/habanalabs/gaudi2/gaudi2_masks.h @@ -86,6 +86,8 @@ #define DCORE0_TPC0_QM_CGM_STS_AGENT_IDLE_MASK 0x100 +#define DCORE0_TPC0_EML_CFG_DBG_CNT_DBG_EXIT_MASK 0x40 + /* CGM_IDLE_MASK is valid for all engines CGM idle check */ #define CGM_IDLE_MASK DCORE0_TPC0_QM_CGM_STS_AGENT_IDLE_MASK diff --git a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h index 0bf3092bfeea..452b379f39f6 100644 --- a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h +++ b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h @@ -164,6 +164,8 @@ #define mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR 0x4800040 +#define mmDCORE0_TPC0_EML_CFG_DBG_CNT 0x40000 + #define SM_OBJS_PROT_BITS_OFFS 0x14000 #define DCORE_OFFSET (mmDCORE1_TPC0_QM_BASE - mmDCORE0_TPC0_QM_BASE) @@ -185,7 +187,10 @@ #define TPC_CFG_STALL_ON_ERR_OFFSET (mmDCORE0_TPC0_CFG_STALL_ON_ERR - mmDCORE0_TPC0_CFG_BASE) #define TPC_CFG_TPC_INTR_MASK_OFFSET (mmDCORE0_TPC0_CFG_TPC_INTR_MASK - mmDCORE0_TPC0_CFG_BASE) #define TPC_CFG_MSS_CONFIG_OFFSET (mmDCORE0_TPC0_CFG_MSS_CONFIG - mmDCORE0_TPC0_CFG_BASE) +#define TPC_EML_CFG_DBG_CNT_OFFSET (mmDCORE0_TPC0_EML_CFG_DBG_CNT - mmDCORE0_TPC0_EML_CFG_BASE) +#define EDMA_CORE_CFG_STALL_OFFSET (mmDCORE0_EDMA0_CORE_CFG_1 - mmDCORE0_EDMA0_CORE_BASE) +#define MME_CTRL_LO_QM_STALL_OFFSET (mmDCORE0_MME_CTRL_LO_QM_STALL - mmDCORE0_MME_CTRL_LO_BASE) #define MME_ACC_INTR_MASK_OFFSET (mmDCORE0_MME_ACC_INTR_MASK - mmDCORE0_MME_ACC_BASE) #define MME_ACC_WR_AXI_AGG_COUT0_OFFSET (mmDCORE0_MME_ACC_WR_AXI_AGG_COUT0 - mmDCORE0_MME_ACC_BASE) #define MME_ACC_WR_AXI_AGG_COUT1_OFFSET (mmDCORE0_MME_ACC_WR_AXI_AGG_COUT1 - mmDCORE0_MME_ACC_BASE) diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h index 359b19ef3c3f..7ca0ef802fd1 100644 --- a/include/uapi/drm/habanalabs_accel.h +++ b/include/uapi/drm/habanalabs_accel.h @@ -1535,17 +1535,31 @@ struct hl_cs_chunk { */ #define HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES 0x8000 +/* + * The engines CS is merged into the existing CS ioctls. + * Use it to control engines modes. + */ +#define HL_CS_FLAGS_ENGINES_COMMAND 0x10000 + #define HL_CS_STATUS_SUCCESS 0 #define HL_MAX_JOBS_PER_CS 512 -/* HL_ENGINE_CORE_ values +/* + * enum hl_engine_command - engine command * - * HL_ENGINE_CORE_HALT: engine core halt - * HL_ENGINE_CORE_RUN: engine core run + * @HL_ENGINE_CORE_HALT: engine core halt + * @HL_ENGINE_CORE_RUN: engine core run + * @HL_ENGINE_STALL: user engine/s stall + * @HL_ENGINE_RESUME: user engine/s resume */ -#define HL_ENGINE_CORE_HALT (1 << 0) -#define HL_ENGINE_CORE_RUN (1 << 1) +enum hl_engine_command { + HL_ENGINE_CORE_HALT = 1, + HL_ENGINE_CORE_RUN = 2, + HL_ENGINE_STALL = 3, + HL_ENGINE_RESUME = 4, + HL_ENGINE_COMMAND_MAX +}; struct hl_cs_in { @@ -1569,6 +1583,18 @@ struct hl_cs_in { /* the core command to be sent towards engine cores */ __u32 core_command; }; + + /* Valid only when HL_CS_FLAGS_ENGINES_COMMAND is set */ + struct { + /* this holds address of array of uint32 for engines */ + __u64 engines; + + /* number of engines in engines array */ + __u32 num_engines; + + /* the engine command to be sent towards engines */ + __u32 engine_command; + }; }; union {