diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 3affb350070c..a5e9bb0a4855 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -585,6 +585,18 @@ void hl_cs_rollback_all(struct hl_device *hdev) } } +void hl_pending_cb_list_flush(struct hl_ctx *ctx) +{ + struct hl_pending_cb *pending_cb, *tmp; + + list_for_each_entry_safe(pending_cb, tmp, + &ctx->pending_cb_list, cb_node) { + list_del(&pending_cb->cb_node); + hl_cb_put(pending_cb->cb); + kfree(pending_cb); + } +} + static void job_wq_completion(struct work_struct *work) { struct hl_cs_job *job = container_of(work, struct hl_cs_job, @@ -954,6 +966,129 @@ out: return rc; } +static int pending_cb_create_job(struct hl_device *hdev, struct hl_ctx *ctx, + struct hl_cs *cs, struct hl_cb *cb, u32 size, u32 hw_queue_id) +{ + struct hw_queue_properties *hw_queue_prop; + struct hl_cs_counters_atomic *cntr; + struct hl_cs_job *job; + + hw_queue_prop = &hdev->asic_prop.hw_queues_props[hw_queue_id]; + cntr = &hdev->aggregated_cs_counters; + + job = hl_cs_allocate_job(hdev, hw_queue_prop->type, true); + if (!job) { + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&cntr->out_of_mem_drop_cnt); + dev_err(hdev->dev, "Failed to allocate a new job\n"); + return -ENOMEM; + } + + job->id = 0; + job->cs = cs; + job->user_cb = cb; + atomic_inc(&job->user_cb->cs_cnt); + job->user_cb_size = size; + job->hw_queue_id = hw_queue_id; + job->patched_cb = job->user_cb; + job->job_cb_size = job->user_cb_size; + + /* increment refcount as for external queues we get completion */ + cs_get(cs); + + cs->jobs_in_queue_cnt[job->hw_queue_id]++; + + list_add_tail(&job->cs_node, &cs->job_list); + + hl_debugfs_add_job(hdev, job); + + return 0; +} + +static int hl_submit_pending_cb(struct hl_fpriv *hpriv) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_ctx *ctx = hpriv->ctx; + struct hl_pending_cb *pending_cb, *tmp; + struct list_head local_cb_list; + struct hl_cs *cs; + struct hl_cb *cb; + u32 hw_queue_id; + u32 cb_size; + int process_list, rc = 0; + + if (list_empty(&ctx->pending_cb_list)) + return 0; + + process_list = atomic_cmpxchg(&ctx->thread_pending_cb_token, 1, 0); + + /* Only a single thread is allowed to process the list */ + if (!process_list) + return 0; + + if (list_empty(&ctx->pending_cb_list)) + goto free_pending_cb_token; + + /* move all list elements to a local list */ + INIT_LIST_HEAD(&local_cb_list); + spin_lock(&ctx->pending_cb_lock); + list_for_each_entry_safe(pending_cb, tmp, &ctx->pending_cb_list, + cb_node) + list_move_tail(&pending_cb->cb_node, &local_cb_list); + spin_unlock(&ctx->pending_cb_lock); + + rc = allocate_cs(hdev, ctx, CS_TYPE_DEFAULT, &cs); + if (rc) + goto add_list_elements; + + hl_debugfs_add_cs(cs); + + /* Iterate through pending cb list, create jobs and add to CS */ + list_for_each_entry(pending_cb, &local_cb_list, cb_node) { + cb = pending_cb->cb; + cb_size = pending_cb->cb_size; + hw_queue_id = pending_cb->hw_queue_id; + + rc = pending_cb_create_job(hdev, ctx, cs, cb, cb_size, + hw_queue_id); + if (rc) + goto free_cs_object; + } + + rc = hl_hw_queue_schedule_cs(cs); + if (rc) { + if (rc != -EAGAIN) + dev_err(hdev->dev, + "Failed to submit CS %d.%llu (%d)\n", + ctx->asid, cs->sequence, rc); + goto free_cs_object; + } + + /* pending cb was scheduled successfully */ + list_for_each_entry_safe(pending_cb, tmp, &local_cb_list, cb_node) { + list_del(&pending_cb->cb_node); + kfree(pending_cb); + } + + cs_put(cs); + + goto free_pending_cb_token; + +free_cs_object: + cs_rollback(hdev, cs); + cs_put(cs); +add_list_elements: + spin_lock(&ctx->pending_cb_lock); + list_for_each_entry_safe_reverse(pending_cb, tmp, &local_cb_list, + cb_node) + list_move(&pending_cb->cb_node, &ctx->pending_cb_list); + spin_unlock(&ctx->pending_cb_lock); +free_pending_cb_token: + atomic_set(&ctx->thread_pending_cb_token, 1); + + return rc; +} + static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args, u64 *cs_seq) { @@ -1353,6 +1488,10 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) if (rc) goto out; + rc = hl_submit_pending_cb(hpriv); + if (rc) + goto out; + cs_type = hl_cs_get_cs_type(args->in.cs_flags & ~HL_CS_FLAGS_FORCE_RESTORE); chunks = (void __user *) (uintptr_t) args->in.chunks_execute; diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c index 3d86b83f4ca6..829fe98eed61 100644 --- a/drivers/misc/habanalabs/common/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -15,6 +15,11 @@ static void hl_ctx_fini(struct hl_ctx *ctx) u64 idle_mask = 0; int i; + /* Release all allocated pending cb's, those cb's were never + * scheduled so it is safe to release them here + */ + hl_pending_cb_list_flush(ctx); + /* * If we arrived here, there are no jobs waiting for this context * on its queues so we can safely remove it. @@ -142,8 +147,11 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) kref_init(&ctx->refcount); ctx->cs_sequence = 1; + INIT_LIST_HEAD(&ctx->pending_cb_list); + spin_lock_init(&ctx->pending_cb_lock); spin_lock_init(&ctx->cs_lock); atomic_set(&ctx->thread_ctx_switch_token, 1); + atomic_set(&ctx->thread_pending_cb_token, 1); ctx->thread_ctx_switch_wait_token = 0; ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs, sizeof(struct hl_fence *), diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index eb43fb3065ce..8e0553bf3e0e 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -1016,6 +1016,20 @@ struct hl_cs_counters_atomic { atomic64_t validation_drop_cnt; }; +/** + * struct hl_pending_cb - pending command buffer structure + * @cb_node: cb node in pending cb list + * @cb: command buffer to send in next submission + * @cb_size: command buffer size + * @hw_queue_id: destination queue id + */ +struct hl_pending_cb { + struct list_head cb_node; + struct hl_cb *cb; + u32 cb_size; + u32 hw_queue_id; +}; + /** * struct hl_ctx - user/kernel context. * @mem_hash: holds mapping from virtual address to virtual memory area @@ -1031,6 +1045,8 @@ struct hl_cs_counters_atomic { * @mmu_lock: protects the MMU page tables. Any change to the PGT, modifying the * MMU hash or walking the PGT requires talking this lock. * @debugfs_list: node in debugfs list of contexts. + * pending_cb_list: list of pending command buffers waiting to be sent upon + * next user command submission context. * @cs_counters: context command submission counters. * @cb_va_pool: device VA pool for command buffers which are mapped to the * device's MMU. @@ -1039,11 +1055,17 @@ struct hl_cs_counters_atomic { * index to cs_pending array. * @dram_default_hops: array that holds all hops addresses needed for default * DRAM mapping. + * @pending_cb_lock: spinlock to protect pending cb list * @cs_lock: spinlock to protect cs_sequence. * @dram_phys_mem: amount of used physical DRAM memory by this context. * @thread_ctx_switch_token: token to prevent multiple threads of the same * context from running the context switch phase. * Only a single thread should run it. + * @thread_pending_cb_token: token to prevent multiple threads from processing + * the pending CB list. Only a single thread should + * process the list since it is protected by a + * spinlock and we don't want to halt the entire + * command submission sequence. * @thread_ctx_switch_wait_token: token to prevent the threads that didn't run * the context switch phase from moving to their * execution phase before the context switch phase @@ -1062,13 +1084,16 @@ struct hl_ctx { struct mutex mem_hash_lock; struct mutex mmu_lock; struct list_head debugfs_list; + struct list_head pending_cb_list; struct hl_cs_counters_atomic cs_counters; struct gen_pool *cb_va_pool; u64 cs_sequence; u64 *dram_default_hops; + spinlock_t pending_cb_lock; spinlock_t cs_lock; atomic64_t dram_phys_mem; atomic_t thread_ctx_switch_token; + atomic_t thread_pending_cb_token; u32 thread_ctx_switch_wait_token; u32 asid; u32 handle; @@ -2143,6 +2168,7 @@ int hl_cb_va_pool_init(struct hl_ctx *ctx); void hl_cb_va_pool_fini(struct hl_ctx *ctx); void hl_cs_rollback_all(struct hl_device *hdev); +void hl_pending_cb_list_flush(struct hl_ctx *ctx); struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, enum hl_queue_type queue_type, bool is_kernel_allocated_cb); void hl_sob_reset_error(struct kref *ref);