habanalabs: fix race between hl_get_compute_ctx() and hl_ctx_put()
hl_get_compute_ctx() is used to get the pointer to the compute context from the hpriv object. The function is called in code paths that are not necessarily initiated by user, so it is possible that a context release process will happen in parallel. This can lead to a race condition in which hl_get_compute_ctx() retrieves the context pointer, and just before it increments the context refcount, the context object is released and a freed memory is accessed. To avoid this race, add a mutex to protect the context pointer in hpriv. With this lock, hl_get_compute_ctx() will be able to detect if the context has been released or is about to be released. struct hl_ctx_mgr has a mutex for contexts IDR with a similar "ctx_lock" name, so rename it to just "lock" to avoid a confusion with the new lock. Signed-off-by: Tomer Tayar <ttayar@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
parent
2bc61bc4f3
commit
41021f728a
@ -125,15 +125,22 @@ void hl_ctx_do_release(struct kref *ref)
|
||||
|
||||
hl_ctx_fini(ctx);
|
||||
|
||||
if (ctx->hpriv)
|
||||
hl_hpriv_put(ctx->hpriv);
|
||||
if (ctx->hpriv) {
|
||||
struct hl_fpriv *hpriv = ctx->hpriv;
|
||||
|
||||
mutex_lock(&hpriv->ctx_lock);
|
||||
hpriv->ctx = NULL;
|
||||
mutex_unlock(&hpriv->ctx_lock);
|
||||
|
||||
hl_hpriv_put(hpriv);
|
||||
}
|
||||
|
||||
kfree(ctx);
|
||||
}
|
||||
|
||||
int hl_ctx_create(struct hl_device *hdev, struct hl_fpriv *hpriv)
|
||||
{
|
||||
struct hl_ctx_mgr *mgr = &hpriv->ctx_mgr;
|
||||
struct hl_ctx_mgr *ctx_mgr = &hpriv->ctx_mgr;
|
||||
struct hl_ctx *ctx;
|
||||
int rc;
|
||||
|
||||
@ -143,9 +150,9 @@ int hl_ctx_create(struct hl_device *hdev, struct hl_fpriv *hpriv)
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
mutex_lock(&mgr->ctx_lock);
|
||||
rc = idr_alloc(&mgr->ctx_handles, ctx, 1, 0, GFP_KERNEL);
|
||||
mutex_unlock(&mgr->ctx_lock);
|
||||
mutex_lock(&ctx_mgr->lock);
|
||||
rc = idr_alloc(&ctx_mgr->handles, ctx, 1, 0, GFP_KERNEL);
|
||||
mutex_unlock(&ctx_mgr->lock);
|
||||
|
||||
if (rc < 0) {
|
||||
dev_err(hdev->dev, "Failed to allocate IDR for a new CTX\n");
|
||||
@ -170,9 +177,9 @@ int hl_ctx_create(struct hl_device *hdev, struct hl_fpriv *hpriv)
|
||||
return 0;
|
||||
|
||||
remove_from_idr:
|
||||
mutex_lock(&mgr->ctx_lock);
|
||||
idr_remove(&mgr->ctx_handles, ctx->handle);
|
||||
mutex_unlock(&mgr->ctx_lock);
|
||||
mutex_lock(&ctx_mgr->lock);
|
||||
idr_remove(&ctx_mgr->handles, ctx->handle);
|
||||
mutex_unlock(&ctx_mgr->lock);
|
||||
free_ctx:
|
||||
kfree(ctx);
|
||||
out_err:
|
||||
@ -269,6 +276,11 @@ err_hw_block_mem_fini:
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int hl_ctx_get_unless_zero(struct hl_ctx *ctx)
|
||||
{
|
||||
return kref_get_unless_zero(&ctx->refcount);
|
||||
}
|
||||
|
||||
void hl_ctx_get(struct hl_ctx *ctx)
|
||||
{
|
||||
kref_get(&ctx->refcount);
|
||||
@ -287,11 +299,15 @@ struct hl_ctx *hl_get_compute_ctx(struct hl_device *hdev)
|
||||
mutex_lock(&hdev->fpriv_list_lock);
|
||||
|
||||
list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) {
|
||||
/* There can only be a single user which has opened the compute device, so exit
|
||||
* immediately once we find him
|
||||
*/
|
||||
mutex_lock(&hpriv->ctx_lock);
|
||||
ctx = hpriv->ctx;
|
||||
hl_ctx_get(ctx);
|
||||
if (ctx && !hl_ctx_get_unless_zero(ctx))
|
||||
ctx = NULL;
|
||||
mutex_unlock(&hpriv->ctx_lock);
|
||||
|
||||
/* There can only be a single user which has opened the compute device, so exit
|
||||
* immediately once we find its context or if we see that it has been released
|
||||
*/
|
||||
break;
|
||||
}
|
||||
|
||||
@ -383,37 +399,37 @@ int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr,
|
||||
/*
|
||||
* hl_ctx_mgr_init - initialize the context manager
|
||||
*
|
||||
* @mgr: pointer to context manager structure
|
||||
* @ctx_mgr: pointer to context manager structure
|
||||
*
|
||||
* This manager is an object inside the hpriv object of the user process.
|
||||
* The function is called when a user process opens the FD.
|
||||
*/
|
||||
void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr)
|
||||
void hl_ctx_mgr_init(struct hl_ctx_mgr *ctx_mgr)
|
||||
{
|
||||
mutex_init(&mgr->ctx_lock);
|
||||
idr_init(&mgr->ctx_handles);
|
||||
mutex_init(&ctx_mgr->lock);
|
||||
idr_init(&ctx_mgr->handles);
|
||||
}
|
||||
|
||||
/*
|
||||
* hl_ctx_mgr_fini - finalize the context manager
|
||||
*
|
||||
* @hdev: pointer to device structure
|
||||
* @mgr: pointer to context manager structure
|
||||
* @ctx_mgr: pointer to context manager structure
|
||||
*
|
||||
* This function goes over all the contexts in the manager and frees them.
|
||||
* It is called when a process closes the FD.
|
||||
*/
|
||||
void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr)
|
||||
void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *ctx_mgr)
|
||||
{
|
||||
struct hl_ctx *ctx;
|
||||
struct idr *idp;
|
||||
u32 id;
|
||||
|
||||
idp = &mgr->ctx_handles;
|
||||
idp = &ctx_mgr->handles;
|
||||
|
||||
idr_for_each_entry(idp, ctx, id)
|
||||
kref_put(&ctx->refcount, hl_ctx_do_release);
|
||||
|
||||
idr_destroy(&mgr->ctx_handles);
|
||||
mutex_destroy(&mgr->ctx_lock);
|
||||
idr_destroy(&ctx_mgr->handles);
|
||||
mutex_destroy(&ctx_mgr->lock);
|
||||
}
|
||||
|
@ -245,6 +245,7 @@ static void hpriv_release(struct kref *ref)
|
||||
|
||||
hl_debugfs_remove_file(hpriv);
|
||||
|
||||
mutex_destroy(&hpriv->ctx_lock);
|
||||
mutex_destroy(&hpriv->restore_phase_mutex);
|
||||
|
||||
if ((!hdev->pldm) && (hdev->pdev) &&
|
||||
|
@ -1638,12 +1638,12 @@ struct hl_ctx {
|
||||
|
||||
/**
|
||||
* struct hl_ctx_mgr - for handling multiple contexts.
|
||||
* @ctx_lock: protects ctx_handles.
|
||||
* @ctx_handles: idr to hold all ctx handles.
|
||||
* @lock: protects ctx_handles.
|
||||
* @handles: idr to hold all ctx handles.
|
||||
*/
|
||||
struct hl_ctx_mgr {
|
||||
struct mutex ctx_lock;
|
||||
struct idr ctx_handles;
|
||||
struct mutex lock;
|
||||
struct idr handles;
|
||||
};
|
||||
|
||||
|
||||
@ -1998,6 +1998,8 @@ struct hl_notifier_event {
|
||||
* @dev_node: node in the device list of file private data
|
||||
* @refcount: number of related contexts.
|
||||
* @restore_phase_mutex: lock for context switch and restore phase.
|
||||
* @ctx_lock: protects the pointer to current executing context pointer. TODO: remove for multiple
|
||||
* ctx per process.
|
||||
*/
|
||||
struct hl_fpriv {
|
||||
struct hl_device *hdev;
|
||||
@ -2011,6 +2013,7 @@ struct hl_fpriv {
|
||||
struct list_head dev_node;
|
||||
struct kref refcount;
|
||||
struct mutex restore_phase_mutex;
|
||||
struct mutex ctx_lock;
|
||||
};
|
||||
|
||||
|
||||
|
@ -137,6 +137,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
|
||||
|
||||
mutex_init(&hpriv->notifier_event.lock);
|
||||
mutex_init(&hpriv->restore_phase_mutex);
|
||||
mutex_init(&hpriv->ctx_lock);
|
||||
kref_init(&hpriv->refcount);
|
||||
nonseekable_open(inode, filp);
|
||||
|
||||
@ -209,6 +210,7 @@ out_err:
|
||||
hl_mem_mgr_fini(&hpriv->mem_mgr);
|
||||
hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
|
||||
filp->private_data = NULL;
|
||||
mutex_destroy(&hpriv->ctx_lock);
|
||||
mutex_destroy(&hpriv->restore_phase_mutex);
|
||||
mutex_destroy(&hpriv->notifier_event.lock);
|
||||
put_pid(hpriv->taskpid);
|
||||
|
Loading…
x
Reference in New Issue
Block a user