habanalabs: clear non-released encapsulated signals
Reserved encapsulated signals which were not released hold the context refcount, leading to a failure when killing the user process on device reset or device fini. Add the release of these left signals in the CS roll-back process. Signed-off-by: Tomer Tayar <ttayar@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
parent
1f615120fc
commit
893afb248c
@ -742,13 +742,11 @@ static void cs_do_release(struct kref *ref)
|
||||
*/
|
||||
if (hl_cs_cmpl->encaps_signals)
|
||||
kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount,
|
||||
hl_encaps_handle_do_release);
|
||||
hl_encaps_release_handle_and_put_ctx);
|
||||
}
|
||||
|
||||
if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
|
||||
&& cs->encaps_signals)
|
||||
kref_put(&cs->encaps_sig_hdl->refcount,
|
||||
hl_encaps_handle_do_release);
|
||||
if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) && cs->encaps_signals)
|
||||
kref_put(&cs->encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
|
||||
|
||||
out:
|
||||
/* Must be called before hl_ctx_put because inside we use ctx to get
|
||||
@ -1011,6 +1009,34 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
|
||||
hl_complete_job(hdev, job);
|
||||
}
|
||||
|
||||
/*
|
||||
* release_reserved_encaps_signals() - release reserved encapsulated signals.
|
||||
* @hdev: pointer to habanalabs device structure
|
||||
*
|
||||
* Release reserved encapsulated signals which weren't un-reserved, or for which a CS with
|
||||
* encapsulated signals wasn't submitted and thus weren't released as part of CS roll-back.
|
||||
* For these signals need also to put the refcount of the H/W SOB which was taken at the
|
||||
* reservation.
|
||||
*/
|
||||
static void release_reserved_encaps_signals(struct hl_device *hdev)
|
||||
{
|
||||
struct hl_ctx *ctx = hl_get_compute_ctx(hdev);
|
||||
struct hl_cs_encaps_sig_handle *handle;
|
||||
struct hl_encaps_signals_mgr *mgr;
|
||||
u32 id;
|
||||
|
||||
if (!ctx)
|
||||
return;
|
||||
|
||||
mgr = &ctx->sig_mgr;
|
||||
|
||||
idr_for_each_entry(&mgr->handles, handle, id)
|
||||
if (handle->cs_seq == ULLONG_MAX)
|
||||
kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob_ctx);
|
||||
|
||||
hl_ctx_put(ctx);
|
||||
}
|
||||
|
||||
void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
|
||||
{
|
||||
int i;
|
||||
@ -1039,6 +1065,8 @@ void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
|
||||
}
|
||||
|
||||
force_complete_multi_cs(hdev);
|
||||
|
||||
release_reserved_encaps_signals(hdev);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -2001,6 +2029,8 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
|
||||
*/
|
||||
handle->pre_sob_val = prop->next_sob_val - handle->count;
|
||||
|
||||
handle->cs_seq = ULLONG_MAX;
|
||||
|
||||
*signals_count = prop->next_sob_val;
|
||||
hdev->asic_funcs->hw_queues_unlock(hdev);
|
||||
|
||||
@ -2350,10 +2380,8 @@ put_cs:
|
||||
/* We finished with the CS in this function, so put the ref */
|
||||
cs_put(cs);
|
||||
free_cs_chunk_array:
|
||||
if (!wait_cs_submitted && cs_encaps_signals && handle_found &&
|
||||
is_wait_cs)
|
||||
kref_put(&encaps_sig_hdl->refcount,
|
||||
hl_encaps_handle_do_release);
|
||||
if (!wait_cs_submitted && cs_encaps_signals && handle_found && is_wait_cs)
|
||||
kref_put(&encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
|
||||
kfree(cs_chunk_array);
|
||||
out:
|
||||
return rc;
|
||||
|
@ -9,37 +9,46 @@
|
||||
|
||||
#include <linux/slab.h>
|
||||
|
||||
void hl_encaps_handle_do_release(struct kref *ref)
|
||||
static void encaps_handle_do_release(struct hl_cs_encaps_sig_handle *handle, bool put_hw_sob,
|
||||
bool put_ctx)
|
||||
{
|
||||
struct hl_cs_encaps_sig_handle *handle =
|
||||
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
|
||||
struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr;
|
||||
|
||||
if (put_hw_sob)
|
||||
hw_sob_put(handle->hw_sob);
|
||||
|
||||
spin_lock(&mgr->lock);
|
||||
idr_remove(&mgr->handles, handle->id);
|
||||
spin_unlock(&mgr->lock);
|
||||
|
||||
hl_ctx_put(handle->ctx);
|
||||
if (put_ctx)
|
||||
hl_ctx_put(handle->ctx);
|
||||
|
||||
kfree(handle);
|
||||
}
|
||||
|
||||
static void hl_encaps_handle_do_release_sob(struct kref *ref)
|
||||
void hl_encaps_release_handle_and_put_ctx(struct kref *ref)
|
||||
{
|
||||
struct hl_cs_encaps_sig_handle *handle =
|
||||
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
|
||||
struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr;
|
||||
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
|
||||
|
||||
/* if we're here, then there was a signals reservation but cs with
|
||||
* encaps signals wasn't submitted, so need to put refcount
|
||||
* to hw_sob taken at the reservation.
|
||||
*/
|
||||
hw_sob_put(handle->hw_sob);
|
||||
encaps_handle_do_release(handle, false, true);
|
||||
}
|
||||
|
||||
spin_lock(&mgr->lock);
|
||||
idr_remove(&mgr->handles, handle->id);
|
||||
spin_unlock(&mgr->lock);
|
||||
static void hl_encaps_release_handle_and_put_sob(struct kref *ref)
|
||||
{
|
||||
struct hl_cs_encaps_sig_handle *handle =
|
||||
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
|
||||
|
||||
kfree(handle);
|
||||
encaps_handle_do_release(handle, true, false);
|
||||
}
|
||||
|
||||
void hl_encaps_release_handle_and_put_sob_ctx(struct kref *ref)
|
||||
{
|
||||
struct hl_cs_encaps_sig_handle *handle =
|
||||
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
|
||||
|
||||
encaps_handle_do_release(handle, true, true);
|
||||
}
|
||||
|
||||
static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
|
||||
@ -48,8 +57,7 @@ static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
|
||||
idr_init(&mgr->handles);
|
||||
}
|
||||
|
||||
static void hl_encaps_sig_mgr_fini(struct hl_device *hdev,
|
||||
struct hl_encaps_signals_mgr *mgr)
|
||||
static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, struct hl_encaps_signals_mgr *mgr)
|
||||
{
|
||||
struct hl_cs_encaps_sig_handle *handle;
|
||||
struct idr *idp;
|
||||
@ -57,11 +65,14 @@ static void hl_encaps_sig_mgr_fini(struct hl_device *hdev,
|
||||
|
||||
idp = &mgr->handles;
|
||||
|
||||
/* The IDR is expected to be empty at this stage, because any left signal should have been
|
||||
* released as part of CS roll-back.
|
||||
*/
|
||||
if (!idr_is_empty(idp)) {
|
||||
dev_warn(hdev->dev, "device released while some encaps signals handles are still allocated\n");
|
||||
dev_warn(hdev->dev,
|
||||
"device released while some encaps signals handles are still allocated\n");
|
||||
idr_for_each_entry(idp, handle, id)
|
||||
kref_put(&handle->refcount,
|
||||
hl_encaps_handle_do_release_sob);
|
||||
kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob);
|
||||
}
|
||||
|
||||
idr_destroy(&mgr->handles);
|
||||
|
@ -3775,7 +3775,8 @@ void hl_sysfs_add_dev_vrm_attr(struct hl_device *hdev, struct attribute_group *d
|
||||
|
||||
void hw_sob_get(struct hl_hw_sob *hw_sob);
|
||||
void hw_sob_put(struct hl_hw_sob *hw_sob);
|
||||
void hl_encaps_handle_do_release(struct kref *ref);
|
||||
void hl_encaps_release_handle_and_put_ctx(struct kref *ref);
|
||||
void hl_encaps_release_handle_and_put_sob_ctx(struct kref *ref);
|
||||
void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev,
|
||||
struct hl_cs *cs, struct hl_cs_job *job,
|
||||
struct hl_cs_compl *cs_cmpl);
|
||||
|
Loading…
x
Reference in New Issue
Block a user