habanalabs: clear non-released encapsulated signals

Reserved encapsulated signals which were not released hold the context
refcount, leading to a failure when killing the user process on device
reset or device fini.
Add the release of these left signals in the CS roll-back process.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
Tomer Tayar 2022-11-17 15:22:31 +02:00 committed by Oded Gabbay
parent 1f615120fc
commit 893afb248c
3 changed files with 71 additions and 31 deletions

View File

@ -742,13 +742,11 @@ static void cs_do_release(struct kref *ref)
*/
if (hl_cs_cmpl->encaps_signals)
kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount,
hl_encaps_handle_do_release);
hl_encaps_release_handle_and_put_ctx);
}
if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
&& cs->encaps_signals)
kref_put(&cs->encaps_sig_hdl->refcount,
hl_encaps_handle_do_release);
if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) && cs->encaps_signals)
kref_put(&cs->encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
out:
/* Must be called before hl_ctx_put because inside we use ctx to get
@ -1011,6 +1009,34 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
hl_complete_job(hdev, job);
}
/*
* release_reserved_encaps_signals() - release reserved encapsulated signals.
* @hdev: pointer to habanalabs device structure
*
* Release reserved encapsulated signals which weren't un-reserved, or for which a CS with
* encapsulated signals wasn't submitted and thus weren't released as part of CS roll-back.
* For these signals need also to put the refcount of the H/W SOB which was taken at the
* reservation.
*/
static void release_reserved_encaps_signals(struct hl_device *hdev)
{
struct hl_ctx *ctx = hl_get_compute_ctx(hdev);
struct hl_cs_encaps_sig_handle *handle;
struct hl_encaps_signals_mgr *mgr;
u32 id;
if (!ctx)
return;
mgr = &ctx->sig_mgr;
idr_for_each_entry(&mgr->handles, handle, id)
if (handle->cs_seq == ULLONG_MAX)
kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob_ctx);
hl_ctx_put(ctx);
}
void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
{
int i;
@ -1039,6 +1065,8 @@ void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
}
force_complete_multi_cs(hdev);
release_reserved_encaps_signals(hdev);
}
static void
@ -2001,6 +2029,8 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
*/
handle->pre_sob_val = prop->next_sob_val - handle->count;
handle->cs_seq = ULLONG_MAX;
*signals_count = prop->next_sob_val;
hdev->asic_funcs->hw_queues_unlock(hdev);
@ -2350,10 +2380,8 @@ put_cs:
/* We finished with the CS in this function, so put the ref */
cs_put(cs);
free_cs_chunk_array:
if (!wait_cs_submitted && cs_encaps_signals && handle_found &&
is_wait_cs)
kref_put(&encaps_sig_hdl->refcount,
hl_encaps_handle_do_release);
if (!wait_cs_submitted && cs_encaps_signals && handle_found && is_wait_cs)
kref_put(&encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
kfree(cs_chunk_array);
out:
return rc;

View File

@ -9,37 +9,46 @@
#include <linux/slab.h>
void hl_encaps_handle_do_release(struct kref *ref)
static void encaps_handle_do_release(struct hl_cs_encaps_sig_handle *handle, bool put_hw_sob,
bool put_ctx)
{
struct hl_cs_encaps_sig_handle *handle =
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr;
if (put_hw_sob)
hw_sob_put(handle->hw_sob);
spin_lock(&mgr->lock);
idr_remove(&mgr->handles, handle->id);
spin_unlock(&mgr->lock);
hl_ctx_put(handle->ctx);
if (put_ctx)
hl_ctx_put(handle->ctx);
kfree(handle);
}
static void hl_encaps_handle_do_release_sob(struct kref *ref)
void hl_encaps_release_handle_and_put_ctx(struct kref *ref)
{
struct hl_cs_encaps_sig_handle *handle =
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr;
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
/* if we're here, then there was a signals reservation but cs with
* encaps signals wasn't submitted, so need to put refcount
* to hw_sob taken at the reservation.
*/
hw_sob_put(handle->hw_sob);
encaps_handle_do_release(handle, false, true);
}
spin_lock(&mgr->lock);
idr_remove(&mgr->handles, handle->id);
spin_unlock(&mgr->lock);
static void hl_encaps_release_handle_and_put_sob(struct kref *ref)
{
struct hl_cs_encaps_sig_handle *handle =
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
kfree(handle);
encaps_handle_do_release(handle, true, false);
}
void hl_encaps_release_handle_and_put_sob_ctx(struct kref *ref)
{
struct hl_cs_encaps_sig_handle *handle =
container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
encaps_handle_do_release(handle, true, true);
}
static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
@ -48,8 +57,7 @@ static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
idr_init(&mgr->handles);
}
static void hl_encaps_sig_mgr_fini(struct hl_device *hdev,
struct hl_encaps_signals_mgr *mgr)
static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, struct hl_encaps_signals_mgr *mgr)
{
struct hl_cs_encaps_sig_handle *handle;
struct idr *idp;
@ -57,11 +65,14 @@ static void hl_encaps_sig_mgr_fini(struct hl_device *hdev,
idp = &mgr->handles;
/* The IDR is expected to be empty at this stage, because any left signal should have been
* released as part of CS roll-back.
*/
if (!idr_is_empty(idp)) {
dev_warn(hdev->dev, "device released while some encaps signals handles are still allocated\n");
dev_warn(hdev->dev,
"device released while some encaps signals handles are still allocated\n");
idr_for_each_entry(idp, handle, id)
kref_put(&handle->refcount,
hl_encaps_handle_do_release_sob);
kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob);
}
idr_destroy(&mgr->handles);

View File

@ -3775,7 +3775,8 @@ void hl_sysfs_add_dev_vrm_attr(struct hl_device *hdev, struct attribute_group *d
void hw_sob_get(struct hl_hw_sob *hw_sob);
void hw_sob_put(struct hl_hw_sob *hw_sob);
void hl_encaps_handle_do_release(struct kref *ref);
void hl_encaps_release_handle_and_put_ctx(struct kref *ref);
void hl_encaps_release_handle_and_put_sob_ctx(struct kref *ref);
void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev,
struct hl_cs *cs, struct hl_cs_job *job,
struct hl_cs_compl *cs_cmpl);