net/mlx5: Fix fatal error handling during device load
[ Upstream commit b6e0b6bebe0732d5cac51f0791f269d2413b8980 ] Currently, in case of fatal error during mlx5_load_one(), we cannot enter error state until mlx5_load_one() is finished, what can take several minutes until commands will get timeouts, because these commands can't be processed due to the fatal error. Fix it by setting dev->state as MLX5_DEVICE_STATE_INTERNAL_ERROR before requesting the lock. Fixes: c1d4d2e92ad6 ("net/mlx5: Avoid calling sleeping function by the health poll thread") Signed-off-by: Shay Drory <shayd@mellanox.com> Reviewed-by: Moshe Shemesh <moshe@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
parent
cd5d0278dc
commit
b05f2b6021
@ -193,15 +193,23 @@ static bool reset_fw_if_needed(struct mlx5_core_dev *dev)
|
||||
|
||||
void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
|
||||
{
|
||||
bool err_detected = false;
|
||||
|
||||
/* Mark the device as fatal in order to abort FW commands */
|
||||
if ((check_fatal_sensors(dev) || force) &&
|
||||
dev->state == MLX5_DEVICE_STATE_UP) {
|
||||
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
|
||||
err_detected = true;
|
||||
}
|
||||
mutex_lock(&dev->intf_state_mutex);
|
||||
if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
|
||||
goto unlock;
|
||||
if (!err_detected && dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
|
||||
goto unlock;/* a previous error is still being handled */
|
||||
if (dev->state == MLX5_DEVICE_STATE_UNINITIALIZED) {
|
||||
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if (check_fatal_sensors(dev) || force) {
|
||||
if (check_fatal_sensors(dev) || force) { /* protected state setting */
|
||||
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
|
||||
mlx5_cmd_flush(dev);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user