drm/panthor: Make sure we handle 'unknown group state' case properly
authorBoris Brezillon <boris.brezillon@collabora.com>
Thu, 2 May 2024 15:52:48 +0000 (17:52 +0200)
committerBoris Brezillon <boris.brezillon@collabora.com>
Thu, 2 May 2024 16:42:13 +0000 (18:42 +0200)
When we check for state values returned by the FW, we only cover part of
the 0:7 range. Make sure we catch FW inconsistencies by adding a default
to the switch statement, and flagging the group state as unknown in that
case.

When an unknown state is detected, we trigger a reset, and consider the
group as unusable after that point, to prevent the potential corruption
from creeping in other places if we continue executing stuff on this
context.

v2:
- Add Steve's R-b
- Fix commit message

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/dri-devel/3b7fd2f2-679e-440c-81cd-42fc2573b515@moroto.mountain/T/#u
Suggested-by: Steven Price <steven.price@arm.com>
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Steven Price <steven.price@arm.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240502155248.1430582-1-boris.brezillon@collabora.com
drivers/gpu/drm/panthor/panthor_sched.c

index 8ec66dd47db3dece6eabe6266a91eb6b70368831..7f16a4a14e9af45251336ffed24f32db8933f4a3 100644 (file)
@@ -490,6 +490,18 @@ enum panthor_group_state {
         * Can no longer be scheduled. The only allowed action is a destruction.
         */
        PANTHOR_CS_GROUP_TERMINATED,
+
+       /**
+        * @PANTHOR_CS_GROUP_UNKNOWN_STATE: Group is an unknown state.
+        *
+        * The FW returned an inconsistent state. The group is flagged unusable
+        * and can no longer be scheduled. The only allowed action is a
+        * destruction.
+        *
+        * When that happens, we also schedule a FW reset, to start from a fresh
+        * state.
+        */
+       PANTHOR_CS_GROUP_UNKNOWN_STATE,
 };
 
 /**
@@ -1127,6 +1139,7 @@ csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id)
        struct panthor_fw_csg_iface *csg_iface;
        struct panthor_group *group;
        enum panthor_group_state new_state, old_state;
+       u32 csg_state;
 
        lockdep_assert_held(&ptdev->scheduler->lock);
 
@@ -1137,7 +1150,8 @@ csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id)
                return;
 
        old_state = group->state;
-       switch (csg_iface->output->ack & CSG_STATE_MASK) {
+       csg_state = csg_iface->output->ack & CSG_STATE_MASK;
+       switch (csg_state) {
        case CSG_STATE_START:
        case CSG_STATE_RESUME:
                new_state = PANTHOR_CS_GROUP_ACTIVE;
@@ -1148,11 +1162,28 @@ csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id)
        case CSG_STATE_SUSPEND:
                new_state = PANTHOR_CS_GROUP_SUSPENDED;
                break;
+       default:
+               /* The unknown state might be caused by a FW state corruption,
+                * which means the group metadata can't be trusted anymore, and
+                * the SUSPEND operation might propagate the corruption to the
+                * suspend buffers. Flag the group state as unknown to make
+                * sure it's unusable after that point.
+                */
+               drm_err(&ptdev->base, "Invalid state on CSG %d (state=%d)",
+                       csg_id, csg_state);
+               new_state = PANTHOR_CS_GROUP_UNKNOWN_STATE;
+               break;
        }
 
        if (old_state == new_state)
                return;
 
+       /* The unknown state might be caused by a FW issue, reset the FW to
+        * take a fresh start.
+        */
+       if (new_state == PANTHOR_CS_GROUP_UNKNOWN_STATE)
+               panthor_device_schedule_reset(ptdev);
+
        if (new_state == PANTHOR_CS_GROUP_SUSPENDED)
                csg_slot_sync_queues_state_locked(ptdev, csg_id);
 
@@ -1783,6 +1814,7 @@ static bool
 group_can_run(struct panthor_group *group)
 {
        return group->state != PANTHOR_CS_GROUP_TERMINATED &&
+              group->state != PANTHOR_CS_GROUP_UNKNOWN_STATE &&
               !group->destroyed && group->fatal_queues == 0 &&
               !group->timedout;
 }
@@ -2557,7 +2589,8 @@ void panthor_sched_suspend(struct panthor_device *ptdev)
 
                if (csg_slot->group) {
                        csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, i,
-                                               CSG_STATE_SUSPEND,
+                                               group_can_run(csg_slot->group) ?
+                                               CSG_STATE_SUSPEND : CSG_STATE_TERMINATE,
                                                CSG_STATE_MASK);
                }
        }