drm/xe: fix pvc unload issue
authorChang, Bruce <yu.bruce.chang@intel.com>
Mon, 3 Apr 2023 22:20:31 +0000 (22:20 +0000)
committerRodrigo Vivi <rodrigo.vivi@intel.com>
Tue, 19 Dec 2023 23:31:30 +0000 (18:31 -0500)
Currently, unload pvc driver will generate a null dereference
and the call stack is as below.

[ 4850.618000] Call Trace:
[ 4850.620740]  <TASK>
[ 4850.623134]  ttm_bo_cleanup_memtype_use+0x3f/0x50 [ttm]
[ 4850.628661]  ttm_bo_release+0x154/0x2c0 [ttm]
[ 4850.633317]  ? drm_buddy_fini+0x62/0x80 [drm_buddy]
[ 4850.638487]  ? __kmem_cache_free+0x27d/0x2c0
[ 4850.643054]  ttm_bo_put+0x38/0x60 [ttm]
[ 4850.647190]  xe_gem_object_free+0x1f/0x30 [xe]
[ 4850.651945]  drm_gem_object_free+0x1e/0x30 [drm]
[ 4850.656904]  ggtt_fini_noalloc+0x9d/0xe0 [xe]
[ 4850.661574]  drm_managed_release+0xb5/0x150 [drm]
[ 4850.666617]  drm_dev_release+0x30/0x50 [drm]
[ 4850.671209]  devm_drm_dev_init_release+0x3c/0x60 [drm]

There are a couple issues, but the main one is due to TTM has only
one TTM_PL_TT region, but since pvc has 2 tiles and tries to setup
1 TTM_PL_TT each tile. The second will overwrite the first one.

During unload time, the first tile will reset the TTM_PL_TT manger
and when the second tile is trying to free Bo and it will generate
the null reference since the TTM manage is already got reset to 0.

The fix is to use one global TTM_PL_TT manager.

v2: make gtt mgr global and change the name to sys_mgr

Cc: Stuart Summers <stuart.summers@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Vivi, Rodrigo <rodrigo.vivi@intel.com>
Signed-off-by: Bruce Chang <yu.bruce.chang@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
drivers/gpu/drm/xe/Makefile
drivers/gpu/drm/xe/xe_device.c
drivers/gpu/drm/xe/xe_device.h
drivers/gpu/drm/xe/xe_device_types.h
drivers/gpu/drm/xe/xe_gt.c
drivers/gpu/drm/xe/xe_gt_types.h
drivers/gpu/drm/xe/xe_ttm_gtt_mgr.c [deleted file]
drivers/gpu/drm/xe/xe_ttm_gtt_mgr.h [deleted file]
drivers/gpu/drm/xe/xe_ttm_gtt_mgr_types.h [deleted file]
drivers/gpu/drm/xe/xe_ttm_sys_mgr.c [new file with mode: 0644]
drivers/gpu/drm/xe/xe_ttm_sys_mgr.h [new file with mode: 0644]

index 6ef80889fddb329c5c3abb3ed7f53332c49d43d8..42459727e67ac82262f4c2e9029aa8a89fc3d71b 100644 (file)
@@ -88,7 +88,7 @@ xe-y += xe_bb.o \
        xe_step.o \
        xe_sync.o \
        xe_trace.o \
-       xe_ttm_gtt_mgr.o \
+       xe_ttm_sys_mgr.o \
        xe_ttm_stolen_mgr.o \
        xe_ttm_vram_mgr.o \
        xe_tuning.o \
index ffacf80c89422e077a78c900b7b75fb0469382ea..b13bbdeeef51a441b4a5f7f23c6698512789b2cc 100644 (file)
@@ -27,6 +27,7 @@
 #include "xe_pm.h"
 #include "xe_query.h"
 #include "xe_ttm_stolen_mgr.h"
+#include "xe_ttm_sys_mgr.h"
 #include "xe_vm.h"
 #include "xe_vm_madvise.h"
 #include "xe_wait_user_fence.h"
@@ -262,6 +263,8 @@ int xe_device_probe(struct xe_device *xe)
        if (err)
                goto err_irq_shutdown;
 
+       xe_ttm_sys_mgr_init(xe);
+
        for_each_gt(gt, xe, id) {
                err = xe_gt_init_noalloc(gt);
                if (err)
index d277f8985f7bfb5d431ea7ba0a91d875a0ce4a18..cbae480a20922f666f46852e0a4e4598d1b1165d 100644 (file)
@@ -116,4 +116,5 @@ static inline bool xe_device_has_flat_ccs(struct xe_device *xe)
 }
 
 u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size);
+
 #endif
index 3917b9152eb94e2d00f96d54f0a81b11d9e3b548..74326091bf98f60f5719e8cc07cce96556578f5a 100644 (file)
@@ -134,6 +134,8 @@ struct xe_device {
                        /** @mapping: pointer to VRAM mappable space */
                        void *__iomem mapping;
                } vram;
+               /** @sys_mgr: system TTM manager */
+               struct ttm_resource_manager sys_mgr;
        } mem;
 
        /** @usm: unified memory state */
index bc821f431c45ab0eed2afa390e6741ecd7616b57..daaf93e23bbfc4ed6e8c50714588571a8bc21094 100644 (file)
@@ -36,7 +36,6 @@
 #include "xe_ring_ops.h"
 #include "xe_sa.h"
 #include "xe_sched_job.h"
-#include "xe_ttm_gtt_mgr.h"
 #include "xe_ttm_vram_mgr.h"
 #include "xe_tuning.h"
 #include "xe_uc.h"
@@ -77,16 +76,11 @@ int xe_gt_alloc(struct xe_device *xe, struct xe_gt *gt)
                if (!gt->mem.vram_mgr)
                        return -ENOMEM;
 
-               gt->mem.gtt_mgr = drmm_kzalloc(drm, sizeof(*gt->mem.gtt_mgr),
-                                              GFP_KERNEL);
-               if (!gt->mem.gtt_mgr)
-                       return -ENOMEM;
        } else {
                struct xe_gt *full_gt = xe_find_full_gt(gt);
 
                gt->mem.ggtt = full_gt->mem.ggtt;
                gt->mem.vram_mgr = full_gt->mem.vram_mgr;
-               gt->mem.gtt_mgr = full_gt->mem.gtt_mgr;
        }
 
        gt->ordered_wq = alloc_ordered_workqueue("gt-ordered-wq", 0);
@@ -98,26 +92,14 @@ static int gt_ttm_mgr_init(struct xe_gt *gt)
 {
        struct xe_device *xe = gt_to_xe(gt);
        int err;
-       struct sysinfo si;
-       u64 gtt_size;
-
-       si_meminfo(&si);
-       gtt_size = (u64)si.totalram * si.mem_unit * 3/4;
 
        if (gt->mem.vram.size) {
                err = xe_ttm_vram_mgr_init(gt, gt->mem.vram_mgr);
                if (err)
                        return err;
-               gtt_size = min(max((XE_DEFAULT_GTT_SIZE_MB << 20),
-                                  (u64)gt->mem.vram.size),
-                              gtt_size);
                xe->info.mem_region_mask |= BIT(gt->info.vram_id) << 1;
        }
 
-       err = xe_ttm_gtt_mgr_init(gt, gt->mem.gtt_mgr, gtt_size);
-       if (err)
-               return err;
-
        return 0;
 }
 
index 8f29aba455e05dc323906812810b47d7646ccaae..9d3117fad2e4b27282c2119184b783e7acb21543 100644 (file)
@@ -162,8 +162,6 @@ struct xe_gt {
                } vram;
                /** @vram_mgr: VRAM TTM manager */
                struct xe_ttm_vram_mgr *vram_mgr;
-               /** @gtt_mr: GTT TTM manager */
-               struct xe_ttm_gtt_mgr *gtt_mgr;
                /** @ggtt: Global graphics translation table */
                struct xe_ggtt *ggtt;
        } mem;
diff --git a/drivers/gpu/drm/xe/xe_ttm_gtt_mgr.c b/drivers/gpu/drm/xe/xe_ttm_gtt_mgr.c
deleted file mode 100644 (file)
index 8075781..0000000
+++ /dev/null
@@ -1,130 +0,0 @@
-// SPDX-License-Identifier: MIT
-/*
- * Copyright © 2021-2022 Intel Corporation
- * Copyright (C) 2021-2002 Red Hat
- */
-
-#include <drm/drm_managed.h>
-
-#include <drm/ttm/ttm_placement.h>
-#include <drm/ttm/ttm_range_manager.h>
-#include <drm/ttm/ttm_tt.h>
-
-#include "xe_bo.h"
-#include "xe_gt.h"
-#include "xe_ttm_gtt_mgr.h"
-
-struct xe_ttm_gtt_node {
-       struct ttm_buffer_object *tbo;
-       struct ttm_range_mgr_node base;
-};
-
-static inline struct xe_ttm_gtt_mgr *
-to_gtt_mgr(struct ttm_resource_manager *man)
-{
-       return container_of(man, struct xe_ttm_gtt_mgr, manager);
-}
-
-static inline struct xe_ttm_gtt_node *
-to_xe_ttm_gtt_node(struct ttm_resource *res)
-{
-       return container_of(res, struct xe_ttm_gtt_node, base.base);
-}
-
-static int xe_ttm_gtt_mgr_new(struct ttm_resource_manager *man,
-                             struct ttm_buffer_object *tbo,
-                             const struct ttm_place *place,
-                             struct ttm_resource **res)
-{
-       struct xe_ttm_gtt_node *node;
-       int r;
-
-       node = kzalloc(struct_size(node, base.mm_nodes, 1), GFP_KERNEL);
-       if (!node)
-               return -ENOMEM;
-
-       node->tbo = tbo;
-       ttm_resource_init(tbo, place, &node->base.base);
-
-       if (!(place->flags & TTM_PL_FLAG_TEMPORARY) &&
-           ttm_resource_manager_usage(man) > (man->size << PAGE_SHIFT)) {
-               r = -ENOSPC;
-               goto err_fini;
-       }
-
-       node->base.mm_nodes[0].start = 0;
-       node->base.mm_nodes[0].size = PFN_UP(node->base.base.size);
-       node->base.base.start = XE_BO_INVALID_OFFSET;
-
-       *res = &node->base.base;
-
-       return 0;
-
-err_fini:
-       ttm_resource_fini(man, &node->base.base);
-       kfree(node);
-       return r;
-}
-
-static void xe_ttm_gtt_mgr_del(struct ttm_resource_manager *man,
-                              struct ttm_resource *res)
-{
-       struct xe_ttm_gtt_node *node = to_xe_ttm_gtt_node(res);
-
-       ttm_resource_fini(man, res);
-       kfree(node);
-}
-
-static void xe_ttm_gtt_mgr_debug(struct ttm_resource_manager *man,
-                                struct drm_printer *printer)
-{
-
-}
-
-static const struct ttm_resource_manager_func xe_ttm_gtt_mgr_func = {
-       .alloc = xe_ttm_gtt_mgr_new,
-       .free = xe_ttm_gtt_mgr_del,
-       .debug = xe_ttm_gtt_mgr_debug
-};
-
-static void ttm_gtt_mgr_fini(struct drm_device *drm, void *arg)
-{
-       struct xe_ttm_gtt_mgr *mgr = arg;
-       struct xe_device *xe = gt_to_xe(mgr->gt);
-       struct ttm_resource_manager *man = &mgr->manager;
-       int err;
-
-       ttm_resource_manager_set_used(man, false);
-
-       err = ttm_resource_manager_evict_all(&xe->ttm, man);
-       if (err)
-               return;
-
-       ttm_resource_manager_cleanup(man);
-       ttm_set_driver_manager(&xe->ttm, XE_PL_TT, NULL);
-}
-
-int xe_ttm_gtt_mgr_init(struct xe_gt *gt, struct xe_ttm_gtt_mgr *mgr,
-                       u64 gtt_size)
-{
-       struct xe_device *xe = gt_to_xe(gt);
-       struct ttm_resource_manager *man = &mgr->manager;
-       int err;
-
-       XE_BUG_ON(xe_gt_is_media_type(gt));
-
-       mgr->gt = gt;
-       man->use_tt = true;
-       man->func = &xe_ttm_gtt_mgr_func;
-
-       ttm_resource_manager_init(man, &xe->ttm, gtt_size >> PAGE_SHIFT);
-
-       ttm_set_driver_manager(&xe->ttm, XE_PL_TT, &mgr->manager);
-       ttm_resource_manager_set_used(man, true);
-
-       err = drmm_add_action_or_reset(&xe->drm, ttm_gtt_mgr_fini, mgr);
-       if (err)
-               return err;
-
-       return 0;
-}
diff --git a/drivers/gpu/drm/xe/xe_ttm_gtt_mgr.h b/drivers/gpu/drm/xe/xe_ttm_gtt_mgr.h
deleted file mode 100644 (file)
index d1d57cb..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: MIT */
-/*
- * Copyright © 2022 Intel Corporation
- */
-
-#ifndef _XE_TTGM_GTT_MGR_H_
-#define _XE_TTGM_GTT_MGR_H_
-
-#include "xe_ttm_gtt_mgr_types.h"
-
-struct xe_gt;
-
-int xe_ttm_gtt_mgr_init(struct xe_gt *gt, struct xe_ttm_gtt_mgr *mgr,
-                       u64 gtt_size);
-
-#endif
diff --git a/drivers/gpu/drm/xe/xe_ttm_gtt_mgr_types.h b/drivers/gpu/drm/xe/xe_ttm_gtt_mgr_types.h
deleted file mode 100644 (file)
index c667374..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: MIT */
-/*
- * Copyright © 2022 Intel Corporation
- */
-
-#ifndef _XE_TTM_GTT_MGR_TYPES_H_
-#define _XE_TTM_GTT_MGR_TYPES_H_
-
-#include <drm/ttm/ttm_device.h>
-
-struct xe_gt;
-
-struct xe_ttm_gtt_mgr {
-       struct xe_gt *gt;
-       struct ttm_resource_manager manager;
-};
-
-#endif
diff --git a/drivers/gpu/drm/xe/xe_ttm_sys_mgr.c b/drivers/gpu/drm/xe/xe_ttm_sys_mgr.c
new file mode 100644 (file)
index 0000000..5b0674b
--- /dev/null
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2021-2022 Intel Corporation
+ * Copyright (C) 2021-2002 Red Hat
+ */
+
+#include "xe_ttm_sys_mgr.h"
+
+#include <drm/drm_managed.h>
+
+#include <drm/ttm/ttm_placement.h>
+#include <drm/ttm/ttm_range_manager.h>
+#include <drm/ttm/ttm_tt.h>
+
+#include "xe_bo.h"
+#include "xe_gt.h"
+
+struct xe_ttm_sys_node {
+       struct ttm_buffer_object *tbo;
+       struct ttm_range_mgr_node base;
+};
+
+static inline struct xe_ttm_sys_node *
+to_xe_ttm_sys_node(struct ttm_resource *res)
+{
+       return container_of(res, struct xe_ttm_sys_node, base.base);
+}
+
+static int xe_ttm_sys_mgr_new(struct ttm_resource_manager *man,
+                             struct ttm_buffer_object *tbo,
+                             const struct ttm_place *place,
+                             struct ttm_resource **res)
+{
+       struct xe_ttm_sys_node *node;
+       int r;
+
+       node = kzalloc(struct_size(node, base.mm_nodes, 1), GFP_KERNEL);
+       if (!node)
+               return -ENOMEM;
+
+       node->tbo = tbo;
+       ttm_resource_init(tbo, place, &node->base.base);
+
+       if (!(place->flags & TTM_PL_FLAG_TEMPORARY) &&
+           ttm_resource_manager_usage(man) > (man->size << PAGE_SHIFT)) {
+               r = -ENOSPC;
+               goto err_fini;
+       }
+
+       node->base.mm_nodes[0].start = 0;
+       node->base.mm_nodes[0].size = PFN_UP(node->base.base.size);
+       node->base.base.start = XE_BO_INVALID_OFFSET;
+
+       *res = &node->base.base;
+
+       return 0;
+
+err_fini:
+       ttm_resource_fini(man, &node->base.base);
+       kfree(node);
+       return r;
+}
+
+static void xe_ttm_sys_mgr_del(struct ttm_resource_manager *man,
+                              struct ttm_resource *res)
+{
+       struct xe_ttm_sys_node *node = to_xe_ttm_sys_node(res);
+
+       ttm_resource_fini(man, res);
+       kfree(node);
+}
+
+static void xe_ttm_sys_mgr_debug(struct ttm_resource_manager *man,
+                                struct drm_printer *printer)
+{
+
+}
+
+static const struct ttm_resource_manager_func xe_ttm_sys_mgr_func = {
+       .alloc = xe_ttm_sys_mgr_new,
+       .free = xe_ttm_sys_mgr_del,
+       .debug = xe_ttm_sys_mgr_debug
+};
+
+static void ttm_sys_mgr_fini(struct drm_device *drm, void *arg)
+{
+       struct xe_device *xe = (struct xe_device *)arg;
+       struct ttm_resource_manager *man = &xe->mem.sys_mgr;
+       int err;
+
+       ttm_resource_manager_set_used(man, false);
+
+       err = ttm_resource_manager_evict_all(&xe->ttm, man);
+       if (err)
+               return;
+
+       ttm_resource_manager_cleanup(man);
+       ttm_set_driver_manager(&xe->ttm, XE_PL_TT, NULL);
+}
+
+int xe_ttm_sys_mgr_init(struct xe_device *xe)
+{
+       struct ttm_resource_manager *man = &xe->mem.sys_mgr;
+       struct sysinfo si;
+       u64 gtt_size;
+
+       si_meminfo(&si);
+       gtt_size = (u64)si.totalram * si.mem_unit * 3/4;
+       man->use_tt = true;
+       man->func = &xe_ttm_sys_mgr_func;
+       ttm_resource_manager_init(man, &xe->ttm, gtt_size >> PAGE_SHIFT);
+       ttm_set_driver_manager(&xe->ttm, XE_PL_TT, man);
+       ttm_resource_manager_set_used(man, true);
+       return drmm_add_action_or_reset(&xe->drm, ttm_sys_mgr_fini, xe);
+}
diff --git a/drivers/gpu/drm/xe/xe_ttm_sys_mgr.h b/drivers/gpu/drm/xe/xe_ttm_sys_mgr.h
new file mode 100644 (file)
index 0000000..e8f5cd3
--- /dev/null
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2023 Intel Corporation
+ */
+
+#ifndef _XE_TTM_SYS_MGR_H_
+#define _XE_TTM_SYS_MGR_H_
+
+struct xe_device;
+
+int xe_ttm_sys_mgr_init(struct xe_device *xe);
+
+#endif