drm/i915: Flush TLBs before releasing backing store
authorTvrtko Ursulin <tvrtko.ursulin@intel.com>
Tue, 19 Oct 2021 12:27:10 +0000 (13:27 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 29 Jan 2022 09:58:24 +0000 (10:58 +0100)
commit 7938d61591d33394a21bdd7797a245b65428f44c upstream.

We need to flush TLBs before releasing backing store otherwise userspace
is able to encounter stale entries if a) it is not declaring access to
certain buffers and b) it races with the backing store release from a
such undeclared execution already executing on the GPU in parallel.

The approach taken is to mark any buffer objects which were ever bound
to the GPU and to trigger a serialized TLB flush when their backing
store is released.

Alternatively the flushing could be done on VMA unbind, at which point
we would be able to ascertain whether there is potential a parallel GPU
execution (which could race), but essentially it boils down to paying
the cost of TLB flushes potentially needlessly at VMA unbind time (when
the backing store is not known to be going away so not needed for
safety), versus potentially needlessly at backing store relase time
(since we at that point cannot tell whether there is anything executing
on the GPU which uses that object).

Thereforce simplicity of implementation has been chosen for now with
scope to benchmark and refine later as required.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reported-by: Sushma Venkatesh Reddy <sushma.venkatesh.reddy@intel.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Dave Airlie <airlied@redhat.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/gpu/drm/i915/gem/i915_gem_object_types.h
drivers/gpu/drm/i915/gem/i915_gem_pages.c
drivers/gpu/drm/i915/gt/intel_gt.c
drivers/gpu/drm/i915/gt/intel_gt.h
drivers/gpu/drm/i915/gt/intel_gt_types.h
drivers/gpu/drm/i915/i915_reg.h
drivers/gpu/drm/i915/i915_vma.c
drivers/gpu/drm/i915/intel_uncore.c
drivers/gpu/drm/i915/intel_uncore.h

index 2471f36aaff38836a4d4e31c88f402965a3b909c..3012cbe5b0b7ce309527addb5d6bb904fadbc292 100644 (file)
@@ -298,6 +298,7 @@ struct drm_i915_gem_object {
                             I915_BO_ALLOC_USER)
 #define I915_BO_READONLY         BIT(4)
 #define I915_TILING_QUIRK_BIT    5 /* unknown swizzling; do not release! */
+#define I915_BO_WAS_BOUND_BIT    6
 
        /**
         * @mem_flags - Mutable placement-related flags
index 8eb1c3a6fc9cd123b1ee1859815e166221482b38..8d6c38a6220161dd04602cfa48e19e0f27b056e4 100644 (file)
@@ -10,6 +10,8 @@
 #include "i915_gem_lmem.h"
 #include "i915_gem_mman.h"
 
+#include "gt/intel_gt.h"
+
 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
                                 struct sg_table *pages,
                                 unsigned int sg_page_sizes)
@@ -218,6 +220,14 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
        __i915_gem_object_reset_page_iter(obj);
        obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
 
+       if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) {
+               struct drm_i915_private *i915 = to_i915(obj->base.dev);
+               intel_wakeref_t wakeref;
+
+               with_intel_runtime_pm_if_active(&i915->runtime_pm, wakeref)
+                       intel_gt_invalidate_tlbs(&i915->gt);
+       }
+
        return pages;
 }
 
index 62d40c9866427e7dfbd9838cdeb86956b32e5525..e1e1d17d49fdd325f59b99534119c3763174f113 100644 (file)
@@ -29,6 +29,8 @@ void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915)
 
        spin_lock_init(&gt->irq_lock);
 
+       mutex_init(&gt->tlb_invalidate_lock);
+
        INIT_LIST_HEAD(&gt->closed_vma);
        spin_lock_init(&gt->closed_lock);
 
@@ -895,3 +897,103 @@ void intel_gt_info_print(const struct intel_gt_info *info,
 
        intel_sseu_dump(&info->sseu, p);
 }
+
+struct reg_and_bit {
+       i915_reg_t reg;
+       u32 bit;
+};
+
+static struct reg_and_bit
+get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8,
+               const i915_reg_t *regs, const unsigned int num)
+{
+       const unsigned int class = engine->class;
+       struct reg_and_bit rb = { };
+
+       if (drm_WARN_ON_ONCE(&engine->i915->drm,
+                            class >= num || !regs[class].reg))
+               return rb;
+
+       rb.reg = regs[class];
+       if (gen8 && class == VIDEO_DECODE_CLASS)
+               rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */
+       else
+               rb.bit = engine->instance;
+
+       rb.bit = BIT(rb.bit);
+
+       return rb;
+}
+
+void intel_gt_invalidate_tlbs(struct intel_gt *gt)
+{
+       static const i915_reg_t gen8_regs[] = {
+               [RENDER_CLASS]                  = GEN8_RTCR,
+               [VIDEO_DECODE_CLASS]            = GEN8_M1TCR, /* , GEN8_M2TCR */
+               [VIDEO_ENHANCEMENT_CLASS]       = GEN8_VTCR,
+               [COPY_ENGINE_CLASS]             = GEN8_BTCR,
+       };
+       static const i915_reg_t gen12_regs[] = {
+               [RENDER_CLASS]                  = GEN12_GFX_TLB_INV_CR,
+               [VIDEO_DECODE_CLASS]            = GEN12_VD_TLB_INV_CR,
+               [VIDEO_ENHANCEMENT_CLASS]       = GEN12_VE_TLB_INV_CR,
+               [COPY_ENGINE_CLASS]             = GEN12_BLT_TLB_INV_CR,
+       };
+       struct drm_i915_private *i915 = gt->i915;
+       struct intel_uncore *uncore = gt->uncore;
+       struct intel_engine_cs *engine;
+       enum intel_engine_id id;
+       const i915_reg_t *regs;
+       unsigned int num = 0;
+
+       if (I915_SELFTEST_ONLY(gt->awake == -ENODEV))
+               return;
+
+       if (GRAPHICS_VER(i915) == 12) {
+               regs = gen12_regs;
+               num = ARRAY_SIZE(gen12_regs);
+       } else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) {
+               regs = gen8_regs;
+               num = ARRAY_SIZE(gen8_regs);
+       } else if (GRAPHICS_VER(i915) < 8) {
+               return;
+       }
+
+       if (drm_WARN_ONCE(&i915->drm, !num,
+                         "Platform does not implement TLB invalidation!"))
+               return;
+
+       GEM_TRACE("\n");
+
+       assert_rpm_wakelock_held(&i915->runtime_pm);
+
+       mutex_lock(&gt->tlb_invalidate_lock);
+       intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
+
+       for_each_engine(engine, gt, id) {
+               /*
+                * HW architecture suggest typical invalidation time at 40us,
+                * with pessimistic cases up to 100us and a recommendation to
+                * cap at 1ms. We go a bit higher just in case.
+                */
+               const unsigned int timeout_us = 100;
+               const unsigned int timeout_ms = 4;
+               struct reg_and_bit rb;
+
+               rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
+               if (!i915_mmio_reg_offset(rb.reg))
+                       continue;
+
+               intel_uncore_write_fw(uncore, rb.reg, rb.bit);
+               if (__intel_wait_for_register_fw(uncore,
+                                                rb.reg, rb.bit, 0,
+                                                timeout_us, timeout_ms,
+                                                NULL))
+                       drm_err_ratelimited(&gt->i915->drm,
+                                           "%s TLB invalidation did not complete in %ums!\n",
+                                           engine->name, timeout_ms);
+       }
+
+       intel_uncore_forcewake_put_delayed(uncore, FORCEWAKE_ALL);
+       mutex_unlock(&gt->tlb_invalidate_lock);
+}
index 74e771871a9bd71e2400424cdf9ad773b488f163..c0169d6017c2d38d9cfd91e01df72cf1bb52d655 100644 (file)
@@ -90,4 +90,6 @@ void intel_gt_info_print(const struct intel_gt_info *info,
 
 void intel_gt_watchdog_work(struct work_struct *work);
 
+void intel_gt_invalidate_tlbs(struct intel_gt *gt);
+
 #endif /* __INTEL_GT_H__ */
index a81e21bf1bd1a74b2225c2db1da7d163ac496700..9fbcbcc6c35db5de225c2940798aa866fdb43c88 100644 (file)
@@ -72,6 +72,8 @@ struct intel_gt {
 
        struct intel_uc uc;
 
+       struct mutex tlb_invalidate_lock;
+
        struct intel_gt_timelines {
                spinlock_t lock; /* protects active_list */
                struct list_head active_list;
index 9023d4ecf3b37252069472cc6f9a624046d37016..c65473fc90935fd4b50c2118c393165e7c78f077 100644 (file)
@@ -2669,6 +2669,12 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING     (1 << 28)
 #define   GAMT_CHKN_DISABLE_I2M_CYCLE_ON_WR_PORT       (1 << 24)
 
+#define GEN8_RTCR      _MMIO(0x4260)
+#define GEN8_M1TCR     _MMIO(0x4264)
+#define GEN8_M2TCR     _MMIO(0x4268)
+#define GEN8_BTCR      _MMIO(0x426c)
+#define GEN8_VTCR      _MMIO(0x4270)
+
 #if 0
 #define PRB0_TAIL      _MMIO(0x2030)
 #define PRB0_HEAD      _MMIO(0x2034)
@@ -2763,6 +2769,11 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   FAULT_VA_HIGH_BITS           (0xf << 0)
 #define   FAULT_GTT_SEL                        (1 << 4)
 
+#define GEN12_GFX_TLB_INV_CR   _MMIO(0xced8)
+#define GEN12_VD_TLB_INV_CR    _MMIO(0xcedc)
+#define GEN12_VE_TLB_INV_CR    _MMIO(0xcee0)
+#define GEN12_BLT_TLB_INV_CR   _MMIO(0xcee4)
+
 #define GEN12_AUX_ERR_DBG              _MMIO(0x43f4)
 
 #define FPGA_DBG               _MMIO(0x42300)
index 4b7fc4647e460eff4a87db4fb3b8be19f6cb2b0f..dfd20060812bcdd631c720594e982350d4bd4893 100644 (file)
@@ -434,6 +434,9 @@ int i915_vma_bind(struct i915_vma *vma,
                vma->ops->bind_vma(vma->vm, NULL, vma, cache_level, bind_flags);
        }
 
+       if (vma->obj)
+               set_bit(I915_BO_WAS_BOUND_BIT, &vma->obj->flags);
+
        atomic_or(bind_flags, &vma->flags);
        return 0;
 }
index 6b38bc2811c1ba70b60b74f0fcf719bc0d6a2aef..de8d0558389c493353e55dd6c30e7c6ac3076a92 100644 (file)
@@ -718,7 +718,8 @@ void intel_uncore_forcewake_get__locked(struct intel_uncore *uncore,
 }
 
 static void __intel_uncore_forcewake_put(struct intel_uncore *uncore,
-                                        enum forcewake_domains fw_domains)
+                                        enum forcewake_domains fw_domains,
+                                        bool delayed)
 {
        struct intel_uncore_forcewake_domain *domain;
        unsigned int tmp;
@@ -733,7 +734,11 @@ static void __intel_uncore_forcewake_put(struct intel_uncore *uncore,
                        continue;
                }
 
-               uncore->funcs.force_wake_put(uncore, domain->mask);
+               if (delayed &&
+                   !(domain->uncore->fw_domains_timer & domain->mask))
+                       fw_domain_arm_timer(domain);
+               else
+                       uncore->funcs.force_wake_put(uncore, domain->mask);
        }
 }
 
@@ -754,7 +759,20 @@ void intel_uncore_forcewake_put(struct intel_uncore *uncore,
                return;
 
        spin_lock_irqsave(&uncore->lock, irqflags);
-       __intel_uncore_forcewake_put(uncore, fw_domains);
+       __intel_uncore_forcewake_put(uncore, fw_domains, false);
+       spin_unlock_irqrestore(&uncore->lock, irqflags);
+}
+
+void intel_uncore_forcewake_put_delayed(struct intel_uncore *uncore,
+                                       enum forcewake_domains fw_domains)
+{
+       unsigned long irqflags;
+
+       if (!uncore->funcs.force_wake_put)
+               return;
+
+       spin_lock_irqsave(&uncore->lock, irqflags);
+       __intel_uncore_forcewake_put(uncore, fw_domains, true);
        spin_unlock_irqrestore(&uncore->lock, irqflags);
 }
 
@@ -796,7 +814,7 @@ void intel_uncore_forcewake_put__locked(struct intel_uncore *uncore,
        if (!uncore->funcs.force_wake_put)
                return;
 
-       __intel_uncore_forcewake_put(uncore, fw_domains);
+       __intel_uncore_forcewake_put(uncore, fw_domains, false);
 }
 
 void assert_forcewakes_inactive(struct intel_uncore *uncore)
index 3c0b0a8b5250d1d6750e07385098aaca29ea3178..4c63209dcf53069b8bd87f398fc158e3e4656c45 100644 (file)
@@ -229,6 +229,8 @@ void intel_uncore_forcewake_get(struct intel_uncore *uncore,
                                enum forcewake_domains domains);
 void intel_uncore_forcewake_put(struct intel_uncore *uncore,
                                enum forcewake_domains domains);
+void intel_uncore_forcewake_put_delayed(struct intel_uncore *uncore,
+                                       enum forcewake_domains domains);
 void intel_uncore_forcewake_flush(struct intel_uncore *uncore,
                                  enum forcewake_domains fw_domains);