drm/amdgpu: Set MTYPE in PTE based on BO flags
authorFelix Kuehling <Felix.Kuehling@amd.com>
Fri, 26 Aug 2022 22:22:35 +0000 (18:22 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 9 Nov 2022 22:41:42 +0000 (17:41 -0500)
The same BO may need different MTYPEs and SNOOP flags in PTEs depending
on its current location relative to the mapping GPU. Setting MTYPEs from
clients ahead of time is not practical for coherent memory sharing.
Instead determine the correct MTYPE for the desired coherence model and
current BO location when updating the page tables.

To maintain backwards compatibility with MTYPE-selection in
AMDGPU_VA_OP_MAP, the coherence-model-based MTYPE selection is only
applied if it chooses an MTYPE other than MTYPE_NC (the default).

Add two AMDGPU_GEM_CREATE_... flags to indicate the coherence model. The
default if no flag is specified is non-coherent (i.e. coarse-grained
coherent at dispatch boundaries).

Update amdgpu_amdkfd_gpuvm.c to use this new method to choose the
correct MTYPE depending on the current memory location.

v2:
* check that bo is not NULL (e.g. PRT mappings)
* Fix missing ~ bitmask in gmc_v11_0.c
v3:
* squash in "drm/amdgpu: Inherit coherence flags on dmabuf import"

Suggested-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
include/uapi/drm/amdgpu_drm.h

index ba72a910d0d598429d7c08b2521226c231d65690..c5c9bfa2772efb3d15dcc4708cd82cbc4c9c866a 100644 (file)
@@ -405,63 +405,15 @@ static int vm_update_pds(struct amdgpu_vm *vm, struct amdgpu_sync *sync)
 
 static uint64_t get_pte_flags(struct amdgpu_device *adev, struct kgd_mem *mem)
 {
-       struct amdgpu_device *bo_adev = amdgpu_ttm_adev(mem->bo->tbo.bdev);
-       bool coherent = mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT;
-       bool uncached = mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED;
-       uint32_t mapping_flags;
-       uint64_t pte_flags;
-       bool snoop = false;
+       uint32_t mapping_flags = AMDGPU_VM_PAGE_READABLE |
+                                AMDGPU_VM_MTYPE_DEFAULT;
 
-       mapping_flags = AMDGPU_VM_PAGE_READABLE;
        if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE)
                mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE;
        if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE)
                mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
 
-       switch (adev->ip_versions[GC_HWIP][0]) {
-       case IP_VERSION(9, 4, 1):
-       case IP_VERSION(9, 4, 2):
-               if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
-                       if (bo_adev == adev) {
-                               if (uncached)
-                                       mapping_flags |= AMDGPU_VM_MTYPE_UC;
-                               else if (coherent)
-                                       mapping_flags |= AMDGPU_VM_MTYPE_CC;
-                               else
-                                       mapping_flags |= AMDGPU_VM_MTYPE_RW;
-                               if ((adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) &&
-                                   adev->gmc.xgmi.connected_to_cpu)
-                                       snoop = true;
-                       } else {
-                               if (uncached || coherent)
-                                       mapping_flags |= AMDGPU_VM_MTYPE_UC;
-                               else
-                                       mapping_flags |= AMDGPU_VM_MTYPE_NC;
-                               if (amdgpu_xgmi_same_hive(adev, bo_adev))
-                                       snoop = true;
-                       }
-               } else {
-                       if (uncached || coherent)
-                               mapping_flags |= AMDGPU_VM_MTYPE_UC;
-                       else
-                               mapping_flags |= AMDGPU_VM_MTYPE_NC;
-                       snoop = true;
-               }
-               break;
-       default:
-               if (uncached || coherent)
-                       mapping_flags |= AMDGPU_VM_MTYPE_UC;
-               else
-                       mapping_flags |= AMDGPU_VM_MTYPE_NC;
-
-               if (!(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM))
-                       snoop = true;
-       }
-
-       pte_flags = amdgpu_gem_va_map_flags(adev, mapping_flags);
-       pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0;
-
-       return pte_flags;
+       return amdgpu_gem_va_map_flags(adev, mapping_flags);
 }
 
 /**
@@ -1673,6 +1625,11 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
                }
        }
 
+       if (flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)
+               alloc_flags |= AMDGPU_GEM_CREATE_COHERENT;
+       if (flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED)
+               alloc_flags |= AMDGPU_GEM_CREATE_UNCACHED;
+
        *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
        if (!*mem) {
                ret = -ENOMEM;
index 7bd8e33b14be5a478d71017040263f8905d92104..271e30e34d93213cce01e9c4e2b7e976ad2bebcc 100644 (file)
@@ -328,7 +328,9 @@ amdgpu_dma_buf_create_obj(struct drm_device *dev, struct dma_buf *dma_buf)
        if (dma_buf->ops == &amdgpu_dmabuf_ops) {
                struct amdgpu_bo *other = gem_to_amdgpu_bo(dma_buf->priv);
 
-               flags |= other->flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC;
+               flags |= other->flags & (AMDGPU_GEM_CREATE_CPU_GTT_USWC |
+                                        AMDGPU_GEM_CREATE_COHERENT |
+                                        AMDGPU_GEM_CREATE_UNCACHED);
        }
 
        ret = amdgpu_gem_object_create(adev, dma_buf->size, PAGE_SIZE,
index f513e2c2e964f0c9b3c8d8d522e96692eb55259b..a83efdc8aa0c5a64e401d77c890e04e4b95606cc 100644 (file)
@@ -612,6 +612,8 @@ static void gmc_v10_0_get_vm_pte(struct amdgpu_device *adev,
                                 struct amdgpu_bo_va_mapping *mapping,
                                 uint64_t *flags)
 {
+       struct amdgpu_bo *bo = mapping->bo_va->base.bo;
+
        *flags &= ~AMDGPU_PTE_EXECUTABLE;
        *flags |= mapping->flags & AMDGPU_PTE_EXECUTABLE;
 
@@ -628,6 +630,11 @@ static void gmc_v10_0_get_vm_pte(struct amdgpu_device *adev,
                *flags |= AMDGPU_PTE_SYSTEM;
                *flags &= ~AMDGPU_PTE_VALID;
        }
+
+       if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
+                              AMDGPU_GEM_CREATE_UNCACHED))
+               *flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) |
+                        AMDGPU_PTE_MTYPE_NV10(MTYPE_UC);
 }
 
 static unsigned gmc_v10_0_get_vbios_fb_size(struct amdgpu_device *adev)
index 66dfb574cc7d1bbe2490f8a6572a66e4825f29d6..16f52049d9863b28257c9f417e760f376eed3dea 100644 (file)
@@ -503,6 +503,8 @@ static void gmc_v11_0_get_vm_pte(struct amdgpu_device *adev,
                                 struct amdgpu_bo_va_mapping *mapping,
                                 uint64_t *flags)
 {
+       struct amdgpu_bo *bo = mapping->bo_va->base.bo;
+
        *flags &= ~AMDGPU_PTE_EXECUTABLE;
        *flags |= mapping->flags & AMDGPU_PTE_EXECUTABLE;
 
@@ -519,6 +521,11 @@ static void gmc_v11_0_get_vm_pte(struct amdgpu_device *adev,
                *flags |= AMDGPU_PTE_SYSTEM;
                *flags &= ~AMDGPU_PTE_VALID;
        }
+
+       if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
+                              AMDGPU_GEM_CREATE_UNCACHED))
+               *flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) |
+                        AMDGPU_PTE_MTYPE_NV10(MTYPE_UC);
 }
 
 static unsigned gmc_v11_0_get_vbios_fb_size(struct amdgpu_device *adev)
index 67ca16a8027c7d9afe83e30a916e8a9c0984e0a0..50386eb2eec8dd3614cb80b400a05cb7bf70b4af 100644 (file)
@@ -1113,6 +1113,74 @@ static void gmc_v9_0_get_vm_pde(struct amdgpu_device *adev, int level,
        }
 }
 
+static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev,
+                                        struct amdgpu_bo *bo,
+                                        struct amdgpu_bo_va_mapping *mapping,
+                                        uint64_t *flags)
+{
+       struct amdgpu_device *bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
+       bool is_vram = bo->tbo.resource->mem_type == TTM_PL_VRAM;
+       bool coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT;
+       bool uncached = bo->flags & AMDGPU_GEM_CREATE_UNCACHED;
+       unsigned int mtype;
+       bool snoop = false;
+
+       switch (adev->ip_versions[GC_HWIP][0]) {
+       case IP_VERSION(9, 4, 1):
+       case IP_VERSION(9, 4, 2):
+               if (is_vram) {
+                       if (bo_adev == adev) {
+                               if (uncached)
+                                       mtype = MTYPE_UC;
+                               else if (coherent)
+                                       mtype = MTYPE_CC;
+                               else
+                                       mtype = MTYPE_RW;
+                               /* FIXME: is this still needed? Or does
+                                * amdgpu_ttm_tt_pde_flags already handle this?
+                                */
+                               if (adev->ip_versions[GC_HWIP][0] ==
+                                       IP_VERSION(9, 4, 2) &&
+                                   adev->gmc.xgmi.connected_to_cpu)
+                                       snoop = true;
+                       } else {
+                               if (uncached || coherent)
+                                       mtype = MTYPE_UC;
+                               else
+                                       mtype = MTYPE_NC;
+                               if (mapping->bo_va->is_xgmi)
+                                       snoop = true;
+                       }
+               } else {
+                       if (uncached || coherent)
+                               mtype = MTYPE_UC;
+                       else
+                               mtype = MTYPE_NC;
+                       /* FIXME: is this still needed? Or does
+                        * amdgpu_ttm_tt_pde_flags already handle this?
+                        */
+                       snoop = true;
+               }
+               break;
+       default:
+               if (uncached || coherent)
+                       mtype = MTYPE_UC;
+               else
+                       mtype = MTYPE_NC;
+
+               /* FIXME: is this still needed? Or does
+                * amdgpu_ttm_tt_pde_flags already handle this?
+                */
+               if (!is_vram)
+                       snoop = true;
+       }
+
+       if (mtype != MTYPE_NC)
+               *flags = (*flags & ~AMDGPU_PTE_MTYPE_VG10_MASK) |
+                        AMDGPU_PTE_MTYPE_VG10(mtype);
+       *flags |= snoop ? AMDGPU_PTE_SNOOPED : 0;
+}
+
 static void gmc_v9_0_get_vm_pte(struct amdgpu_device *adev,
                                struct amdgpu_bo_va_mapping *mapping,
                                uint64_t *flags)
@@ -1128,14 +1196,9 @@ static void gmc_v9_0_get_vm_pte(struct amdgpu_device *adev,
                *flags &= ~AMDGPU_PTE_VALID;
        }
 
-       if ((adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 1) ||
-            adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) &&
-           !(*flags & AMDGPU_PTE_SYSTEM) &&
-           mapping->bo_va->is_xgmi)
-               *flags |= AMDGPU_PTE_SNOOPED;
-
-       if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
-               *flags |= mapping->flags & AMDGPU_PTE_SNOOPED;
+       if (mapping->bo_va->base.bo)
+               gmc_v9_0_get_coherence_flags(adev, mapping->bo_va->base.bo,
+                                            mapping, flags);
 }
 
 static unsigned gmc_v9_0_get_vbios_fb_size(struct amdgpu_device *adev)
index 0d93ec132ebbcb1cdab91a1ae0831b02ffeb782d..4038abe8505affd7191fa2629401aece46b7d761 100644 (file)
@@ -144,6 +144,20 @@ extern "C" {
  * content.
  */
 #define AMDGPU_GEM_CREATE_DISCARDABLE          (1 << 12)
+/* Flag that BO is shared coherently between multiple devices or CPU threads.
+ * May depend on GPU instructions to flush caches explicitly
+ *
+ * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and
+ * may override the MTYPE selected in AMDGPU_VA_OP_MAP.
+ */
+#define AMDGPU_GEM_CREATE_COHERENT             (1 << 13)
+/* Flag that BO should not be cached by GPU. Coherent without having to flush
+ * GPU caches explicitly
+ *
+ * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and
+ * may override the MTYPE selected in AMDGPU_VA_OP_MAP.
+ */
+#define AMDGPU_GEM_CREATE_UNCACHED             (1 << 14)
 
 struct drm_amdgpu_gem_create_in  {
        /** the requested memory size */