drm/amdgpu: use scheduler fault instead of reset work
authorChristian König <christian.koenig@amd.com>
Tue, 16 Oct 2018 11:08:21 +0000 (13:08 +0200)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 5 Nov 2018 19:21:03 +0000 (14:21 -0500)
Signal a fault to the scheduler on an illegal instruction or register
access violation instead of kicking of the reset handler directly.

Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
drivers/gpu/drm/amd/amdgpu/cik_sdma.c
drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c

index fb922a872a80420ed381820f98402422b39dee56..9348eb5a3c8394bdd2dc874a0e389ac6c813e1ee 100644 (file)
@@ -830,7 +830,6 @@ struct amdgpu_device {
        bool                            need_dma32;
        bool                            need_swiotlb;
        bool                            accel_working;
-       struct work_struct              reset_work;
        struct notifier_block           acpi_nb;
        struct amdgpu_i2c_chan          *i2c_bus[AMDGPU_MAX_I2C_BUS];
        struct amdgpu_debugfs           debugfs[AMDGPU_DEBUGFS_MAX_COMPONENTS];
index 52c17f6219a706d2793d999eef39979342c5ab28..6b6524f04ce09246b906a9da07a4f8d8b767a417 100644 (file)
@@ -93,23 +93,6 @@ static void amdgpu_hotplug_work_func(struct work_struct *work)
        drm_helper_hpd_irq_event(dev);
 }
 
-/**
- * amdgpu_irq_reset_work_func - execute GPU reset
- *
- * @work: work struct pointer
- *
- * Execute scheduled GPU reset (Cayman+).
- * This function is called when the IRQ handler thinks we need a GPU reset.
- */
-static void amdgpu_irq_reset_work_func(struct work_struct *work)
-{
-       struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
-                                                 reset_work);
-
-       if (!amdgpu_sriov_vf(adev) && amdgpu_device_should_recover_gpu(adev))
-               amdgpu_device_gpu_recover(adev, NULL);
-}
-
 /**
  * amdgpu_irq_disable_all - disable *all* interrupts
  *
@@ -262,15 +245,12 @@ int amdgpu_irq_init(struct amdgpu_device *adev)
                                amdgpu_hotplug_work_func);
        }
 
-       INIT_WORK(&adev->reset_work, amdgpu_irq_reset_work_func);
-
        adev->irq.installed = true;
        r = drm_irq_install(adev->ddev, adev->ddev->pdev->irq);
        if (r) {
                adev->irq.installed = false;
                if (!amdgpu_device_has_dc_support(adev))
                        flush_work(&adev->hotplug_work);
-               cancel_work_sync(&adev->reset_work);
                return r;
        }
        adev->ddev->max_vblank_count = 0x00ffffff;
@@ -299,7 +279,6 @@ void amdgpu_irq_fini(struct amdgpu_device *adev)
                        pci_disable_msi(adev->pdev);
                if (!amdgpu_device_has_dc_support(adev))
                        flush_work(&adev->hotplug_work);
-               cancel_work_sync(&adev->reset_work);
        }
 
        for (i = 0; i < AMDGPU_IRQ_CLIENTID_MAX; ++i) {
index b918c8886b75c4104d2fc5b03c7863bf9a4d6e41..32eb43d165f260967a632c7ec22e2d5ebaefdcd8 100644 (file)
@@ -1214,8 +1214,11 @@ static int cik_sdma_process_illegal_inst_irq(struct amdgpu_device *adev,
                                             struct amdgpu_irq_src *source,
                                             struct amdgpu_iv_entry *entry)
 {
+       u8 instance_id;
+
        DRM_ERROR("Illegal instruction in SDMA command stream\n");
-       schedule_work(&adev->reset_work);
+       instance_id = (entry->ring_id & 0x3) >> 0;
+       drm_sched_fault(&adev->sdma.instance[instance_id].ring.sched);
        return 0;
 }
 
index d76eb27945dc897230640e1727cc2b8def35f1cb..622dd70f310e076b8f03addd26006b101926ca69 100644 (file)
@@ -3393,12 +3393,31 @@ static int gfx_v6_0_eop_irq(struct amdgpu_device *adev,
        return 0;
 }
 
+static void gfx_v6_0_fault(struct amdgpu_device *adev,
+                          struct amdgpu_iv_entry *entry)
+{
+       struct amdgpu_ring *ring;
+
+       switch (entry->ring_id) {
+       case 0:
+               ring = &adev->gfx.gfx_ring[0];
+               break;
+       case 1:
+       case 2:
+               ring = &adev->gfx.compute_ring[entry->ring_id - 1];
+               break;
+       default:
+               return;
+       }
+       drm_sched_fault(&ring->sched);
+}
+
 static int gfx_v6_0_priv_reg_irq(struct amdgpu_device *adev,
                                 struct amdgpu_irq_src *source,
                                 struct amdgpu_iv_entry *entry)
 {
        DRM_ERROR("Illegal register access in command stream\n");
-       schedule_work(&adev->reset_work);
+       gfx_v6_0_fault(adev, entry);
        return 0;
 }
 
@@ -3407,7 +3426,7 @@ static int gfx_v6_0_priv_inst_irq(struct amdgpu_device *adev,
                                  struct amdgpu_iv_entry *entry)
 {
        DRM_ERROR("Illegal instruction in command stream\n");
-       schedule_work(&adev->reset_work);
+       gfx_v6_0_fault(adev, entry);
        return 0;
 }
 
index 0e72bc09939aca1415320b027d9f57380e6eebc4..9fadb32da827938ca58405e46bd7a58cbff71d4e 100644 (file)
@@ -4959,12 +4959,36 @@ static int gfx_v7_0_eop_irq(struct amdgpu_device *adev,
        return 0;
 }
 
+static void gfx_v7_0_fault(struct amdgpu_device *adev,
+                          struct amdgpu_iv_entry *entry)
+{
+       struct amdgpu_ring *ring;
+       u8 me_id, pipe_id;
+       int i;
+
+       me_id = (entry->ring_id & 0x0c) >> 2;
+       pipe_id = (entry->ring_id & 0x03) >> 0;
+       switch (me_id) {
+       case 0:
+               drm_sched_fault(&adev->gfx.gfx_ring[0].sched);
+               break;
+       case 1:
+       case 2:
+               for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+                       ring = &adev->gfx.compute_ring[i];
+                       if ((ring->me == me_id) && (ring->pipe == pipe_id))
+                               drm_sched_fault(&ring->sched);
+               }
+               break;
+       }
+}
+
 static int gfx_v7_0_priv_reg_irq(struct amdgpu_device *adev,
                                 struct amdgpu_irq_src *source,
                                 struct amdgpu_iv_entry *entry)
 {
        DRM_ERROR("Illegal register access in command stream\n");
-       schedule_work(&adev->reset_work);
+       gfx_v7_0_fault(adev, entry);
        return 0;
 }
 
@@ -4974,7 +4998,7 @@ static int gfx_v7_0_priv_inst_irq(struct amdgpu_device *adev,
 {
        DRM_ERROR("Illegal instruction in command stream\n");
        // XXX soft reset the gfx block only
-       schedule_work(&adev->reset_work);
+       gfx_v7_0_fault(adev, entry);
        return 0;
 }
 
index 617b0c8908a375aa0d132af1868f3eaf9e2067b1..ba614f26f55342474efcdb76f677beea705b3a4c 100644 (file)
@@ -6738,12 +6738,39 @@ static int gfx_v8_0_eop_irq(struct amdgpu_device *adev,
        return 0;
 }
 
+static void gfx_v8_0_fault(struct amdgpu_device *adev,
+                          struct amdgpu_iv_entry *entry)
+{
+       u8 me_id, pipe_id, queue_id;
+       struct amdgpu_ring *ring;
+       int i;
+
+       me_id = (entry->ring_id & 0x0c) >> 2;
+       pipe_id = (entry->ring_id & 0x03) >> 0;
+       queue_id = (entry->ring_id & 0x70) >> 4;
+
+       switch (me_id) {
+       case 0:
+               drm_sched_fault(&adev->gfx.gfx_ring[0].sched);
+               break;
+       case 1:
+       case 2:
+               for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+                       ring = &adev->gfx.compute_ring[i];
+                       if (ring->me == me_id && ring->pipe == pipe_id &&
+                           ring->queue == queue_id)
+                               drm_sched_fault(&ring->sched);
+               }
+               break;
+       }
+}
+
 static int gfx_v8_0_priv_reg_irq(struct amdgpu_device *adev,
                                 struct amdgpu_irq_src *source,
                                 struct amdgpu_iv_entry *entry)
 {
        DRM_ERROR("Illegal register access in command stream\n");
-       schedule_work(&adev->reset_work);
+       gfx_v8_0_fault(adev, entry);
        return 0;
 }
 
@@ -6752,7 +6779,7 @@ static int gfx_v8_0_priv_inst_irq(struct amdgpu_device *adev,
                                  struct amdgpu_iv_entry *entry)
 {
        DRM_ERROR("Illegal instruction in command stream\n");
-       schedule_work(&adev->reset_work);
+       gfx_v8_0_fault(adev, entry);
        return 0;
 }
 
index 6d7baf59d6e11e947c83ef34d716c5a546d6460f..0ce1e14099bc72afa4c0a3bddb1a8501b935d318 100644 (file)
@@ -4695,12 +4695,39 @@ static int gfx_v9_0_eop_irq(struct amdgpu_device *adev,
        return 0;
 }
 
+static void gfx_v9_0_fault(struct amdgpu_device *adev,
+                          struct amdgpu_iv_entry *entry)
+{
+       u8 me_id, pipe_id, queue_id;
+       struct amdgpu_ring *ring;
+       int i;
+
+       me_id = (entry->ring_id & 0x0c) >> 2;
+       pipe_id = (entry->ring_id & 0x03) >> 0;
+       queue_id = (entry->ring_id & 0x70) >> 4;
+
+       switch (me_id) {
+       case 0:
+               drm_sched_fault(&adev->gfx.gfx_ring[0].sched);
+               break;
+       case 1:
+       case 2:
+               for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+                       ring = &adev->gfx.compute_ring[i];
+                       if (ring->me == me_id && ring->pipe == pipe_id &&
+                           ring->queue == queue_id)
+                               drm_sched_fault(&ring->sched);
+               }
+               break;
+       }
+}
+
 static int gfx_v9_0_priv_reg_irq(struct amdgpu_device *adev,
                                 struct amdgpu_irq_src *source,
                                 struct amdgpu_iv_entry *entry)
 {
        DRM_ERROR("Illegal register access in command stream\n");
-       schedule_work(&adev->reset_work);
+       gfx_v9_0_fault(adev, entry);
        return 0;
 }
 
@@ -4709,7 +4736,7 @@ static int gfx_v9_0_priv_inst_irq(struct amdgpu_device *adev,
                                  struct amdgpu_iv_entry *entry)
 {
        DRM_ERROR("Illegal instruction in command stream\n");
-       schedule_work(&adev->reset_work);
+       gfx_v9_0_fault(adev, entry);
        return 0;
 }
 
index 2d4770e173dd373f6ece0bc6c65f9a6b627fead5..bedbd5f296c5f493ad4c37e5b3ab44396469e159 100644 (file)
@@ -1105,8 +1105,14 @@ static int sdma_v2_4_process_illegal_inst_irq(struct amdgpu_device *adev,
                                              struct amdgpu_irq_src *source,
                                              struct amdgpu_iv_entry *entry)
 {
+       u8 instance_id, queue_id;
+
        DRM_ERROR("Illegal instruction in SDMA command stream\n");
-       schedule_work(&adev->reset_work);
+       instance_id = (entry->ring_id & 0x3) >> 0;
+       queue_id = (entry->ring_id & 0xc) >> 2;
+
+       if (instance_id <= 1 && queue_id == 0)
+               drm_sched_fault(&adev->sdma.instance[instance_id].ring.sched);
        return 0;
 }
 
index 6fb3edaba0ec065fdd57f5af7866e93006bf39b2..415968dc6c872bad5642493036e0fab03384e294 100644 (file)
@@ -1440,8 +1440,14 @@ static int sdma_v3_0_process_illegal_inst_irq(struct amdgpu_device *adev,
                                              struct amdgpu_irq_src *source,
                                              struct amdgpu_iv_entry *entry)
 {
+       u8 instance_id, queue_id;
+
        DRM_ERROR("Illegal instruction in SDMA command stream\n");
-       schedule_work(&adev->reset_work);
+       instance_id = (entry->ring_id & 0x3) >> 0;
+       queue_id = (entry->ring_id & 0xc) >> 2;
+
+       if (instance_id <= 1 && queue_id == 0)
+               drm_sched_fault(&adev->sdma.instance[instance_id].ring.sched);
        return 0;
 }
 
index c0d1650d6c71195562ef6fa06019136702fca802..88d93430dfb1eb71beae7f917b4142095d1ff773 100644 (file)
@@ -1717,12 +1717,29 @@ static int sdma_v4_0_process_illegal_inst_irq(struct amdgpu_device *adev,
                                              struct amdgpu_irq_src *source,
                                              struct amdgpu_iv_entry *entry)
 {
+       int instance;
+
        DRM_ERROR("Illegal instruction in SDMA command stream\n");
-       schedule_work(&adev->reset_work);
+
+       switch (entry->client_id) {
+       case SOC15_IH_CLIENTID_SDMA0:
+               instance = 0;
+               break;
+       case SOC15_IH_CLIENTID_SDMA1:
+               instance = 1;
+               break;
+       default:
+               return 0;
+       }
+
+       switch (entry->ring_id) {
+       case 0:
+               drm_sched_fault(&adev->sdma.instance[instance].ring.sched);
+               break;
+       }
        return 0;
 }
 
-
 static void sdma_v4_0_update_medium_grain_clock_gating(
                struct amdgpu_device *adev,
                bool enable)