I'm not increasing the DRM version because GDS isn't totally without bugs yet.
v2: update emit_ib_size
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
  * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE.
  * - 3.27.0 - Add new chunk to to AMDGPU_CS to enable BO_LIST creation.
  * - 3.28.0 - Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES
+ * - 3.29.0 - Add AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID
  */
 #define KMS_DRIVER_MAJOR       3
-#define KMS_DRIVER_MINOR       28
+#define KMS_DRIVER_MINOR       29
 #define KMS_DRIVER_PATCHLEVEL  0
 
 int amdgpu_vram_limit = 0;
 
        struct amdgpu_gds_asic_info     mem;
        struct amdgpu_gds_asic_info     gws;
        struct amdgpu_gds_asic_info     oa;
+       uint32_t                        gds_compute_max_wave_id;
+
        /* At present, GDS, GWS and OA resources for gfx (graphics)
         * is always pre-allocated and available for graphics operation.
         * Such resource is shared between all gfx clients.
 
        unsigned vmid = AMDGPU_JOB_GET_VMID(job);
        u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+       /* Currently, there is a high possibility to get wave ID mismatch
+        * between ME and GDS, leading to a hw deadlock, because ME generates
+        * different wave IDs than the GDS expects. This situation happens
+        * randomly when at least 5 compute pipes use GDS ordered append.
+        * The wave IDs generated by ME are also wrong after suspend/resume.
+        * Those are probably bugs somewhere else in the kernel driver.
+        *
+        * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+        * GDS to 0 for this ring (me/pipe).
+        */
+       if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+               amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+               amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
+               amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+       }
+
        amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
        amdgpu_ring_write(ring,
 #ifdef __BIG_ENDIAN
                7 + /* gfx_v7_0_ring_emit_pipeline_sync */
                CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v7_0_ring_emit_vm_flush */
                7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user fence, vm fence */
-       .emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_compute */
+       .emit_ib_size = 7, /* gfx_v7_0_ring_emit_ib_compute */
        .emit_ib = gfx_v7_0_ring_emit_ib_compute,
        .emit_fence = gfx_v7_0_ring_emit_fence_compute,
        .emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
        adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
        adev->gds.gws.total_size = 64;
        adev->gds.oa.total_size = 16;
+       adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
 
        if (adev->gds.mem.total_size == 64 * 1024) {
                adev->gds.mem.gfx_partition_size = 4096;
 
        unsigned vmid = AMDGPU_JOB_GET_VMID(job);
        u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+       /* Currently, there is a high possibility to get wave ID mismatch
+        * between ME and GDS, leading to a hw deadlock, because ME generates
+        * different wave IDs than the GDS expects. This situation happens
+        * randomly when at least 5 compute pipes use GDS ordered append.
+        * The wave IDs generated by ME are also wrong after suspend/resume.
+        * Those are probably bugs somewhere else in the kernel driver.
+        *
+        * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+        * GDS to 0 for this ring (me/pipe).
+        */
+       if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+               amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+               amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
+               amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+       }
+
        amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
        amdgpu_ring_write(ring,
 #ifdef __BIG_ENDIAN
                7 + /* gfx_v8_0_ring_emit_pipeline_sync */
                VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */
                7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */
-       .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */
+       .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
        .emit_ib = gfx_v8_0_ring_emit_ib_compute,
        .emit_fence = gfx_v8_0_ring_emit_fence_compute,
        .emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
                7 + /* gfx_v8_0_ring_emit_pipeline_sync */
                17 + /* gfx_v8_0_ring_emit_vm_flush */
                7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for user fence, vm fence */
-       .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */
+       .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
        .emit_fence = gfx_v8_0_ring_emit_fence_kiq,
        .test_ring = gfx_v8_0_ring_test_ring,
        .insert_nop = amdgpu_ring_insert_nop,
        adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
        adev->gds.gws.total_size = 64;
        adev->gds.oa.total_size = 16;
+       adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
 
        if (adev->gds.mem.total_size == 64 * 1024) {
                adev->gds.mem.gfx_partition_size = 4096;
 
        unsigned vmid = AMDGPU_JOB_GET_VMID(job);
        u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+       /* Currently, there is a high possibility to get wave ID mismatch
+        * between ME and GDS, leading to a hw deadlock, because ME generates
+        * different wave IDs than the GDS expects. This situation happens
+        * randomly when at least 5 compute pipes use GDS ordered append.
+        * The wave IDs generated by ME are also wrong after suspend/resume.
+        * Those are probably bugs somewhere else in the kernel driver.
+        *
+        * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+        * GDS to 0 for this ring (me/pipe).
+        */
+       if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+               amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+               amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID);
+               amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+       }
+
        amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
        BUG_ON(ib->gpu_addr & 0x3); /* Dword align */
        amdgpu_ring_write(ring,
                SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
                2 + /* gfx_v9_0_ring_emit_vm_flush */
                8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
-       .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */
+       .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
        .emit_ib = gfx_v9_0_ring_emit_ib_compute,
        .emit_fence = gfx_v9_0_ring_emit_fence,
        .emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
                SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
                2 + /* gfx_v9_0_ring_emit_vm_flush */
                8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */
-       .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */
+       .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
        .emit_fence = gfx_v9_0_ring_emit_fence_kiq,
        .test_ring = gfx_v9_0_ring_test_ring,
        .insert_nop = amdgpu_ring_insert_nop,
                break;
        }
 
+       switch (adev->asic_type) {
+       case CHIP_VEGA10:
+       case CHIP_VEGA20:
+               adev->gds.gds_compute_max_wave_id = 0x7ff;
+               break;
+       case CHIP_VEGA12:
+               adev->gds.gds_compute_max_wave_id = 0x27f;
+               break;
+       case CHIP_RAVEN:
+               if (adev->rev_id >= 0x8)
+                       adev->gds.gds_compute_max_wave_id = 0x77; /* raven2 */
+               else
+                       adev->gds.gds_compute_max_wave_id = 0x15f; /* raven1 */
+               break;
+       default:
+               /* this really depends on the chip */
+               adev->gds.gds_compute_max_wave_id = 0x7ff;
+               break;
+       }
+
        adev->gds.gws.total_size = 64;
        adev->gds.oa.total_size = 16;
 
 
  * caches (L2/vL1/sL1/I$). */
 #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
 
+/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER.
+ * This will reset wave ID counters for the IB.
+ */
+#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4)
+
 struct drm_amdgpu_cs_chunk_ib {
        __u32 _pad;
        /** AMDGPU_IB_FLAG_* */