Due to a HW bug, waves in only half the shader arrays can enter trap.
When starting a debug session, relocate all waves to the first shader
array of each shader engine and mask off the 2nd shader array as
unavailable.
When ending a debug session, re-enable the 2nd shader array per
shader engine.
User CU masking per queue cannot be guaranteed to remain functional
if requested during debugging (e.g. user cu mask requests only 2nd shader
array as an available resource leading to zero HW resources available)
nor can runtime be alerted of any of these changes during execution.
Make user CU masking and debugging mutual exclusive with respect to
availability.
If the debugger tries to attach to a process with a user cu masked
queue, return the runtime status as enabled but busy.
If the debugger tries to attach and fails to reallocate queue waves to
the first shader array of each shader engine, return the runtime status
as enabled but with an error.
In addition, like any other mutli-process debug supported devices,
disable trap temporary setup per-process to avoid performance impact from
setup overhead.
Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
uint32_t gws_size;
uint64_t tba_addr;
uint64_t tma_addr;
+ uint32_t trap_en;
+ uint32_t skip_process_ctx_clear;
uint32_t is_kfd_process;
uint32_t is_aql_queue;
uint32_t queue_size;
mes_add_queue_pkt.gws_size = input->gws_size;
mes_add_queue_pkt.trap_handler_addr = input->tba_addr;
mes_add_queue_pkt.tma_addr = input->tma_addr;
+ mes_add_queue_pkt.trap_en = input->trap_en;
+ mes_add_queue_pkt.skip_process_ctx_clear = input->skip_process_ctx_clear;
mes_add_queue_pkt.is_kfd_process = input->is_kfd_process;
/* For KFD, gds_size is re-used for queue size (needed in MES for AQL queues) */
mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
mes_add_queue_pkt.gds_size = input->queue_size;
- if (!(((adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 4) &&
- (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) &&
- (adev->ip_versions[GC_HWIP][0] <= IP_VERSION(11, 0, 3))))
- mes_add_queue_pkt.trap_en = 1;
-
/* For KFD, gds_size is re-used for queue size (needed in MES for AQL queues) */
mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
mes_add_queue_pkt.gds_size = input->queue_size;
goto out;
}
- minfo.update_flag = UPDATE_FLAG_CU_MASK;
-
mutex_lock(&p->mutex);
retval = pqm_update_mqd(&p->pqm, args->queue_id, &minfo);
#include "kfd_device_queue_manager.h"
#include <linux/file.h>
+static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
+{
+ struct mqd_update_info minfo = {0};
+ int err;
+
+ if (!q)
+ return 0;
+
+ if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
+ KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0))
+ return 0;
+
+ if (enable && q->properties.is_user_cu_masked)
+ return -EBUSY;
+
+ minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
+
+ q->properties.is_dbg_wa = enable;
+ err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
+ if (err)
+ q->properties.is_dbg_wa = false;
+
+ return err;
+}
+
+static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
+{
+ struct process_queue_manager *pqm = &target->pqm;
+ struct process_queue_node *pqn;
+ int r = 0;
+
+ list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
+ r = kfd_dbg_set_queue_workaround(pqn->q, enable);
+ if (enable && r)
+ goto unwind;
+ }
+
+ return 0;
+
+unwind:
+ list_for_each_entry(pqn, &pqm->queues, process_queue_list)
+ kfd_dbg_set_queue_workaround(pqn->q, false);
+
+ if (enable)
+ target->runtime_info.runtime_state = r == -EBUSY ?
+ DEBUG_RUNTIME_STATE_ENABLED_BUSY :
+ DEBUG_RUNTIME_STATE_ENABLED_ERROR;
+
+ return r;
+}
+
static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
{
uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
else
kfd_dbg_set_mes_debug_mode(pdd);
}
+
+ kfd_dbg_set_workaround(target, false);
}
int kfd_dbg_trap_disable(struct kfd_process *target)
{
int i, r = 0;
+ r = kfd_dbg_set_workaround(target, true);
+ if (r)
+ return r;
+
for (i = 0; i < target->n_pdds; i++) {
struct kfd_process_device *pdd = target->pdds[i];
uint32_t *runtime_info_size);
static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_node *dev)
{
- return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2);
+ return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2) ||
+ KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0);
}
/*
queue_input.paging = false;
queue_input.tba_addr = qpd->tba_addr;
queue_input.tma_addr = qpd->tma_addr;
+ queue_input.trap_en = KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
+ KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0) ||
+ q->properties.is_dbg_wa;
+ queue_input.skip_process_ctx_clear = qpd->pqm->process->debug_trap_enabled;
queue_type = convert_to_mes_queue_type(q->properties.type);
if (queue_type < 0) {
* updates the is_evicted flag but is a no-op otherwise.
*/
q->properties.is_evicted = !!qpd->evicted;
+ q->properties.is_dbg_wa = qpd->pqm->process->debug_trap_enabled &&
+ KFD_GC_VERSION(q->device) >= IP_VERSION(11, 0, 0) &&
+ KFD_GC_VERSION(q->device) < IP_VERSION(12, 0, 0);
if (qd)
mqd_mgr->restore_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj, &q->gart_mqd_addr,
struct cik_mqd *m;
uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
- if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
- !minfo->cu_mask.ptr)
+ if (!minfo || !minfo->cu_mask.ptr)
return;
mqd_symmetrically_map_cu_mask(mm,
struct v10_compute_mqd *m;
uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
- if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
- !minfo->cu_mask.ptr)
+ if (!minfo || !minfo->cu_mask.ptr)
return;
mqd_symmetrically_map_cu_mask(mm,
{
struct v11_compute_mqd *m;
uint32_t se_mask[KFD_MAX_NUM_SE] = {0};
+ bool has_wa_flag = minfo && (minfo->update_flag & (UPDATE_FLAG_DBG_WA_ENABLE |
+ UPDATE_FLAG_DBG_WA_DISABLE));
- if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
- !minfo->cu_mask.ptr)
+ if (!minfo || !(has_wa_flag || minfo->cu_mask.ptr))
return;
+ m = get_mqd(mqd);
+
+ if (has_wa_flag) {
+ uint32_t wa_mask = minfo->update_flag == UPDATE_FLAG_DBG_WA_ENABLE ?
+ 0xffff : 0xffffffff;
+
+ m->compute_static_thread_mgmt_se0 = wa_mask;
+ m->compute_static_thread_mgmt_se1 = wa_mask;
+ m->compute_static_thread_mgmt_se2 = wa_mask;
+ m->compute_static_thread_mgmt_se3 = wa_mask;
+ m->compute_static_thread_mgmt_se4 = wa_mask;
+ m->compute_static_thread_mgmt_se5 = wa_mask;
+ m->compute_static_thread_mgmt_se6 = wa_mask;
+ m->compute_static_thread_mgmt_se7 = wa_mask;
+
+ return;
+ }
+
mqd_symmetrically_map_cu_mask(mm,
minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask);
- m = get_mqd(mqd);
m->compute_static_thread_mgmt_se0 = se_mask[0];
m->compute_static_thread_mgmt_se1 = se_mask[1];
m->compute_static_thread_mgmt_se2 = se_mask[2];
uint64_t addr;
struct v11_compute_mqd *m;
int size;
+ uint32_t wa_mask = q->is_dbg_wa ? 0xffff : 0xffffffff;
m = (struct v11_compute_mqd *) mqd_mem_obj->cpu_ptr;
addr = mqd_mem_obj->gpu_addr;
m->header = 0xC0310800;
m->compute_pipelinestat_enable = 1;
- m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
- m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
- m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
- m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
- m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF;
- m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF;
- m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF;
- m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF;
+
+ m->compute_static_thread_mgmt_se0 = wa_mask;
+ m->compute_static_thread_mgmt_se1 = wa_mask;
+ m->compute_static_thread_mgmt_se2 = wa_mask;
+ m->compute_static_thread_mgmt_se3 = wa_mask;
+ m->compute_static_thread_mgmt_se4 = wa_mask;
+ m->compute_static_thread_mgmt_se5 = wa_mask;
+ m->compute_static_thread_mgmt_se6 = wa_mask;
+ m->compute_static_thread_mgmt_se7 = wa_mask;
m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
0x55 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
struct v9_mqd *m;
uint32_t se_mask[KFD_MAX_NUM_SE] = {0};
- if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
- !minfo->cu_mask.ptr)
+ if (!minfo || !minfo->cu_mask.ptr)
return;
mqd_symmetrically_map_cu_mask(mm,
struct vi_mqd *m;
uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
- if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
- !minfo->cu_mask.ptr)
+ if (!minfo || !minfo->cu_mask.ptr)
return;
mqd_symmetrically_map_cu_mask(mm,
bool is_active;
bool is_gws;
uint32_t pm4_target_xcc;
+ bool is_dbg_wa;
+ bool is_user_cu_masked;
/* Not relevant for user mode queues in cp scheduling */
unsigned int vmid;
/* Relevant only for sdma queues*/
!(q).is_evicted)
enum mqd_update_flag {
- UPDATE_FLAG_CU_MASK = 0,
+ UPDATE_FLAG_DBG_WA_ENABLE = 1,
+ UPDATE_FLAG_DBG_WA_DISABLE = 2,
};
struct mqd_update_info {
return -EFAULT;
}
+ /* CUs are masked for debugger requirements so deny user mask */
+ if (pqn->q->properties.is_dbg_wa && minfo && minfo->cu_mask.ptr)
+ return -EBUSY;
+
/* ASICs that have WGPs must enforce pairwise enabled mask checks. */
- if (minfo && minfo->update_flag == UPDATE_FLAG_CU_MASK && minfo->cu_mask.ptr &&
+ if (minfo && minfo->cu_mask.ptr &&
KFD_GC_VERSION(pqn->q->device) >= IP_VERSION(10, 0, 0)) {
int i;
if (retval != 0)
return retval;
+ if (minfo && minfo->cu_mask.ptr)
+ pqn->q->properties.is_user_cu_masked = true;
+
return 0;
}
{
bool firmware_supported = true;
+ /*
+ * FIXME: GFX11 FW currently not sufficient to deal with CWSR WA.
+ * Updated FW with API changes coming soon.
+ */
if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(11, 0, 0) &&
KFD_GC_VERSION(dev->gpu) < IP_VERSION(12, 0, 0)) {
- firmware_supported =
- (dev->gpu->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 9;
+ firmware_supported = false;
goto out;
}