drm/amdkfd: add raise exception event function

Exception events can be generated from interrupts or queue activitity.

The raise event function will save exception status of a queue, device
or process then notify the debugger of the status change by writing to
a debugger polled file descriptor that the debugger provides during
debug attach.

For memory violation exceptions, extra exception data will be saved.

The debugger will be able to query the saved exception states by query
operation that will be provided by follow up patches.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Jonathan Kim 2022-04-06 12:03:31 -04:00 committed by Alex Deucher
parent 69a8c3ae2d
commit 44b87bb083
4 changed files with 123 additions and 0 deletions

View file

@ -24,6 +24,107 @@
#include "kfd_device_queue_manager.h"
#include <linux/file.h>
void debug_event_write_work_handler(struct work_struct *work)
{
struct kfd_process *process;
static const char write_data = '.';
loff_t pos = 0;
process = container_of(work,
struct kfd_process,
debug_event_workarea);
kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
}
/* update process/device/queue exception status, write to descriptor
* only if exception_status is enabled.
*/
bool kfd_dbg_ev_raise(uint64_t event_mask,
struct kfd_process *process, struct kfd_node *dev,
unsigned int source_id, bool use_worker,
void *exception_data, size_t exception_data_size)
{
struct process_queue_manager *pqm;
struct process_queue_node *pqn;
int i;
static const char write_data = '.';
loff_t pos = 0;
bool is_subscribed = true;
if (!(process && process->debug_trap_enabled))
return false;
mutex_lock(&process->event_mutex);
if (event_mask & KFD_EC_MASK_DEVICE) {
for (i = 0; i < process->n_pdds; i++) {
struct kfd_process_device *pdd = process->pdds[i];
if (pdd->dev != dev)
continue;
pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
if (!pdd->vm_fault_exc_data) {
pdd->vm_fault_exc_data = kmemdup(
exception_data,
exception_data_size,
GFP_KERNEL);
if (!pdd->vm_fault_exc_data)
pr_debug("Failed to allocate exception data memory");
} else {
pr_debug("Debugger exception data not saved\n");
print_hex_dump_bytes("exception data: ",
DUMP_PREFIX_OFFSET,
exception_data,
exception_data_size);
}
}
break;
}
} else if (event_mask & KFD_EC_MASK_PROCESS) {
process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
} else {
pqm = &process->pqm;
list_for_each_entry(pqn, &pqm->queues,
process_queue_list) {
int target_id;
if (!pqn->q)
continue;
target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
pqn->q->properties.queue_id :
pqn->q->doorbell_id;
if (pqn->q->device != dev || target_id != source_id)
continue;
pqn->q->properties.exception_status |= event_mask;
break;
}
}
if (process->exception_enable_mask & event_mask) {
if (use_worker)
schedule_work(&process->debug_event_workarea);
else
kernel_write(process->dbg_ev_file,
&write_data,
1,
&pos);
} else {
is_subscribed = false;
}
mutex_unlock(&process->event_mutex);
return is_subscribed;
}
static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
{
struct mqd_update_info minfo = {0};
@ -99,6 +200,9 @@ static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int
{
int i;
if (!unwind)
cancel_work_sync(&target->debug_event_workarea);
for (i = 0; i < target->n_pdds; i++) {
struct kfd_process_device *pdd = target->pdds[i];

View file

@ -25,6 +25,11 @@
#include "kfd_priv.h"
bool kfd_dbg_ev_raise(uint64_t event_mask,
struct kfd_process *process, struct kfd_node *dev,
unsigned int source_id, bool use_worker,
void *exception_data,
size_t exception_data_size);
int kfd_dbg_trap_disable(struct kfd_process *target);
int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
void __user *runtime_info,
@ -35,6 +40,8 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_node *dev)
KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0);
}
void debug_event_write_work_handler(struct work_struct *work);
/*
* If GFX off is enabled, chips that do not support RLC restore for the debug
* registers will disable GFX off temporarily for the entire debug session.

View file

@ -529,6 +529,7 @@ struct queue_properties {
uint32_t ctl_stack_size;
uint64_t tba_addr;
uint64_t tma_addr;
uint64_t exception_status;
};
#define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 && \
@ -820,6 +821,11 @@ struct kfd_process_device {
uint64_t page_in;
uint64_t page_out;
/* Exception code status*/
uint64_t exception_status;
void *vm_fault_exc_data;
size_t vm_fault_exc_data_size;
/* Tracks debug per-vmid request settings */
uint32_t spi_dbg_override;
uint32_t spi_dbg_launch_mode;
@ -955,12 +961,16 @@ struct kfd_process {
/* Exception code enable mask and status */
uint64_t exception_enable_mask;
uint64_t exception_status;
/* shared virtual memory registered by this process */
struct svm_range_list svms;
bool xnack_enabled;
/* Work area for debugger event writer worker. */
struct work_struct debug_event_workarea;
/* Tracks debug per-vmid request for debug flags */
bool dbg_flags;

View file

@ -1509,6 +1509,8 @@ static struct kfd_process *create_process(const struct task_struct *thread)
kfd_unref_process(process);
get_task_struct(process->lead_thread);
INIT_WORK(&process->debug_event_workarea, debug_event_write_work_handler);
return process;
err_register_notifier: