drm/amdgpu/ras: fix and update the documentation for RAS

Add new sections to amdgpu.rst, fix up formatting issues,
add additional documentation to each section.

Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Alex Deucher 2019-09-19 15:09:56 -05:00
parent a667b75c1e
commit f77c7109c0
2 changed files with 68 additions and 9 deletions

View file

@ -79,12 +79,32 @@ AMDGPU XGMI Support
.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
:internal:
AMDGPU RAS debugfs control interface
====================================
AMDGPU RAS Support
==================
RAS debugfs/sysfs Control and Error Injection Interfaces
--------------------------------------------------------
.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
:doc: AMDGPU RAS debugfs control interface
RAS Error Count sysfs Interface
-------------------------------
.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
:doc: AMDGPU RAS sysfs Error Count Interface
RAS EEPROM debugfs Interface
----------------------------
.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
:doc: AMDGPU RAS debugfs EEPROM table reset interface
RAS VRAM Bad Pages sysfs Interface
----------------------------------
.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
:doc: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
:internal:

View file

@ -310,7 +310,18 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
/**
* DOC: AMDGPU RAS debugfs EEPROM table reset interface
*
* Usage: echo 1 > ../ras/ras_eeprom_reset will reset EEPROM table to 0 entries.
* Some boards contain an EEPROM which is used to persistently store a list of
* bad pages containing ECC errors detected in vram. This interface provides
* a way to reset the EEPROM, e.g., after testing error injection.
*
* Usage:
*
* .. code-block:: bash
*
* echo 1 > ../ras/ras_eeprom_reset
*
* will reset EEPROM table to 0 entries.
*
*/
static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf,
size_t size, loff_t *pos)
@ -337,6 +348,27 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
.llseek = default_llseek
};
/**
* DOC: AMDGPU RAS sysfs Error Count Interface
*
* It allows user to read the error count for each IP block on the gpu through
* /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
*
* It outputs the multiple lines which report the uncorrected (ue) and corrected
* (ce) error counts.
*
* The format of one line is below,
*
* [ce|ue]: count
*
* Example:
*
* .. code-block:: bash
*
* ue: 0
* ce: 1
*
*/
static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
struct device_attribute *attr, char *buf)
{
@ -781,8 +813,8 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
};
}
/*
* DOC: ras sysfs gpu_vram_bad_pages interface
/**
* DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
*
* It allows user to read the bad pages of vram on the gpu through
* /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
@ -794,14 +826,21 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
*
* gpu pfn and gpu page size are printed in hex format.
* flags can be one of below character,
*
* R: reserved, this gpu page is reserved and not able to use.
*
* P: pending for reserve, this gpu page is marked as bad, will be reserved
* in next window of page_reserve.
* in next window of page_reserve.
*
* F: unable to reserve. this gpu page can't be reserved due to some reasons.
*
* examples:
* 0x00000001 : 0x00001000 : R
* 0x00000002 : 0x00001000 : P
* Examples:
*
* .. code-block:: bash
*
* 0x00000001 : 0x00001000 : R
* 0x00000002 : 0x00001000 : P
*
*/
static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,