linux-stable/drivers/gpu/drm/i915/i915_gpu_error.h

/*
 * SPDX-License-Identifier: MIT
 *
 * Copyright <EFBFBD> 2008-2018 Intel Corporation
 */

#ifndef _I915_GPU_ERROR_H_
#define _I915_GPU_ERROR_H_

#include <linux/atomic.h>
#include <linux/kref.h>
#include <linux/ktime.h>
#include <linux/sched.h>

#include <drm/drm_mm.h>

#include "gt/intel_engine.h"
#include "gt/uc/intel_uc_fw.h"

#include "intel_device_info.h"

#include "i915_gem.h"
#include "i915_gem_gtt.h"
#include "i915_params.h"
#include "i915_scheduler.h"

struct drm_i915_private;
struct intel_overlay_error_state;
struct intel_display_error_state;

struct i915_gpu_state {
	struct kref ref;
	ktime_t time;
	ktime_t boottime;
	ktime_t uptime;
	unsigned long capture;
	unsigned long epoch;

	struct drm_i915_private *i915;

	char error_msg[128];
	bool simulated;
	bool awake;
	bool wakelock;
	bool suspended;
	int iommu;
	u32 reset_count;
	u32 suspend_count;
	struct intel_device_info device_info;
	struct intel_runtime_info runtime_info;
	struct intel_driver_caps driver_caps;
	struct i915_params params;

	struct i915_error_uc {
		struct intel_uc_fw guc_fw;
		struct intel_uc_fw huc_fw;
		struct drm_i915_error_object *guc_log;
	} uc;

	/* Generic register state */
	u32 eir;
	u32 pgtbl_er;
	u32 ier;
	u32 gtier[6], ngtier;
	u32 ccid;
	u32 derrmr;
	u32 forcewake;
	u32 error; /* gen6+ */
	u32 err_int; /* gen7 */
	u32 fault_data0; /* gen8, gen9 */
	u32 fault_data1; /* gen8, gen9 */
	u32 done_reg;
	u32 gac_eco;
	u32 gam_ecochk;
	u32 gab_ctl;
	u32 gfx_mode;
	u32 gtt_cache;

	u32 nfence;
	u64 fence[I915_MAX_NUM_FENCES];
	struct intel_overlay_error_state *overlay;
	struct intel_display_error_state *display;

	struct drm_i915_error_engine {
		const struct intel_engine_cs *engine;

		/* Software tracked state */
		bool idle;
		unsigned long hangcheck_timestamp;
		int num_requests;
		u32 reset_count;

		/* position of active request inside the ring */
		u32 rq_head, rq_post, rq_tail;

		/* our own tracking of ring head and tail */
		u32 cpu_ring_head;
		u32 cpu_ring_tail;

		/* Register state */
		u32 start;
		u32 tail;
		u32 head;
		u32 ctl;
		u32 mode;
		u32 hws;
		u32 ipeir;
		u32 ipehr;
		u32 bbstate;
		u32 instpm;
		u32 instps;
		u64 bbaddr;
		u64 acthd;
		u32 fault_reg;
		u64 faddr;
		u32 rc_psmi; /* sleep state */
		struct intel_instdone instdone;

		struct drm_i915_error_context {
			char comm[TASK_COMM_LEN];
			pid_t pid;
			int active;
			int guilty;
			struct i915_sched_attr sched_attr;
		} context;

		struct drm_i915_error_object {
			u64 gtt_offset;
			u64 gtt_size;
			u32 gtt_page_sizes;
			int num_pages;
			int page_count;
			int unused;
			u32 *pages[0];
		} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;

		struct drm_i915_error_object **user_bo;
		long user_bo_count;

		struct drm_i915_error_object *wa_ctx;
		struct drm_i915_error_object *default_state;

		struct drm_i915_error_request {
			unsigned long flags;
			long jiffies;
			pid_t pid;
			u32 context;
			u32 seqno;
			u32 start;
			u32 head;
			u32 tail;
			struct i915_sched_attr sched_attr;
		} *requests, execlist[EXECLIST_MAX_PORTS];
		unsigned int num_ports;

		struct {
			u32 gfx_mode;
			union {
				u64 pdp[4];
				u32 pp_dir_base;
			};
		} vm_info;

		struct drm_i915_error_engine *next;
	} *engine;

	struct scatterlist *sgl, *fit;
};

struct i915_gpu_error {
	/* For reset and error_state handling. */
	spinlock_t lock;
	/* Protected by the above dev->gpu_error.lock. */
	struct i915_gpu_state *first_error;

	atomic_t pending_fb_pin;

	/** Number of times the device has been reset (global) */
	atomic_t reset_count;

	/** Number of times an engine has been reset */
	atomic_t reset_engine_count[I915_NUM_ENGINES];
};

struct drm_i915_error_state_buf {
	struct drm_i915_private *i915;
	struct scatterlist *sgl, *cur, *end;

	char *buf;
	size_t bytes;
	size_t size;
	loff_t iter;

	int err;
};

#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)

__printf(2, 3)
void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);

struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
void i915_capture_error_state(struct drm_i915_private *dev_priv,
			      intel_engine_mask_t engine_mask,
			      const char *error_msg);

static inline struct i915_gpu_state *
i915_gpu_state_get(struct i915_gpu_state *gpu)
{
	kref_get(&gpu->ref);
	return gpu;
}

ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error,
				      char *buf, loff_t offset, size_t count);

void __i915_gpu_state_free(struct kref *kref);
static inline void i915_gpu_state_put(struct i915_gpu_state *gpu)
{
	if (gpu)
		kref_put(&gpu->ref, __i915_gpu_state_free);
}

struct i915_gpu_state *i915_first_error_state(struct drm_i915_private *i915);
void i915_reset_error_state(struct drm_i915_private *i915);
void i915_disable_error_state(struct drm_i915_private *i915, int err);

#else

static inline void i915_capture_error_state(struct drm_i915_private *dev_priv,
					    u32 engine_mask,
					    const char *error_msg)
{
}

static inline struct i915_gpu_state *
i915_first_error_state(struct drm_i915_private *i915)
{
	return ERR_PTR(-ENODEV);
}

static inline void i915_reset_error_state(struct drm_i915_private *i915)
{
}

static inline void i915_disable_error_state(struct drm_i915_private *i915,
					    int err)
{
}

#endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */

#endif /* _I915_GPU_ERROR_H_ */
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								/*
 								 * SPDX-License-Identifier: MIT
 								 *
 								 * Copyright <EFBFBD> 2008-2018 Intel Corporation
 								 */
 								#ifndef _I915_GPU_ERROR_H_
 								#define _I915_GPU_ERROR_H_
-												drm/i915/gt: Use intel_gt as the primary object for handling resets

Having taken the first step in encapsulating the functionality by moving
the related files under gt/, the next step is to start encapsulating by
passing around the relevant structs rather than the global
drm_i915_private. In this step, we pass intel_gt to intel_reset.c

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190712192953.9187-1-chris@chris-wilson.co.uk

											
										
										
											2019-07-12 19:29:53 +00:00
+								#include <linux/atomic.h>
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								#include <linux/kref.h>
 								#include <linux/ktime.h>
 								#include <linux/sched.h>
 								#include <drm/drm_mm.h>
-												drm/i915: Move GraphicsTechnology files under gt/

Start partitioning off the code that talks to the hardware (GT) from the
uapi layers and move the device facing code under gt/

One casualty is s/intel_ringbuffer.h/intel_engine.h/ with the plan to
subdivide that header and body further (and split out the submission
code from the ringbuffer and logical context handling). This patch aims
to be simple motion so git can fixup inflight patches with little mess.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190424174839.7141-1-chris@chris-wilson.co.uk

											
										
										
											2019-04-24 17:48:39 +00:00
+								#include "gt/intel_engine.h"
-												drm/i915/uc: move GuC and HuC files under gt/uc/

Both microcontrollers are part of the GT HW and are closely related to
GT operations. To keep all the files cleanly together, they've been
placed in their own subdir inside the gt/ folder

Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20190713100016.8026-6-chris@chris-wilson.co.uk
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

											
										
										
											2019-07-13 10:00:11 +00:00
+								#include "gt/uc/intel_uc_fw.h"
-												drm/i915: Move GraphicsTechnology files under gt/

Start partitioning off the code that talks to the hardware (GT) from the
uapi layers and move the device facing code under gt/

One casualty is s/intel_ringbuffer.h/intel_engine.h/ with the plan to
subdivide that header and body further (and split out the submission
code from the ringbuffer and logical context handling). This patch aims
to be simple motion so git can fixup inflight patches with little mess.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190424174839.7141-1-chris@chris-wilson.co.uk

											
										
										
											2019-04-24 17:48:39 +00:00
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								#include "intel_device_info.h"
 								#include "i915_gem.h"
 								#include "i915_gem_gtt.h"
 								#include "i915_params.h"
-												drm/i915: Pack params to engine->schedule() into a struct

Today we only want to pass along the priority to engine->schedule(), but
in the future we want to have much more control over the various aspects
of the GPU during a context's execution, for example controlling the
frequency allowed. As we need an ever growing number of parameters for
scheduling, move those into a struct for convenience.

v2: Move the anonymous struct into its own function for legibility and
ye olde gcc.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180418184052.7129-3-chris@chris-wilson.co.uk

											
										
										
											2018-04-18 18:40:52 +00:00
+								#include "i915_scheduler.h"
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
 								struct drm_i915_private;
 								struct intel_overlay_error_state;
 								struct intel_display_error_state;
 								struct i915_gpu_state {
 									struct kref ref;
 									ktime_t time;
 									ktime_t boottime;
 									ktime_t uptime;
-												drm/i915: Print error state times relative to capture

Using plain jiffies in error state output makes the output
time differences relative to the current system time. This
is wrong as it makes output time differences dependent
of when the error state is printed rather than when it is
captured.

Store capture jiffies into error state and use it
when outputting the state to fix time differences output.

v2: use engine timestamp as epoch, output formatting (Chris)
v3: pass epoch to print_engine/request (Chris)

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180430075259.4476-1-mika.kuoppala@linux.intel.com

											
										
										
											2018-04-30 07:52:59 +00:00
+									unsigned long capture;
 									unsigned long epoch;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
 									struct drm_i915_private *i915;
 									char error_msg[128];
 									bool simulated;
 									bool awake;
 									bool wakelock;
 									bool suspended;
 									int iommu;
 									u32 reset_count;
 									u32 suspend_count;
 									struct intel_device_info device_info;
-												drm/i915: start moving runtime device info to a separate struct

First move the low hanging fruit, the fields that are only initialized
runtime. Use RUNTIME_INFO() exclusively to access the fields.

Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/c24fe7a4b0492a888690c46814c0ff21ce2f12b1.1546267488.git.jani.nikula@intel.com

											
										
										
											2018-12-31 14:56:41 +00:00
+									struct intel_runtime_info runtime_info;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+									struct intel_driver_caps driver_caps;
 									struct i915_params params;
 									struct i915_error_uc {
 										struct intel_uc_fw guc_fw;
 										struct intel_uc_fw huc_fw;
 										struct drm_i915_error_object *guc_log;
 									} uc;
 									/* Generic register state */
 									u32 eir;
 									u32 pgtbl_er;
 									u32 ier;
-												drm/i915/icl: Read the correct Gen11 interrupt registers

Stop reading some now deprecated interrupt registers in both
debugfs and error state. Instead, read the new equivalents in the
Gen11 interrupt repartitioning scheme.

Note that the equivalent to the PM ISR & IIR cannot be read without
affecting the current state of the system, so I've opted for leaving
them out. See gen11_reset_one_iir() for more info.

v2: else if !!! (Paulo)
v3: another else if (Vinay)
v4:
  - Rebased
  - Renamed patch
  - Improved the ordering of GENs
  - Improved the printing of per-GEN info
v5: Avoid maybe-unitialized & add comment explaining the lack
    of PM ISR & IIR

Suggested-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
Signed-off-by: Oscar Mateo <oscar.mateo@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Sagar Arun Kamble <sagar.a.kamble@intel.com>
Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
Reviewed-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
[Paulo: fix commit message and coding style.]
Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1525989595-18220-1-git-send-email-oscar.mateo@intel.com

											
										
										
											2018-05-10 21:59:55 +00:00
+									u32 gtier[6], ngtier;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+									u32 ccid;
 									u32 derrmr;
 									u32 forcewake;
 									u32 error; /* gen6+ */
 									u32 err_int; /* gen7 */
 									u32 fault_data0; /* gen8, gen9 */
 									u32 fault_data1; /* gen8, gen9 */
 									u32 done_reg;
 									u32 gac_eco;
 									u32 gam_ecochk;
 									u32 gab_ctl;
 									u32 gfx_mode;
-												drm/i915: include GTT page-size info in error state

It might prove useful in the future to know if the vma is utilising
huge-GTT-pages. Related to this is the GTT cache, where there is some HW
"quirkiness" where it must be disabled if using 2M pages, so include
that for good measure.

Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20190909171646.22090-1-matthew.auld@intel.com

											
										
										
											2019-09-09 17:16:46 +00:00
+									u32 gtt_cache;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
 									u32 nfence;
 									u64 fence[I915_MAX_NUM_FENCES];
 									struct intel_overlay_error_state *overlay;
 									struct intel_display_error_state *display;
 									struct drm_i915_error_engine {
-												drm/i915: Only include active engines in the capture state

Skip printing out idle engines that did not contribute to the GPU hang.
As the number of engines gets ever larger, we have increasing noise in
the error state where typically there is only one guilty request on one
engine that we need to inspect.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190808144511.32269-1-chris@chris-wilson.co.uk

											
										
										
											2019-08-08 14:45:11 +00:00
+										const struct intel_engine_cs *engine;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+										/* Software tracked state */
 										bool idle;
 										unsigned long hangcheck_timestamp;
 										int num_requests;
 										u32 reset_count;
 										/* position of active request inside the ring */
 										u32 rq_head, rq_post, rq_tail;
 										/* our own tracking of ring head and tail */
 										u32 cpu_ring_head;
 										u32 cpu_ring_tail;
 										/* Register state */
 										u32 start;
 										u32 tail;
 										u32 head;
 										u32 ctl;
 										u32 mode;
 										u32 hws;
 										u32 ipeir;
 										u32 ipehr;
 										u32 bbstate;
 										u32 instpm;
 										u32 instps;
 										u64 bbaddr;
 										u64 acthd;
 										u32 fault_reg;
 										u64 faddr;
 										u32 rc_psmi; /* sleep state */
 										struct intel_instdone instdone;
 										struct drm_i915_error_context {
 											char comm[TASK_COMM_LEN];
 											pid_t pid;
 											int active;
 											int guilty;
-												drm/i915: Pack params to engine->schedule() into a struct

Today we only want to pass along the priority to engine->schedule(), but
in the future we want to have much more control over the various aspects
of the GPU during a context's execution, for example controlling the
frequency allowed. As we need an ever growing number of parameters for
scheduling, move those into a struct for convenience.

v2: Move the anonymous struct into its own function for legibility and
ye olde gcc.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180418184052.7129-3-chris@chris-wilson.co.uk

											
										
										
											2018-04-18 18:40:52 +00:00
+											struct i915_sched_attr sched_attr;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+										} context;
 										struct drm_i915_error_object {
 											u64 gtt_offset;
 											u64 gtt_size;
-												drm/i915: include GTT page-size info in error state

It might prove useful in the future to know if the vma is utilising
huge-GTT-pages. Related to this is the GTT cache, where there is some HW
"quirkiness" where it must be disabled if using 2M pages, so include
that for good measure.

Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20190909171646.22090-1-matthew.auld@intel.com

											
										
										
											2019-09-09 17:16:46 +00:00
+											u32 gtt_page_sizes;
-												drm/i915: Handle incomplete Z_FINISH for compressed error states

The final call to zlib_deflate(Z_FINISH) may require more output
space to be allocated and so needs to re-invoked. Failure to do so in
the current code leads to incomplete zlib streams (albeit intact due to
the use of Z_SYNC_FLUSH) resulting in the occasional short object
capture.

v2: Check against overrunning our pre-allocated page array
v3: Drop Z_SYNC_FLUSH entirely

Testcase: igt/i915-error-capture.js
Fixes: 0a97015d45ee ("drm/i915: Compress GPU objects in error state")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: <stable@vger.kernel.org> # v4.10+
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181003082422.23214-1-chris@chris-wilson.co.uk

											
										
										
											2018-10-03 08:24:22 +00:00
+											int num_pages;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+											int page_count;
 											int unused;
 											u32 *pages[0];
 										} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
 										struct drm_i915_error_object **user_bo;
 										long user_bo_count;
 										struct drm_i915_error_object *wa_ctx;
 										struct drm_i915_error_object *default_state;
 										struct drm_i915_error_request {
-												drm/i915: Replace global breadcrumbs with per-context interrupt tracking

A few years ago, see commit 688e6c725816 ("drm/i915: Slaughter the
thundering i915_wait_request herd"), the issue of handling multiple
clients waiting in parallel was brought to our attention. The
requirement was that every client should be woken immediately upon its
request being signaled, without incurring any cpu overhead.

To handle certain fragility of our hw meant that we could not do a
simple check inside the irq handler (some generations required almost
unbounded delays before we could be sure of seqno coherency) and so
request completion checking required delegation.

Before commit 688e6c725816, the solution was simple. Every client
waiting on a request would be woken on every interrupt and each would do
a heavyweight check to see if their request was complete. Commit
688e6c725816 introduced an rbtree so that only the earliest waiter on
the global timeline would woken, and would wake the next and so on.
(Along with various complications to handle requests being reordered
along the global timeline, and also a requirement for kthread to provide
a delegate for fence signaling that had no process context.)

The global rbtree depends on knowing the execution timeline (and global
seqno). Without knowing that order, we must instead check all contexts
queued to the HW to see which may have advanced. We trim that list by
only checking queued contexts that are being waited on, but still we
keep a list of all active contexts and their active signalers that we
inspect from inside the irq handler. By moving the waiters onto the fence
signal list, we can combine the client wakeup with the dma_fence
signaling (a dramatic reduction in complexity, but does require the HW
being coherent, the seqno must be visible from the cpu before the
interrupt is raised - we keep a timer backup just in case).

Having previously fixed all the issues with irq-seqno serialisation (by
inserting delays onto the GPU after each request instead of random delays
on the CPU after each interrupt), we can rely on the seqno state to
perfom direct wakeups from the interrupt handler. This allows us to
preserve our single context switch behaviour of the current routine,
with the only downside that we lose the RT priority sorting of wakeups.
In general, direct wakeup latency of multiple clients is about the same
(about 10% better in most cases) with a reduction in total CPU time spent
in the waiter (about 20-50% depending on gen). Average herd behaviour is
improved, but at the cost of not delegating wakeups on task_prio.

v2: Capture fence signaling state for error state and add comments to
warm even the most cold of hearts.
v3: Check if the request is still active before busywaiting
v4: Reduce the amount of pointer misdirection with list_for_each_safe
and using a local i915_request variable inside the loops
v5: Add a missing pluralisation to a purely informative selftest message.

References: 688e6c725816 ("drm/i915: Slaughter the thundering i915_wait_request herd")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190129205230.19056-2-chris@chris-wilson.co.uk

											
										
										
											2019-01-29 20:52:29 +00:00
+											unsigned long flags;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+											long jiffies;
 											pid_t pid;
 											u32 context;
 											u32 seqno;
-												drm/i915: Show ring->start for the ELSP context/request queue

Since the advent of execlists, the HW no longer executes from a single
statically assigned ring, but instead switches to a different ring for
each context (logical ringbuffer contexts as it is called). So a good way
to tally the executing context against what we have queued is by
comparing the RING_START register against our requests. Make it so.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180502104150.29874-1-chris@chris-wilson.co.uk

											
										
										
											2018-05-02 10:41:50 +00:00
+											u32 start;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+											u32 head;
 											u32 tail;
-												drm/i915: Pack params to engine->schedule() into a struct

Today we only want to pass along the priority to engine->schedule(), but
in the future we want to have much more control over the various aspects
of the GPU during a context's execution, for example controlling the
frequency allowed. As we need an ever growing number of parameters for
scheduling, move those into a struct for convenience.

v2: Move the anonymous struct into its own function for legibility and
ye olde gcc.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180418184052.7129-3-chris@chris-wilson.co.uk

											
										
										
											2018-04-18 18:40:52 +00:00
+											struct i915_sched_attr sched_attr;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+										} *requests, execlist[EXECLIST_MAX_PORTS];
 										unsigned int num_ports;
 										struct {
 											u32 gfx_mode;
 											union {
 												u64 pdp[4];
 												u32 pp_dir_base;
 											};
 										} vm_info;
-												drm/i915: Only include active engines in the capture state

Skip printing out idle engines that did not contribute to the GPU hang.
As the number of engines gets ever larger, we have increasing noise in
the error state where typically there is only one guilty request on one
engine that we need to inspect.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190808144511.32269-1-chris@chris-wilson.co.uk

											
										
										
											2019-08-08 14:45:11 +00:00
 										struct drm_i915_error_engine *next;
 									} *engine;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
-												drm/i915: Cache the error string

Currently, we convert the error state into a string every time we read
from sysfs (and sysfs reads in page size (4KiB) chunks). We do try to
window the string and only capture the portion that is being read, but
that means that we must always convert up to the window to find the
start. For a very large error state bordering on EXEC_OBJECT_CAPTURE
abuse, this is noticeable as it degrades to O(N^2)!

As we do not have a convenient hook for sysfs open(), and we would like
to keep the lazy conversion into a string, do the conversion of the
whole string on the first read and keep the string until the error state
is freed.

v2: Don't double advance simple_read_from_buffer
v3: Due to extreme pain of lack of vrealloc, use a scatterlist
v4: Keep the forward iterator loosely cached
v5: Stylistic improvements to reduce patch size

Reported-by: Jason Ekstrand <jason@jlekstrand.net>
Testcase: igt/gem_exec_capture/many*
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181123132325.26541-1-chris@chris-wilson.co.uk

											
										
										
											2018-11-23 13:23:25 +00:00
+									struct scatterlist *sgl, *fit;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								};
 								struct i915_gpu_error {
 									/* For reset and error_state handling. */
 									spinlock_t lock;
 									/* Protected by the above dev->gpu_error.lock. */
 									struct i915_gpu_state *first_error;
 									atomic_t pending_fb_pin;
-												drm/i915: Revoke mmaps and prevent access to fence registers across reset

Previously, we were able to rely on the recursive properties of
struct_mutex to allow us to serialise revoking mmaps and reacquiring the
FENCE registers with them being clobbered over a global device reset.
I then proceeded to throw out the baby with the bath water in order to
pursue a struct_mutex-less reset.

Perusing LWN for alternative strategies, the dilemma on how to serialise
access to a global resource on one side was answered by
https://lwn.net/Articles/202847/ -- Sleepable RCU:

    1  int readside(void) {
    2      int idx;
    3      rcu_read_lock();
    4	   if (nomoresrcu) {
    5          rcu_read_unlock();
    6	       return -EINVAL;
    7      }
    8	   idx = srcu_read_lock(&ss);
    9	   rcu_read_unlock();
    10	   /* SRCU read-side critical section. */
    11	   srcu_read_unlock(&ss, idx);
    12	   return 0;
    13 }
    14
    15 void cleanup(void)
    16 {
    17     nomoresrcu = 1;
    18     synchronize_rcu();
    19     synchronize_srcu(&ss);
    20     cleanup_srcu_struct(&ss);
    21 }

No more worrying about stop_machine, just an uber-complex mutex,
optimised for reads, with the overhead pushed to the rare reset path.

However, we do run the risk of a deadlock as we allocate underneath the
SRCU read lock, and the allocation may require a GPU reset, causing a
dependency cycle via the in-flight requests. We resolve that by declaring
the driver wedged and cancelling all in-flight rendering.

v2: Use expedited rcu barriers to match our earlier timing
characteristics.
v3: Try to annotate locking contexts for sparse
v4: Reduce selftest lock duration to avoid a reset deadlock with fences
v5: s/srcu/reset_backoff_srcu/
v6: Remove more stale comments

Testcase: igt/gem_mmap_gtt/hang
Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190208153708.20023-2-chris@chris-wilson.co.uk

											
										
										
											2019-02-08 15:37:03 +00:00
+									/** Number of times the device has been reset (global) */
-												drm/i915/gt: Use intel_gt as the primary object for handling resets

Having taken the first step in encapsulating the functionality by moving
the related files under gt/, the next step is to start encapsulating by
passing around the relevant structs rather than the global
drm_i915_private. In this step, we pass intel_gt to intel_reset.c

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190712192953.9187-1-chris@chris-wilson.co.uk

											
										
										
											2019-07-12 19:29:53 +00:00
+									atomic_t reset_count;
-												drm/i915: Revoke mmaps and prevent access to fence registers across reset

Previously, we were able to rely on the recursive properties of
struct_mutex to allow us to serialise revoking mmaps and reacquiring the
FENCE registers with them being clobbered over a global device reset.
I then proceeded to throw out the baby with the bath water in order to
pursue a struct_mutex-less reset.

Perusing LWN for alternative strategies, the dilemma on how to serialise
access to a global resource on one side was answered by
https://lwn.net/Articles/202847/ -- Sleepable RCU:

    1  int readside(void) {
    2      int idx;
    3      rcu_read_lock();
    4	   if (nomoresrcu) {
    5          rcu_read_unlock();
    6	       return -EINVAL;
    7      }
    8	   idx = srcu_read_lock(&ss);
    9	   rcu_read_unlock();
    10	   /* SRCU read-side critical section. */
    11	   srcu_read_unlock(&ss, idx);
    12	   return 0;
    13 }
    14
    15 void cleanup(void)
    16 {
    17     nomoresrcu = 1;
    18     synchronize_rcu();
    19     synchronize_srcu(&ss);
    20     cleanup_srcu_struct(&ss);
    21 }

No more worrying about stop_machine, just an uber-complex mutex,
optimised for reads, with the overhead pushed to the rare reset path.

However, we do run the risk of a deadlock as we allocate underneath the
SRCU read lock, and the allocation may require a GPU reset, causing a
dependency cycle via the in-flight requests. We resolve that by declaring
the driver wedged and cancelling all in-flight rendering.

v2: Use expedited rcu barriers to match our earlier timing
characteristics.
v3: Try to annotate locking contexts for sparse
v4: Reduce selftest lock duration to avoid a reset deadlock with fences
v5: s/srcu/reset_backoff_srcu/
v6: Remove more stale comments

Testcase: igt/gem_mmap_gtt/hang
Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190208153708.20023-2-chris@chris-wilson.co.uk

											
										
										
											2019-02-08 15:37:03 +00:00
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+									/** Number of times an engine has been reset */
-												drm/i915/gt: Use intel_gt as the primary object for handling resets

Having taken the first step in encapsulating the functionality by moving
the related files under gt/, the next step is to start encapsulating by
passing around the relevant structs rather than the global
drm_i915_private. In this step, we pass intel_gt to intel_reset.c

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190712192953.9187-1-chris@chris-wilson.co.uk

											
										
										
											2019-07-12 19:29:53 +00:00
+									atomic_t reset_engine_count[I915_NUM_ENGINES];
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								};
 								struct drm_i915_error_state_buf {
 									struct drm_i915_private *i915;
-												drm/i915: Cache the error string

Currently, we convert the error state into a string every time we read
from sysfs (and sysfs reads in page size (4KiB) chunks). We do try to
window the string and only capture the portion that is being read, but
that means that we must always convert up to the window to find the
start. For a very large error state bordering on EXEC_OBJECT_CAPTURE
abuse, this is noticeable as it degrades to O(N^2)!

As we do not have a convenient hook for sysfs open(), and we would like
to keep the lazy conversion into a string, do the conversion of the
whole string on the first read and keep the string until the error state
is freed.

v2: Don't double advance simple_read_from_buffer
v3: Due to extreme pain of lack of vrealloc, use a scatterlist
v4: Keep the forward iterator loosely cached
v5: Stylistic improvements to reduce patch size

Reported-by: Jason Ekstrand <jason@jlekstrand.net>
Testcase: igt/gem_exec_capture/many*
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181123132325.26541-1-chris@chris-wilson.co.uk

											
										
										
											2018-11-23 13:23:25 +00:00
+									struct scatterlist *sgl, *cur, *end;
 									char *buf;
 									size_t bytes;
 									size_t size;
 									loff_t iter;
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+									int err;
 								};
 								#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
 								__printf(2, 3)
 								void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
 								struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
 								void i915_capture_error_state(struct drm_i915_private *dev_priv,
-												drm/i915: Move intel_engine_mask_t around for use by i915_request_types.h

We want to use intel_engine_mask_t inside i915_request.h, which means
extracting it from the general header file mess and placing it inside a
types.h. A knock on effect is that the compiler wants to warn about
type-contraction of ALL_ENGINES into intel_engine_maskt_t, so prepare
for the worst.

v2: Use intel_engine_mask_t consistently
v3: Move I915_NUM_ENGINES to its natural home at the end of the enum

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190401162641.10963-1-chris@chris-wilson.co.uk
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

											
										
										
											2019-04-01 16:26:39 +00:00
+											      intel_engine_mask_t engine_mask,
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+											      const char *error_msg);
 								static inline struct i915_gpu_state *
 								i915_gpu_state_get(struct i915_gpu_state *gpu)
 								{
 									kref_get(&gpu->ref);
 									return gpu;
 								}
-												drm/i915: Cache the error string

Currently, we convert the error state into a string every time we read
from sysfs (and sysfs reads in page size (4KiB) chunks). We do try to
window the string and only capture the portion that is being read, but
that means that we must always convert up to the window to find the
start. For a very large error state bordering on EXEC_OBJECT_CAPTURE
abuse, this is noticeable as it degrades to O(N^2)!

As we do not have a convenient hook for sysfs open(), and we would like
to keep the lazy conversion into a string, do the conversion of the
whole string on the first read and keep the string until the error state
is freed.

v2: Don't double advance simple_read_from_buffer
v3: Due to extreme pain of lack of vrealloc, use a scatterlist
v4: Keep the forward iterator loosely cached
v5: Stylistic improvements to reduce patch size

Reported-by: Jason Ekstrand <jason@jlekstrand.net>
Testcase: igt/gem_exec_capture/many*
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181123132325.26541-1-chris@chris-wilson.co.uk

											
										
										
											2018-11-23 13:23:25 +00:00
+								ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error,
 												      char *buf, loff_t offset, size_t count);
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								void __i915_gpu_state_free(struct kref *kref);
 								static inline void i915_gpu_state_put(struct i915_gpu_state *gpu)
 								{
 									if (gpu)
 										kref_put(&gpu->ref, __i915_gpu_state_free);
 								}
 								struct i915_gpu_state *i915_first_error_state(struct drm_i915_private *i915);
 								void i915_reset_error_state(struct drm_i915_private *i915);
-												drm/i915: Prevent machine hang from Broxton's vtd w/a and error capture

Since capturing the error state requires fiddling around with the GGTT
to read arbitrary buffers and is itself run under stop_machine(), it
deadlocks the machine (effectively a hard hang) when run in conjunction
with Broxton's VTd workaround to serialize GGTT access.

v2: Store the ERR_PTR in first_error so that the error can be reported
to the user via sysfs.
v3: Mention the quirk in dmesg (using info as per usual)

Fixes: 0ef34ad6222a ("drm/i915: Serialize GTT/Aperture accesses on BXT")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: John Harrison <john.C.Harrison@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181102161232.17742-5-chris@chris-wilson.co.uk

											
										
										
											2018-11-02 16:12:12 +00:00
+								void i915_disable_error_state(struct drm_i915_private *i915, int err);
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
 								#else
 								static inline void i915_capture_error_state(struct drm_i915_private *dev_priv,
 													    u32 engine_mask,
 													    const char *error_msg)
 								{
 								}
 								static inline struct i915_gpu_state *
 								i915_first_error_state(struct drm_i915_private *i915)
 								{
-												drm/i915: Prevent machine hang from Broxton's vtd w/a and error capture

Since capturing the error state requires fiddling around with the GGTT
to read arbitrary buffers and is itself run under stop_machine(), it
deadlocks the machine (effectively a hard hang) when run in conjunction
with Broxton's VTd workaround to serialize GGTT access.

v2: Store the ERR_PTR in first_error so that the error can be reported
to the user via sysfs.
v3: Mention the quirk in dmesg (using info as per usual)

Fixes: 0ef34ad6222a ("drm/i915: Serialize GTT/Aperture accesses on BXT")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: John Harrison <john.C.Harrison@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181102161232.17742-5-chris@chris-wilson.co.uk

											
										
										
											2018-11-02 16:12:12 +00:00
+									return ERR_PTR(-ENODEV);
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								}
 								static inline void i915_reset_error_state(struct drm_i915_private *i915)
 								{
 								}
-												drm/i915: Prevent machine hang from Broxton's vtd w/a and error capture

Since capturing the error state requires fiddling around with the GGTT
to read arbitrary buffers and is itself run under stop_machine(), it
deadlocks the machine (effectively a hard hang) when run in conjunction
with Broxton's VTd workaround to serialize GGTT access.

v2: Store the ERR_PTR in first_error so that the error can be reported
to the user via sysfs.
v3: Mention the quirk in dmesg (using info as per usual)

Fixes: 0ef34ad6222a ("drm/i915: Serialize GTT/Aperture accesses on BXT")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: John Harrison <john.C.Harrison@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181102161232.17742-5-chris@chris-wilson.co.uk

											
										
										
											2018-11-02 16:12:12 +00:00
+								static inline void i915_disable_error_state(struct drm_i915_private *i915,
 													    int err)
 								{
 								}
-												drm/i915: Move i915_gpu_error into its own header

Error state management code was moved into separate .c unit
but we didn't move related definitions into own header.

v2: move also intel_display_error_state forward decl
    fix ("Prefer 'unsigned int' to bare use of 'unsigned'")
    warnings detected by checkpatch in moved code (Michal)

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180308095037.18264-5-michal.wajdeczko@intel.com

											
										
										
											2018-03-08 09:50:37 +00:00
+								#endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */
 								#endif /* _I915_GPU_ERROR_H_ */