xhuff
diff --git a/‎drivers/gpu/drm/i915/i915_debugfs.c‎
Lines changed: 7 additions & 31 deletions b/‎drivers/gpu/drm/i915/i915_debugfs.c‎
Lines changed: 7 additions & 31 deletions
diff --git a/‎drivers/gpu/drm/i915/i915_drv.h‎
Lines changed: 0 additions & 5 deletions b/‎drivers/gpu/drm/i915/i915_drv.h‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎drivers/gpu/drm/i915/i915_gem.c‎
Lines changed: 7 additions & 11 deletions b/‎drivers/gpu/drm/i915/i915_gem.c‎
Lines changed: 7 additions & 11 deletions
diff --git a/‎drivers/gpu/drm/i915/i915_gem_fence_reg.h‎
Lines changed: 0 additions & 1 deletion b/‎drivers/gpu/drm/i915/i915_gem_fence_reg.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎drivers/gpu/drm/i915/i915_gem_gtt.h‎
Lines changed: 1 addition & 0 deletions b/‎drivers/gpu/drm/i915/i915_gem_gtt.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎drivers/gpu/drm/i915/i915_gpu_error.c‎
Lines changed: 49 additions & 55 deletions b/‎drivers/gpu/drm/i915/i915_gpu_error.c‎
Lines changed: 49 additions & 55 deletions
@@ -1284,8 +1284,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_puts(m, "Wedged\n");
 	if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
 		seq_puts(m, "Reset in progress: struct_mutex backoff\n");
-	if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
-		seq_puts(m, "Reset in progress: reset handoff to waiter\n");
 	if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
 		seq_puts(m, "Waiter holding struct mutex\n");
 	if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
@@ -1321,15 +1319,15 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		struct rb_node *rb;
 
 		seq_printf(m, "%s:\n", engine->name);
-		seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
+		seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n",
 			   engine->hangcheck.seqno, seqno[id],
-			   intel_engine_last_submit(engine));
-		seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n",
+			   intel_engine_last_submit(engine),
+			   jiffies_to_msecs(jiffies -
+					    engine->hangcheck.action_timestamp));
+		seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
 			   yesno(intel_engine_has_waiter(engine)),
 			   yesno(test_bit(engine->id,
-					  &dev_priv->gpu_error.missed_irq_rings)),
-			   yesno(engine->hangcheck.stalled),
-			   yesno(engine->hangcheck.wedged));
+					  &dev_priv->gpu_error.missed_irq_rings)));
 
 		spin_lock_irq(&b->rb_lock);
 		for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
@@ -1343,11 +1341,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)engine->hangcheck.acthd,
 			   (long long)acthd[id]);
-		seq_printf(m, "\taction = %s(%d) %d ms ago\n",
-			   hangcheck_action_to_str(engine->hangcheck.action),
-			   engine->hangcheck.action,
-			   jiffies_to_msecs(jiffies -
-					    engine->hangcheck.action_timestamp));
 
 		if (engine->id == RCS) {
 			seq_puts(m, "\tinstdone read =\n");
@@ -3911,8 +3904,6 @@ static int
 i915_wedged_set(void *data, u64 val)
 {
 	struct drm_i915_private *i915 = data;
-	struct intel_engine_cs *engine;
-	unsigned int tmp;
 
 	/*
 	 * There is no safeguard against this debugfs entry colliding
@@ -3925,18 +3916,8 @@ i915_wedged_set(void *data, u64 val)
 	if (i915_reset_backoff(&i915->gpu_error))
 		return -EAGAIN;
 
-	for_each_engine_masked(engine, i915, val, tmp) {
-		engine->hangcheck.seqno = intel_engine_get_seqno(engine);
-		engine->hangcheck.stalled = true;
-	}
-
 	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
 			  "Manually set wedged engine mask = %llx", val);
-
-	wait_on_bit(&i915->gpu_error.flags,
-		    I915_RESET_HANDOFF,
-		    TASK_UNINTERRUPTIBLE);
-
 	return 0;
 }
 
@@ -4091,13 +4072,8 @@ i915_drop_caches_set(void *data, u64 val)
 		mutex_unlock(&i915->drm.struct_mutex);
 	}
 
-	if (val & DROP_RESET_ACTIVE &&
-	    i915_terminally_wedged(&i915->gpu_error)) {
+	if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error))
 		i915_handle_error(i915, ALL_ENGINES, 0, NULL);
-		wait_on_bit(&i915->gpu_error.flags,
-			    I915_RESET_HANDOFF,
-			    TASK_UNINTERRUPTIBLE);
-	}
 
 	fs_reclaim_acquire(GFP_KERNEL);
 	if (val & DROP_BOUND)
 
@@ -3001,11 +3001,6 @@ static inline bool i915_reset_backoff(struct i915_gpu_error *error)
 	return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
 }
 
-static inline bool i915_reset_handoff(struct i915_gpu_error *error)
-{
-	return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
-}
-
 static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
 {
 	return unlikely(test_bit(I915_WEDGED, &error->flags));
 
@@ -657,11 +657,6 @@ i915_gem_object_wait(struct drm_i915_gem_object *obj,
 		     struct intel_rps_client *rps_client)
 {
 	might_sleep();
-#if IS_ENABLED(CONFIG_LOCKDEP)
-	GEM_BUG_ON(debug_locks &&
-		   !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
-		   !!(flags & I915_WAIT_LOCKED));
-#endif
 	GEM_BUG_ON(timeout < 0);
 
 	timeout = i915_gem_object_wait_reservation(obj->resv,
@@ -4493,8 +4488,6 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
 
 	GEM_TRACE("\n");
 
-	mutex_lock(&i915->drm.struct_mutex);
-
 	wakeref = intel_runtime_pm_get(i915);
 	intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
 
@@ -4520,6 +4513,7 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
 	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
 	intel_runtime_pm_put(i915, wakeref);
 
+	mutex_lock(&i915->drm.struct_mutex);
 	i915_gem_contexts_lost(i915);
 	mutex_unlock(&i915->drm.struct_mutex);
 }
@@ -4534,6 +4528,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
 	wakeref = intel_runtime_pm_get(i915);
 	intel_suspend_gt_powersave(i915);
 
+	flush_workqueue(i915->wq);
+
 	mutex_lock(&i915->drm.struct_mutex);
 
 	/*
@@ -4563,18 +4559,18 @@ int i915_gem_suspend(struct drm_i915_private *i915)
 	i915_retire_requests(i915); /* ensure we flush after wedging */
 
 	mutex_unlock(&i915->drm.struct_mutex);
+	i915_reset_flush(i915);
 
-	intel_uc_suspend(i915);
-
-	cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
-	cancel_delayed_work_sync(&i915->gt.retire_work);
+	drain_delayed_work(&i915->gt.retire_work);
 
 	/*
 	 * As the idle_work is rearming if it detects a race, play safe and
 	 * repeat the flush until it is definitely idle.
 	 */
 	drain_delayed_work(&i915->gt.idle_work);
 
+	intel_uc_suspend(i915);
+
 	/*
 	 * Assert that we successfully flushed all the work and
 	 * reset the GPU back to its idle, low power state.
 
@@ -50,4 +50,3 @@ struct drm_i915_fence_reg {
 };
 
 #endif
-
@@ -39,6 +39,7 @@
 #include <linux/pagevec.h>
 
 #include "i915_request.h"
+#include "i915_reset.h"
 #include "i915_selftest.h"
 #include "i915_timeline.h"
 
 
@@ -533,10 +533,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  waiting: %s\n", yesno(ee->waiting));
 	err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
 	err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
-	err_printf(m, "  hangcheck stall: %s\n", yesno(ee->hangcheck_stalled));
-	err_printf(m, "  hangcheck action: %s\n",
-		   hangcheck_action_to_str(ee->hangcheck_action));
-	err_printf(m, "  hangcheck action timestamp: %dms (%lu%s)\n",
+	err_printf(m, "  hangcheck timestamp: %dms (%lu%s)\n",
 		   jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
 		   ee->hangcheck_timestamp,
 		   ee->hangcheck_timestamp == epoch ? "; epoch" : "");
@@ -684,15 +681,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 		   jiffies_to_msecs(error->capture - error->epoch));
 
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (error->engine[i].hangcheck_stalled &&
-		    error->engine[i].context.pid) {
-			err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
-				   engine_name(m->i915, i),
-				   error->engine[i].context.comm,
-				   error->engine[i].context.pid,
-				   error->engine[i].context.ban_score,
-				   bannable(&error->engine[i].context));
-		}
+		if (!error->engine[i].context.pid)
+			continue;
+
+		err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
+			   engine_name(m->i915, i),
+			   error->engine[i].context.comm,
+			   error->engine[i].context.pid,
+			   error->engine[i].context.ban_score,
+			   bannable(&error->engine[i].context));
 	}
 	err_printf(m, "Reset count: %u\n", error->reset_count);
 	err_printf(m, "Suspend count: %u\n", error->suspend_count);
@@ -1144,7 +1141,8 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
 	return i;
 }
 
-/* Generate a semi-unique error code. The code is not meant to have meaning, The
+/*
+ * Generate a semi-unique error code. The code is not meant to have meaning, The
  * code's only purpose is to try to prevent false duplicated bug reports by
  * grossly estimating a GPU error state.
  *
@@ -1153,29 +1151,23 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
  *
  * It's only a small step better than a random number in its current form.
  */
-static u32 i915_error_generate_code(struct drm_i915_private *dev_priv,
-				    struct i915_gpu_state *error,
-				    int *engine_id)
+static u32 i915_error_generate_code(struct i915_gpu_state *error,
+				    unsigned long engine_mask)
 {
-	u32 error_code = 0;
-	int i;
-
-	/* IPEHR would be an ideal way to detect errors, as it's the gross
+	/*
+	 * IPEHR would be an ideal way to detect errors, as it's the gross
 	 * measure of "the command that hung." However, has some very common
 	 * synchronization commands which almost always appear in the case
 	 * strictly a client bug. Use instdone to differentiate those some.
 	 */
-	for (i = 0; i < I915_NUM_ENGINES; i++) {
-		if (error->engine[i].hangcheck_stalled) {
-			if (engine_id)
-				*engine_id = i;
+	if (engine_mask) {
+		struct drm_i915_error_engine *ee =
+			&error->engine[ffs(engine_mask)];
 
-			return error->engine[i].ipehr ^
-			       error->engine[i].instdone.instdone;
-		}
+		return ee->ipehr ^ ee->instdone.instdone;
 	}
 
-	return error_code;
+	return 0;
 }
 
 static void gem_record_fences(struct i915_gpu_state *error)
@@ -1338,9 +1330,8 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
 	}
 
 	ee->idle = intel_engine_is_idle(engine);
-	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
-	ee->hangcheck_action = engine->hangcheck.action;
-	ee->hangcheck_stalled = engine->hangcheck.stalled;
+	if (!ee->idle)
+		ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
 	ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
 						  engine);
 
@@ -1783,31 +1774,35 @@ static void capture_reg_state(struct i915_gpu_state *error)
 	error->pgtbl_er = I915_READ(PGTBL_ER);
 }
 
-static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
-				   struct i915_gpu_state *error,
-				   u32 engine_mask,
-				   const char *error_msg)
+static const char *
+error_msg(struct i915_gpu_state *error, unsigned long engines, const char *msg)
 {
-	u32 ecode;
-	int engine_id = -1, len;
+	int len;
+	int i;
 
-	ecode = i915_error_generate_code(dev_priv, error, &engine_id);
+	for (i = 0; i < ARRAY_SIZE(error->engine); i++)
+		if (!error->engine[i].context.pid)
+			engines &= ~BIT(i);
 
 	len = scnprintf(error->error_msg, sizeof(error->error_msg),
-			"GPU HANG: ecode %d:%d:0x%08x",
-			INTEL_GEN(dev_priv), engine_id, ecode);
-
-	if (engine_id != -1 && error->engine[engine_id].context.pid)
+			"GPU HANG: ecode %d:%lx:0x%08x",
+			INTEL_GEN(error->i915), engines,
+			i915_error_generate_code(error, engines));
+	if (engines) {
+		/* Just show the first executing process, more is confusing */
+		i = ffs(engines);
 		len += scnprintf(error->error_msg + len,
 				 sizeof(error->error_msg) - len,
 				 ", in %s [%d]",
-				 error->engine[engine_id].context.comm,
-				 error->engine[engine_id].context.pid);
+				 error->engine[i].context.comm,
+				 error->engine[i].context.pid);
+	}
+	if (msg)
+		len += scnprintf(error->error_msg + len,
+				 sizeof(error->error_msg) - len,
+				 ", %s", msg);
 
-	scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
-		  ", reason: %s, action: %s",
-		  error_msg,
-		  engine_mask ? "reset" : "continue");
+	return error->error_msg;
 }
 
 static void capture_gen_state(struct i915_gpu_state *error)
@@ -1847,7 +1842,7 @@ static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 		const struct drm_i915_error_engine *ee = &error->engine[i];
 
-		if (ee->hangcheck_stalled &&
+		if (ee->hangcheck_timestamp &&
 		    time_before(ee->hangcheck_timestamp, epoch))
 			epoch = ee->hangcheck_timestamp;
 	}
@@ -1921,16 +1916,16 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
  * i915_capture_error_state - capture an error record for later analysis
  * @i915: i915 device
  * @engine_mask: the mask of engines triggering the hang
- * @error_msg: a message to insert into the error capture header
+ * @msg: a message to insert into the error capture header
  *
  * Should be called when an error is detected (either a hang or an error
  * interrupt) to capture error state from the time of the error.  Fills
  * out a structure which becomes available in debugfs for user level tools
  * to pick up.
  */
 void i915_capture_error_state(struct drm_i915_private *i915,
-			      u32 engine_mask,
-			      const char *error_msg)
+			      unsigned long engine_mask,
+			      const char *msg)
 {
 	static bool warned;
 	struct i915_gpu_state *error;
@@ -1946,8 +1941,7 @@ void i915_capture_error_state(struct drm_i915_private *i915,
 	if (IS_ERR(error))
 		return;
 
-	i915_error_capture_msg(i915, error, engine_mask, error_msg);
-	DRM_INFO("%s\n", error->error_msg);
+	dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
 
 	if (!error->simulated) {
 		spin_lock_irqsave(&i915->gpu_error.lock, flags);
Original file line number	Diff line number	Diff line change
`@@ -3001,11 +3001,6 @@ static inline bool i915_reset_backoff(struct i915_gpu_error *error)`
`3001`	`3001`	`return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));`
`3002`	`3002`	`}`
`3003`	`3003`
`3004`		`-static inline bool i915_reset_handoff(struct i915_gpu_error *error)`
`3005`		`-{`
`3006`		`- return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));`
`3007`		`-}`
`3008`		`-`
`3009`	`3004`	`static inline bool i915_terminally_wedged(struct i915_gpu_error *error)`
`3010`	`3005`	`{`
`3011`	`3006`	`return unlikely(test_bit(I915_WEDGED, &error->flags));`
Original file line number	Diff line number	Diff line change
`@@ -50,4 +50,3 @@ struct drm_i915_fence_reg {`
`50`	`50`	`};`
`51`	`51`
`52`	`52`	`#endif`
`53`		`-`