Skip to content

Commit eb8d0f5

Browse files
committed
drm/i915: Remove GPU reset dependence on struct_mutex
Now that the submission backends are controlled via their own spinlocks, with a wave of a magic wand we can lift the struct_mutex requirement around GPU reset. That is we allow the submission frontend (userspace) to keep on submitting while we process the GPU reset as we can suspend the backend independently. The major change is around the backoff/handoff strategy for performing the reset. With no mutex deadlock, we no longer have to coordinate with any waiter, and just perform the reset immediately. Testcase: igt/gem_mmap_gtt/hang # regresses Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190125132230.22221-3-chris@chris-wilson.co.uk
1 parent fe62365 commit eb8d0f5

20 files changed

Lines changed: 404 additions & 537 deletions

drivers/gpu/drm/i915/i915_debugfs.c

Lines changed: 7 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1284,8 +1284,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
12841284
seq_puts(m, "Wedged\n");
12851285
if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
12861286
seq_puts(m, "Reset in progress: struct_mutex backoff\n");
1287-
if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
1288-
seq_puts(m, "Reset in progress: reset handoff to waiter\n");
12891287
if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
12901288
seq_puts(m, "Waiter holding struct mutex\n");
12911289
if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
@@ -1321,15 +1319,15 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
13211319
struct rb_node *rb;
13221320

13231321
seq_printf(m, "%s:\n", engine->name);
1324-
seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
1322+
seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n",
13251323
engine->hangcheck.seqno, seqno[id],
1326-
intel_engine_last_submit(engine));
1327-
seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n",
1324+
intel_engine_last_submit(engine),
1325+
jiffies_to_msecs(jiffies -
1326+
engine->hangcheck.action_timestamp));
1327+
seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
13281328
yesno(intel_engine_has_waiter(engine)),
13291329
yesno(test_bit(engine->id,
1330-
&dev_priv->gpu_error.missed_irq_rings)),
1331-
yesno(engine->hangcheck.stalled),
1332-
yesno(engine->hangcheck.wedged));
1330+
&dev_priv->gpu_error.missed_irq_rings)));
13331331

13341332
spin_lock_irq(&b->rb_lock);
13351333
for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
@@ -1343,11 +1341,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
13431341
seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
13441342
(long long)engine->hangcheck.acthd,
13451343
(long long)acthd[id]);
1346-
seq_printf(m, "\taction = %s(%d) %d ms ago\n",
1347-
hangcheck_action_to_str(engine->hangcheck.action),
1348-
engine->hangcheck.action,
1349-
jiffies_to_msecs(jiffies -
1350-
engine->hangcheck.action_timestamp));
13511344

13521345
if (engine->id == RCS) {
13531346
seq_puts(m, "\tinstdone read =\n");
@@ -3911,8 +3904,6 @@ static int
39113904
i915_wedged_set(void *data, u64 val)
39123905
{
39133906
struct drm_i915_private *i915 = data;
3914-
struct intel_engine_cs *engine;
3915-
unsigned int tmp;
39163907

39173908
/*
39183909
* There is no safeguard against this debugfs entry colliding
@@ -3925,18 +3916,8 @@ i915_wedged_set(void *data, u64 val)
39253916
if (i915_reset_backoff(&i915->gpu_error))
39263917
return -EAGAIN;
39273918

3928-
for_each_engine_masked(engine, i915, val, tmp) {
3929-
engine->hangcheck.seqno = intel_engine_get_seqno(engine);
3930-
engine->hangcheck.stalled = true;
3931-
}
3932-
39333919
i915_handle_error(i915, val, I915_ERROR_CAPTURE,
39343920
"Manually set wedged engine mask = %llx", val);
3935-
3936-
wait_on_bit(&i915->gpu_error.flags,
3937-
I915_RESET_HANDOFF,
3938-
TASK_UNINTERRUPTIBLE);
3939-
39403921
return 0;
39413922
}
39423923

@@ -4091,13 +4072,8 @@ i915_drop_caches_set(void *data, u64 val)
40914072
mutex_unlock(&i915->drm.struct_mutex);
40924073
}
40934074

4094-
if (val & DROP_RESET_ACTIVE &&
4095-
i915_terminally_wedged(&i915->gpu_error)) {
4075+
if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error))
40964076
i915_handle_error(i915, ALL_ENGINES, 0, NULL);
4097-
wait_on_bit(&i915->gpu_error.flags,
4098-
I915_RESET_HANDOFF,
4099-
TASK_UNINTERRUPTIBLE);
4100-
}
41014077

41024078
fs_reclaim_acquire(GFP_KERNEL);
41034079
if (val & DROP_BOUND)

drivers/gpu/drm/i915/i915_drv.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3001,11 +3001,6 @@ static inline bool i915_reset_backoff(struct i915_gpu_error *error)
30013001
return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
30023002
}
30033003

3004-
static inline bool i915_reset_handoff(struct i915_gpu_error *error)
3005-
{
3006-
return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
3007-
}
3008-
30093004
static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
30103005
{
30113006
return unlikely(test_bit(I915_WEDGED, &error->flags));

drivers/gpu/drm/i915/i915_gem.c

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -657,11 +657,6 @@ i915_gem_object_wait(struct drm_i915_gem_object *obj,
657657
struct intel_rps_client *rps_client)
658658
{
659659
might_sleep();
660-
#if IS_ENABLED(CONFIG_LOCKDEP)
661-
GEM_BUG_ON(debug_locks &&
662-
!!lockdep_is_held(&obj->base.dev->struct_mutex) !=
663-
!!(flags & I915_WAIT_LOCKED));
664-
#endif
665660
GEM_BUG_ON(timeout < 0);
666661

667662
timeout = i915_gem_object_wait_reservation(obj->resv,
@@ -4493,8 +4488,6 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
44934488

44944489
GEM_TRACE("\n");
44954490

4496-
mutex_lock(&i915->drm.struct_mutex);
4497-
44984491
wakeref = intel_runtime_pm_get(i915);
44994492
intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
45004493

@@ -4520,6 +4513,7 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
45204513
intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
45214514
intel_runtime_pm_put(i915, wakeref);
45224515

4516+
mutex_lock(&i915->drm.struct_mutex);
45234517
i915_gem_contexts_lost(i915);
45244518
mutex_unlock(&i915->drm.struct_mutex);
45254519
}
@@ -4534,6 +4528,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
45344528
wakeref = intel_runtime_pm_get(i915);
45354529
intel_suspend_gt_powersave(i915);
45364530

4531+
flush_workqueue(i915->wq);
4532+
45374533
mutex_lock(&i915->drm.struct_mutex);
45384534

45394535
/*
@@ -4563,18 +4559,18 @@ int i915_gem_suspend(struct drm_i915_private *i915)
45634559
i915_retire_requests(i915); /* ensure we flush after wedging */
45644560

45654561
mutex_unlock(&i915->drm.struct_mutex);
4562+
i915_reset_flush(i915);
45664563

4567-
intel_uc_suspend(i915);
4568-
4569-
cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
4570-
cancel_delayed_work_sync(&i915->gt.retire_work);
4564+
drain_delayed_work(&i915->gt.retire_work);
45714565

45724566
/*
45734567
* As the idle_work is rearming if it detects a race, play safe and
45744568
* repeat the flush until it is definitely idle.
45754569
*/
45764570
drain_delayed_work(&i915->gt.idle_work);
45774571

4572+
intel_uc_suspend(i915);
4573+
45784574
/*
45794575
* Assert that we successfully flushed all the work and
45804576
* reset the GPU back to its idle, low power state.

drivers/gpu/drm/i915/i915_gem_fence_reg.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,3 @@ struct drm_i915_fence_reg {
5050
};
5151

5252
#endif
53-

drivers/gpu/drm/i915/i915_gem_gtt.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include <linux/pagevec.h>
4040

4141
#include "i915_request.h"
42+
#include "i915_reset.h"
4243
#include "i915_selftest.h"
4344
#include "i915_timeline.h"
4445

drivers/gpu/drm/i915/i915_gpu_error.c

Lines changed: 49 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -533,10 +533,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
533533
err_printf(m, " waiting: %s\n", yesno(ee->waiting));
534534
err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head);
535535
err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail);
536-
err_printf(m, " hangcheck stall: %s\n", yesno(ee->hangcheck_stalled));
537-
err_printf(m, " hangcheck action: %s\n",
538-
hangcheck_action_to_str(ee->hangcheck_action));
539-
err_printf(m, " hangcheck action timestamp: %dms (%lu%s)\n",
536+
err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n",
540537
jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
541538
ee->hangcheck_timestamp,
542539
ee->hangcheck_timestamp == epoch ? "; epoch" : "");
@@ -684,15 +681,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
684681
jiffies_to_msecs(error->capture - error->epoch));
685682

686683
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
687-
if (error->engine[i].hangcheck_stalled &&
688-
error->engine[i].context.pid) {
689-
err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
690-
engine_name(m->i915, i),
691-
error->engine[i].context.comm,
692-
error->engine[i].context.pid,
693-
error->engine[i].context.ban_score,
694-
bannable(&error->engine[i].context));
695-
}
684+
if (!error->engine[i].context.pid)
685+
continue;
686+
687+
err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
688+
engine_name(m->i915, i),
689+
error->engine[i].context.comm,
690+
error->engine[i].context.pid,
691+
error->engine[i].context.ban_score,
692+
bannable(&error->engine[i].context));
696693
}
697694
err_printf(m, "Reset count: %u\n", error->reset_count);
698695
err_printf(m, "Suspend count: %u\n", error->suspend_count);
@@ -1144,7 +1141,8 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
11441141
return i;
11451142
}
11461143

1147-
/* Generate a semi-unique error code. The code is not meant to have meaning, The
1144+
/*
1145+
* Generate a semi-unique error code. The code is not meant to have meaning, The
11481146
* code's only purpose is to try to prevent false duplicated bug reports by
11491147
* grossly estimating a GPU error state.
11501148
*
@@ -1153,29 +1151,23 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
11531151
*
11541152
* It's only a small step better than a random number in its current form.
11551153
*/
1156-
static u32 i915_error_generate_code(struct drm_i915_private *dev_priv,
1157-
struct i915_gpu_state *error,
1158-
int *engine_id)
1154+
static u32 i915_error_generate_code(struct i915_gpu_state *error,
1155+
unsigned long engine_mask)
11591156
{
1160-
u32 error_code = 0;
1161-
int i;
1162-
1163-
/* IPEHR would be an ideal way to detect errors, as it's the gross
1157+
/*
1158+
* IPEHR would be an ideal way to detect errors, as it's the gross
11641159
* measure of "the command that hung." However, has some very common
11651160
* synchronization commands which almost always appear in the case
11661161
* strictly a client bug. Use instdone to differentiate those some.
11671162
*/
1168-
for (i = 0; i < I915_NUM_ENGINES; i++) {
1169-
if (error->engine[i].hangcheck_stalled) {
1170-
if (engine_id)
1171-
*engine_id = i;
1163+
if (engine_mask) {
1164+
struct drm_i915_error_engine *ee =
1165+
&error->engine[ffs(engine_mask)];
11721166

1173-
return error->engine[i].ipehr ^
1174-
error->engine[i].instdone.instdone;
1175-
}
1167+
return ee->ipehr ^ ee->instdone.instdone;
11761168
}
11771169

1178-
return error_code;
1170+
return 0;
11791171
}
11801172

11811173
static void gem_record_fences(struct i915_gpu_state *error)
@@ -1338,9 +1330,8 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
13381330
}
13391331

13401332
ee->idle = intel_engine_is_idle(engine);
1341-
ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
1342-
ee->hangcheck_action = engine->hangcheck.action;
1343-
ee->hangcheck_stalled = engine->hangcheck.stalled;
1333+
if (!ee->idle)
1334+
ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
13441335
ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
13451336
engine);
13461337

@@ -1783,31 +1774,35 @@ static void capture_reg_state(struct i915_gpu_state *error)
17831774
error->pgtbl_er = I915_READ(PGTBL_ER);
17841775
}
17851776

1786-
static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
1787-
struct i915_gpu_state *error,
1788-
u32 engine_mask,
1789-
const char *error_msg)
1777+
static const char *
1778+
error_msg(struct i915_gpu_state *error, unsigned long engines, const char *msg)
17901779
{
1791-
u32 ecode;
1792-
int engine_id = -1, len;
1780+
int len;
1781+
int i;
17931782

1794-
ecode = i915_error_generate_code(dev_priv, error, &engine_id);
1783+
for (i = 0; i < ARRAY_SIZE(error->engine); i++)
1784+
if (!error->engine[i].context.pid)
1785+
engines &= ~BIT(i);
17951786

17961787
len = scnprintf(error->error_msg, sizeof(error->error_msg),
1797-
"GPU HANG: ecode %d:%d:0x%08x",
1798-
INTEL_GEN(dev_priv), engine_id, ecode);
1799-
1800-
if (engine_id != -1 && error->engine[engine_id].context.pid)
1788+
"GPU HANG: ecode %d:%lx:0x%08x",
1789+
INTEL_GEN(error->i915), engines,
1790+
i915_error_generate_code(error, engines));
1791+
if (engines) {
1792+
/* Just show the first executing process, more is confusing */
1793+
i = ffs(engines);
18011794
len += scnprintf(error->error_msg + len,
18021795
sizeof(error->error_msg) - len,
18031796
", in %s [%d]",
1804-
error->engine[engine_id].context.comm,
1805-
error->engine[engine_id].context.pid);
1797+
error->engine[i].context.comm,
1798+
error->engine[i].context.pid);
1799+
}
1800+
if (msg)
1801+
len += scnprintf(error->error_msg + len,
1802+
sizeof(error->error_msg) - len,
1803+
", %s", msg);
18061804

1807-
scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
1808-
", reason: %s, action: %s",
1809-
error_msg,
1810-
engine_mask ? "reset" : "continue");
1805+
return error->error_msg;
18111806
}
18121807

18131808
static void capture_gen_state(struct i915_gpu_state *error)
@@ -1847,7 +1842,7 @@ static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
18471842
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
18481843
const struct drm_i915_error_engine *ee = &error->engine[i];
18491844

1850-
if (ee->hangcheck_stalled &&
1845+
if (ee->hangcheck_timestamp &&
18511846
time_before(ee->hangcheck_timestamp, epoch))
18521847
epoch = ee->hangcheck_timestamp;
18531848
}
@@ -1921,16 +1916,16 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
19211916
* i915_capture_error_state - capture an error record for later analysis
19221917
* @i915: i915 device
19231918
* @engine_mask: the mask of engines triggering the hang
1924-
* @error_msg: a message to insert into the error capture header
1919+
* @msg: a message to insert into the error capture header
19251920
*
19261921
* Should be called when an error is detected (either a hang or an error
19271922
* interrupt) to capture error state from the time of the error. Fills
19281923
* out a structure which becomes available in debugfs for user level tools
19291924
* to pick up.
19301925
*/
19311926
void i915_capture_error_state(struct drm_i915_private *i915,
1932-
u32 engine_mask,
1933-
const char *error_msg)
1927+
unsigned long engine_mask,
1928+
const char *msg)
19341929
{
19351930
static bool warned;
19361931
struct i915_gpu_state *error;
@@ -1946,8 +1941,7 @@ void i915_capture_error_state(struct drm_i915_private *i915,
19461941
if (IS_ERR(error))
19471942
return;
19481943

1949-
i915_error_capture_msg(i915, error, engine_mask, error_msg);
1950-
DRM_INFO("%s\n", error->error_msg);
1944+
dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
19511945

19521946
if (!error->simulated) {
19531947
spin_lock_irqsave(&i915->gpu_error.lock, flags);

0 commit comments

Comments
 (0)