@@ -533,10 +533,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
533533 err_printf (m , " waiting: %s\n" , yesno (ee -> waiting ));
534534 err_printf (m , " ring->head: 0x%08x\n" , ee -> cpu_ring_head );
535535 err_printf (m , " ring->tail: 0x%08x\n" , ee -> cpu_ring_tail );
536- err_printf (m , " hangcheck stall: %s\n" , yesno (ee -> hangcheck_stalled ));
537- err_printf (m , " hangcheck action: %s\n" ,
538- hangcheck_action_to_str (ee -> hangcheck_action ));
539- err_printf (m , " hangcheck action timestamp: %dms (%lu%s)\n" ,
536+ err_printf (m , " hangcheck timestamp: %dms (%lu%s)\n" ,
540537 jiffies_to_msecs (ee -> hangcheck_timestamp - epoch ),
541538 ee -> hangcheck_timestamp ,
542539 ee -> hangcheck_timestamp == epoch ? "; epoch" : "" );
@@ -684,15 +681,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
684681 jiffies_to_msecs (error -> capture - error -> epoch ));
685682
686683 for (i = 0 ; i < ARRAY_SIZE (error -> engine ); i ++ ) {
687- if (error -> engine [i ].hangcheck_stalled &&
688- error -> engine [ i ]. context . pid ) {
689- err_printf ( m , "Active process (on ring %s): %s [%d], score %d%s\n" ,
690- engine_name ( m -> i915 , i ) ,
691- error -> engine [ i ]. context . comm ,
692- error -> engine [i ].context .pid ,
693- error -> engine [i ].context .ban_score ,
694- bannable ( & error -> engine [i ].context ));
695- }
684+ if (! error -> engine [i ].context . pid )
685+ continue ;
686+
687+ err_printf ( m , "Active process (on ring %s): %s [%d], score %d%s\n" ,
688+ engine_name ( m -> i915 , i ) ,
689+ error -> engine [i ].context .comm ,
690+ error -> engine [i ].context .pid ,
691+ error -> engine [i ].context . ban_score ,
692+ bannable ( & error -> engine [ i ]. context ));
696693 }
697694 err_printf (m , "Reset count: %u\n" , error -> reset_count );
698695 err_printf (m , "Suspend count: %u\n" , error -> suspend_count );
@@ -1144,7 +1141,8 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
11441141 return i ;
11451142}
11461143
1147- /* Generate a semi-unique error code. The code is not meant to have meaning, The
1144+ /*
1145+ * Generate a semi-unique error code. The code is not meant to have meaning, The
11481146 * code's only purpose is to try to prevent false duplicated bug reports by
11491147 * grossly estimating a GPU error state.
11501148 *
@@ -1153,29 +1151,23 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
11531151 *
11541152 * It's only a small step better than a random number in its current form.
11551153 */
1156- static u32 i915_error_generate_code (struct drm_i915_private * dev_priv ,
1157- struct i915_gpu_state * error ,
1158- int * engine_id )
1154+ static u32 i915_error_generate_code (struct i915_gpu_state * error ,
1155+ unsigned long engine_mask )
11591156{
1160- u32 error_code = 0 ;
1161- int i ;
1162-
1163- /* IPEHR would be an ideal way to detect errors, as it's the gross
1157+ /*
1158+ * IPEHR would be an ideal way to detect errors, as it's the gross
11641159 * measure of "the command that hung." However, has some very common
11651160 * synchronization commands which almost always appear in the case
11661161 * strictly a client bug. Use instdone to differentiate those some.
11671162 */
1168- for (i = 0 ; i < I915_NUM_ENGINES ; i ++ ) {
1169- if (error -> engine [i ].hangcheck_stalled ) {
1170- if (engine_id )
1171- * engine_id = i ;
1163+ if (engine_mask ) {
1164+ struct drm_i915_error_engine * ee =
1165+ & error -> engine [ffs (engine_mask )];
11721166
1173- return error -> engine [i ].ipehr ^
1174- error -> engine [i ].instdone .instdone ;
1175- }
1167+ return ee -> ipehr ^ ee -> instdone .instdone ;
11761168 }
11771169
1178- return error_code ;
1170+ return 0 ;
11791171}
11801172
11811173static void gem_record_fences (struct i915_gpu_state * error )
@@ -1338,9 +1330,8 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
13381330 }
13391331
13401332 ee -> idle = intel_engine_is_idle (engine );
1341- ee -> hangcheck_timestamp = engine -> hangcheck .action_timestamp ;
1342- ee -> hangcheck_action = engine -> hangcheck .action ;
1343- ee -> hangcheck_stalled = engine -> hangcheck .stalled ;
1333+ if (!ee -> idle )
1334+ ee -> hangcheck_timestamp = engine -> hangcheck .action_timestamp ;
13441335 ee -> reset_count = i915_reset_engine_count (& dev_priv -> gpu_error ,
13451336 engine );
13461337
@@ -1783,31 +1774,35 @@ static void capture_reg_state(struct i915_gpu_state *error)
17831774 error -> pgtbl_er = I915_READ (PGTBL_ER );
17841775}
17851776
1786- static void i915_error_capture_msg (struct drm_i915_private * dev_priv ,
1787- struct i915_gpu_state * error ,
1788- u32 engine_mask ,
1789- const char * error_msg )
1777+ static const char *
1778+ error_msg (struct i915_gpu_state * error , unsigned long engines , const char * msg )
17901779{
1791- u32 ecode ;
1792- int engine_id = -1 , len ;
1780+ int len ;
1781+ int i ;
17931782
1794- ecode = i915_error_generate_code (dev_priv , error , & engine_id );
1783+ for (i = 0 ; i < ARRAY_SIZE (error -> engine ); i ++ )
1784+ if (!error -> engine [i ].context .pid )
1785+ engines &= ~BIT (i );
17951786
17961787 len = scnprintf (error -> error_msg , sizeof (error -> error_msg ),
1797- "GPU HANG: ecode %d:%d:0x%08x" ,
1798- INTEL_GEN (dev_priv ), engine_id , ecode );
1799-
1800- if (engine_id != -1 && error -> engine [engine_id ].context .pid )
1788+ "GPU HANG: ecode %d:%lx:0x%08x" ,
1789+ INTEL_GEN (error -> i915 ), engines ,
1790+ i915_error_generate_code (error , engines ));
1791+ if (engines ) {
1792+ /* Just show the first executing process, more is confusing */
1793+ i = ffs (engines );
18011794 len += scnprintf (error -> error_msg + len ,
18021795 sizeof (error -> error_msg ) - len ,
18031796 ", in %s [%d]" ,
1804- error -> engine [engine_id ].context .comm ,
1805- error -> engine [engine_id ].context .pid );
1797+ error -> engine [i ].context .comm ,
1798+ error -> engine [i ].context .pid );
1799+ }
1800+ if (msg )
1801+ len += scnprintf (error -> error_msg + len ,
1802+ sizeof (error -> error_msg ) - len ,
1803+ ", %s" , msg );
18061804
1807- scnprintf (error -> error_msg + len , sizeof (error -> error_msg ) - len ,
1808- ", reason: %s, action: %s" ,
1809- error_msg ,
1810- engine_mask ? "reset" : "continue" );
1805+ return error -> error_msg ;
18111806}
18121807
18131808static void capture_gen_state (struct i915_gpu_state * error )
@@ -1847,7 +1842,7 @@ static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
18471842 for (i = 0 ; i < ARRAY_SIZE (error -> engine ); i ++ ) {
18481843 const struct drm_i915_error_engine * ee = & error -> engine [i ];
18491844
1850- if (ee -> hangcheck_stalled &&
1845+ if (ee -> hangcheck_timestamp &&
18511846 time_before (ee -> hangcheck_timestamp , epoch ))
18521847 epoch = ee -> hangcheck_timestamp ;
18531848 }
@@ -1921,16 +1916,16 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
19211916 * i915_capture_error_state - capture an error record for later analysis
19221917 * @i915: i915 device
19231918 * @engine_mask: the mask of engines triggering the hang
1924- * @error_msg : a message to insert into the error capture header
1919+ * @msg : a message to insert into the error capture header
19251920 *
19261921 * Should be called when an error is detected (either a hang or an error
19271922 * interrupt) to capture error state from the time of the error. Fills
19281923 * out a structure which becomes available in debugfs for user level tools
19291924 * to pick up.
19301925 */
19311926void i915_capture_error_state (struct drm_i915_private * i915 ,
1932- u32 engine_mask ,
1933- const char * error_msg )
1927+ unsigned long engine_mask ,
1928+ const char * msg )
19341929{
19351930 static bool warned ;
19361931 struct i915_gpu_state * error ;
@@ -1946,8 +1941,7 @@ void i915_capture_error_state(struct drm_i915_private *i915,
19461941 if (IS_ERR (error ))
19471942 return ;
19481943
1949- i915_error_capture_msg (i915 , error , engine_mask , error_msg );
1950- DRM_INFO ("%s\n" , error -> error_msg );
1944+ dev_info (i915 -> drm .dev , "%s\n" , error_msg (error , engine_mask , msg ));
19511945
19521946 if (!error -> simulated ) {
19531947 spin_lock_irqsave (& i915 -> gpu_error .lock , flags );
0 commit comments