drm/i915/gt: Push engine stopping into reset-prepare

Push the engine stop into the back reset_prepare (where it already was!)
This allows us to avoid dangerously setting the RING registers to 0 for
logical contexts. If we clear the register on a live context, those
invalid register values are recorded in the logical context state and
replayed (with hilarious results).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190716124931.5870-2-chris@chris-wilson.co.uk
This commit is contained in:
Chris Wilson 2019-07-16 13:49:28 +01:00
parent fff8102aae
commit c30d5dc653
3 changed files with 53 additions and 61 deletions

View File

@ -2183,11 +2183,23 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
__tasklet_disable_sync_once(&execlists->tasklet);
GEM_BUG_ON(!reset_in_progress(execlists));
intel_engine_stop_cs(engine);
/* And flush any current direct submission. */
spin_lock_irqsave(&engine->active.lock, flags);
spin_unlock_irqrestore(&engine->active.lock, flags);
/*
* We stop engines, otherwise we might get failed reset and a
* dead gpu (on elk). Also as modern gpu as kbl can suffer
* from system hang if batchbuffer is progressing when
* the reset is issued, regardless of READY_TO_RESET ack.
* Thus assume it is best to stop engines on all gens
* where we have a gpu reset.
*
* WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
*
* FIXME: Wa for more modern gens needs to be validated
*/
intel_engine_stop_cs(engine);
}
static void reset_csb_pointers(struct intel_engine_cs *engine)

View File

@ -135,47 +135,6 @@ void __i915_request_reset(struct i915_request *rq, bool guilty)
}
}
static void gen3_stop_engine(struct intel_engine_cs *engine)
{
struct intel_uncore *uncore = engine->uncore;
const u32 base = engine->mmio_base;
GEM_TRACE("%s\n", engine->name);
if (intel_engine_stop_cs(engine))
GEM_TRACE("%s: timed out on STOP_RING\n", engine->name);
intel_uncore_write_fw(uncore,
RING_HEAD(base),
intel_uncore_read_fw(uncore, RING_TAIL(base)));
intel_uncore_posting_read_fw(uncore, RING_HEAD(base)); /* paranoia */
intel_uncore_write_fw(uncore, RING_HEAD(base), 0);
intel_uncore_write_fw(uncore, RING_TAIL(base), 0);
intel_uncore_posting_read_fw(uncore, RING_TAIL(base));
/* The ring must be empty before it is disabled */
intel_uncore_write_fw(uncore, RING_CTL(base), 0);
/* Check acts as a post */
if (intel_uncore_read_fw(uncore, RING_HEAD(base)))
GEM_TRACE("%s: ring head [%x] not parked\n",
engine->name,
intel_uncore_read_fw(uncore, RING_HEAD(base)));
}
static void stop_engines(struct intel_gt *gt, intel_engine_mask_t engine_mask)
{
struct intel_engine_cs *engine;
intel_engine_mask_t tmp;
if (INTEL_GEN(gt->i915) < 3)
return;
for_each_engine_masked(engine, gt->i915, engine_mask, tmp)
gen3_stop_engine(engine);
}
static bool i915_in_reset(struct pci_dev *pdev)
{
u8 gdrst;
@ -607,23 +566,6 @@ int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
*/
intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
/*
* We stop engines, otherwise we might get failed reset and a
* dead gpu (on elk). Also as modern gpu as kbl can suffer
* from system hang if batchbuffer is progressing when
* the reset is issued, regardless of READY_TO_RESET ack.
* Thus assume it is best to stop engines on all gens
* where we have a gpu reset.
*
* WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
*
* WaMediaResetMainRingCleanup:ctg,elk (presumably)
*
* FIXME: Wa for more modern gens needs to be validated
*/
if (retry)
stop_engines(gt, engine_mask);
GEM_TRACE("engine_mask=%x\n", engine_mask);
preempt_disable();
ret = reset(gt, engine_mask, retry);

View File

@ -739,7 +739,45 @@ static int xcs_resume(struct intel_engine_cs *engine)
static void reset_prepare(struct intel_engine_cs *engine)
{
intel_engine_stop_cs(engine);
struct intel_uncore *uncore = engine->uncore;
const u32 base = engine->mmio_base;
/*
* We stop engines, otherwise we might get failed reset and a
* dead gpu (on elk). Also as modern gpu as kbl can suffer
* from system hang if batchbuffer is progressing when
* the reset is issued, regardless of READY_TO_RESET ack.
* Thus assume it is best to stop engines on all gens
* where we have a gpu reset.
*
* WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
*
* WaMediaResetMainRingCleanup:ctg,elk (presumably)
*
* FIXME: Wa for more modern gens needs to be validated
*/
GEM_TRACE("%s\n", engine->name);
if (intel_engine_stop_cs(engine))
GEM_TRACE("%s: timed out on STOP_RING\n", engine->name);
intel_uncore_write_fw(uncore,
RING_HEAD(base),
intel_uncore_read_fw(uncore, RING_TAIL(base)));
intel_uncore_posting_read_fw(uncore, RING_HEAD(base)); /* paranoia */
intel_uncore_write_fw(uncore, RING_HEAD(base), 0);
intel_uncore_write_fw(uncore, RING_TAIL(base), 0);
intel_uncore_posting_read_fw(uncore, RING_TAIL(base));
/* The ring must be empty before it is disabled */
intel_uncore_write_fw(uncore, RING_CTL(base), 0);
/* Check acts as a post */
if (intel_uncore_read_fw(uncore, RING_HEAD(base)))
GEM_TRACE("%s: ring head [%x] not parked\n",
engine->name,
intel_uncore_read_fw(uncore, RING_HEAD(base)));
}
static void reset_ring(struct intel_engine_cs *engine, bool stalled)