intel: Throttle harder

Filling the rings is a very unpleasant user experience, so cap the number of batches we allow to be inflight at any one time. Interestingly, as also found with SNA, throttling can improve performance by reducing RSS. However, typically throughput is improved (at the expense of latency) by oversubscribing work to the GPU and a 10-20% slowdown is commonplace for cairo-traces. Notably, x11perf is less affected and in particular application level benchmarks show no change. Note that this exposes another bug in libdrm-intel 2.4.40 on gen2/3. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
2013-01-10 19:14:21 +00:00 · 2013-01-10 19:14:21 +00:00 · 441ef916ae
parent a37d56f338
commit 441ef916ae
2 changed files with 32 additions and 11 deletions
--- a/src/intel.h
+++ b/src/intel.h
@ -182,7 +182,7 @@ typedef struct intel_screen_private {
 	unsigned int batch_emit_start;
 	/** Number of bytes to be emitted in the current BEGIN_BATCH. */
 	uint32_t batch_emitting;
-	dri_bo *batch_bo;
+	dri_bo *batch_bo, *last_batch_bo[2];
 	/** Whether we're in a section of code that can't tolerate flushing */
 	Bool in_batch_atomic;
 	/** Ending batch_used that was verified by intel_start_batch_atomic() */
--- a/src/intel_batchbuffer.c
+++ b/src/intel_batchbuffer.c
@ -67,17 +67,26 @@ void intel_next_vertex(intel_screen_private *intel)
 		dri_bo_alloc(intel->bufmgr, "vertex", sizeof (intel->vertex_ptr), 4096);
 }

-static void intel_next_batch(ScrnInfoPtr scrn)
+static dri_bo *bo_alloc(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
-
+	int size = 4 * 4096;
 	/* The 865 has issues with larger-than-page-sized batch buffers. */
 	if (IS_I865G(intel))
-		intel->batch_bo =
-		    dri_bo_alloc(intel->bufmgr, "batch", 4096, 4096);
-	else
-		intel->batch_bo =
-		    dri_bo_alloc(intel->bufmgr, "batch", 4096 * 4, 4096);
+		size = 4096;
+	return dri_bo_alloc(intel->bufmgr, "batch", size, 4096);
+}
+
+static void intel_next_batch(ScrnInfoPtr scrn, int mode)
+{
+	intel_screen_private *intel = intel_get_screen_private(scrn);
+	dri_bo *tmp;
+
+	drm_intel_gem_bo_clear_relocs(intel->batch_bo, 0);
+
+	tmp = intel->last_batch_bo[mode];
+	intel->last_batch_bo[mode] = intel->batch_bo;
+	intel->batch_bo = tmp;

 	intel->batch_used = 0;

@ -95,12 +104,25 @@ void intel_batch_init(ScrnInfoPtr scrn)
 	intel->batch_emitting = 0;
 	intel->vertex_id = 0;

-	intel_next_batch(scrn);
+	intel->last_batch_bo[0] = bo_alloc(scrn);
+	intel->last_batch_bo[1] = bo_alloc(scrn);
+
+	intel->batch_bo = bo_alloc(scrn);
+	intel->batch_used = 0;
+	intel->last_3d = LAST_3D_OTHER;
 }

 void intel_batch_teardown(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(intel->last_batch_bo); i++) {
+		if (intel->last_batch_bo[i] != NULL) {
+			dri_bo_unreference(intel->last_batch_bo[i]);
+			intel->last_batch_bo[i] = NULL;
+		}
+	}

 	if (intel->batch_bo != NULL) {
 		dri_bo_unreference(intel->batch_bo);
@ -273,8 +295,7 @@ void intel_batch_submit(ScrnInfoPtr scrn)
 	if (intel->debug_flush & DEBUG_FLUSH_WAIT)
 		drm_intel_bo_wait_rendering(intel->batch_bo);

-	dri_bo_unreference(intel->batch_bo);
-	intel_next_batch(scrn);
+	intel_next_batch(scrn, intel->current_batch == I915_EXEC_BLT);

 	if (intel->batch_commit_notify)
 		intel->batch_commit_notify(intel);