Use pwrite to upload the batch buffer

By using pwrite() instead of dri_bo_map() we can write to the batch buffer through the GTT and not be forced to map it back into the CPU domain and out again, eliminating a double clflush. Measing x11perf text performance on PineView: Before: 16000000 trep @ 0.0020 msec (511000.0/sec): Char in 80-char aa line (Charter 10) 16000000 trep @ 0.0021 msec (480000.0/sec): Char in 80-char rgb line (Charter 10) After: 16000000 trep @ 0.0019 msec (532000.0/sec): Char in 80-char aa line (Charter 10) 16000000 trep @ 0.0020 msec (496000.0/sec): Char in 80-char rgb line (Charter 10) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
2010-05-19 10:57:46 +01:00 · 2010-05-19 10:57:46 +01:00 · 2b050f330f
parent dcef703a7c
commit 2b050f330f
5 changed files with 25 additions and 42 deletions
--- a/src/i830.h
+++ b/src/i830.h
@ -252,7 +252,7 @@ typedef struct intel_screen_private {

 	dri_bufmgr *bufmgr;

-	uint8_t *batch_ptr;
+	uint32_t batch_ptr[4096];
 	/** Byte offset in batch_ptr for the next dword to be emitted. */
 	unsigned int batch_used;
 	/** Position in batch_ptr at the start of the current BEGIN_BATCH */
--- a/src/i830_batchbuffer.c
+++ b/src/i830_batchbuffer.c
@ -42,7 +42,6 @@
 static void intel_next_batch(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
-	int ret;

 	/* The 865 has issues with larger-than-page-sized batch buffers. */
 	if (IS_I865G(intel))
@ -52,12 +51,7 @@ static void intel_next_batch(ScrnInfoPtr scrn)
 		intel->batch_bo =
 		    dri_bo_alloc(intel->bufmgr, "batch", 4096 * 4, 4096);

-	ret = dri_bo_map(intel->batch_bo, 1);
-	if (ret != 0)
-		FatalError("Failed to map batchbuffer: %s\n", strerror(-ret));
-
 	intel->batch_used = 0;
-	intel->batch_ptr = intel->batch_bo->virtual;

 	/* We don't know when another client has executed, so we have
 	 * to reinitialize our 3D state per batch.
@ -80,9 +74,6 @@ void intel_batch_teardown(ScrnInfoPtr scrn)
 	intel_screen_private *intel = intel_get_screen_private(scrn);

 	if (intel->batch_ptr != NULL) {
-		dri_bo_unmap(intel->batch_bo);
-		intel->batch_ptr = NULL;
-
 		dri_bo_unreference(intel->batch_bo);
 		intel->batch_bo = NULL;

@ -168,31 +159,24 @@ void intel_batch_submit(ScrnInfoPtr scrn)
 	if (intel->vertex_flush)
 		intel->vertex_flush(intel);

-	/* Emit a padding dword if we aren't going to be quad-word aligned. */
-	if ((intel->batch_used & 4) == 0) {
-		*(uint32_t *) (intel->batch_ptr + intel->batch_used) = MI_NOOP;
-		intel->batch_used += 4;
-	}
-
 	/* Mark the end of the batchbuffer. */
-	*(uint32_t *) (intel->batch_ptr + intel->batch_used) =
-	    MI_BATCH_BUFFER_END;
-	intel->batch_used += 4;
+	OUT_BATCH(MI_BATCH_BUFFER_END);
+	/* Emit a padding dword if we aren't going to be quad-word aligned. */
+	if (intel->batch_used & 1)
+		OUT_BATCH(MI_NOOP);

 	if (DUMP_BATCHBUFFERS) {
 	    FILE *file = fopen(DUMP_BATCHBUFFERS, "a");
 	    if (file) {
-		fwrite (intel->batch_ptr, intel->batch_used, 1, file);
+		fwrite (intel->batch_ptr, intel->batch_used*4, 1, file);
 		fclose(file);
 	    }
 	}

-	dri_bo_unmap(intel->batch_bo);
-	intel->batch_ptr = NULL;
-
-	ret =
-	    dri_bo_exec(intel->batch_bo, intel->batch_used, NULL, 0,
-			0xffffffff);
+	ret = dri_bo_subdata(intel->batch_bo, 0, intel->batch_used*4, intel->batch_ptr);
+	if (ret == 0)
+		ret = dri_bo_exec(intel->batch_bo, intel->batch_used*4,
+				  NULL, 0, 0xffffffff);
 	if (ret != 0) {
 		static int once;

@ -269,6 +253,6 @@ void intel_batch_wait_last(ScrnInfoPtr scrn)
 	/* Map it CPU write, which guarantees it's done.  This is a completely
 	 * non performance path, so we don't need anything better.
 	 */
-	drm_intel_bo_map(intel->last_batch_bo, TRUE);
-	drm_intel_bo_unmap(intel->last_batch_bo);
+	drm_intel_gem_bo_map_gtt(intel->last_batch_bo);
+	drm_intel_gem_bo_unmap_gtt(intel->last_batch_bo);
 }
--- a/src/i830_batchbuffer.h
+++ b/src/i830_batchbuffer.h
@ -41,7 +41,7 @@ void intel_batch_wait_last(ScrnInfoPtr scrn);

 static inline int intel_batch_space(intel_screen_private *intel)
 {
-	return (intel->batch_bo->size - BATCH_RESERVED) - (intel->batch_used);
+	return (intel->batch_bo->size - BATCH_RESERVED) - (4*intel->batch_used);
 }

 static inline void
@ -60,7 +60,7 @@ static inline void intel_batch_start_atomic(ScrnInfoPtr scrn, unsigned int sz)
 	intel_batch_require_space(scrn, intel, sz * 4);

 	intel->in_batch_atomic = TRUE;
-	intel->batch_atomic_limit = intel->batch_used + sz * 4;
+	intel->batch_atomic_limit = intel->batch_used + sz;
 }

 static inline void intel_batch_end_atomic(ScrnInfoPtr scrn)
@ -74,19 +74,19 @@ static inline void intel_batch_end_atomic(ScrnInfoPtr scrn)

 static inline void intel_batch_emit_dword(intel_screen_private *intel, uint32_t dword)
 {
-	*(uint32_t *) (intel->batch_ptr + intel->batch_used) = dword;
-	intel->batch_used += 4;
+	intel->batch_ptr[intel->batch_used++] = dword;
 }

 static inline void intel_batch_align(intel_screen_private *intel, uint32_t align)
 {
 	uint32_t delta;

+	align /= 4;
 	assert(align);

 	if ((delta = intel->batch_used & (align - 1))) {
 		delta = align - delta;
-		memset (intel->batch_ptr + intel->batch_used, 0, delta);
+		memset (intel->batch_ptr + intel->batch_used, 0, 4*delta);
 		intel->batch_used += delta;
 	}
 }
@ -99,11 +99,11 @@ intel_batch_emit_reloc(intel_screen_private *intel,
 {
 	if (needs_fence)
 		drm_intel_bo_emit_reloc_fence(intel->batch_bo,
-					      intel->batch_used,
+					      intel->batch_used * 4,
 					      bo, delta,
 					      read_domains, write_domains);
 	else
-		drm_intel_bo_emit_reloc(intel->batch_bo, intel->batch_used,
+		drm_intel_bo_emit_reloc(intel->batch_bo, intel->batch_used * 4,
 					bo, delta,
 					read_domains, write_domains);

@ -175,7 +175,7 @@ do {									\
 			   "ADVANCE_BATCH\n", __FUNCTION__);		\
 	assert(!intel->in_batch_atomic);				\
 	intel_batch_require_space(scrn, intel, (n) * 4);		\
-	intel->batch_emitting = (n) * 4;				\
+	intel->batch_emitting = (n);					\
 	intel->batch_emit_start = intel->batch_used;			\
 } while (0)

--- a/src/i915_3d.h
+++ b/src/i915_3d.h
@ -423,8 +423,7 @@ do {									\

 #define FS_BEGIN()							\
 do {									\
-    _shader_offset = intel->batch_used;					\
-   intel->batch_used += 4;						\
+    _shader_offset = intel->batch_used++;				\
 } while (0)

 #define FS_OUT(_shaderop)						\
@ -436,7 +435,7 @@ do {									\

 #define FS_END()							\
 do {									\
-    *(uint32_t *)(intel->batch_ptr + _shader_offset) =			\
-	(_3DSTATE_PIXEL_SHADER_PROGRAM |				\
-	 ((intel->batch_used - _shader_offset) / 4 - 2));		\
+    intel->batch_ptr[_shader_offset] =					\
+	_3DSTATE_PIXEL_SHADER_PROGRAM |					\
+	(intel->batch_used - _shader_offset - 2);			\
 } while (0);
--- a/src/i915_render.c
+++ b/src/i915_render.c
@ -1067,7 +1067,7 @@ void
 i915_vertex_flush(intel_screen_private *intel)
 {
 	if (intel->prim_offset) {
-		*(uint32_t *) (intel->batch_ptr + intel->prim_offset) |= intel->prim_count - 1;
+		intel->batch_ptr[intel->prim_offset] |= intel->prim_count - 1;
 		intel->prim_offset = 0;
 	}
 }