Use pwrite to upload the batch buffer
By using pwrite() instead of dri_bo_map() we can write to the batch buffer through the GTT and not be forced to map it back into the CPU domain and out again, eliminating a double clflush. Measing x11perf text performance on PineView: Before: 16000000 trep @ 0.0020 msec (511000.0/sec): Char in 80-char aa line (Charter 10) 16000000 trep @ 0.0021 msec (480000.0/sec): Char in 80-char rgb line (Charter 10) After: 16000000 trep @ 0.0019 msec (532000.0/sec): Char in 80-char aa line (Charter 10) 16000000 trep @ 0.0020 msec (496000.0/sec): Char in 80-char rgb line (Charter 10) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
This commit is contained in:
parent
dcef703a7c
commit
2b050f330f
|
|
@ -252,7 +252,7 @@ typedef struct intel_screen_private {
|
|||
|
||||
dri_bufmgr *bufmgr;
|
||||
|
||||
uint8_t *batch_ptr;
|
||||
uint32_t batch_ptr[4096];
|
||||
/** Byte offset in batch_ptr for the next dword to be emitted. */
|
||||
unsigned int batch_used;
|
||||
/** Position in batch_ptr at the start of the current BEGIN_BATCH */
|
||||
|
|
|
|||
|
|
@ -42,7 +42,6 @@
|
|||
static void intel_next_batch(ScrnInfoPtr scrn)
|
||||
{
|
||||
intel_screen_private *intel = intel_get_screen_private(scrn);
|
||||
int ret;
|
||||
|
||||
/* The 865 has issues with larger-than-page-sized batch buffers. */
|
||||
if (IS_I865G(intel))
|
||||
|
|
@ -52,12 +51,7 @@ static void intel_next_batch(ScrnInfoPtr scrn)
|
|||
intel->batch_bo =
|
||||
dri_bo_alloc(intel->bufmgr, "batch", 4096 * 4, 4096);
|
||||
|
||||
ret = dri_bo_map(intel->batch_bo, 1);
|
||||
if (ret != 0)
|
||||
FatalError("Failed to map batchbuffer: %s\n", strerror(-ret));
|
||||
|
||||
intel->batch_used = 0;
|
||||
intel->batch_ptr = intel->batch_bo->virtual;
|
||||
|
||||
/* We don't know when another client has executed, so we have
|
||||
* to reinitialize our 3D state per batch.
|
||||
|
|
@ -80,9 +74,6 @@ void intel_batch_teardown(ScrnInfoPtr scrn)
|
|||
intel_screen_private *intel = intel_get_screen_private(scrn);
|
||||
|
||||
if (intel->batch_ptr != NULL) {
|
||||
dri_bo_unmap(intel->batch_bo);
|
||||
intel->batch_ptr = NULL;
|
||||
|
||||
dri_bo_unreference(intel->batch_bo);
|
||||
intel->batch_bo = NULL;
|
||||
|
||||
|
|
@ -168,31 +159,24 @@ void intel_batch_submit(ScrnInfoPtr scrn)
|
|||
if (intel->vertex_flush)
|
||||
intel->vertex_flush(intel);
|
||||
|
||||
/* Emit a padding dword if we aren't going to be quad-word aligned. */
|
||||
if ((intel->batch_used & 4) == 0) {
|
||||
*(uint32_t *) (intel->batch_ptr + intel->batch_used) = MI_NOOP;
|
||||
intel->batch_used += 4;
|
||||
}
|
||||
|
||||
/* Mark the end of the batchbuffer. */
|
||||
*(uint32_t *) (intel->batch_ptr + intel->batch_used) =
|
||||
MI_BATCH_BUFFER_END;
|
||||
intel->batch_used += 4;
|
||||
OUT_BATCH(MI_BATCH_BUFFER_END);
|
||||
/* Emit a padding dword if we aren't going to be quad-word aligned. */
|
||||
if (intel->batch_used & 1)
|
||||
OUT_BATCH(MI_NOOP);
|
||||
|
||||
if (DUMP_BATCHBUFFERS) {
|
||||
FILE *file = fopen(DUMP_BATCHBUFFERS, "a");
|
||||
if (file) {
|
||||
fwrite (intel->batch_ptr, intel->batch_used, 1, file);
|
||||
fwrite (intel->batch_ptr, intel->batch_used*4, 1, file);
|
||||
fclose(file);
|
||||
}
|
||||
}
|
||||
|
||||
dri_bo_unmap(intel->batch_bo);
|
||||
intel->batch_ptr = NULL;
|
||||
|
||||
ret =
|
||||
dri_bo_exec(intel->batch_bo, intel->batch_used, NULL, 0,
|
||||
0xffffffff);
|
||||
ret = dri_bo_subdata(intel->batch_bo, 0, intel->batch_used*4, intel->batch_ptr);
|
||||
if (ret == 0)
|
||||
ret = dri_bo_exec(intel->batch_bo, intel->batch_used*4,
|
||||
NULL, 0, 0xffffffff);
|
||||
if (ret != 0) {
|
||||
static int once;
|
||||
|
||||
|
|
@ -269,6 +253,6 @@ void intel_batch_wait_last(ScrnInfoPtr scrn)
|
|||
/* Map it CPU write, which guarantees it's done. This is a completely
|
||||
* non performance path, so we don't need anything better.
|
||||
*/
|
||||
drm_intel_bo_map(intel->last_batch_bo, TRUE);
|
||||
drm_intel_bo_unmap(intel->last_batch_bo);
|
||||
drm_intel_gem_bo_map_gtt(intel->last_batch_bo);
|
||||
drm_intel_gem_bo_unmap_gtt(intel->last_batch_bo);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ void intel_batch_wait_last(ScrnInfoPtr scrn);
|
|||
|
||||
static inline int intel_batch_space(intel_screen_private *intel)
|
||||
{
|
||||
return (intel->batch_bo->size - BATCH_RESERVED) - (intel->batch_used);
|
||||
return (intel->batch_bo->size - BATCH_RESERVED) - (4*intel->batch_used);
|
||||
}
|
||||
|
||||
static inline void
|
||||
|
|
@ -60,7 +60,7 @@ static inline void intel_batch_start_atomic(ScrnInfoPtr scrn, unsigned int sz)
|
|||
intel_batch_require_space(scrn, intel, sz * 4);
|
||||
|
||||
intel->in_batch_atomic = TRUE;
|
||||
intel->batch_atomic_limit = intel->batch_used + sz * 4;
|
||||
intel->batch_atomic_limit = intel->batch_used + sz;
|
||||
}
|
||||
|
||||
static inline void intel_batch_end_atomic(ScrnInfoPtr scrn)
|
||||
|
|
@ -74,19 +74,19 @@ static inline void intel_batch_end_atomic(ScrnInfoPtr scrn)
|
|||
|
||||
static inline void intel_batch_emit_dword(intel_screen_private *intel, uint32_t dword)
|
||||
{
|
||||
*(uint32_t *) (intel->batch_ptr + intel->batch_used) = dword;
|
||||
intel->batch_used += 4;
|
||||
intel->batch_ptr[intel->batch_used++] = dword;
|
||||
}
|
||||
|
||||
static inline void intel_batch_align(intel_screen_private *intel, uint32_t align)
|
||||
{
|
||||
uint32_t delta;
|
||||
|
||||
align /= 4;
|
||||
assert(align);
|
||||
|
||||
if ((delta = intel->batch_used & (align - 1))) {
|
||||
delta = align - delta;
|
||||
memset (intel->batch_ptr + intel->batch_used, 0, delta);
|
||||
memset (intel->batch_ptr + intel->batch_used, 0, 4*delta);
|
||||
intel->batch_used += delta;
|
||||
}
|
||||
}
|
||||
|
|
@ -99,11 +99,11 @@ intel_batch_emit_reloc(intel_screen_private *intel,
|
|||
{
|
||||
if (needs_fence)
|
||||
drm_intel_bo_emit_reloc_fence(intel->batch_bo,
|
||||
intel->batch_used,
|
||||
intel->batch_used * 4,
|
||||
bo, delta,
|
||||
read_domains, write_domains);
|
||||
else
|
||||
drm_intel_bo_emit_reloc(intel->batch_bo, intel->batch_used,
|
||||
drm_intel_bo_emit_reloc(intel->batch_bo, intel->batch_used * 4,
|
||||
bo, delta,
|
||||
read_domains, write_domains);
|
||||
|
||||
|
|
@ -175,7 +175,7 @@ do { \
|
|||
"ADVANCE_BATCH\n", __FUNCTION__); \
|
||||
assert(!intel->in_batch_atomic); \
|
||||
intel_batch_require_space(scrn, intel, (n) * 4); \
|
||||
intel->batch_emitting = (n) * 4; \
|
||||
intel->batch_emitting = (n); \
|
||||
intel->batch_emit_start = intel->batch_used; \
|
||||
} while (0)
|
||||
|
||||
|
|
|
|||
|
|
@ -423,8 +423,7 @@ do { \
|
|||
|
||||
#define FS_BEGIN() \
|
||||
do { \
|
||||
_shader_offset = intel->batch_used; \
|
||||
intel->batch_used += 4; \
|
||||
_shader_offset = intel->batch_used++; \
|
||||
} while (0)
|
||||
|
||||
#define FS_OUT(_shaderop) \
|
||||
|
|
@ -436,7 +435,7 @@ do { \
|
|||
|
||||
#define FS_END() \
|
||||
do { \
|
||||
*(uint32_t *)(intel->batch_ptr + _shader_offset) = \
|
||||
(_3DSTATE_PIXEL_SHADER_PROGRAM | \
|
||||
((intel->batch_used - _shader_offset) / 4 - 2)); \
|
||||
intel->batch_ptr[_shader_offset] = \
|
||||
_3DSTATE_PIXEL_SHADER_PROGRAM | \
|
||||
(intel->batch_used - _shader_offset - 2); \
|
||||
} while (0);
|
||||
|
|
|
|||
|
|
@ -1067,7 +1067,7 @@ void
|
|||
i915_vertex_flush(intel_screen_private *intel)
|
||||
{
|
||||
if (intel->prim_offset) {
|
||||
*(uint32_t *) (intel->batch_ptr + intel->prim_offset) |= intel->prim_count - 1;
|
||||
intel->batch_ptr[intel->prim_offset] |= intel->prim_count - 1;
|
||||
intel->prim_offset = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue