Keep static copies of the 965 render programs in video memory.

This reduces the CPU overhead of memcpying them in every time, for a speedup
in aa24text of around 30%.  This is based on work by Carl Worth which is
in the intel-batchbuffer branch.
This commit is contained in:
Eric Anholt 2008-04-10 13:17:58 -07:00
parent bd41d2bb85
commit b606278db8
5 changed files with 210 additions and 110 deletions

View File

@ -85,7 +85,6 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#ifdef I830_USE_EXA
#include "exa.h"
Bool I830EXAInit(ScreenPtr pScreen);
#define EXA_LINEAR_EXTRA (64*1024)
unsigned long long I830TexOffsetStart(PixmapPtr pPix);
#endif
@ -398,7 +397,7 @@ typedef struct _I830Rec {
i830_memory *xaa_scratch_2;
#ifdef I830_USE_EXA
i830_memory *exa_offscreen;
i830_memory *exa_965_state;
i830_memory *gen4_render_state_mem;
#endif
/* Regions allocated either from the above pools, or from agpgart. */
I830RingBuffer *LpRing;
@ -531,6 +530,9 @@ typedef struct _I830Rec {
uint32_t mapstate[6];
uint32_t samplerstate[6];
/* 965 render acceleration state */
struct gen4_render_state *gen4_render_state;
Bool directRenderingDisabled; /* DRI disabled in PreInit. */
Bool directRenderingEnabled; /* DRI enabled this generation. */
@ -824,6 +826,10 @@ Bool i915_prepare_composite(int op, PicturePtr pSrc, PicturePtr pMask,
PicturePtr pDst, PixmapPtr pSrcPixmap,
PixmapPtr pMaskPixmap, PixmapPtr pDstPixmap);
/* i965_render.c */
unsigned int gen4_render_state_size(ScrnInfoPtr pScrn);
void gen4_render_state_init(ScrnInfoPtr pScrn);
void gen4_render_state_cleanup(ScrnInfoPtr pScrn);
void gen4_render_state_reset(ScrnInfoPtr pScrn);
Bool i965_check_composite(int op, PicturePtr pSrc, PicturePtr pMask,
PicturePtr pDst);
Bool i965_prepare_composite(int op, PicturePtr pSrc, PicturePtr pMask,

View File

@ -205,6 +205,9 @@ I830Sync(ScrnInfoPtr pScrn)
pI830->LpRing->space = pI830->LpRing->mem->size - 8;
pI830->nextColorExpandBuf = 0;
if (IS_I965G(pI830))
gen4_render_state_reset(pScrn);
}
void

View File

@ -3190,6 +3190,9 @@ I830LeaveVT(int scrnIndex, int flags)
}
#endif /* XF86DRI_MM */
if (IS_I965G(pI830))
gen4_render_state_cleanup(pScrn);
if (pI830->AccelInfoRec)
pI830->AccelInfoRec->NeedToSync = FALSE;
}
@ -3236,6 +3239,9 @@ I830EnterVT(int scrnIndex, int flags)
/* Update the screen pixmap in case the buffer moved */
i830_update_front_offset(pScrn);
if (IS_I965G(pI830))
gen4_render_state_init(pScrn);
if (i830_check_error_state(pScrn)) {
xf86DrvMsg(pScrn->scrnIndex, X_WARNING,
"Existing errors found in hardware state.\n");

View File

@ -334,7 +334,7 @@ i830_reset_allocations(ScrnInfoPtr pScrn)
pI830->xaa_scratch = NULL;
pI830->xaa_scratch_2 = NULL;
pI830->exa_offscreen = NULL;
pI830->exa_965_state = NULL;
pI830->gen4_render_state_mem = NULL;
pI830->overlay_regs = NULL;
pI830->logical_context = NULL;
#ifdef XF86DRI
@ -1370,11 +1370,14 @@ i830_allocate_2d_memory(ScrnInfoPtr pScrn)
}
/* even in XAA, 965G needs state mem buffer for rendering */
if (IS_I965G(pI830) && !pI830->noAccel && pI830->exa_965_state == NULL) {
pI830->exa_965_state =
if (IS_I965G(pI830) && !pI830->noAccel &&
pI830->gen4_render_state_mem == NULL)
{
pI830->gen4_render_state_mem =
i830_allocate_memory(pScrn, "exa G965 state buffer",
EXA_LINEAR_EXTRA, GTT_PAGE_SIZE, 0);
if (pI830->exa_965_state == NULL) {
gen4_render_state_size(pScrn),
GTT_PAGE_SIZE, 0);
if (pI830->gen4_render_state_mem == NULL) {
xf86DrvMsg(pScrn->scrnIndex, X_WARNING,
"Failed to allocate exa state buffer for 965.\n");
return FALSE;

View File

@ -113,6 +113,12 @@ static struct formatinfo i965_tex_formats[] = {
{PICT_a8, BRW_SURFACEFORMAT_A8_UNORM },
};
/** Private data for gen4 render accel implementation. */
struct gen4_render_state {
unsigned char *state_addr;
unsigned int state_offset;
};
static void i965_get_blend_cntl(int op, PicturePtr pMask, uint32_t dst_format,
uint32_t *sblend, uint32_t *dblend)
{
@ -267,17 +273,12 @@ static struct brw_wm_unit_state *wm_state, wm_state_local;
static struct brw_cc_unit_state *cc_state, cc_state_local;
static struct brw_cc_viewport *cc_viewport;
static struct brw_instruction *sf_kernel;
static struct brw_instruction *ps_kernel;
static struct brw_instruction *sip_kernel;
static uint32_t *binding_table;
static int binding_table_entries;
static int dest_surf_offset, src_surf_offset, mask_surf_offset;
static int src_sampler_offset, mask_sampler_offset,vs_offset;
static int sf_offset, wm_offset, cc_offset, vb_offset, cc_viewport_offset;
static int sf_kernel_offset, ps_kernel_offset, sip_kernel_offset;
static int wm_scratch_offset;
static int binding_table_offset;
static int default_color_offset;
@ -324,7 +325,7 @@ static const uint32_t sf_kernel_static[][4] = {
#include "exa_sf.g4b"
};
static const uint32_t sf_kernel_static_mask[][4] = {
static const uint32_t sf_kernel_mask_static[][4] = {
#include "exa_sf_mask.g4b"
};
@ -334,21 +335,21 @@ static const uint32_t sf_kernel_static_mask[][4] = {
#define PS_SCRATCH_SPACE 1024
#define PS_SCRATCH_SPACE_LOG 0 /* log2 (PS_SCRATCH_SPACE) - 10 (1024 is 0, 2048 is 1) */
static const uint32_t ps_kernel_static_nomask_affine [][4] = {
static const uint32_t ps_kernel_nomask_affine_static [][4] = {
#include "exa_wm_xy.g4b"
#include "exa_wm_src_affine.g4b"
#include "exa_wm_src_sample_argb.g4b"
#include "exa_wm_write.g4b"
};
static const uint32_t ps_kernel_static_nomask_projective [][4] = {
static const uint32_t ps_kernel_nomask_projective_static [][4] = {
#include "exa_wm_xy.g4b"
#include "exa_wm_src_projective.g4b"
#include "exa_wm_src_sample_argb.g4b"
#include "exa_wm_write.g4b"
};
static const uint32_t ps_kernel_static_maskca_affine [][4] = {
static const uint32_t ps_kernel_maskca_affine_static [][4] = {
#include "exa_wm_xy.g4b"
#include "exa_wm_src_affine.g4b"
#include "exa_wm_src_sample_argb.g4b"
@ -358,7 +359,7 @@ static const uint32_t ps_kernel_static_maskca_affine [][4] = {
#include "exa_wm_write.g4b"
};
static const uint32_t ps_kernel_static_maskca_projective [][4] = {
static const uint32_t ps_kernel_maskca_projective_static [][4] = {
#include "exa_wm_xy.g4b"
#include "exa_wm_src_projective.g4b"
#include "exa_wm_src_sample_argb.g4b"
@ -368,7 +369,7 @@ static const uint32_t ps_kernel_static_maskca_projective [][4] = {
#include "exa_wm_write.g4b"
};
static const uint32_t ps_kernel_static_maskca_srcalpha_affine [][4] = {
static const uint32_t ps_kernel_maskca_srcalpha_affine_static [][4] = {
#include "exa_wm_xy.g4b"
#include "exa_wm_src_affine.g4b"
#include "exa_wm_src_sample_a.g4b"
@ -378,7 +379,7 @@ static const uint32_t ps_kernel_static_maskca_srcalpha_affine [][4] = {
#include "exa_wm_write.g4b"
};
static const uint32_t ps_kernel_static_maskca_srcalpha_projective [][4] = {
static const uint32_t ps_kernel_maskca_srcalpha_projective_static [][4] = {
#include "exa_wm_xy.g4b"
#include "exa_wm_src_projective.g4b"
#include "exa_wm_src_sample_a.g4b"
@ -388,7 +389,7 @@ static const uint32_t ps_kernel_static_maskca_srcalpha_projective [][4] = {
#include "exa_wm_write.g4b"
};
static const uint32_t ps_kernel_static_masknoca_affine [][4] = {
static const uint32_t ps_kernel_masknoca_affine_static [][4] = {
#include "exa_wm_xy.g4b"
#include "exa_wm_src_affine.g4b"
#include "exa_wm_src_sample_argb.g4b"
@ -398,7 +399,7 @@ static const uint32_t ps_kernel_static_masknoca_affine [][4] = {
#include "exa_wm_write.g4b"
};
static const uint32_t ps_kernel_static_masknoca_projective [][4] = {
static const uint32_t ps_kernel_masknoca_projective_static [][4] = {
#include "exa_wm_xy.g4b"
#include "exa_wm_src_projective.g4b"
#include "exa_wm_src_sample_argb.g4b"
@ -408,6 +409,62 @@ static const uint32_t ps_kernel_static_masknoca_projective [][4] = {
#include "exa_wm_write.g4b"
};
/**
* Storage for the static kernel data with template name, rounded to 64 bytes.
*/
#define KERNEL_DECL(template) \
uint32_t template [((sizeof (template ## _static) + 63) & ~63) / 16][4];
/**
* Gen4 rendering state buffer structure.
*
* Ideally this structure would contain static data for all of the
* combinations of state that we use for Render acceleration, and another
* buffer would be the use-and-throw-away surface and vertex data. See the
* intel-batchbuffer branch for an implementation of that. For now, it
* has the static program data, and then a changing buffer containing all
* the rest.
*/
typedef struct _gen4_state {
KERNEL_DECL (sip_kernel);
KERNEL_DECL (sf_kernel);
KERNEL_DECL (sf_kernel_mask);
KERNEL_DECL (ps_kernel_nomask_affine);
KERNEL_DECL (ps_kernel_nomask_projective);
KERNEL_DECL (ps_kernel_maskca_affine);
KERNEL_DECL (ps_kernel_maskca_projective);
KERNEL_DECL (ps_kernel_maskca_srcalpha_affine);
KERNEL_DECL (ps_kernel_maskca_srcalpha_projective);
KERNEL_DECL (ps_kernel_masknoca_affine);
KERNEL_DECL (ps_kernel_masknoca_projective);
uint8_t other_state[65536];
} gen4_state_t;
/**
* Called at EnterVT to fill in our state buffer with any static information.
*/
static void
gen4_state_init (gen4_state_t *state)
{
#define KERNEL_COPY(kernel) \
memcpy(state->kernel, kernel ## _static, sizeof(kernel ## _static))
KERNEL_COPY (sip_kernel);
KERNEL_COPY (sf_kernel);
KERNEL_COPY (sf_kernel_mask);
KERNEL_COPY (ps_kernel_nomask_affine);
KERNEL_COPY (ps_kernel_nomask_projective);
KERNEL_COPY (ps_kernel_maskca_affine);
KERNEL_COPY (ps_kernel_maskca_projective);
KERNEL_COPY (ps_kernel_maskca_srcalpha_affine);
KERNEL_COPY (ps_kernel_maskca_srcalpha_projective);
KERNEL_COPY (ps_kernel_masknoca_affine);
KERNEL_COPY (ps_kernel_masknoca_projective);
#undef KERNEL_COPY
}
static uint32_t
i965_get_card_format(PicturePtr pPict)
{
@ -484,7 +541,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
binding_table_entries = 2; /* default no mask */
/* Set up our layout of state in framebuffer. First the general state: */
next_offset = 0;
next_offset = offsetof(gen4_state_t, other_state);
vs_offset = ALIGN(next_offset, 64);
next_offset = vs_offset + sizeof(*vs_state);
@ -500,46 +557,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
cc_offset = ALIGN(next_offset, 32);
next_offset = cc_offset + sizeof(*cc_state);
/* keep current sf_kernel, which will send one setup urb entry to
* PS kernel
*/
sf_kernel_offset = ALIGN(next_offset, 64);
if (pMask)
next_offset = sf_kernel_offset + sizeof (sf_kernel_static_mask);
else
next_offset = sf_kernel_offset + sizeof (sf_kernel_static);
ps_kernel_offset = ALIGN(next_offset, 64);
if (pMask) {
if (pMaskPicture->componentAlpha &&
PICT_FORMAT_RGB(pMaskPicture->format)) {
if (i965_blend_op[op].src_alpha) {
if (is_affine)
next_offset = ps_kernel_offset + sizeof(ps_kernel_static_maskca_srcalpha_affine);
else
next_offset = ps_kernel_offset + sizeof(ps_kernel_static_maskca_srcalpha_projective);
} else {
if (is_affine)
next_offset = ps_kernel_offset + sizeof(ps_kernel_static_maskca_affine);
else
next_offset = ps_kernel_offset + sizeof(ps_kernel_static_maskca_projective);
}
} else {
if (is_affine)
next_offset = ps_kernel_offset + sizeof(ps_kernel_static_masknoca_affine);
else
next_offset = ps_kernel_offset + sizeof(ps_kernel_static_masknoca_projective);
}
} else {
if (is_affine)
next_offset = ps_kernel_offset + sizeof (ps_kernel_static_nomask_affine);
else
next_offset = ps_kernel_offset + sizeof (ps_kernel_static_nomask_projective);
}
sip_kernel_offset = ALIGN(next_offset, 64);
next_offset = sip_kernel_offset + sizeof (sip_kernel_static);
/* needed? */
cc_viewport_offset = ALIGN(next_offset, 32);
next_offset = cc_viewport_offset + sizeof(*cc_viewport);
@ -576,16 +593,12 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
next_offset = default_color_offset + sizeof(*default_color_state);
total_state_size = next_offset;
assert(total_state_size < pI830->exa_965_state->size);
assert(total_state_size < sizeof(gen4_state_t));
state_base_offset = pI830->exa_965_state->offset;
state_base_offset = ALIGN(state_base_offset, 64);
state_base_offset = pI830->gen4_render_state_mem->offset;
assert((state_base_offset & 63) == 0);
state_base = (char *)(pI830->FbBase + state_base_offset);
sf_kernel = (void *)(state_base + sf_kernel_offset);
ps_kernel = (void *)(state_base + ps_kernel_offset);
sip_kernel = (void *)(state_base + sip_kernel_offset);
cc_viewport = (void *)(state_base + cc_viewport_offset);
binding_table = (void *)(state_base + binding_table_offset);
@ -664,9 +677,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
cc_state = (void *)(state_base + cc_offset);
memcpy (cc_state, &cc_state_local, sizeof (cc_state_local));
/* Upload system kernel */
memcpy (sip_kernel, sip_kernel_static, sizeof (sip_kernel_static));
/* Set up the state buffer for the destination surface */
dest_surf_state = &dest_surf_state_local;
memset(dest_surf_state, 0, sizeof(*dest_surf_state));
@ -857,16 +867,15 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
* calculate dA/dx and dA/dy. Hand these interpolation coefficients
* back to SF which then hands pixels off to WM.
*/
if (pMask)
memcpy(sf_kernel, sf_kernel_static_mask,
sizeof (sf_kernel_static_mask));
else
memcpy(sf_kernel, sf_kernel_static, sizeof (sf_kernel_static));
sf_state = &sf_state_local;
memset(sf_state, 0, sizeof(*sf_state));
sf_state->thread0.kernel_start_pointer =
(state_base_offset + sf_kernel_offset) >> 6;
if (pMask) {
sf_state->thread0.kernel_start_pointer = (state_base_offset +
offsetof(gen4_state_t, sf_kernel_mask)) >> 6;
} else {
sf_state->thread0.kernel_start_pointer = (state_base_offset +
offsetof(gen4_state_t, sf_kernel)) >> 6;
}
sf_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(SF_KERNEL_NUM_GRF);
sf_state->sf1.single_program_flow = 1;
sf_state->sf1.binding_table_entry_count = 0;
@ -899,37 +908,64 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
memcpy (sf_state, &sf_state_local, sizeof (sf_state_local));
/* Set up the PS kernel (dispatched by WM) */
if (pMask) {
if (pMaskPicture->componentAlpha &&
PICT_FORMAT_RGB(pMaskPicture->format)) {
if (i965_blend_op[op].src_alpha) {
if (is_affine)
memcpy(ps_kernel, ps_kernel_static_maskca_srcalpha_affine, sizeof (ps_kernel_static_maskca_srcalpha_affine));
else
memcpy(ps_kernel, ps_kernel_static_maskca_srcalpha_projective, sizeof (ps_kernel_static_maskca_srcalpha_projective));
} else {
if (is_affine)
memcpy(ps_kernel, ps_kernel_static_maskca_affine, sizeof (ps_kernel_static_maskca_affine));
else
memcpy(ps_kernel, ps_kernel_static_maskca_projective, sizeof (ps_kernel_static_maskca_projective));
}
} else {
if (is_affine)
memcpy(ps_kernel, ps_kernel_static_masknoca_affine, sizeof (ps_kernel_static_masknoca_affine));
else
memcpy(ps_kernel, ps_kernel_static_masknoca_projective, sizeof (ps_kernel_static_masknoca_projective));
}
} else {
if (is_affine)
memcpy(ps_kernel, ps_kernel_static_nomask_affine, sizeof (ps_kernel_static_nomask_affine));
else
memcpy(ps_kernel, ps_kernel_static_nomask_projective, sizeof (ps_kernel_static_nomask_projective));
}
wm_state = &wm_state_local;
memset(wm_state, 0, sizeof (*wm_state));
wm_state->thread0.kernel_start_pointer =
(state_base_offset + ps_kernel_offset) >> 6;
if (pMask) {
if (pMaskPicture->componentAlpha &&
PICT_FORMAT_RGB(pMaskPicture->format))
{
if (i965_blend_op[op].src_alpha) {
if (is_affine) {
wm_state->thread0.kernel_start_pointer =
(state_base_offset +
offsetof(gen4_state_t,
ps_kernel_maskca_srcalpha_affine)) >> 6;
} else {
wm_state->thread0.kernel_start_pointer =
(state_base_offset +
offsetof(gen4_state_t,
ps_kernel_maskca_srcalpha_projective)) >> 6;
}
} else {
if (is_affine) {
wm_state->thread0.kernel_start_pointer =
(state_base_offset +
offsetof(gen4_state_t,
ps_kernel_maskca_affine)) >> 6;
} else {
wm_state->thread0.kernel_start_pointer =
(state_base_offset +
offsetof(gen4_state_t,
ps_kernel_maskca_projective)) >> 6;
}
}
} else {
if (is_affine) {
wm_state->thread0.kernel_start_pointer =
(state_base_offset +
offsetof(gen4_state_t,
ps_kernel_masknoca_affine)) >> 6;
} else {
wm_state->thread0.kernel_start_pointer =
(state_base_offset +
offsetof(gen4_state_t,
ps_kernel_masknoca_projective)) >> 6;
}
}
} else {
if (is_affine) {
wm_state->thread0.kernel_start_pointer =
(state_base_offset +
offsetof(gen4_state_t,
ps_kernel_nomask_affine)) >> 6;
} else {
wm_state->thread0.kernel_start_pointer =
(state_base_offset +
offsetof(gen4_state_t,
ps_kernel_nomask_projective)) >> 6;
}
}
wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
wm_state->thread1.single_program_flow = 0;
if (!pMask)
@ -1006,7 +1042,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
/* Set system instruction pointer */
OUT_BATCH(BRW_STATE_SIP | 0);
OUT_BATCH(state_base_offset + sip_kernel_offset);
OUT_BATCH(state_base_offset + offsetof(gen4_state_t, sip_kernel));
OUT_BATCH(MI_NOOP);
ADVANCE_BATCH();
}
@ -1328,3 +1364,49 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
*/
i830MarkSync(pScrn);
}
/**
* Called at EnterVT so we can set up our offsets into the state buffer.
*/
void
gen4_render_state_init(ScrnInfoPtr pScrn)
{
I830Ptr pI830 = I830PTR(pScrn);
struct gen4_render_state *state;
if (pI830->gen4_render_state == NULL)
pI830->gen4_render_state = calloc(sizeof(*state), 1);
state = pI830->gen4_render_state;
state->state_offset = pI830->gen4_render_state_mem->offset;
state->state_addr = pI830->FbBase + pI830->gen4_render_state_mem->offset;
gen4_state_init((gen4_state_t *)state->state_addr);
}
/**
* Called at LeaveVT.
*/
void
gen4_render_state_cleanup(ScrnInfoPtr pScrn)
{
I830Ptr pI830 = I830PTR(pScrn);
pI830->gen4_render_state->state_addr = NULL;
}
/**
* Called when the hardware is idled and flushed, so we know we can
* reuse the buffer contents.
*/
void
gen4_render_state_reset(ScrnInfoPtr pScrn)
{
}
unsigned int
gen4_render_state_size(ScrnInfoPtr pScrn)
{
return sizeof(gen4_state_t);
}