EXA: try to enable rotation for G965

The new sf/wm should handle the texture sampling only in rotated case. Also fix possible hole in VUE slot.
2007-03-21 14:50:45 +08:00 · 2007-03-21 14:50:45 +08:00 · 3025fa0fb2
parent 223944878c
commit 3025fa0fb2
6 changed files with 333 additions and 6 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -120,20 +120,24 @@ INTEL_G4A =				\
 	packed_yuv_wm.g4a		\
 	exa_sf.g4a 			\
 	exa_sf_mask.g4a 		\
+	exa_sf_rotation.g4a		\
 	exa_wm_maskca.g4a 		\
 	exa_wm_maskca_srcalpha.g4a 	\
 	exa_wm_masknoca.g4a 		\
-	exa_wm_nomask.g4a
+	exa_wm_nomask.g4a		\
+	exa_wm_rotation.g4a

 INTEL_G4H = 				\
 	sf_prog.h			\
 	wm_prog.h 			\
 	exa_sf_mask_prog.h		\
 	exa_sf_prog.h 			\
+	exa_sf_rotation_prog.h		\
 	exa_wm_maskca_prog.h		\
 	exa_wm_maskca_srcalpha_prog.h	\
 	exa_wm_masknoca_prog.h		\
-	exa_wm_nomask_prog.h
+	exa_wm_nomask_prog.h		\
+	exa_wm_rotation_prog.h

 EXTRA_DIST = 		\
 	$(XMODE_SRCS)	\
@ -154,6 +158,9 @@ exa_sf_mask_prog.h: exa_sf_mask.g4a
 	
 exa_sf_prog.h: exa_sf.g4a
 	intel-gen4asm -o exa_sf_prog.h exa_sf.g4a
+
+exa_sf_rotation_prog.h: exa_sf_rotation.g4a
+	intel-gen4asm -o exa_sf_rotation_prog.h exa_sf_rotation.g4a
 	
 exa_wm_maskca_prog.h: exa_wm_maskca.g4a
 	intel-gen4asm -o exa_wm_maskca_prog.h exa_wm_maskca.g4a
@ -166,6 +173,9 @@ exa_wm_masknoca_prog.h: exa_wm_masknoca.g4a
 	
 exa_wm_nomask_prog.h: exa_wm_nomask.g4a
 	intel-gen4asm -o exa_wm_nomask_prog.h exa_wm_nomask.g4a
+
+exa_wm_rotation_prog.h: exa_wm_rotation.g4a
+	intel-gen4asm -o exa_wm_rotation_prog.h exa_wm_rotation.g4a
 	
 endif

--- a/src/exa_sf_rotation.g4a
+++ b/src/exa_sf_rotation.g4a
@ -0,0 +1,29 @@
+/* 1/dx */
+send (1) 0 g6<1>F g1.12<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+/* 1/dy */
+send (1) 0 g6.4<1>F g1.20<0,1,0>F math inv scalar mlen 1 rlen 1 { align1 };
+/* du, dv */
+mul (1) g7<1>F g3<0,1,0>F -1.0F { align1 };
+mul (1) g7.4<1>F g3.4<0,1,0>F -1.0F { align1 };
+add (1) g7<1>F g4<0,1,0>F g7<0,1,0>F { align1 };
+add (1) g7.4<1>F g4.4<0,1,0>F g7.4<0,1,0>F { align1 };
+
+/* du/dy */
+mul (1) g7<1>F g7<0,1,0>F g6.4<0,1,0>F { align1 };
+/* dv/dx */
+mul (1) g7.4<1>F g7.4<0,1,0>F g6<0,1,0>F { align1 };
+/* Cx */
+mov (8) m1<1>F g7<0,1,0>F { align1 };
+/* Cy */
+mov (8) m2<1>F g7.4<0,1,0>F { align1 };
+/* Co */
+mov (8) m3<1>F g3<8,8,1>F { align1 };
+send (8) 0 null g0<8,8,1>F urb 0 transpose used complete mlen 4 rlen 0 { align1 EOT };
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
--- a/src/exa_sf_rotation_prog.h
+++ b/src/exa_sf_rotation_prog.h
@ -0,0 +1,20 @@
+   { 0x00000031, 0x20c01fbd, 0x0000002c, 0x01110081 },
+   { 0x00000031, 0x20c41fbd, 0x00000034, 0x01110081 },
+   { 0x00000041, 0x20e07fbd, 0x00000060, 0xbf800000 },
+   { 0x00000041, 0x20e47fbd, 0x00000064, 0xbf800000 },
+   { 0x00000040, 0x20e077bd, 0x00000080, 0x000000e0 },
+   { 0x00000040, 0x20e477bd, 0x00000084, 0x000000e4 },
+   { 0x00000041, 0x20e077bd, 0x000000e0, 0x000000c4 },
+   { 0x00000041, 0x20e477bd, 0x000000e4, 0x000000c0 },
+   { 0x00600001, 0x202003be, 0x000000e0, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x000000e4, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0060, 0x00000000 },
+   { 0x00600031, 0x20001fbc, 0x008d0000, 0x8640c800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
--- a/src/exa_wm_rotation.g4a
+++ b/src/exa_wm_rotation.g4a
@ -0,0 +1,158 @@
+/*
+ * This's for exa composite operation in no mask picture case.
+ * The simplest case is just sending what src picture has to dst picture.
+ */
+
+/* I think this should be same as in g4a program for texture video,
+   as we also use 16-pixel dispatch. and SF scale in g3 is useful for us. */
+
+/* The initial payload of the thread is always g0.
+ * WM_URB (incoming URB entries) is g3
+ * X0_R is g4
+ * X1_R is g5
+ * Y0_R is g6
+ * Y1_R is g7
+ */
+
+    /* Set up the X/Y screen coordinates of the pixels in our 4 subspans.  Each
+     * subspan is a 2x2 rectangle, and the screen x/y of the upper left of each
+     * subspan are given in GRF register 1.2 through 1.5 (which, with the word
+     * addressing below, are 1.4 through 1.11).
+     *
+     * The result is WM_X*_R and WM_Y*R being:
+     *
+     * X0: {ss0.x, ss0.x+1, ss0.x,   ss0.x+1, ss1.x, ss1.x+1, ss1.x,   ss1.x+y}
+     * Y0: {ss0.y, ss0.y,   ss0.y+1, ss0.y+1, ss1.y, ss1.y,   ss1.y+1, ss1.y+1}
+     * X1: {ss2.x, ss2.x+1, ss2.x,   ss2.x+1, ss3.x, ss3.x+1, ss3.x,   ss3.x+y}
+     * Y1: {ss2.y, ss2.y,   ss2.y+1, ss2.y+1, ss3.y, ss3.y,   ss3.y+1, ss3.y+1}
+     */
+    /* Set up ss0.x coordinates*/
+mov (1) g4<1>F g1.8<0,1,0>UW { align1 };
+add (1) g4.4<1>F g1.8<0,1,0>UW 1UB { align1 };
+mov (1) g4.8<1>F g1.8<0,1,0>UW { align1 };
+add (1) g4.12<1>F g1.8<0,1,0>UW 1UB { align1 };
+    /* Set up ss0.y coordinates */
+mov (1) g6<1>F g1.10<0,1,0>UW { align1 };
+mov (1) g6.4<1>F g1.10<0,1,0>UW { align1 };
+add (1) g6.8<1>F g1.10<0,1,0>UW 1UB { align1 };
+add (1) g6.12<1>F g1.10<0,1,0>UW 1UB { align1 };
+    /* set up ss1.x coordinates */
+mov (1) g4.16<1>F g1.12<0,1,0>UW { align1 };
+add (1) g4.20<1>F g1.12<0,1,0>UW 1UB { align1 };
+mov (1) g4.24<1>F g1.12<0,1,0>UW { align1 };
+add (1) g4.28<1>F g1.12<0,1,0>UW 1UB { align1 };
+    /* set up ss1.y coordinates */
+mov (1) g6.16<1>F g1.14<0,1,0>UW { align1 };
+mov (1) g6.20<1>F g1.14<0,1,0>UW { align1 };
+add (1) g6.24<1>F g1.14<0,1,0>UW 1UB { align1 };
+add (1) g6.28<1>F g1.14<0,1,0>UW 1UB { align1 };
+    /* Set up ss2.x coordinates */
+mov (1) g5<1>F g1.16<0,1,0>UW { align1 };
+add (1) g5.4<1>F g1.16<0,1,0>UW 1UB { align1 };
+mov (1) g5.8<1>F g1.16<0,1,0>UW { align1 };
+add (1) g5.12<1>F g1.16<0,1,0>UW 1UB { align1 };
+    /* Set up ss2.y coordinates */
+mov (1) g7<1>F g1.18<0,1,0>UW { align1 };
+mov (1) g7.4<1>F g1.18<0,1,0>UW { align1 };
+add (1) g7.8<1>F g1.18<0,1,0>UW 1UB { align1 };
+add (1) g7.12<1>F g1.18<0,1,0>UW 1UB { align1 };
+    /* Set up ss3.x coordinates */
+mov (1) g5.16<1>F g1.20<0,1,0>UW { align1 };
+add (1) g5.20<1>F g1.20<0,1,0>UW 1UB { align1 };
+mov (1) g5.24<1>F g1.20<0,1,0>UW { align1 };
+add (1) g5.28<1>F g1.20<0,1,0>UW 1UB { align1 };
+    /* Set up ss3.y coordinates */
+mov (1) g7.16<1>F g1.22<0,1,0>UW { align1 };
+mov (1) g7.20<1>F g1.22<0,1,0>UW { align1 };
+add (1) g7.24<1>F g1.22<0,1,0>UW 1UB { align1 };
+add (1) g7.28<1>F g1.22<0,1,0>UW 1UB { align1 };
+
+    /* Now, map these screen space coordinates into texture coordinates. */
+    /* subtract screen-space X origin of vertex 0. */
+/* for rotation, texture y is from ssX.x, so g4,g5 will be Y */
+add (8) g4<1>F g4<8,8,1>F -g1<0,1,0>F { align1 };
+add (8) g5<1>F g5<8,8,1>F -g1<0,1,0>F { align1 };
+    /* scale by texture X increment */
+mul (8) g4<1>F g4<8,8,1>F g3.20<0,1,0>F { align1 };
+mul (8) g5<1>F g5<8,8,1>F g3.20<0,1,0>F { align1 };
+    /* add in texture X offset */
+add (8) g4<1>F g4<8,8,1>F g3.28<0,1,0>F { align1 };
+add (8) g5<1>F g5<8,8,1>F g3.28<0,1,0>F { align1 };
+
+/* texture Y is from ssX.x */
+    /* subtract screen-space Y origin of vertex 0. */
+add (8) g6<1>F g6<8,8,1>F -g1.4<0,1,0>F { align1 };
+add (8) g7<1>F g7<8,8,1>F -g1.4<0,1,0>F { align1 };
+    /* scale by texture Y increment */
+mul (8) g6<1>F g6<8,8,1>F g3.16<0,1,0>F { align1 };
+mul (8) g7<1>F g7<8,8,1>F g3.16<0,1,0>F { align1 };
+    /* add in texture Y offset */
+add (8) g6<1>F g6<8,8,1>F g3.12<0,1,0>F { align1 };
+add (8) g7<1>F g7<8,8,1>F g3.12<0,1,0>F { align1 };
+
+/* prepare sampler read back gX register, which would be written back to output */
+
+/* use simd16 sampler, param 0 is u, param 1 is v. */
+/* 'payload' loading, assuming tex coord start from g4 */
+mov (8) m1<1>F g6<8,8,1>F { align1 };
+mov (8) m2<1>F g7<8,8,1>F { align1 };  
+mov (8) m3<1>F g4<8,8,1>F { align1 };
+mov (8) m4<1>F g5<8,8,1>F { align1 }; 
+
+/* m0 will be copied with g0, as it contains send desc */
+/* emit sampler 'send' cmd */
+send (16) 0 		/* msg reg index */
+	g12<1>UW 	/* readback */
+	g0<8,8,1>UW  	/* copy to msg start reg*/
+	sampler (1,0,F)  /* sampler message description, (binding_table,sampler_index,datatype)
+			 /* here(src->dst) we should use src_sampler and src_surface */
+	mlen 5 rlen 8 { align1 };   /* required message len 5, readback len 8 */
+
+mov (8) g19<1>UD g19<8,8,1>UD { align1 };  /* wait sampler return */
+/* if we set up read-back reg correctly, emit dataport write 'send' cmd with EOT */
+
+/* m0, m1 are all direct passed by PS thread payload */
+mov (8) m1<1>F g1<8,8,1>F { align1 };
+
+/* prepare data in m2-m5 for subspan(1,0), m6-m9 for subspan(3,2), then it's ready to write */
+/* g12 -> m2
+   g13 -> m6
+   g14 -> m3
+   g15 -> m7
+   g16 -> m4
+   g17 -> m8
+   g18 -> m5
+   g19 -> m9
+*/
+mov (8) m2<1>F g12<8,8,1>F { align1 };
+mov (8) m3<1>F g14<8,8,1>F { align1 };
+mov (8) m4<1>F g16<8,8,1>F { align1 };
+mov (8) m5<1>F g18<8,8,1>F { align1 };
+mov (8) m6<1>F g13<8,8,1>F { align1 };
+mov (8) m7<1>F g15<8,8,1>F { align1 };
+mov (8) m8<1>F g17<8,8,1>F { align1 };
+mov (8) m9<1>F g19<8,8,1>F { align1 };
+
+/* m0, m1 are all direct passed by PS thread payload */
+mov (8) m1<1>UD g1<8,8,1>UD { align1 mask_disable };
+
+/* write */
+send (16) 0 acc0<1>UW g0<8,8,1>UW write (
+	0,  /* binding_table */
+	8,  /* pixel scordboard clear, msg type simd16 single source */
+	4,  /* render target write */
+	0   /* no write commit message */
+	) 
+	mlen 10
+	rlen 0
+	{ align1 EOT };
+
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
+nop;
--- a/src/exa_wm_rotation_prog.h
+++ b/src/exa_wm_rotation_prog.h
@ -0,0 +1,70 @@
+   { 0x00000001, 0x2080013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x20840d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x2088013d, 0x00000028, 0x00000000 },
+   { 0x00000040, 0x208c0d3d, 0x00000028, 0x00000001 },
+   { 0x00000001, 0x20c0013d, 0x0000002a, 0x00000000 },
+   { 0x00000001, 0x20c4013d, 0x0000002a, 0x00000000 },
+   { 0x00000040, 0x20c80d3d, 0x0000002a, 0x00000001 },
+   { 0x00000040, 0x20cc0d3d, 0x0000002a, 0x00000001 },
+   { 0x00000001, 0x2090013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x20940d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x2098013d, 0x0000002c, 0x00000000 },
+   { 0x00000040, 0x209c0d3d, 0x0000002c, 0x00000001 },
+   { 0x00000001, 0x20d0013d, 0x0000002e, 0x00000000 },
+   { 0x00000001, 0x20d4013d, 0x0000002e, 0x00000000 },
+   { 0x00000040, 0x20d80d3d, 0x0000002e, 0x00000001 },
+   { 0x00000040, 0x20dc0d3d, 0x0000002e, 0x00000001 },
+   { 0x00000001, 0x20a0013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20a40d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20a8013d, 0x00000030, 0x00000000 },
+   { 0x00000040, 0x20ac0d3d, 0x00000030, 0x00000001 },
+   { 0x00000001, 0x20e0013d, 0x00000032, 0x00000000 },
+   { 0x00000001, 0x20e4013d, 0x00000032, 0x00000000 },
+   { 0x00000040, 0x20e80d3d, 0x00000032, 0x00000001 },
+   { 0x00000040, 0x20ec0d3d, 0x00000032, 0x00000001 },
+   { 0x00000001, 0x20b0013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20b40d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20b8013d, 0x00000034, 0x00000000 },
+   { 0x00000040, 0x20bc0d3d, 0x00000034, 0x00000001 },
+   { 0x00000001, 0x20f0013d, 0x00000036, 0x00000000 },
+   { 0x00000001, 0x20f4013d, 0x00000036, 0x00000000 },
+   { 0x00000040, 0x20f80d3d, 0x00000036, 0x00000001 },
+   { 0x00000040, 0x20fc0d3d, 0x00000036, 0x00000001 },
+   { 0x00600040, 0x208077bd, 0x008d0080, 0x00004020 },
+   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x00004020 },
+   { 0x00600041, 0x208077bd, 0x008d0080, 0x00000074 },
+   { 0x00600041, 0x20a077bd, 0x008d00a0, 0x00000074 },
+   { 0x00600040, 0x208077bd, 0x008d0080, 0x0000007c },
+   { 0x00600040, 0x20a077bd, 0x008d00a0, 0x0000007c },
+   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x00004024 },
+   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x00004024 },
+   { 0x00600041, 0x20c077bd, 0x008d00c0, 0x00000070 },
+   { 0x00600041, 0x20e077bd, 0x008d00e0, 0x00000070 },
+   { 0x00600040, 0x20c077bd, 0x008d00c0, 0x0000006c },
+   { 0x00600040, 0x20e077bd, 0x008d00e0, 0x0000006c },
+   { 0x00600001, 0x202003be, 0x008d00c0, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d00e0, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d0080, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d00a0, 0x00000000 },
+   { 0x00800031, 0x21801d29, 0x008d0000, 0x02580001 },
+   { 0x00600001, 0x22600021, 0x008d0260, 0x00000000 },
+   { 0x00600001, 0x202003be, 0x008d0020, 0x00000000 },
+   { 0x00600001, 0x204003be, 0x008d0180, 0x00000000 },
+   { 0x00600001, 0x206003be, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x208003be, 0x008d0200, 0x00000000 },
+   { 0x00600001, 0x20a003be, 0x008d0240, 0x00000000 },
+   { 0x00600001, 0x20c003be, 0x008d01a0, 0x00000000 },
+   { 0x00600001, 0x20e003be, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x210003be, 0x008d0220, 0x00000000 },
+   { 0x00600001, 0x212003be, 0x008d0260, 0x00000000 },
+   { 0x00600201, 0x20200022, 0x008d0020, 0x00000000 },
+   { 0x00800031, 0x24001d28, 0x008d0000, 0x85a04800 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
--- a/src/i965_render.c
+++ b/src/i965_render.c
@ -335,6 +335,10 @@ static const CARD32 sf_kernel_static_mask[][4] = {
 #include "exa_sf_mask_prog.h"
 };

+static const CARD32 sf_kernel_static_rotation[][4] = {
+#include "exa_sf_rotation_prog.h"
+};
+
 /* ps kernels */
 #define PS_KERNEL_NUM_GRF   32
 #define PS_MAX_THREADS	   32
@ -355,7 +359,12 @@ static const CARD32 ps_kernel_static_masknoca [][4] = {
 #include "exa_wm_masknoca_prog.h"
 };

-static CARD32 i965_get_card_format(PicturePtr pPict)
+static const CARD32 ps_kernel_static_rotation [][4] = {
+#include "exa_wm_rotation_prog.h"
+};
+
+static CARD32 
+i965_get_card_format(PicturePtr pPict)
 {
    int i;

@ -368,6 +377,21 @@ static CARD32 i965_get_card_format(PicturePtr pPict)
    return i965_tex_formats[i].card_fmt;
 }

+static Bool
+i965_check_rotation_transform(PictTransformPtr t)
+{
+    /* XXX this is arbitrary */
+    int a, b;
+    a = xFixedToInt(t->matrix[0][1]);
+    b = xFixedToInt(t->matrix[1][0]);
+    if (a == -1 && b == 1)
+	return TRUE;
+    else if (a == 1 && b == -1)
+	return TRUE;
+    else
+	return FALSE;
+}
+
 Bool
 i965_prepare_composite(int op, PicturePtr pSrcPicture,
 		       PicturePtr pMaskPicture, PicturePtr pDstPicture,
@ -378,6 +402,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
    CARD32 src_offset, src_pitch;
    CARD32 mask_offset = 0, mask_pitch = 0;
    CARD32 dst_format, dst_offset, dst_pitch;
+    Bool rotation_program = FALSE;

 #ifdef XF86DRI
    if (pI830->directRenderingEnabled) {
@ -406,6 +431,9 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	pI830->transform[1] = NULL;
 	pI830->scale_units[1][0] = -1;
 	pI830->scale_units[1][1] = -1;
+	if (pI830->transform[0] && 
+		i965_check_rotation_transform(pI830->transform[0]))
+	    rotation_program = TRUE;
    } else {
 	pI830->transform[1] = pMaskPicture->transform;
 	pI830->scale_units[1][0] = pMask->drawable.width;
@ -442,7 +470,9 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
    sf_kernel_offset = ALIGN(next_offset, 64);
    if (pMask)
 	next_offset = sf_kernel_offset + sizeof (sf_kernel_static_mask);
-    else
+    else if (rotation_program)
+	next_offset = sf_kernel_offset + sizeof (sf_kernel_static_rotation);
+    else 
 	next_offset = sf_kernel_offset + sizeof (sf_kernel_static);

    ps_kernel_offset = ALIGN(next_offset, 64);
@ -459,6 +489,8 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
        } else
 	    next_offset = ps_kernel_offset + 
                          sizeof(ps_kernel_static_masknoca);
+    } else if (rotation_program) {
+   	next_offset = ps_kernel_offset + sizeof (ps_kernel_static_rotation);
    } else {
   	next_offset = ps_kernel_offset + sizeof (ps_kernel_static_nomask);
    }
@ -762,6 +794,9 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     */
    if (pMask)
 	memcpy(sf_kernel, sf_kernel_static_mask, sizeof (sf_kernel_static));
+    else if (rotation_program)
+	memcpy(sf_kernel, sf_kernel_static_rotation, 
+		sizeof (sf_kernel_static_rotation));
    else
 	memcpy(sf_kernel, sf_kernel_static, sizeof (sf_kernel_static));

@ -808,6 +843,9 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
        } else
   	    memcpy(ps_kernel, ps_kernel_static_masknoca,
 		   sizeof (ps_kernel_static_masknoca));
+    } else if (rotation_program) {
+   	memcpy(ps_kernel, ps_kernel_static_rotation,
+	       sizeof (ps_kernel_static_rotation));
    } else {
   	memcpy(ps_kernel, ps_kernel_static_nomask,
 	       sizeof (ps_kernel_static_nomask));
@ -973,8 +1011,10 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	    	 (0 << VE0_OFFSET_SHIFT));
   	OUT_RING((BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
 	    	 (BRW_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
-	     	 (BRW_VFCOMPONENT_NOSTORE << VE1_VFCOMPONENT_2_SHIFT) |
-	    	 (BRW_VFCOMPONENT_NOSTORE << VE1_VFCOMPONENT_3_SHIFT) |
+	     	 ((pMask ? BRW_VFCOMPONENT_NOSTORE: BRW_VFCOMPONENT_STORE_1_FLT)
+		  << VE1_VFCOMPONENT_2_SHIFT) |
+	    	 ((pMask ? BRW_VFCOMPONENT_NOSTORE: BRW_VFCOMPONENT_STORE_1_FLT)
+		  << VE1_VFCOMPONENT_3_SHIFT) |
 	    	 (0 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT));
   	if (pMask) {
 	    OUT_RING((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |