diff --git a/src/sna/compiler.h b/src/sna/compiler.h
index 23ec31c3..fe2e3214 100644
--- a/src/sna/compiler.h
+++ b/src/sna/compiler.h
@@ -52,6 +52,16 @@
 #define flatten
 #endif
 
+#if defined(__GNUC__) && (__GNUC__ >= 4) /* 4.4 */
+#define sse2 __attribute__((target("sse2")))
+#define sse4_2 __attribute__((target("sse4.2,sse2")))
+#define avx2 __attribute__((target("avx2,sse4.2,sse2")))
+#else
+#define sse2
+#define sse4_2
+#define avx2
+#endif
+
 #ifdef HAVE_VALGRIND
 #define VG(x) x
 #else
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index d08d762b..92802469 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -1945,7 +1945,7 @@ gen4_render_composite(struct sna *sna,
 					     tmp->mask.bo != NULL,
 					     tmp->has_component_alpha,
 					     tmp->is_affine);
-	tmp->u.gen4.ve_id = gen4_choose_composite_emitter(tmp);
+	tmp->u.gen4.ve_id = gen4_choose_composite_emitter(sna, tmp);
 
 	tmp->blt   = gen4_render_composite_blt;
 	tmp->box   = gen4_render_composite_box;
@@ -2186,7 +2186,7 @@ gen4_render_composite_spans(struct sna *sna,
 	tmp->base.has_component_alpha = false;
 	tmp->base.need_magic_ca_pass = false;
 
-	tmp->base.u.gen4.ve_id = gen4_choose_spans_emitter(tmp);
+	tmp->base.u.gen4.ve_id = gen4_choose_spans_emitter(sna, tmp);
 	tmp->base.u.gen4.wm_kernel = WM_KERNEL_OPACITY | !tmp->base.is_affine;
 
 	tmp->box   = gen4_render_composite_spans_box;
diff --git a/src/sna/gen4_vertex.c b/src/sna/gen4_vertex.c
index 5062ebdf..20f85b37 100644
--- a/src/sna/gen4_vertex.c
+++ b/src/sna/gen4_vertex.c
@@ -272,10 +272,10 @@ emit_texcoord(struct sna *sna,
 
 inline static void
 emit_vertex(struct sna *sna,
-		      const struct sna_composite_op *op,
-		      int16_t srcX, int16_t srcY,
-		      int16_t mskX, int16_t mskY,
-		      int16_t dstX, int16_t dstY)
+	    const struct sna_composite_op *op,
+	    int16_t srcX, int16_t srcY,
+	    int16_t mskX, int16_t mskY,
+	    int16_t dstX, int16_t dstY)
 {
 	OUT_VERTEX(dstX, dstY);
 	emit_texcoord(sna, &op->src, srcX, srcY);
@@ -414,6 +414,66 @@ emit_primitive_linear(struct sna *sna,
 	v[5] = compute_linear(&op->src, r->src.x, r->src.y);
 }
 
+sse4_2 fastcall static void
+emit_primitive_linear__sse4_2(struct sna *sna,
+			      const struct sna_composite_op *op,
+			      const struct sna_composite_rectangles *r)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	assert(op->floats_per_rect == 6);
+	assert((sna->render.vertex_used % 2) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 6;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	dst.p.x = r->dst.x;
+	v[2] = dst.f;
+	dst.p.y = r->dst.y;
+	v[4] = dst.f;
+
+	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
+	v[3] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
+	v[5] = compute_linear(&op->src, r->src.x, r->src.y);
+}
+
+avx2 fastcall static void
+emit_primitive_linear__avx2(struct sna *sna,
+			    const struct sna_composite_op *op,
+			    const struct sna_composite_rectangles *r)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	assert(op->floats_per_rect == 6);
+	assert((sna->render.vertex_used % 2) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 6;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	dst.p.x = r->dst.x;
+	v[2] = dst.f;
+	dst.p.y = r->dst.y;
+	v[4] = dst.f;
+
+	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
+	v[3] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
+	v[5] = compute_linear(&op->src, r->src.x, r->src.y);
+}
+
 fastcall static void
 emit_boxes_linear(const struct sna_composite_op *op,
 		  const BoxRec *box, int nbox,
@@ -442,6 +502,62 @@ emit_boxes_linear(const struct sna_composite_op *op,
 	} while (--nbox);
 }
 
+sse4_2 fastcall static void
+emit_boxes_linear__sse4_2(const struct sna_composite_op *op,
+			  const BoxRec *box, int nbox,
+			  float *v)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	do {
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		dst.p.x = box->x1;
+		v[2] = dst.f;
+		dst.p.y = box->y1;
+		v[4] = dst.f;
+
+		v[1] = compute_linear(&op->src, box->x2, box->y2);
+		v[3] = compute_linear(&op->src, box->x1, box->y2);
+		v[5] = compute_linear(&op->src, box->x1, box->y1);
+
+		v += 6;
+		box++;
+	} while (--nbox);
+}
+
+avx2 fastcall static void
+emit_boxes_linear__avx2(const struct sna_composite_op *op,
+			const BoxRec *box, int nbox,
+			float *v)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	do {
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		dst.p.x = box->x1;
+		v[2] = dst.f;
+		dst.p.y = box->y1;
+		v[4] = dst.f;
+
+		v[1] = compute_linear(&op->src, box->x2, box->y2);
+		v[3] = compute_linear(&op->src, box->x1, box->y2);
+		v[5] = compute_linear(&op->src, box->x1, box->y1);
+
+		v += 6;
+		box++;
+	} while (--nbox);
+}
+
 fastcall static void
 emit_primitive_identity_source(struct sna *sna,
 			       const struct sna_composite_op *op,
@@ -473,6 +589,68 @@ emit_primitive_identity_source(struct sna *sna,
 	v[5] = v[2] = v[8] + r->height * op->src.scale[1];
 }
 
+sse4_2 fastcall static void
+emit_primitive_identity_source__sse4_2(struct sna *sna,
+				       const struct sna_composite_op *op,
+				       const struct sna_composite_rectangles *r)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
+
+	assert(op->floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	dst.p.x = r->dst.x;
+	v[3] = dst.f;
+	dst.p.y = r->dst.y;
+	v[6] = dst.f;
+
+	v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
+	v[1] = v[4] + r->width * op->src.scale[0];
+
+	v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
+	v[5] = v[2] = v[8] + r->height * op->src.scale[1];
+}
+
+avx2 fastcall static void
+emit_primitive_identity_source__avx2(struct sna *sna,
+				     const struct sna_composite_op *op,
+				     const struct sna_composite_rectangles *r)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
+
+	assert(op->floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	dst.p.x = r->dst.x;
+	v[3] = dst.f;
+	dst.p.y = r->dst.y;
+	v[6] = dst.f;
+
+	v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
+	v[1] = v[4] + r->width * op->src.scale[0];
+
+	v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
+	v[5] = v[2] = v[8] + r->height * op->src.scale[1];
+}
+
 fastcall static void
 emit_boxes_identity_source(const struct sna_composite_op *op,
 			   const BoxRec *box, int nbox,
@@ -503,6 +681,66 @@ emit_boxes_identity_source(const struct sna_composite_op *op,
 	} while (--nbox);
 }
 
+sse4_2 fastcall static void
+emit_boxes_identity_source__sse4_2(const struct sna_composite_op *op,
+				   const BoxRec *box, int nbox,
+				   float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		dst.p.x = box->x1;
+		v[3] = dst.f;
+		dst.p.y = box->y1;
+		v[6] = dst.f;
+
+		v[7] = v[4] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
+		v[1] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
+
+		v[8] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
+		v[2] = v[5] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
+
+		v += 9;
+		box++;
+	} while (--nbox);
+}
+
+avx2 fastcall static void
+emit_boxes_identity_source__avx2(const struct sna_composite_op *op,
+				 const BoxRec *box, int nbox,
+				 float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		dst.p.x = box->x1;
+		v[3] = dst.f;
+		dst.p.y = box->y1;
+		v[6] = dst.f;
+
+		v[7] = v[4] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
+		v[1] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
+
+		v[8] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
+		v[2] = v[5] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
+
+		v += 9;
+		box++;
+	} while (--nbox);
+}
+
 fastcall static void
 emit_primitive_simple_source(struct sna *sna,
 			     const struct sna_composite_op *op,
@@ -543,6 +781,86 @@ emit_primitive_simple_source(struct sna *sna,
 	v[8] = ((r->src.y + ty) * yy + y0) * sy;
 }
 
+sse4_2 fastcall static void
+emit_primitive_simple_source__sse4_2(struct sna *sna,
+				     const struct sna_composite_op *op,
+				     const struct sna_composite_rectangles *r)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	float xx = op->src.transform->matrix[0][0];
+	float x0 = op->src.transform->matrix[0][2];
+	float yy = op->src.transform->matrix[1][1];
+	float y0 = op->src.transform->matrix[1][2];
+	float sx = op->src.scale[0];
+	float sy = op->src.scale[1];
+	int16_t tx = op->src.offset[0];
+	int16_t ty = op->src.offset[1];
+
+	assert(op->floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*3;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
+	v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
+
+	dst.p.x = r->dst.x;
+	v[3] = dst.f;
+	v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx;
+
+	dst.p.y = r->dst.y;
+	v[6] = dst.f;
+	v[8] = ((r->src.y + ty) * yy + y0) * sy;
+}
+
+avx2 fastcall static void
+emit_primitive_simple_source__avx2(struct sna *sna,
+				   const struct sna_composite_op *op,
+				   const struct sna_composite_rectangles *r)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	float xx = op->src.transform->matrix[0][0];
+	float x0 = op->src.transform->matrix[0][2];
+	float yy = op->src.transform->matrix[1][1];
+	float y0 = op->src.transform->matrix[1][2];
+	float sx = op->src.scale[0];
+	float sy = op->src.scale[1];
+	int16_t tx = op->src.offset[0];
+	int16_t ty = op->src.offset[1];
+
+	assert(op->floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*3;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
+	v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
+
+	dst.p.x = r->dst.x;
+	v[3] = dst.f;
+	v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx;
+
+	dst.p.y = r->dst.y;
+	v[6] = dst.f;
+	v[8] = ((r->src.y + ty) * yy + y0) * sy;
+}
+
 fastcall static void
 emit_boxes_simple_source(const struct sna_composite_op *op,
 			 const BoxRec *box, int nbox,
@@ -582,6 +900,84 @@ emit_boxes_simple_source(const struct sna_composite_op *op,
 	} while (--nbox);
 }
 
+sse4_2 fastcall static void
+emit_boxes_simple_source__sse4_2(const struct sna_composite_op *op,
+				 const BoxRec *box, int nbox,
+				 float *v)
+{
+	float xx = op->src.transform->matrix[0][0];
+	float x0 = op->src.transform->matrix[0][2];
+	float yy = op->src.transform->matrix[1][1];
+	float y0 = op->src.transform->matrix[1][2];
+	float sx = op->src.scale[0];
+	float sy = op->src.scale[1];
+	int16_t tx = op->src.offset[0];
+	int16_t ty = op->src.offset[1];
+
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		v[1] = ((box->x2 + tx) * xx + x0) * sx;
+		v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
+
+		dst.p.x = box->x1;
+		v[3] = dst.f;
+		v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx;
+
+		dst.p.y = box->y1;
+		v[6] = dst.f;
+		v[8] = ((box->y1 + ty) * yy + y0) * sy;
+
+		v += 9;
+		box++;
+	} while (--nbox);
+}
+
+avx2 fastcall static void
+emit_boxes_simple_source__avx2(const struct sna_composite_op *op,
+			       const BoxRec *box, int nbox,
+			       float *v)
+{
+	float xx = op->src.transform->matrix[0][0];
+	float x0 = op->src.transform->matrix[0][2];
+	float yy = op->src.transform->matrix[1][1];
+	float y0 = op->src.transform->matrix[1][2];
+	float sx = op->src.scale[0];
+	float sy = op->src.scale[1];
+	int16_t tx = op->src.offset[0];
+	int16_t ty = op->src.offset[1];
+
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		v[1] = ((box->x2 + tx) * xx + x0) * sx;
+		v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
+
+		dst.p.x = box->x1;
+		v[3] = dst.f;
+		v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx;
+
+		dst.p.y = box->y1;
+		v[6] = dst.f;
+		v[8] = ((box->y1 + ty) * yy + y0) * sy;
+
+		v += 9;
+		box++;
+	} while (--nbox);
+}
+
 fastcall static void
 emit_primitive_affine_source(struct sna *sna,
 			     const struct sna_composite_op *op,
@@ -981,7 +1377,7 @@ emit_composite_texcoord_affine(struct sna *sna,
 }
 
 
-unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp)
+unsigned gen4_choose_composite_emitter(struct sna *sna, struct sna_composite_op *tmp)
 {
 	unsigned vb;
 
@@ -1060,14 +1456,30 @@ unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp)
 			vb = 1;
 		} else if (tmp->src.is_linear) {
 			DBG(("%s: linear, no mask\n", __FUNCTION__));
-			tmp->prim_emit = emit_primitive_linear;
-			tmp->emit_boxes = emit_boxes_linear;
+			if (sna->cpu_features & AVX2) {
+				tmp->prim_emit = emit_primitive_linear__avx2;
+				tmp->emit_boxes = emit_boxes_linear__avx2;
+			} else  if (sna->cpu_features & SSE4_2) {
+				tmp->prim_emit = emit_primitive_linear__sse4_2;
+				tmp->emit_boxes = emit_boxes_linear__sse4_2;
+			} else {
+				tmp->prim_emit = emit_primitive_linear;
+				tmp->emit_boxes = emit_boxes_linear;
+			}
 			tmp->floats_per_vertex = 2;
 			vb = 1;
 		} else if (tmp->src.transform == NULL) {
 			DBG(("%s: identity src, no mask\n", __FUNCTION__));
-			tmp->prim_emit = emit_primitive_identity_source;
-			tmp->emit_boxes = emit_boxes_identity_source;
+			if (sna->cpu_features & AVX2) {
+				tmp->prim_emit = emit_primitive_identity_source__avx2;
+				tmp->emit_boxes = emit_boxes_identity_source__avx2;
+			} else if (sna->cpu_features & SSE4_2) {
+				tmp->prim_emit = emit_primitive_identity_source__sse4_2;
+				tmp->emit_boxes = emit_boxes_identity_source__sse4_2;
+			} else {
+				tmp->prim_emit = emit_primitive_identity_source;
+				tmp->emit_boxes = emit_boxes_identity_source;
+			}
 			tmp->floats_per_vertex = 3;
 			vb = 2;
 		} else if (tmp->src.is_affine) {
@@ -1075,8 +1487,16 @@ unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp)
 			tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
 			if (!sna_affine_transform_is_rotation(tmp->src.transform)) {
 				DBG(("%s: simple src, no mask\n", __FUNCTION__));
-				tmp->prim_emit = emit_primitive_simple_source;
-				tmp->emit_boxes = emit_boxes_simple_source;
+				if (sna->cpu_features & AVX2) {
+					tmp->prim_emit = emit_primitive_simple_source__avx2;
+					tmp->emit_boxes = emit_boxes_simple_source__avx2;
+				} else if (sna->cpu_features & SSE4_2) {
+					tmp->prim_emit = emit_primitive_simple_source__sse4_2;
+					tmp->emit_boxes = emit_boxes_simple_source__sse4_2;
+				} else {
+					tmp->prim_emit = emit_primitive_simple_source;
+					tmp->emit_boxes = emit_boxes_simple_source;
+				}
 			} else {
 				DBG(("%s: affine src, no mask\n", __FUNCTION__));
 				tmp->prim_emit = emit_primitive_affine_source;
@@ -1222,6 +1642,86 @@ emit_span_identity(struct sna *sna,
 	v[11] = v[7] = v[3] = opacity;
 }
 
+sse4_2 fastcall static void
+emit_span_identity__sse4_2(struct sna *sna,
+			   const struct sna_composite_spans_op *op,
+			   const BoxRec *box,
+			   float opacity)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	float sx = op->base.src.scale[0];
+	float sy = op->base.src.scale[1];
+	int16_t tx = op->base.src.offset[0];
+	int16_t ty = op->base.src.offset[1];
+
+	assert(op->base.floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*4;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	v[1] = (box->x2 + tx) * sx;
+	v[6] = v[2] = (box->y2 + ty) * sy;
+
+	dst.p.x = box->x1;
+	v[4] = dst.f;
+	v[9] = v[5] = (box->x1 + tx) * sx;
+
+	dst.p.y = box->y1;
+	v[8] = dst.f;
+	v[10] = (box->y1 + ty) * sy;
+
+	v[11] = v[7] = v[3] = opacity;
+}
+
+avx2 fastcall static void
+emit_span_identity__avx2(struct sna *sna,
+			 const struct sna_composite_spans_op *op,
+			 const BoxRec *box,
+			 float opacity)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	float sx = op->base.src.scale[0];
+	float sy = op->base.src.scale[1];
+	int16_t tx = op->base.src.offset[0];
+	int16_t ty = op->base.src.offset[1];
+
+	assert(op->base.floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*4;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	v[1] = (box->x2 + tx) * sx;
+	v[6] = v[2] = (box->y2 + ty) * sy;
+
+	dst.p.x = box->x1;
+	v[4] = dst.f;
+	v[9] = v[5] = (box->x1 + tx) * sx;
+
+	dst.p.y = box->y1;
+	v[8] = dst.f;
+	v[10] = (box->y1 + ty) * sy;
+
+	v[11] = v[7] = v[3] = opacity;
+}
+
 fastcall static void
 emit_span_boxes_identity(const struct sna_composite_spans_op *op,
 			 const struct sna_opacity_box *b, int nbox,
@@ -1259,11 +1759,173 @@ emit_span_boxes_identity(const struct sna_composite_spans_op *op,
 	} while (--nbox);
 }
 
+sse4_2 fastcall static void
+emit_span_boxes_identity__sse4_2(const struct sna_composite_spans_op *op,
+				 const struct sna_opacity_box *b, int nbox,
+				 float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		float sx = op->base.src.scale[0];
+		float sy = op->base.src.scale[1];
+		int16_t tx = op->base.src.offset[0];
+		int16_t ty = op->base.src.offset[1];
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		v[1] = (b->box.x2 + tx) * sx;
+		v[6] = v[2] = (b->box.y2 + ty) * sy;
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		v[9] = v[5] = (b->box.x1 + tx) * sx;
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		v[10] = (b->box.y1 + ty) * sy;
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
+avx2 fastcall static void
+emit_span_boxes_identity__avx2(const struct sna_composite_spans_op *op,
+			       const struct sna_opacity_box *b, int nbox,
+			       float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		float sx = op->base.src.scale[0];
+		float sy = op->base.src.scale[1];
+		int16_t tx = op->base.src.offset[0];
+		int16_t ty = op->base.src.offset[1];
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		v[1] = (b->box.x2 + tx) * sx;
+		v[6] = v[2] = (b->box.y2 + ty) * sy;
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		v[9] = v[5] = (b->box.x1 + tx) * sx;
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		v[10] = (b->box.y1 + ty) * sy;
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
 fastcall static void
 emit_span_simple(struct sna *sna,
-		  const struct sna_composite_spans_op *op,
-		  const BoxRec *box,
-		  float opacity)
+		 const struct sna_composite_spans_op *op,
+		 const BoxRec *box,
+		 float opacity)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	float xx = op->base.src.transform->matrix[0][0];
+	float x0 = op->base.src.transform->matrix[0][2];
+	float yy = op->base.src.transform->matrix[1][1];
+	float y0 = op->base.src.transform->matrix[1][2];
+	float sx = op->base.src.scale[0];
+	float sy = op->base.src.scale[1];
+	int16_t tx = op->base.src.offset[0];
+	int16_t ty = op->base.src.offset[1];
+
+	assert(op->base.floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*4;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	v[1] = ((box->x2 + tx) * xx + x0) * sx;
+	v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
+
+	dst.p.x = box->x1;
+	v[4] = dst.f;
+	v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx;
+
+	dst.p.y = box->y1;
+	v[8] = dst.f;
+	v[10] = ((box->y1 + ty) * yy + y0) * sy;
+
+	v[11] = v[7] = v[3] = opacity;
+}
+
+sse4_2 fastcall static void
+emit_span_simple__sse4_2(struct sna *sna,
+			 const struct sna_composite_spans_op *op,
+			 const BoxRec *box,
+			 float opacity)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	float xx = op->base.src.transform->matrix[0][0];
+	float x0 = op->base.src.transform->matrix[0][2];
+	float yy = op->base.src.transform->matrix[1][1];
+	float y0 = op->base.src.transform->matrix[1][2];
+	float sx = op->base.src.scale[0];
+	float sy = op->base.src.scale[1];
+	int16_t tx = op->base.src.offset[0];
+	int16_t ty = op->base.src.offset[1];
+
+	assert(op->base.floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*4;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	v[1] = ((box->x2 + tx) * xx + x0) * sx;
+	v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
+
+	dst.p.x = box->x1;
+	v[4] = dst.f;
+	v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx;
+
+	dst.p.y = box->y1;
+	v[8] = dst.f;
+	v[10] = ((box->y1 + ty) * yy + y0) * sy;
+
+	v[11] = v[7] = v[3] = opacity;
+}
+
+avx2 fastcall static void
+emit_span_simple__avx2(struct sna *sna,
+		       const struct sna_composite_spans_op *op,
+		       const BoxRec *box,
+		       float opacity)
 {
 	float *v;
 	union {
@@ -1344,6 +2006,88 @@ emit_span_boxes_simple(const struct sna_composite_spans_op *op,
 	} while (--nbox);
 }
 
+sse4_2 fastcall static void
+emit_span_boxes_simple__sse4_2(const struct sna_composite_spans_op *op,
+			       const struct sna_opacity_box *b, int nbox,
+			       float *v)
+{
+	float xx = op->base.src.transform->matrix[0][0];
+	float x0 = op->base.src.transform->matrix[0][2];
+	float yy = op->base.src.transform->matrix[1][1];
+	float y0 = op->base.src.transform->matrix[1][2];
+	float sx = op->base.src.scale[0];
+	float sy = op->base.src.scale[1];
+	int16_t tx = op->base.src.offset[0];
+	int16_t ty = op->base.src.offset[1];
+
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		v[1] = ((b->box.x2 + tx) * xx + x0) * sx;
+		v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy;
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx;
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		v[10] = ((b->box.y1 + ty) * yy + y0) * sy;
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
+avx2 fastcall static void
+emit_span_boxes_simple__avx2(const struct sna_composite_spans_op *op,
+			     const struct sna_opacity_box *b, int nbox,
+			     float *v)
+{
+	float xx = op->base.src.transform->matrix[0][0];
+	float x0 = op->base.src.transform->matrix[0][2];
+	float yy = op->base.src.transform->matrix[1][1];
+	float y0 = op->base.src.transform->matrix[1][2];
+	float sx = op->base.src.scale[0];
+	float sy = op->base.src.scale[1];
+	int16_t tx = op->base.src.offset[0];
+	int16_t ty = op->base.src.offset[1];
+
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		v[1] = ((b->box.x2 + tx) * xx + x0) * sx;
+		v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy;
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx;
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		v[10] = ((b->box.y1 + ty) * yy + y0) * sy;
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
 fastcall static void
 emit_span_affine(struct sna *sna,
 		  const struct sna_composite_spans_op *op,
@@ -1389,6 +2133,96 @@ emit_span_affine(struct sna *sna,
 	v[11] = v[7] = v[3] = opacity;
 }
 
+sse4_2 fastcall static void
+emit_span_affine__sse4_2(struct sna *sna,
+			 const struct sna_composite_spans_op *op,
+			 const BoxRec *box,
+			 float opacity)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
+
+	assert(op->base.floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 12;
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
+				    op->base.src.offset[1] + box->y2,
+				    op->base.src.transform,
+				    op->base.src.scale,
+				    &v[1], &v[2]);
+
+	dst.p.x = box->x1;
+	v[4] = dst.f;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+				    op->base.src.offset[1] + box->y2,
+				    op->base.src.transform,
+				    op->base.src.scale,
+				    &v[5], &v[6]);
+
+	dst.p.y = box->y1;
+	v[8] = dst.f;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+				    op->base.src.offset[1] + box->y1,
+				    op->base.src.transform,
+				    op->base.src.scale,
+				    &v[9], &v[10]);
+
+	v[11] = v[7] = v[3] = opacity;
+}
+
+avx2 fastcall static void
+emit_span_affine__avx2(struct sna *sna,
+		       const struct sna_composite_spans_op *op,
+		       const BoxRec *box,
+		       float opacity)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
+
+	assert(op->base.floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 12;
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
+				    op->base.src.offset[1] + box->y2,
+				    op->base.src.transform,
+				    op->base.src.scale,
+				    &v[1], &v[2]);
+
+	dst.p.x = box->x1;
+	v[4] = dst.f;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+				    op->base.src.offset[1] + box->y2,
+				    op->base.src.transform,
+				    op->base.src.scale,
+				    &v[5], &v[6]);
+
+	dst.p.y = box->y1;
+	v[8] = dst.f;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+				    op->base.src.offset[1] + box->y1,
+				    op->base.src.transform,
+				    op->base.src.scale,
+				    &v[9], &v[10]);
+
+	v[11] = v[7] = v[3] = opacity;
+}
+
 fastcall static void
 emit_span_boxes_affine(const struct sna_composite_spans_op *op,
 		       const struct sna_opacity_box *b, int nbox,
@@ -1432,11 +2266,161 @@ emit_span_boxes_affine(const struct sna_composite_spans_op *op,
 	} while (--nbox);
 }
 
+sse4_2 fastcall static void
+emit_span_boxes_affine__sse4_2(const struct sna_composite_spans_op *op,
+			       const struct sna_opacity_box *b, int nbox,
+			       float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
+					    op->base.src.offset[1] + b->box.y2,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[1], &v[2]);
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+					    op->base.src.offset[1] + b->box.y2,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[5], &v[6]);
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+					    op->base.src.offset[1] + b->box.y1,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[9], &v[10]);
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
+avx2 fastcall static void
+emit_span_boxes_affine__avx2(const struct sna_composite_spans_op *op,
+			     const struct sna_opacity_box *b, int nbox,
+			     float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
+					    op->base.src.offset[1] + b->box.y2,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[1], &v[2]);
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+					    op->base.src.offset[1] + b->box.y2,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[5], &v[6]);
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+					    op->base.src.offset[1] + b->box.y1,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[9], &v[10]);
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
 fastcall static void
 emit_span_linear(struct sna *sna,
-		  const struct sna_composite_spans_op *op,
-		  const BoxRec *box,
-		  float opacity)
+		 const struct sna_composite_spans_op *op,
+		 const BoxRec *box,
+		 float opacity)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
+
+	assert(op->base.floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	dst.p.x = box->x1;
+	v[3] = dst.f;
+	dst.p.y = box->y1;
+	v[6] = dst.f;
+
+	v[1] = compute_linear(&op->base.src, box->x2, box->y2);
+	v[4] = compute_linear(&op->base.src, box->x1, box->y2);
+	v[7] = compute_linear(&op->base.src, box->x1, box->y1);
+
+	v[8] = v[5] = v[2] = opacity;
+}
+
+sse4_2 fastcall static void
+emit_span_linear__sse4_2(struct sna *sna,
+			 const struct sna_composite_spans_op *op,
+			 const BoxRec *box,
+			 float opacity)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
+
+	assert(op->base.floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	dst.p.x = box->x1;
+	v[3] = dst.f;
+	dst.p.y = box->y1;
+	v[6] = dst.f;
+
+	v[1] = compute_linear(&op->base.src, box->x2, box->y2);
+	v[4] = compute_linear(&op->base.src, box->x1, box->y2);
+	v[7] = compute_linear(&op->base.src, box->x1, box->y1);
+
+	v[8] = v[5] = v[2] = opacity;
+}
+
+avx2 fastcall static void
+emit_span_linear__avx2(struct sna *sna,
+		       const struct sna_composite_spans_op *op,
+		       const BoxRec *box,
+		       float opacity)
 {
 	union {
 		struct sna_coordinate p;
@@ -1494,6 +2478,66 @@ emit_span_boxes_linear(const struct sna_composite_spans_op *op,
 	} while (--nbox);
 }
 
+sse4_2 fastcall static void
+emit_span_boxes_linear__sse4_2(const struct sna_composite_spans_op *op,
+			       const struct sna_opacity_box *b, int nbox,
+			       float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		dst.p.x = b->box.x1;
+		v[3] = dst.f;
+		dst.p.y = b->box.y1;
+		v[6] = dst.f;
+
+		v[1] = compute_linear(&op->base.src, b->box.x2, b->box.y2);
+		v[4] = compute_linear(&op->base.src, b->box.x1, b->box.y2);
+		v[7] = compute_linear(&op->base.src, b->box.x1, b->box.y1);
+
+		v[8] = v[5] = v[2] = b->alpha;
+
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
+avx2 fastcall static void
+emit_span_boxes_linear__avx2(const struct sna_composite_spans_op *op,
+			     const struct sna_opacity_box *b, int nbox,
+			     float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		dst.p.x = b->box.x1;
+		v[3] = dst.f;
+		dst.p.y = b->box.y1;
+		v[6] = dst.f;
+
+		v[1] = compute_linear(&op->base.src, b->box.x2, b->box.y2);
+		v[4] = compute_linear(&op->base.src, b->box.x1, b->box.y2);
+		v[7] = compute_linear(&op->base.src, b->box.x1, b->box.y1);
+
+		v[8] = v[5] = v[2] = b->alpha;
+
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
 inline inline static uint32_t
 gen4_choose_spans_vertex_buffer(const struct sna_composite_op *op)
 {
@@ -1502,7 +2546,8 @@ gen4_choose_spans_vertex_buffer(const struct sna_composite_op *op)
 	return 1 << 2 | id;
 }
 
-unsigned gen4_choose_spans_emitter(struct sna_composite_spans_op *tmp)
+unsigned gen4_choose_spans_emitter(struct sna *sna,
+				   struct sna_composite_spans_op *tmp)
 {
 	unsigned vb;
 
@@ -1512,24 +2557,56 @@ unsigned gen4_choose_spans_emitter(struct sna_composite_spans_op *tmp)
 		tmp->base.floats_per_vertex = 3;
 		vb = 1 << 2 | 1;
 	} else if (tmp->base.src.is_linear) {
-		tmp->prim_emit = emit_span_linear;
-		tmp->emit_boxes = emit_span_boxes_linear;
+		if (sna->cpu_features & AVX2) {
+			tmp->prim_emit = emit_span_linear__avx2;
+			tmp->emit_boxes = emit_span_boxes_linear__avx2;
+		} else if (sna->cpu_features & SSE4_2) {
+			tmp->prim_emit = emit_span_linear__sse4_2;
+			tmp->emit_boxes = emit_span_boxes_linear__sse4_2;
+		} else {
+			tmp->prim_emit = emit_span_linear;
+			tmp->emit_boxes = emit_span_boxes_linear;
+		}
 		tmp->base.floats_per_vertex = 3;
 		vb = 1 << 2 | 1;
 	} else if (tmp->base.src.transform == NULL) {
-		tmp->prim_emit = emit_span_identity;
-		tmp->emit_boxes = emit_span_boxes_identity;
+		if (sna->cpu_features & AVX2) {
+			tmp->prim_emit = emit_span_identity__avx2;
+			tmp->emit_boxes = emit_span_boxes_identity__avx2;
+		} else if (sna->cpu_features & SSE4_2) {
+			tmp->prim_emit = emit_span_identity__sse4_2;
+			tmp->emit_boxes = emit_span_boxes_identity__sse4_2;
+		} else {
+			tmp->prim_emit = emit_span_identity;
+			tmp->emit_boxes = emit_span_boxes_identity;
+		}
 		tmp->base.floats_per_vertex = 4;
 		vb = 1 << 2 | 2;
 	} else if (tmp->base.is_affine) {
 		tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2];
 		tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2];
 		if (!sna_affine_transform_is_rotation(tmp->base.src.transform)) {
-			tmp->prim_emit = emit_span_simple;
-			tmp->emit_boxes = emit_span_boxes_simple;
+			if (sna->cpu_features & AVX2) {
+				tmp->prim_emit = emit_span_simple__avx2;
+				tmp->emit_boxes = emit_span_boxes_simple__avx2;
+			} else if (sna->cpu_features & SSE4_2) {
+				tmp->prim_emit = emit_span_simple__sse4_2;
+				tmp->emit_boxes = emit_span_boxes_simple__sse4_2;
+			} else {
+				tmp->prim_emit = emit_span_simple;
+				tmp->emit_boxes = emit_span_boxes_simple;
+			}
 		} else {
-			tmp->prim_emit = emit_span_affine;
-			tmp->emit_boxes = emit_span_boxes_affine;
+			if (sna->cpu_features & AVX2) {
+				tmp->prim_emit = emit_span_affine__avx2;
+				tmp->emit_boxes = emit_span_boxes_affine__avx2;
+			} else if (sna->cpu_features & SSE4_2) {
+				tmp->prim_emit = emit_span_affine__sse4_2;
+				tmp->emit_boxes = emit_span_boxes_affine__sse4_2;
+			} else {
+				tmp->prim_emit = emit_span_affine;
+				tmp->emit_boxes = emit_span_boxes_affine;
+			}
 		}
 		tmp->base.floats_per_vertex = 4;
 		vb = 1 << 2 | 2;
diff --git a/src/sna/gen4_vertex.h b/src/sna/gen4_vertex.h
index 431b545e..1494ba14 100644
--- a/src/sna/gen4_vertex.h
+++ b/src/sna/gen4_vertex.h
@@ -10,7 +10,7 @@ void gen4_vertex_flush(struct sna *sna);
 int gen4_vertex_finish(struct sna *sna);
 void gen4_vertex_close(struct sna *sna);
 
-unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp);
-unsigned gen4_choose_spans_emitter(struct sna_composite_spans_op *tmp);
+unsigned gen4_choose_composite_emitter(struct sna *sna, struct sna_composite_op *tmp);
+unsigned gen4_choose_spans_emitter(struct sna *sna, struct sna_composite_spans_op *tmp);
 
 #endif /* GEN4_VERTEX_H */
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index f2368773..8b9eaac1 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -1924,7 +1924,7 @@ gen5_render_composite(struct sna *sna,
 					     tmp->mask.bo != NULL,
 					     tmp->has_component_alpha,
 					     tmp->is_affine);
-	tmp->u.gen5.ve_id = gen4_choose_composite_emitter(tmp);
+	tmp->u.gen5.ve_id = gen4_choose_composite_emitter(sna, tmp);
 
 	tmp->blt   = gen5_render_composite_blt;
 	tmp->box   = gen5_render_composite_box;
@@ -2152,7 +2152,7 @@ gen5_render_composite_spans(struct sna *sna,
 	tmp->base.has_component_alpha = false;
 	tmp->base.need_magic_ca_pass = false;
 
-	tmp->base.u.gen5.ve_id = gen4_choose_spans_emitter(tmp);
+	tmp->base.u.gen5.ve_id = gen4_choose_spans_emitter(sna, tmp);
 	tmp->base.u.gen5.wm_kernel = WM_KERNEL_OPACITY | !tmp->base.is_affine;
 
 	tmp->box   = gen5_render_composite_spans_box;
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index fa4c47b1..d4105141 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -2272,7 +2272,7 @@ gen6_render_composite(struct sna *sna,
 							    tmp->mask.bo != NULL,
 							    tmp->has_component_alpha,
 							    tmp->is_affine),
-			       gen4_choose_composite_emitter(tmp));
+			       gen4_choose_composite_emitter(sna, tmp));
 
 	tmp->blt   = gen6_render_composite_blt;
 	tmp->box   = gen6_render_composite_box;
@@ -2508,7 +2508,7 @@ gen6_render_composite_spans(struct sna *sna,
 					      SAMPLER_EXTEND_PAD),
 			       gen6_get_blend(tmp->base.op, false, tmp->base.dst.format),
 			       GEN6_WM_KERNEL_OPACITY | !tmp->base.is_affine,
-			       gen4_choose_spans_emitter(tmp));
+			       gen4_choose_spans_emitter(sna, tmp));
 
 	tmp->box   = gen6_render_composite_spans_box;
 	tmp->boxes = gen6_render_composite_spans_boxes;
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index bd14d90f..7984cf19 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -2412,7 +2412,7 @@ gen7_render_composite(struct sna *sna,
 							    tmp->mask.bo != NULL,
 							    tmp->has_component_alpha,
 							    tmp->is_affine),
-			       gen4_choose_composite_emitter(tmp));
+			       gen4_choose_composite_emitter(sna, tmp));
 
 	tmp->blt   = gen7_render_composite_blt;
 	tmp->box   = gen7_render_composite_box;
@@ -2628,7 +2628,7 @@ gen7_render_composite_spans(struct sna *sna,
 					      SAMPLER_EXTEND_PAD),
 			       gen7_get_blend(tmp->base.op, false, tmp->base.dst.format),
 			       GEN7_WM_KERNEL_OPACITY | !tmp->base.is_affine,
-			       gen4_choose_spans_emitter(tmp));
+			       gen4_choose_spans_emitter(sna, tmp));
 
 	tmp->box   = gen7_render_composite_spans_box;
 	tmp->boxes = gen7_render_composite_spans_boxes;