diff --git a/src/sna/compiler.h b/src/sna/compiler.h index 23ec31c3..fe2e3214 100644 --- a/src/sna/compiler.h +++ b/src/sna/compiler.h @@ -52,6 +52,16 @@ #define flatten #endif +#if defined(__GNUC__) && (__GNUC__ >= 4) /* 4.4 */ +#define sse2 __attribute__((target("sse2"))) +#define sse4_2 __attribute__((target("sse4.2,sse2"))) +#define avx2 __attribute__((target("avx2,sse4.2,sse2"))) +#else +#define sse2 +#define sse4_2 +#define avx2 +#endif + #ifdef HAVE_VALGRIND #define VG(x) x #else diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c index d08d762b..92802469 100644 --- a/src/sna/gen4_render.c +++ b/src/sna/gen4_render.c @@ -1945,7 +1945,7 @@ gen4_render_composite(struct sna *sna, tmp->mask.bo != NULL, tmp->has_component_alpha, tmp->is_affine); - tmp->u.gen4.ve_id = gen4_choose_composite_emitter(tmp); + tmp->u.gen4.ve_id = gen4_choose_composite_emitter(sna, tmp); tmp->blt = gen4_render_composite_blt; tmp->box = gen4_render_composite_box; @@ -2186,7 +2186,7 @@ gen4_render_composite_spans(struct sna *sna, tmp->base.has_component_alpha = false; tmp->base.need_magic_ca_pass = false; - tmp->base.u.gen4.ve_id = gen4_choose_spans_emitter(tmp); + tmp->base.u.gen4.ve_id = gen4_choose_spans_emitter(sna, tmp); tmp->base.u.gen4.wm_kernel = WM_KERNEL_OPACITY | !tmp->base.is_affine; tmp->box = gen4_render_composite_spans_box; diff --git a/src/sna/gen4_vertex.c b/src/sna/gen4_vertex.c index 5062ebdf..20f85b37 100644 --- a/src/sna/gen4_vertex.c +++ b/src/sna/gen4_vertex.c @@ -272,10 +272,10 @@ emit_texcoord(struct sna *sna, inline static void emit_vertex(struct sna *sna, - const struct sna_composite_op *op, - int16_t srcX, int16_t srcY, - int16_t mskX, int16_t mskY, - int16_t dstX, int16_t dstY) + const struct sna_composite_op *op, + int16_t srcX, int16_t srcY, + int16_t mskX, int16_t mskY, + int16_t dstX, int16_t dstY) { OUT_VERTEX(dstX, dstY); emit_texcoord(sna, &op->src, srcX, srcY); @@ -414,6 +414,66 @@ emit_primitive_linear(struct sna *sna, v[5] = compute_linear(&op->src, r->src.x, r->src.y); } +sse4_2 fastcall static void +emit_primitive_linear__sse4_2(struct sna *sna, + const struct sna_composite_op *op, + const struct sna_composite_rectangles *r) +{ + float *v; + union { + struct sna_coordinate p; + float f; + } dst; + + assert(op->floats_per_rect == 6); + assert((sna->render.vertex_used % 2) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 6; + assert(sna->render.vertex_used <= sna->render.vertex_size); + + dst.p.x = r->dst.x + r->width; + dst.p.y = r->dst.y + r->height; + v[0] = dst.f; + dst.p.x = r->dst.x; + v[2] = dst.f; + dst.p.y = r->dst.y; + v[4] = dst.f; + + v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height); + v[3] = compute_linear(&op->src, r->src.x, r->src.y+r->height); + v[5] = compute_linear(&op->src, r->src.x, r->src.y); +} + +avx2 fastcall static void +emit_primitive_linear__avx2(struct sna *sna, + const struct sna_composite_op *op, + const struct sna_composite_rectangles *r) +{ + float *v; + union { + struct sna_coordinate p; + float f; + } dst; + + assert(op->floats_per_rect == 6); + assert((sna->render.vertex_used % 2) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 6; + assert(sna->render.vertex_used <= sna->render.vertex_size); + + dst.p.x = r->dst.x + r->width; + dst.p.y = r->dst.y + r->height; + v[0] = dst.f; + dst.p.x = r->dst.x; + v[2] = dst.f; + dst.p.y = r->dst.y; + v[4] = dst.f; + + v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height); + v[3] = compute_linear(&op->src, r->src.x, r->src.y+r->height); + v[5] = compute_linear(&op->src, r->src.x, r->src.y); +} + fastcall static void emit_boxes_linear(const struct sna_composite_op *op, const BoxRec *box, int nbox, @@ -442,6 +502,62 @@ emit_boxes_linear(const struct sna_composite_op *op, } while (--nbox); } +sse4_2 fastcall static void +emit_boxes_linear__sse4_2(const struct sna_composite_op *op, + const BoxRec *box, int nbox, + float *v) +{ + union { + struct sna_coordinate p; + float f; + } dst; + + do { + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + dst.p.x = box->x1; + v[2] = dst.f; + dst.p.y = box->y1; + v[4] = dst.f; + + v[1] = compute_linear(&op->src, box->x2, box->y2); + v[3] = compute_linear(&op->src, box->x1, box->y2); + v[5] = compute_linear(&op->src, box->x1, box->y1); + + v += 6; + box++; + } while (--nbox); +} + +avx2 fastcall static void +emit_boxes_linear__avx2(const struct sna_composite_op *op, + const BoxRec *box, int nbox, + float *v) +{ + union { + struct sna_coordinate p; + float f; + } dst; + + do { + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + dst.p.x = box->x1; + v[2] = dst.f; + dst.p.y = box->y1; + v[4] = dst.f; + + v[1] = compute_linear(&op->src, box->x2, box->y2); + v[3] = compute_linear(&op->src, box->x1, box->y2); + v[5] = compute_linear(&op->src, box->x1, box->y1); + + v += 6; + box++; + } while (--nbox); +} + fastcall static void emit_primitive_identity_source(struct sna *sna, const struct sna_composite_op *op, @@ -473,6 +589,68 @@ emit_primitive_identity_source(struct sna *sna, v[5] = v[2] = v[8] + r->height * op->src.scale[1]; } +sse4_2 fastcall static void +emit_primitive_identity_source__sse4_2(struct sna *sna, + const struct sna_composite_op *op, + const struct sna_composite_rectangles *r) +{ + union { + struct sna_coordinate p; + float f; + } dst; + float *v; + + assert(op->floats_per_rect == 9); + assert((sna->render.vertex_used % 3) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 9; + + dst.p.x = r->dst.x + r->width; + dst.p.y = r->dst.y + r->height; + v[0] = dst.f; + dst.p.x = r->dst.x; + v[3] = dst.f; + dst.p.y = r->dst.y; + v[6] = dst.f; + + v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0]; + v[1] = v[4] + r->width * op->src.scale[0]; + + v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1]; + v[5] = v[2] = v[8] + r->height * op->src.scale[1]; +} + +avx2 fastcall static void +emit_primitive_identity_source__avx2(struct sna *sna, + const struct sna_composite_op *op, + const struct sna_composite_rectangles *r) +{ + union { + struct sna_coordinate p; + float f; + } dst; + float *v; + + assert(op->floats_per_rect == 9); + assert((sna->render.vertex_used % 3) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 9; + + dst.p.x = r->dst.x + r->width; + dst.p.y = r->dst.y + r->height; + v[0] = dst.f; + dst.p.x = r->dst.x; + v[3] = dst.f; + dst.p.y = r->dst.y; + v[6] = dst.f; + + v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0]; + v[1] = v[4] + r->width * op->src.scale[0]; + + v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1]; + v[5] = v[2] = v[8] + r->height * op->src.scale[1]; +} + fastcall static void emit_boxes_identity_source(const struct sna_composite_op *op, const BoxRec *box, int nbox, @@ -503,6 +681,66 @@ emit_boxes_identity_source(const struct sna_composite_op *op, } while (--nbox); } +sse4_2 fastcall static void +emit_boxes_identity_source__sse4_2(const struct sna_composite_op *op, + const BoxRec *box, int nbox, + float *v) +{ + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + dst.p.x = box->x1; + v[3] = dst.f; + dst.p.y = box->y1; + v[6] = dst.f; + + v[7] = v[4] = (box->x1 + op->src.offset[0]) * op->src.scale[0]; + v[1] = (box->x2 + op->src.offset[0]) * op->src.scale[0]; + + v[8] = (box->y1 + op->src.offset[1]) * op->src.scale[1]; + v[2] = v[5] = (box->y2 + op->src.offset[1]) * op->src.scale[1]; + + v += 9; + box++; + } while (--nbox); +} + +avx2 fastcall static void +emit_boxes_identity_source__avx2(const struct sna_composite_op *op, + const BoxRec *box, int nbox, + float *v) +{ + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + dst.p.x = box->x1; + v[3] = dst.f; + dst.p.y = box->y1; + v[6] = dst.f; + + v[7] = v[4] = (box->x1 + op->src.offset[0]) * op->src.scale[0]; + v[1] = (box->x2 + op->src.offset[0]) * op->src.scale[0]; + + v[8] = (box->y1 + op->src.offset[1]) * op->src.scale[1]; + v[2] = v[5] = (box->y2 + op->src.offset[1]) * op->src.scale[1]; + + v += 9; + box++; + } while (--nbox); +} + fastcall static void emit_primitive_simple_source(struct sna *sna, const struct sna_composite_op *op, @@ -543,6 +781,86 @@ emit_primitive_simple_source(struct sna *sna, v[8] = ((r->src.y + ty) * yy + y0) * sy; } +sse4_2 fastcall static void +emit_primitive_simple_source__sse4_2(struct sna *sna, + const struct sna_composite_op *op, + const struct sna_composite_rectangles *r) +{ + float *v; + union { + struct sna_coordinate p; + float f; + } dst; + + float xx = op->src.transform->matrix[0][0]; + float x0 = op->src.transform->matrix[0][2]; + float yy = op->src.transform->matrix[1][1]; + float y0 = op->src.transform->matrix[1][2]; + float sx = op->src.scale[0]; + float sy = op->src.scale[1]; + int16_t tx = op->src.offset[0]; + int16_t ty = op->src.offset[1]; + + assert(op->floats_per_rect == 9); + assert((sna->render.vertex_used % 3) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 3*3; + + dst.p.x = r->dst.x + r->width; + dst.p.y = r->dst.y + r->height; + v[0] = dst.f; + v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx; + v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy; + + dst.p.x = r->dst.x; + v[3] = dst.f; + v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx; + + dst.p.y = r->dst.y; + v[6] = dst.f; + v[8] = ((r->src.y + ty) * yy + y0) * sy; +} + +avx2 fastcall static void +emit_primitive_simple_source__avx2(struct sna *sna, + const struct sna_composite_op *op, + const struct sna_composite_rectangles *r) +{ + float *v; + union { + struct sna_coordinate p; + float f; + } dst; + + float xx = op->src.transform->matrix[0][0]; + float x0 = op->src.transform->matrix[0][2]; + float yy = op->src.transform->matrix[1][1]; + float y0 = op->src.transform->matrix[1][2]; + float sx = op->src.scale[0]; + float sy = op->src.scale[1]; + int16_t tx = op->src.offset[0]; + int16_t ty = op->src.offset[1]; + + assert(op->floats_per_rect == 9); + assert((sna->render.vertex_used % 3) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 3*3; + + dst.p.x = r->dst.x + r->width; + dst.p.y = r->dst.y + r->height; + v[0] = dst.f; + v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx; + v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy; + + dst.p.x = r->dst.x; + v[3] = dst.f; + v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx; + + dst.p.y = r->dst.y; + v[6] = dst.f; + v[8] = ((r->src.y + ty) * yy + y0) * sy; +} + fastcall static void emit_boxes_simple_source(const struct sna_composite_op *op, const BoxRec *box, int nbox, @@ -582,6 +900,84 @@ emit_boxes_simple_source(const struct sna_composite_op *op, } while (--nbox); } +sse4_2 fastcall static void +emit_boxes_simple_source__sse4_2(const struct sna_composite_op *op, + const BoxRec *box, int nbox, + float *v) +{ + float xx = op->src.transform->matrix[0][0]; + float x0 = op->src.transform->matrix[0][2]; + float yy = op->src.transform->matrix[1][1]; + float y0 = op->src.transform->matrix[1][2]; + float sx = op->src.scale[0]; + float sy = op->src.scale[1]; + int16_t tx = op->src.offset[0]; + int16_t ty = op->src.offset[1]; + + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + v[1] = ((box->x2 + tx) * xx + x0) * sx; + v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy; + + dst.p.x = box->x1; + v[3] = dst.f; + v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx; + + dst.p.y = box->y1; + v[6] = dst.f; + v[8] = ((box->y1 + ty) * yy + y0) * sy; + + v += 9; + box++; + } while (--nbox); +} + +avx2 fastcall static void +emit_boxes_simple_source__avx2(const struct sna_composite_op *op, + const BoxRec *box, int nbox, + float *v) +{ + float xx = op->src.transform->matrix[0][0]; + float x0 = op->src.transform->matrix[0][2]; + float yy = op->src.transform->matrix[1][1]; + float y0 = op->src.transform->matrix[1][2]; + float sx = op->src.scale[0]; + float sy = op->src.scale[1]; + int16_t tx = op->src.offset[0]; + int16_t ty = op->src.offset[1]; + + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + v[1] = ((box->x2 + tx) * xx + x0) * sx; + v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy; + + dst.p.x = box->x1; + v[3] = dst.f; + v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx; + + dst.p.y = box->y1; + v[6] = dst.f; + v[8] = ((box->y1 + ty) * yy + y0) * sy; + + v += 9; + box++; + } while (--nbox); +} + fastcall static void emit_primitive_affine_source(struct sna *sna, const struct sna_composite_op *op, @@ -981,7 +1377,7 @@ emit_composite_texcoord_affine(struct sna *sna, } -unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp) +unsigned gen4_choose_composite_emitter(struct sna *sna, struct sna_composite_op *tmp) { unsigned vb; @@ -1060,14 +1456,30 @@ unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp) vb = 1; } else if (tmp->src.is_linear) { DBG(("%s: linear, no mask\n", __FUNCTION__)); - tmp->prim_emit = emit_primitive_linear; - tmp->emit_boxes = emit_boxes_linear; + if (sna->cpu_features & AVX2) { + tmp->prim_emit = emit_primitive_linear__avx2; + tmp->emit_boxes = emit_boxes_linear__avx2; + } else if (sna->cpu_features & SSE4_2) { + tmp->prim_emit = emit_primitive_linear__sse4_2; + tmp->emit_boxes = emit_boxes_linear__sse4_2; + } else { + tmp->prim_emit = emit_primitive_linear; + tmp->emit_boxes = emit_boxes_linear; + } tmp->floats_per_vertex = 2; vb = 1; } else if (tmp->src.transform == NULL) { DBG(("%s: identity src, no mask\n", __FUNCTION__)); - tmp->prim_emit = emit_primitive_identity_source; - tmp->emit_boxes = emit_boxes_identity_source; + if (sna->cpu_features & AVX2) { + tmp->prim_emit = emit_primitive_identity_source__avx2; + tmp->emit_boxes = emit_boxes_identity_source__avx2; + } else if (sna->cpu_features & SSE4_2) { + tmp->prim_emit = emit_primitive_identity_source__sse4_2; + tmp->emit_boxes = emit_boxes_identity_source__sse4_2; + } else { + tmp->prim_emit = emit_primitive_identity_source; + tmp->emit_boxes = emit_boxes_identity_source; + } tmp->floats_per_vertex = 3; vb = 2; } else if (tmp->src.is_affine) { @@ -1075,8 +1487,16 @@ unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp) tmp->src.scale[1] /= tmp->src.transform->matrix[2][2]; if (!sna_affine_transform_is_rotation(tmp->src.transform)) { DBG(("%s: simple src, no mask\n", __FUNCTION__)); - tmp->prim_emit = emit_primitive_simple_source; - tmp->emit_boxes = emit_boxes_simple_source; + if (sna->cpu_features & AVX2) { + tmp->prim_emit = emit_primitive_simple_source__avx2; + tmp->emit_boxes = emit_boxes_simple_source__avx2; + } else if (sna->cpu_features & SSE4_2) { + tmp->prim_emit = emit_primitive_simple_source__sse4_2; + tmp->emit_boxes = emit_boxes_simple_source__sse4_2; + } else { + tmp->prim_emit = emit_primitive_simple_source; + tmp->emit_boxes = emit_boxes_simple_source; + } } else { DBG(("%s: affine src, no mask\n", __FUNCTION__)); tmp->prim_emit = emit_primitive_affine_source; @@ -1222,6 +1642,86 @@ emit_span_identity(struct sna *sna, v[11] = v[7] = v[3] = opacity; } +sse4_2 fastcall static void +emit_span_identity__sse4_2(struct sna *sna, + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) +{ + float *v; + union { + struct sna_coordinate p; + float f; + } dst; + + float sx = op->base.src.scale[0]; + float sy = op->base.src.scale[1]; + int16_t tx = op->base.src.offset[0]; + int16_t ty = op->base.src.offset[1]; + + assert(op->base.floats_per_rect == 12); + assert((sna->render.vertex_used % 4) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 3*4; + assert(sna->render.vertex_used <= sna->render.vertex_size); + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + v[1] = (box->x2 + tx) * sx; + v[6] = v[2] = (box->y2 + ty) * sy; + + dst.p.x = box->x1; + v[4] = dst.f; + v[9] = v[5] = (box->x1 + tx) * sx; + + dst.p.y = box->y1; + v[8] = dst.f; + v[10] = (box->y1 + ty) * sy; + + v[11] = v[7] = v[3] = opacity; +} + +avx2 fastcall static void +emit_span_identity__avx2(struct sna *sna, + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) +{ + float *v; + union { + struct sna_coordinate p; + float f; + } dst; + + float sx = op->base.src.scale[0]; + float sy = op->base.src.scale[1]; + int16_t tx = op->base.src.offset[0]; + int16_t ty = op->base.src.offset[1]; + + assert(op->base.floats_per_rect == 12); + assert((sna->render.vertex_used % 4) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 3*4; + assert(sna->render.vertex_used <= sna->render.vertex_size); + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + v[1] = (box->x2 + tx) * sx; + v[6] = v[2] = (box->y2 + ty) * sy; + + dst.p.x = box->x1; + v[4] = dst.f; + v[9] = v[5] = (box->x1 + tx) * sx; + + dst.p.y = box->y1; + v[8] = dst.f; + v[10] = (box->y1 + ty) * sy; + + v[11] = v[7] = v[3] = opacity; +} + fastcall static void emit_span_boxes_identity(const struct sna_composite_spans_op *op, const struct sna_opacity_box *b, int nbox, @@ -1259,11 +1759,173 @@ emit_span_boxes_identity(const struct sna_composite_spans_op *op, } while (--nbox); } +sse4_2 fastcall static void +emit_span_boxes_identity__sse4_2(const struct sna_composite_spans_op *op, + const struct sna_opacity_box *b, int nbox, + float *v) +{ + do { + union { + struct sna_coordinate p; + float f; + } dst; + + float sx = op->base.src.scale[0]; + float sy = op->base.src.scale[1]; + int16_t tx = op->base.src.offset[0]; + int16_t ty = op->base.src.offset[1]; + + dst.p.x = b->box.x2; + dst.p.y = b->box.y2; + v[0] = dst.f; + v[1] = (b->box.x2 + tx) * sx; + v[6] = v[2] = (b->box.y2 + ty) * sy; + + dst.p.x = b->box.x1; + v[4] = dst.f; + v[9] = v[5] = (b->box.x1 + tx) * sx; + + dst.p.y = b->box.y1; + v[8] = dst.f; + v[10] = (b->box.y1 + ty) * sy; + + v[11] = v[7] = v[3] = b->alpha; + + v += 12; + b++; + } while (--nbox); +} + +avx2 fastcall static void +emit_span_boxes_identity__avx2(const struct sna_composite_spans_op *op, + const struct sna_opacity_box *b, int nbox, + float *v) +{ + do { + union { + struct sna_coordinate p; + float f; + } dst; + + float sx = op->base.src.scale[0]; + float sy = op->base.src.scale[1]; + int16_t tx = op->base.src.offset[0]; + int16_t ty = op->base.src.offset[1]; + + dst.p.x = b->box.x2; + dst.p.y = b->box.y2; + v[0] = dst.f; + v[1] = (b->box.x2 + tx) * sx; + v[6] = v[2] = (b->box.y2 + ty) * sy; + + dst.p.x = b->box.x1; + v[4] = dst.f; + v[9] = v[5] = (b->box.x1 + tx) * sx; + + dst.p.y = b->box.y1; + v[8] = dst.f; + v[10] = (b->box.y1 + ty) * sy; + + v[11] = v[7] = v[3] = b->alpha; + + v += 12; + b++; + } while (--nbox); +} + fastcall static void emit_span_simple(struct sna *sna, - const struct sna_composite_spans_op *op, - const BoxRec *box, - float opacity) + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) +{ + float *v; + union { + struct sna_coordinate p; + float f; + } dst; + + float xx = op->base.src.transform->matrix[0][0]; + float x0 = op->base.src.transform->matrix[0][2]; + float yy = op->base.src.transform->matrix[1][1]; + float y0 = op->base.src.transform->matrix[1][2]; + float sx = op->base.src.scale[0]; + float sy = op->base.src.scale[1]; + int16_t tx = op->base.src.offset[0]; + int16_t ty = op->base.src.offset[1]; + + assert(op->base.floats_per_rect == 12); + assert((sna->render.vertex_used % 4) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 3*4; + assert(sna->render.vertex_used <= sna->render.vertex_size); + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + v[1] = ((box->x2 + tx) * xx + x0) * sx; + v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy; + + dst.p.x = box->x1; + v[4] = dst.f; + v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx; + + dst.p.y = box->y1; + v[8] = dst.f; + v[10] = ((box->y1 + ty) * yy + y0) * sy; + + v[11] = v[7] = v[3] = opacity; +} + +sse4_2 fastcall static void +emit_span_simple__sse4_2(struct sna *sna, + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) +{ + float *v; + union { + struct sna_coordinate p; + float f; + } dst; + + float xx = op->base.src.transform->matrix[0][0]; + float x0 = op->base.src.transform->matrix[0][2]; + float yy = op->base.src.transform->matrix[1][1]; + float y0 = op->base.src.transform->matrix[1][2]; + float sx = op->base.src.scale[0]; + float sy = op->base.src.scale[1]; + int16_t tx = op->base.src.offset[0]; + int16_t ty = op->base.src.offset[1]; + + assert(op->base.floats_per_rect == 12); + assert((sna->render.vertex_used % 4) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 3*4; + assert(sna->render.vertex_used <= sna->render.vertex_size); + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + v[1] = ((box->x2 + tx) * xx + x0) * sx; + v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy; + + dst.p.x = box->x1; + v[4] = dst.f; + v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx; + + dst.p.y = box->y1; + v[8] = dst.f; + v[10] = ((box->y1 + ty) * yy + y0) * sy; + + v[11] = v[7] = v[3] = opacity; +} + +avx2 fastcall static void +emit_span_simple__avx2(struct sna *sna, + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) { float *v; union { @@ -1344,6 +2006,88 @@ emit_span_boxes_simple(const struct sna_composite_spans_op *op, } while (--nbox); } +sse4_2 fastcall static void +emit_span_boxes_simple__sse4_2(const struct sna_composite_spans_op *op, + const struct sna_opacity_box *b, int nbox, + float *v) +{ + float xx = op->base.src.transform->matrix[0][0]; + float x0 = op->base.src.transform->matrix[0][2]; + float yy = op->base.src.transform->matrix[1][1]; + float y0 = op->base.src.transform->matrix[1][2]; + float sx = op->base.src.scale[0]; + float sy = op->base.src.scale[1]; + int16_t tx = op->base.src.offset[0]; + int16_t ty = op->base.src.offset[1]; + + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = b->box.x2; + dst.p.y = b->box.y2; + v[0] = dst.f; + v[1] = ((b->box.x2 + tx) * xx + x0) * sx; + v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy; + + dst.p.x = b->box.x1; + v[4] = dst.f; + v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx; + + dst.p.y = b->box.y1; + v[8] = dst.f; + v[10] = ((b->box.y1 + ty) * yy + y0) * sy; + + v[11] = v[7] = v[3] = b->alpha; + + v += 12; + b++; + } while (--nbox); +} + +avx2 fastcall static void +emit_span_boxes_simple__avx2(const struct sna_composite_spans_op *op, + const struct sna_opacity_box *b, int nbox, + float *v) +{ + float xx = op->base.src.transform->matrix[0][0]; + float x0 = op->base.src.transform->matrix[0][2]; + float yy = op->base.src.transform->matrix[1][1]; + float y0 = op->base.src.transform->matrix[1][2]; + float sx = op->base.src.scale[0]; + float sy = op->base.src.scale[1]; + int16_t tx = op->base.src.offset[0]; + int16_t ty = op->base.src.offset[1]; + + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = b->box.x2; + dst.p.y = b->box.y2; + v[0] = dst.f; + v[1] = ((b->box.x2 + tx) * xx + x0) * sx; + v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy; + + dst.p.x = b->box.x1; + v[4] = dst.f; + v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx; + + dst.p.y = b->box.y1; + v[8] = dst.f; + v[10] = ((b->box.y1 + ty) * yy + y0) * sy; + + v[11] = v[7] = v[3] = b->alpha; + + v += 12; + b++; + } while (--nbox); +} + fastcall static void emit_span_affine(struct sna *sna, const struct sna_composite_spans_op *op, @@ -1389,6 +2133,96 @@ emit_span_affine(struct sna *sna, v[11] = v[7] = v[3] = opacity; } +sse4_2 fastcall static void +emit_span_affine__sse4_2(struct sna *sna, + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) +{ + union { + struct sna_coordinate p; + float f; + } dst; + float *v; + + assert(op->base.floats_per_rect == 12); + assert((sna->render.vertex_used % 4) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 12; + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + box->x2, + op->base.src.offset[1] + box->y2, + op->base.src.transform, + op->base.src.scale, + &v[1], &v[2]); + + dst.p.x = box->x1; + v[4] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + box->x1, + op->base.src.offset[1] + box->y2, + op->base.src.transform, + op->base.src.scale, + &v[5], &v[6]); + + dst.p.y = box->y1; + v[8] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + box->x1, + op->base.src.offset[1] + box->y1, + op->base.src.transform, + op->base.src.scale, + &v[9], &v[10]); + + v[11] = v[7] = v[3] = opacity; +} + +avx2 fastcall static void +emit_span_affine__avx2(struct sna *sna, + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) +{ + union { + struct sna_coordinate p; + float f; + } dst; + float *v; + + assert(op->base.floats_per_rect == 12); + assert((sna->render.vertex_used % 4) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 12; + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + box->x2, + op->base.src.offset[1] + box->y2, + op->base.src.transform, + op->base.src.scale, + &v[1], &v[2]); + + dst.p.x = box->x1; + v[4] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + box->x1, + op->base.src.offset[1] + box->y2, + op->base.src.transform, + op->base.src.scale, + &v[5], &v[6]); + + dst.p.y = box->y1; + v[8] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + box->x1, + op->base.src.offset[1] + box->y1, + op->base.src.transform, + op->base.src.scale, + &v[9], &v[10]); + + v[11] = v[7] = v[3] = opacity; +} + fastcall static void emit_span_boxes_affine(const struct sna_composite_spans_op *op, const struct sna_opacity_box *b, int nbox, @@ -1432,11 +2266,161 @@ emit_span_boxes_affine(const struct sna_composite_spans_op *op, } while (--nbox); } +sse4_2 fastcall static void +emit_span_boxes_affine__sse4_2(const struct sna_composite_spans_op *op, + const struct sna_opacity_box *b, int nbox, + float *v) +{ + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = b->box.x2; + dst.p.y = b->box.y2; + v[0] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2, + op->base.src.offset[1] + b->box.y2, + op->base.src.transform, + op->base.src.scale, + &v[1], &v[2]); + + dst.p.x = b->box.x1; + v[4] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1, + op->base.src.offset[1] + b->box.y2, + op->base.src.transform, + op->base.src.scale, + &v[5], &v[6]); + + dst.p.y = b->box.y1; + v[8] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1, + op->base.src.offset[1] + b->box.y1, + op->base.src.transform, + op->base.src.scale, + &v[9], &v[10]); + + v[11] = v[7] = v[3] = b->alpha; + + v += 12; + b++; + } while (--nbox); +} + +avx2 fastcall static void +emit_span_boxes_affine__avx2(const struct sna_composite_spans_op *op, + const struct sna_opacity_box *b, int nbox, + float *v) +{ + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = b->box.x2; + dst.p.y = b->box.y2; + v[0] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2, + op->base.src.offset[1] + b->box.y2, + op->base.src.transform, + op->base.src.scale, + &v[1], &v[2]); + + dst.p.x = b->box.x1; + v[4] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1, + op->base.src.offset[1] + b->box.y2, + op->base.src.transform, + op->base.src.scale, + &v[5], &v[6]); + + dst.p.y = b->box.y1; + v[8] = dst.f; + _sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1, + op->base.src.offset[1] + b->box.y1, + op->base.src.transform, + op->base.src.scale, + &v[9], &v[10]); + + v[11] = v[7] = v[3] = b->alpha; + + v += 12; + b++; + } while (--nbox); +} + fastcall static void emit_span_linear(struct sna *sna, - const struct sna_composite_spans_op *op, - const BoxRec *box, - float opacity) + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) +{ + union { + struct sna_coordinate p; + float f; + } dst; + float *v; + + assert(op->base.floats_per_rect == 9); + assert((sna->render.vertex_used % 3) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 9; + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + dst.p.x = box->x1; + v[3] = dst.f; + dst.p.y = box->y1; + v[6] = dst.f; + + v[1] = compute_linear(&op->base.src, box->x2, box->y2); + v[4] = compute_linear(&op->base.src, box->x1, box->y2); + v[7] = compute_linear(&op->base.src, box->x1, box->y1); + + v[8] = v[5] = v[2] = opacity; +} + +sse4_2 fastcall static void +emit_span_linear__sse4_2(struct sna *sna, + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) +{ + union { + struct sna_coordinate p; + float f; + } dst; + float *v; + + assert(op->base.floats_per_rect == 9); + assert((sna->render.vertex_used % 3) == 0); + v = sna->render.vertices + sna->render.vertex_used; + sna->render.vertex_used += 9; + + dst.p.x = box->x2; + dst.p.y = box->y2; + v[0] = dst.f; + dst.p.x = box->x1; + v[3] = dst.f; + dst.p.y = box->y1; + v[6] = dst.f; + + v[1] = compute_linear(&op->base.src, box->x2, box->y2); + v[4] = compute_linear(&op->base.src, box->x1, box->y2); + v[7] = compute_linear(&op->base.src, box->x1, box->y1); + + v[8] = v[5] = v[2] = opacity; +} + +avx2 fastcall static void +emit_span_linear__avx2(struct sna *sna, + const struct sna_composite_spans_op *op, + const BoxRec *box, + float opacity) { union { struct sna_coordinate p; @@ -1494,6 +2478,66 @@ emit_span_boxes_linear(const struct sna_composite_spans_op *op, } while (--nbox); } +sse4_2 fastcall static void +emit_span_boxes_linear__sse4_2(const struct sna_composite_spans_op *op, + const struct sna_opacity_box *b, int nbox, + float *v) +{ + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = b->box.x2; + dst.p.y = b->box.y2; + v[0] = dst.f; + dst.p.x = b->box.x1; + v[3] = dst.f; + dst.p.y = b->box.y1; + v[6] = dst.f; + + v[1] = compute_linear(&op->base.src, b->box.x2, b->box.y2); + v[4] = compute_linear(&op->base.src, b->box.x1, b->box.y2); + v[7] = compute_linear(&op->base.src, b->box.x1, b->box.y1); + + v[8] = v[5] = v[2] = b->alpha; + + v += 9; + b++; + } while (--nbox); +} + +avx2 fastcall static void +emit_span_boxes_linear__avx2(const struct sna_composite_spans_op *op, + const struct sna_opacity_box *b, int nbox, + float *v) +{ + do { + union { + struct sna_coordinate p; + float f; + } dst; + + dst.p.x = b->box.x2; + dst.p.y = b->box.y2; + v[0] = dst.f; + dst.p.x = b->box.x1; + v[3] = dst.f; + dst.p.y = b->box.y1; + v[6] = dst.f; + + v[1] = compute_linear(&op->base.src, b->box.x2, b->box.y2); + v[4] = compute_linear(&op->base.src, b->box.x1, b->box.y2); + v[7] = compute_linear(&op->base.src, b->box.x1, b->box.y1); + + v[8] = v[5] = v[2] = b->alpha; + + v += 9; + b++; + } while (--nbox); +} + inline inline static uint32_t gen4_choose_spans_vertex_buffer(const struct sna_composite_op *op) { @@ -1502,7 +2546,8 @@ gen4_choose_spans_vertex_buffer(const struct sna_composite_op *op) return 1 << 2 | id; } -unsigned gen4_choose_spans_emitter(struct sna_composite_spans_op *tmp) +unsigned gen4_choose_spans_emitter(struct sna *sna, + struct sna_composite_spans_op *tmp) { unsigned vb; @@ -1512,24 +2557,56 @@ unsigned gen4_choose_spans_emitter(struct sna_composite_spans_op *tmp) tmp->base.floats_per_vertex = 3; vb = 1 << 2 | 1; } else if (tmp->base.src.is_linear) { - tmp->prim_emit = emit_span_linear; - tmp->emit_boxes = emit_span_boxes_linear; + if (sna->cpu_features & AVX2) { + tmp->prim_emit = emit_span_linear__avx2; + tmp->emit_boxes = emit_span_boxes_linear__avx2; + } else if (sna->cpu_features & SSE4_2) { + tmp->prim_emit = emit_span_linear__sse4_2; + tmp->emit_boxes = emit_span_boxes_linear__sse4_2; + } else { + tmp->prim_emit = emit_span_linear; + tmp->emit_boxes = emit_span_boxes_linear; + } tmp->base.floats_per_vertex = 3; vb = 1 << 2 | 1; } else if (tmp->base.src.transform == NULL) { - tmp->prim_emit = emit_span_identity; - tmp->emit_boxes = emit_span_boxes_identity; + if (sna->cpu_features & AVX2) { + tmp->prim_emit = emit_span_identity__avx2; + tmp->emit_boxes = emit_span_boxes_identity__avx2; + } else if (sna->cpu_features & SSE4_2) { + tmp->prim_emit = emit_span_identity__sse4_2; + tmp->emit_boxes = emit_span_boxes_identity__sse4_2; + } else { + tmp->prim_emit = emit_span_identity; + tmp->emit_boxes = emit_span_boxes_identity; + } tmp->base.floats_per_vertex = 4; vb = 1 << 2 | 2; } else if (tmp->base.is_affine) { tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2]; tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2]; if (!sna_affine_transform_is_rotation(tmp->base.src.transform)) { - tmp->prim_emit = emit_span_simple; - tmp->emit_boxes = emit_span_boxes_simple; + if (sna->cpu_features & AVX2) { + tmp->prim_emit = emit_span_simple__avx2; + tmp->emit_boxes = emit_span_boxes_simple__avx2; + } else if (sna->cpu_features & SSE4_2) { + tmp->prim_emit = emit_span_simple__sse4_2; + tmp->emit_boxes = emit_span_boxes_simple__sse4_2; + } else { + tmp->prim_emit = emit_span_simple; + tmp->emit_boxes = emit_span_boxes_simple; + } } else { - tmp->prim_emit = emit_span_affine; - tmp->emit_boxes = emit_span_boxes_affine; + if (sna->cpu_features & AVX2) { + tmp->prim_emit = emit_span_affine__avx2; + tmp->emit_boxes = emit_span_boxes_affine__avx2; + } else if (sna->cpu_features & SSE4_2) { + tmp->prim_emit = emit_span_affine__sse4_2; + tmp->emit_boxes = emit_span_boxes_affine__sse4_2; + } else { + tmp->prim_emit = emit_span_affine; + tmp->emit_boxes = emit_span_boxes_affine; + } } tmp->base.floats_per_vertex = 4; vb = 1 << 2 | 2; diff --git a/src/sna/gen4_vertex.h b/src/sna/gen4_vertex.h index 431b545e..1494ba14 100644 --- a/src/sna/gen4_vertex.h +++ b/src/sna/gen4_vertex.h @@ -10,7 +10,7 @@ void gen4_vertex_flush(struct sna *sna); int gen4_vertex_finish(struct sna *sna); void gen4_vertex_close(struct sna *sna); -unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp); -unsigned gen4_choose_spans_emitter(struct sna_composite_spans_op *tmp); +unsigned gen4_choose_composite_emitter(struct sna *sna, struct sna_composite_op *tmp); +unsigned gen4_choose_spans_emitter(struct sna *sna, struct sna_composite_spans_op *tmp); #endif /* GEN4_VERTEX_H */ diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c index f2368773..8b9eaac1 100644 --- a/src/sna/gen5_render.c +++ b/src/sna/gen5_render.c @@ -1924,7 +1924,7 @@ gen5_render_composite(struct sna *sna, tmp->mask.bo != NULL, tmp->has_component_alpha, tmp->is_affine); - tmp->u.gen5.ve_id = gen4_choose_composite_emitter(tmp); + tmp->u.gen5.ve_id = gen4_choose_composite_emitter(sna, tmp); tmp->blt = gen5_render_composite_blt; tmp->box = gen5_render_composite_box; @@ -2152,7 +2152,7 @@ gen5_render_composite_spans(struct sna *sna, tmp->base.has_component_alpha = false; tmp->base.need_magic_ca_pass = false; - tmp->base.u.gen5.ve_id = gen4_choose_spans_emitter(tmp); + tmp->base.u.gen5.ve_id = gen4_choose_spans_emitter(sna, tmp); tmp->base.u.gen5.wm_kernel = WM_KERNEL_OPACITY | !tmp->base.is_affine; tmp->box = gen5_render_composite_spans_box; diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c index fa4c47b1..d4105141 100644 --- a/src/sna/gen6_render.c +++ b/src/sna/gen6_render.c @@ -2272,7 +2272,7 @@ gen6_render_composite(struct sna *sna, tmp->mask.bo != NULL, tmp->has_component_alpha, tmp->is_affine), - gen4_choose_composite_emitter(tmp)); + gen4_choose_composite_emitter(sna, tmp)); tmp->blt = gen6_render_composite_blt; tmp->box = gen6_render_composite_box; @@ -2508,7 +2508,7 @@ gen6_render_composite_spans(struct sna *sna, SAMPLER_EXTEND_PAD), gen6_get_blend(tmp->base.op, false, tmp->base.dst.format), GEN6_WM_KERNEL_OPACITY | !tmp->base.is_affine, - gen4_choose_spans_emitter(tmp)); + gen4_choose_spans_emitter(sna, tmp)); tmp->box = gen6_render_composite_spans_box; tmp->boxes = gen6_render_composite_spans_boxes; diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c index bd14d90f..7984cf19 100644 --- a/src/sna/gen7_render.c +++ b/src/sna/gen7_render.c @@ -2412,7 +2412,7 @@ gen7_render_composite(struct sna *sna, tmp->mask.bo != NULL, tmp->has_component_alpha, tmp->is_affine), - gen4_choose_composite_emitter(tmp)); + gen4_choose_composite_emitter(sna, tmp)); tmp->blt = gen7_render_composite_blt; tmp->box = gen7_render_composite_box; @@ -2628,7 +2628,7 @@ gen7_render_composite_spans(struct sna *sna, SAMPLER_EXTEND_PAD), gen7_get_blend(tmp->base.op, false, tmp->base.dst.format), GEN7_WM_KERNEL_OPACITY | !tmp->base.is_affine, - gen4_choose_spans_emitter(tmp)); + gen4_choose_spans_emitter(sna, tmp)); tmp->box = gen7_render_composite_spans_box; tmp->boxes = gen7_render_composite_spans_boxes;