From 57d7d5de78bcf01d75d7a7de03fe50a2a9bd1b7e Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 6 Jun 2012 00:08:17 +0100
Subject: [PATCH] sna: Use GPU for readback onto CPU bo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Time to blt from GTT to LLC 16384 bytes:	 125.000µs (snb)
Time to blt from GTT to LLC 16384 bytes:	  71.000µs (ivb)
Time to blt from GTT to LLC 1048576 bytes:	1400.000µs (snb)
Time to blt from GTT to LLC 1048576 bytes:	 938.000µs (ivb)

Time to copy from GTT to LLC 16384 bytes:	 118.000µs (snb)
Time to copy from GTT to LLC 16384 bytes:	 134.000µs (ivb)
Time to copy from GTT to LLC 1048576 bytes:	6723.000µs (snb)
Time to copy from GTT to LLC 1048576 bytes:	7424.000µs (ivb)

And conversely,

Time to blt from LLC to GTT 16384 bytes:	 10.000µs (snb)
Time to blt from LLC to GTT 16384 bytes:	  8.000µs (ivb)
Time to blt from LLC to GTT 1048576 bytes:	217.000µs (snb)
Time to blt from LLC to GTT 1048576 bytes:	135.000µs (ivb)

Time to copy from LLC to GTT 16384 bytes:	  4.000µs (snb)
Time to copy from LLC to GTT 16384 bytes:	  4.000µs (ivb)
Time to copy from LLC to GTT 1048576 bytes:	270.000µs (snb)
Time to copy from LLC to GTT 1048576 bytes:	179.500µs (ivb)

It seems clear then that even with the extra synchronisation cost
copying from the GTT is much preferable with the GPU than using the
uncached reads by the CPU. Streaming write-combines from the CPU into
the GTT seem about as efficient as we can manage, so continue to use the
mapping unless busy.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 src/sna/sna_accel.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index afd9ed77..e2a2c127 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -930,8 +930,19 @@ sna_pixmap_create_mappable_gpu(PixmapPtr pixmap)
 	return priv->gpu_bo && kgem_bo_is_mappable(&sna->kgem, priv->gpu_bo);
 }
 
-static bool use_cpu_bo_for_xfer(struct sna_pixmap *priv)
+static inline bool use_cpu_bo_for_write(struct sna *sna,
+					struct sna_pixmap *priv)
 {
+	return priv->cpu_bo != NULL && sna->kgem.gen >= 30;
+}
+
+static inline bool use_cpu_bo_for_read(struct sna_pixmap *priv)
+{
+#if 0
+	if (pixmap->devPrivate.ptr == NULL)
+		return TRUE;
+#endif
+
 	if (priv->cpu_bo == NULL)
 		return FALSE;
 
@@ -1112,7 +1123,7 @@ skip_inplace_map:
 		if (n) {
 			Bool ok = FALSE;
 
-			if (sna->kgem.gen >= 30 && use_cpu_bo_for_xfer(priv))
+			if (use_cpu_bo_for_write(sna, priv))
 				ok = sna->render.copy_boxes(sna, GXcopy,
 							    pixmap, priv->gpu_bo, 0, 0,
 							    pixmap, priv->cpu_bo, 0, 0,
@@ -1503,7 +1514,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 			assert(pixmap_contains_damage(pixmap, priv->gpu_damage));
 
 			ok = FALSE;
-			if (sna->kgem.gen >= 30 && use_cpu_bo_for_xfer(priv))
+			if (use_cpu_bo_for_write(sna, priv))
 				ok = sna->render.copy_boxes(sna, GXcopy,
 							    pixmap, priv->gpu_bo, 0, 0,
 							    pixmap, priv->cpu_bo, 0, 0,
@@ -1604,7 +1615,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 				if (n) {
 					Bool ok = FALSE;
 
-					if (sna->kgem.gen >= 30 && use_cpu_bo_for_xfer(priv))
+					if (use_cpu_bo_for_write(sna, priv))
 						ok = sna->render.copy_boxes(sna, GXcopy,
 									    pixmap, priv->gpu_bo, 0, 0,
 									    pixmap, priv->cpu_bo, 0, 0,
@@ -1626,7 +1637,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 				int n = REGION_NUM_RECTS(r);
 				Bool ok = FALSE;
 
-				if (sna->kgem.gen >= 30 && use_cpu_bo_for_xfer(priv))
+				if (use_cpu_bo_for_write(sna, priv))
 					ok = sna->render.copy_boxes(sna, GXcopy,
 								    pixmap, priv->gpu_bo, 0, 0,
 								    pixmap, priv->cpu_bo, 0, 0,
@@ -1648,7 +1659,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 					int n = REGION_NUM_RECTS(&need);
 					Bool ok = FALSE;
 
-					if (sna->kgem.gen >= 30 && use_cpu_bo_for_xfer(priv))
+					if (use_cpu_bo_for_write(sna, priv))
 						ok = sna->render.copy_boxes(sna, GXcopy,
 									    pixmap, priv->gpu_bo, 0, 0,
 									    pixmap, priv->cpu_bo, 0, 0,
@@ -1878,7 +1889,7 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 		if (n) {
 			Bool ok = FALSE;
 
-			if (pixmap->devPrivate.ptr == NULL || use_cpu_bo_for_xfer(priv))
+			if (use_cpu_bo_for_read(priv))
 				ok = sna->render.copy_boxes(sna, GXcopy,
 							    pixmap, priv->cpu_bo, 0, 0,
 							    pixmap, priv->gpu_bo, 0, 0,
@@ -1916,7 +1927,7 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 	} else if (DAMAGE_IS_ALL(priv->cpu_damage) ||
 		   sna_damage_contains_box__no_reduce(priv->cpu_damage, box)) {
 		Bool ok = FALSE;
-		if (pixmap->devPrivate.ptr == NULL || use_cpu_bo_for_xfer(priv))
+		if (use_cpu_bo_for_read(priv))
 			ok = sna->render.copy_boxes(sna, GXcopy,
 						    pixmap, priv->cpu_bo, 0, 0,
 						    pixmap, priv->gpu_bo, 0, 0,
@@ -1945,7 +1956,7 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 
 		box = REGION_RECTS(&i);
 		ok = FALSE;
-		if (pixmap->devPrivate.ptr == NULL || use_cpu_bo_for_xfer(priv))
+		if (use_cpu_bo_for_read(priv))
 			ok = sna->render.copy_boxes(sna, GXcopy,
 						    pixmap, priv->cpu_bo, 0, 0,
 						    pixmap, priv->gpu_bo, 0, 0,
@@ -2441,7 +2452,7 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 		DBG(("%s: uploading %d damage boxes\n", __FUNCTION__, n));
 
 		ok = FALSE;
-		if (pixmap->devPrivate.ptr == NULL || use_cpu_bo_for_xfer(priv))
+		if (use_cpu_bo_for_read(priv))
 			ok = sna->render.copy_boxes(sna, GXcopy,
 						    pixmap, priv->cpu_bo, 0, 0,
 						    pixmap, priv->gpu_bo, 0, 0,