[RFC PATCH] cfq-iosched: limit slice_idle when many busy que

Hi,

When some application launches several hundreds of processes that issue
only a few small sync I/O requests, CFQ may cause heavy latencies
(10+ seconds at the worst case), although the request rate is low enough for
the disk to handle it without waiting. This is because CFQ waits for
slice_idle (default:8ms) every time before processing each request, until
their thinktimes are evaluated.

This scenario can be reproduced using fio with parameters below:
  fio -filename=/tmp/test -rw=randread -size=5G -runtime=15 -name=file1 \
      -bs=4k -numjobs=500 -thinktime=1000000
In this case, 500 processes issue a random read request every second.

This problem can be avoided by setting slice_idle to 0, but there is a
risk to hurt throughput performance on S-ATA disks.

This patch tries to reduce the effect of slice_idle automatically when a
lot of busy queues are waiting in the idle window.
It adds a counter (busy_idle_queues) of queues in idle window that have
I/O requests to cfq_data. And if (busy_idle_queues * slice_idle) goes over
the slice allocated to the group, it limits the idle wait time to
(group_slice / busy_idle_queues).

Without this patch, fio benchmark with parameters above to an ext4
partition on a S-ATA HDD results in:
 read : io=20140KB, bw=1258.5KB/s, iops=314 , runt= 16004msec
 clat (usec): min=4 , max=6494.9K, avg=541264.54, stdev=993834.12

With this patch:
  read : io=28040KB, bw=1750.1KB/s, iops=437 , runt= 16014msec
  clat (usec): min=4 , max=2837.2K, avg=110236.79, stdev=303351.72

Average latency is reduced by 80%, and max is also reduced by 56%.

Any comments are appreciated.

Signed-off-by: Tomoki Sekiyama <tomoki.sekiyama@hds.com>
---
 block/cfq-iosched.c |   36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d5cd313..77ac27e80 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -329,6 +329,7 @@ struct cfq_data {

 	unsigned int busy_queues;
 	unsigned int busy_sync_queues;
+	unsigned int busy_idle_queues; /* busy but with idle window */

 	int rq_in_driver;
 	int rq_in_flight[2];
@@ -446,6 +447,20 @@ CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS

+static inline void cfq_set_cfqq_idle_window(struct cfq_data *cfqd,
+					    struct cfq_queue *cfqq, bool idle)
+{
+	if (idle) {
+		cfq_mark_cfqq_idle_window(cfqq);
+		if (cfq_cfqq_on_rr(cfqq))
+			cfqd->busy_idle_queues++;
+	} else {
+		cfq_clear_cfqq_idle_window(cfqq);
+		if (cfq_cfqq_on_rr(cfqq))
+			cfqd->busy_idle_queues--;
+	}
+}
+
 static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
 {
 	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
@@ -2164,6 +2179,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	cfqd->busy_queues++;
 	if (cfq_cfqq_sync(cfqq))
 		cfqd->busy_sync_queues++;
+	if (cfq_cfqq_idle_window(cfqq))
+		cfqd->busy_idle_queues++;

 	cfq_resort_rr_list(cfqd, cfqq);
 }
@@ -2192,6 +2209,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	cfqd->busy_queues--;
 	if (cfq_cfqq_sync(cfqq))
 		cfqd->busy_sync_queues--;
+	if (cfq_cfqq_idle_window(cfqq))
+		cfqd->busy_idle_queues--;
 }

 /*
@@ -2761,6 +2780,16 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	else
 		sl = cfqd->cfq_slice_idle;

+	/*
+	 * If there too many queues with idle window, slice idle can cause
+	 * unacceptable latency. Then we reduce slice idle here.
+	 */
+	if (cfqd->busy_idle_queues) {
+		unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
+		unsigned long limit = group_slice / cfqd->busy_idle_queues;
+		sl = min(sl, limit);
+	}
+
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
 	cfqg_stats_set_start_idle_time(cfqq->cfqg);
 	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
@@ -3091,7 +3120,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	    (cfq_cfqq_slice_new(cfqq) ||
 	    (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
 		cfq_clear_cfqq_deep(cfqq);
-		cfq_clear_cfqq_idle_window(cfqq);
+		cfq_set_cfqq_idle_window(cfqd, cfqq, false);
 	}

 	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
@@ -3742,10 +3771,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,

 	if (old_idle != enable_idle) {
 		cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
-		if (enable_idle)
-			cfq_mark_cfqq_idle_window(cfqq);
-		else
-			cfq_clear_cfqq_idle_window(cfqq);
+		cfq_set_cfqq_idle_window(cfqd, cfqq, enable_idle);
 	}
 }


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/