diff -aru linux-3.0-orig/block/blk-ioc.c linux-3.0/block/blk-ioc.c
--- linux-3.0-orig/block/blk-ioc.c 2011-07-21 23:17:23.000000000 -0300
+++ linux-3.0/block/blk-ioc.c 2011-08-23 17:28:32.202013966 -0300
@@ -5,6 +5,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
+#include <linux/bitmap.h>
#include <linux/blkdev.h>
#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
#include <linux/slab.h>
@@ -16,13 +17,12 @@
*/
static struct kmem_cache *iocontext_cachep;
-static void cfq_dtor(struct io_context *ioc)
+static void hlist_sched_dtor(struct io_context *ioc, struct hlist_head *list)
{
- if (!hlist_empty(&ioc->cic_list)) {
+ if (!hlist_empty(list)) {
struct cfq_io_context *cic;
- cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
- cic_list);
+ cic = list_entry(list->first, struct cfq_io_context, cic_list);
cic->dtor(ioc);
}
}
@@ -40,7 +40,9 @@
if (atomic_long_dec_and_test(&ioc->refcount)) {
rcu_read_lock();
- cfq_dtor(ioc);
+
+ hlist_sched_dtor(ioc, &ioc->cic_list);
+ hlist_sched_dtor(ioc, &ioc->bfq_cic_list);
rcu_read_unlock();
kmem_cache_free(iocontext_cachep, ioc);
@@ -50,15 +52,14 @@
}
EXPORT_SYMBOL(put_io_context);
-static void cfq_exit(struct io_context *ioc)
+static void hlist_sched_exit(struct io_context *ioc, struct hlist_head *list)
{
rcu_read_lock();
- if (!hlist_empty(&ioc->cic_list)) {
+ if (!hlist_empty(list)) {
struct cfq_io_context *cic;
- cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
- cic_list);
+ cic = list_entry(list->first, struct cfq_io_context, cic_list);
cic->exit(ioc);
}
rcu_read_unlock();
@@ -74,9 +75,10 @@
task->io_context = NULL;
task_unlock(task);
- if (atomic_dec_and_test(&ioc->nr_tasks))
- cfq_exit(ioc);
-
+ if (atomic_dec_and_test(&ioc->nr_tasks)) {
+ hlist_sched_exit(ioc, &ioc->cic_list);
+ hlist_sched_exit(ioc, &ioc->bfq_cic_list);
+ }
put_io_context(ioc);
}
@@ -89,12 +91,14 @@
atomic_long_set(&ret->refcount, 1);
atomic_set(&ret->nr_tasks, 1);
spin_lock_init(&ret->lock);
- ret->ioprio_changed = 0;
+ bitmap_zero(ret->ioprio_changed, IOC_IOPRIO_CHANGED_BITS);
ret->ioprio = 0;
ret->last_waited = 0; /* doesn't matter... */
ret->nr_batch_requests = 0; /* because this is 0 */
INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
INIT_HLIST_HEAD(&ret->cic_list);
+ INIT_RADIX_TREE(&ret->bfq_radix_root, GFP_ATOMIC | __GFP_HIGH);
+ INIT_HLIST_HEAD(&ret->bfq_cic_list);
ret->ioc_data = NULL;
#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
ret->cgroup_changed = 0;
diff -aru linux-3.0-orig/block/cfq-iosched.c linux-3.0/block/cfq-iosched.c
--- linux-3.0-orig/block/cfq-iosched.c 2011-07-21 23:17:23.000000000 -0300
+++ linux-3.0/block/cfq-iosched.c 2011-08-23 17:27:13.670247768 -0300
@@ -2919,7 +2919,6 @@
static void cfq_ioc_set_ioprio(struct io_context *ioc)
{
call_for_each_cic(ioc, changed_ioprio);
- ioc->ioprio_changed = 0;
}
static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
@@ -3204,8 +3203,13 @@
goto err_free;
out:
- smp_read_barrier_depends();
- if (unlikely(ioc->ioprio_changed))
+ /*
+ * test_and_clear_bit() implies a memory barrier, paired with
+ * the wmb() in fs/ioprio.c, so the value seen for ioprio is the
+ * new one.
+ */
+ if (unlikely(test_and_clear_bit(IOC_CFQ_IOPRIO_CHANGED,
+ ioc->ioprio_changed)))
cfq_ioc_set_ioprio(ioc);
#ifdef CONFIG_CFQ_GROUP_IOSCHED
diff -aru linux-3.0-orig/block/Kconfig.iosched linux-3.0/block/Kconfig.iosched
--- linux-3.0-orig/block/Kconfig.iosched 2011-07-21 23:17:23.000000000 -0300
+++ linux-3.0/block/Kconfig.iosched 2011-08-23 17:27:13.670247768 -0300
@@ -43,6 +43,28 @@
---help---
Enable group IO scheduling in CFQ.
+config IOSCHED_BFQ
+ tristate "BFQ I/O scheduler"
+ depends on EXPERIMENTAL
+ default n
+ ---help---
+ The BFQ I/O scheduler tries to distribute bandwidth among
+ all processes according to their weights.
+ It aims at distributing the bandwidth as desired, independently of
+ the disk parameters and with any workload. It also tries to
+ guarantee low latency to interactive and soft real-time
+ applications. If compiled built-in (saying Y here), BFQ can
+ be configured to support hierarchical scheduling.
+
+config CGROUP_BFQIO
+ bool "BFQ hierarchical scheduling support"
+ depends on CGROUPS && IOSCHED_BFQ=y
+ default n
+ ---help---
+ Enable hierarchical scheduling in BFQ, using the cgroups
+ filesystem interface. The name of the subsystem will be
+ bfqio.
+
choice
prompt "Default I/O scheduler"
default DEFAULT_CFQ
@@ -56,6 +78,9 @@
config DEFAULT_CFQ
bool "CFQ" if IOSCHED_CFQ=y
+ config DEFAULT_BFQ
+ bool "BFQ" if IOSCHED_BFQ=y
+
config DEFAULT_NOOP
bool "No-op"
@@ -65,6 +90,7 @@
string
default "deadline" if DEFAULT_DEADLINE
default "cfq" if DEFAULT_CFQ
+ default "bfq" if DEFAULT_BFQ
default "noop" if DEFAULT_NOOP
endmenu
diff -aru linux-3.0-orig/fs/ioprio.c linux-3.0/fs/ioprio.c
--- linux-3.0-orig/fs/ioprio.c 2011-07-21 23:17:23.000000000 -0300
+++ linux-3.0/fs/ioprio.c 2011-08-23 17:27:13.670247768 -0300
@@ -30,7 +30,7 @@
int set_task_ioprio(struct task_struct *task, int ioprio)
{
- int err;
+ int err, i;
struct io_context *ioc;
const struct cred *cred = current_cred(), *tcred;
@@ -60,12 +60,17 @@
err = -ENOMEM;
break;
}
+ /* let other ioc users see the new values */
+ smp_wmb();
task->io_context = ioc;
} while (1);
if (!err) {
ioc->ioprio = ioprio;
- ioc->ioprio_changed = 1;
+ /* make sure schedulers see the new ioprio value */
+ wmb();
+ for (i = 0; i < IOC_IOPRIO_CHANGED_BITS; i++)
+ set_bit(i, ioc->ioprio_changed);
}
task_unlock(task);
diff -aru linux-3.0-orig/include/linux/iocontext.h linux-3.0/include/linux/iocontext.h
--- linux-3.0-orig/include/linux/iocontext.h 2011-07-21 23:17:23.000000000 -0300
+++ linux-3.0/include/linux/iocontext.h 2011-08-23 17:27:13.670247768 -0300
@@ -1,14 +1,14 @@
#ifndef IOCONTEXT_H
#define IOCONTEXT_H
+#include <linux/bitmap.h>
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
-struct cfq_queue;
struct cfq_io_context {
void *key;
- struct cfq_queue *cfqq[2];
+ void *cfqq[2];
struct io_context *ioc;
@@ -28,6 +28,16 @@
};
/*
+ * Indexes into the ioprio_changed bitmap. A bit set indicates that
+ * the corresponding I/O scheduler needs to see a ioprio update.
+ */
+enum {
+ IOC_CFQ_IOPRIO_CHANGED,
+ IOC_BFQ_IOPRIO_CHANGED,
+ IOC_IOPRIO_CHANGED_BITS
+};
+
+/*
* I/O subsystem state of the associated processes. It is refcounted
* and kmalloc'ed. These could be shared between processes.
*/
@@ -39,7 +49,7 @@
spinlock_t lock;
unsigned short ioprio;
- unsigned short ioprio_changed;
+ DECLARE_BITMAP(ioprio_changed, IOC_IOPRIO_CHANGED_BITS);
#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
unsigned short cgroup_changed;
@@ -53,6 +63,8 @@
struct radix_tree_root radix_root;
struct hlist_head cic_list;
+ struct radix_tree_root bfq_radix_root;
+ struct hlist_head bfq_cic_list;
void __rcu *ioc_data;
};