wb-4.4.6-v5.patch

diff -Naur linux-4.4.6-gentoo-orig/arch/um/drivers/ubd_kern.c linux-4.4.6-gentoo-patched/arch/um/drivers/ubd_kern.c
--- linux-4.4.6-gentoo-orig/arch/um/drivers/ubd_kern.c  2016-05-04 11:19:37.591649829 +0300
+++ linux-4.4.6-gentoo-patched/arch/um/drivers/ubd_kern.c   2016-05-04 11:02:48.599733982 +0300
@@ -866,7 +866,7 @@
        goto out;
    }
    ubd_dev->queue->queuedata = ubd_dev;
-   blk_queue_flush(ubd_dev->queue, REQ_FLUSH);
+   blk_queue_write_cache(ubd_dev->queue, true, false);

    blk_queue_max_segments(ubd_dev->queue, MAX_SG);
    err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]);
diff -Naur linux-4.4.6-gentoo-orig/block/blk-core.c linux-4.4.6-gentoo-patched/block/blk-core.c
--- linux-4.4.6-gentoo-orig/block/blk-core.c    2016-05-04 11:19:37.593649829 +0300
+++ linux-4.4.6-gentoo-patched/block/blk-core.c 2016-05-04 11:02:48.599733982 +0300
@@ -33,6 +33,7 @@
 #include <linux/ratelimit.h>
 #include <linux/pm_runtime.h>
 #include <linux/blk-cgroup.h>
+#include <linux/wbt.h>

 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -872,6 +873,8 @@

 fail:
    blk_free_flush_queue(q->fq);
+   wbt_exit(q->rq_wb);
+   q->rq_wb = NULL;
    return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1385,6 +1388,7 @@
    blk_delete_timer(rq);
    blk_clear_rq_complete(rq);
    trace_block_rq_requeue(q, rq);
+   wbt_requeue(q->rq_wb, &rq->wb_stat);

    if (rq->cmd_flags & REQ_QUEUED)
        blk_queue_end_tag(q, rq);
@@ -1475,6 +1479,8 @@
    /* this is a bio leak */
    WARN_ON(req->bio != NULL);

+   wbt_done(q->rq_wb, &req->wb_stat);
+
    /*
     * Request may not have originated from ll_rw_blk. if not,
     * it didn't come out of our reserved rq pools
@@ -1704,6 +1710,7 @@
    int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
    struct request *req;
    unsigned int request_count = 0;
+   bool wb_acct;

    /*
     * low level driver can indicate that it wants pages above a
@@ -1756,6 +1763,8 @@
    }

 get_rq:
+   wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, q->queue_lock);
+
    /*
     * This sync check and mask will be re-done in init_request_from_bio(),
     * but we need to set it earlier to expose the sync flag to the
@@ -1771,11 +1780,16 @@
     */
    req = get_request(q, rw_flags, bio, GFP_NOIO);
    if (IS_ERR(req)) {
+       if (wb_acct)
+           __wbt_done(q->rq_wb);
        bio->bi_error = PTR_ERR(req);
        bio_endio(bio);
        goto out_unlock;
    }

+   if (wb_acct)
+       wbt_mark_tracked(&req->wb_stat);
+
    /*
     * After dropping the lock and possibly sleeping here, our request
     * may now be mergeable after it had proven unmergeable (above).
@@ -1953,7 +1967,8 @@
     * drivers without flush support don't have to worry
     * about them.
     */
-   if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+   if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
+       !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
        bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
        if (!nr_sectors) {
            err = 0;
@@ -2502,6 +2517,8 @@
 {
    blk_dequeue_request(req);

+   wbt_issue(req->q->rq_wb, &req->wb_stat);
+
    /*
     * We are now handing the request to the hardware, initialize
     * resid_len to full count and add the timeout handler.
@@ -2569,6 +2586,8 @@

    trace_block_rq_complete(req->q, req, nr_bytes);

+   blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req);
+
    if (!req->bio)
        return false;

@@ -2736,9 +2755,10 @@

    blk_account_io_done(req);

-   if (req->end_io)
+   if (req->end_io) {
+       wbt_done(req->q->rq_wb, &req->wb_stat);
        req->end_io(req, error);
-   else {
+   } else {
        if (blk_bidi_rq(req))
            __blk_put_request(req->next_rq->q, req->next_rq);

diff -Naur linux-4.4.6-gentoo-orig/block/blk-flush.c linux-4.4.6-gentoo-patched/block/blk-flush.c
--- linux-4.4.6-gentoo-orig/block/blk-flush.c   2016-05-04 11:19:37.593649829 +0300
+++ linux-4.4.6-gentoo-patched/block/blk-flush.c    2016-05-04 11:02:48.599733982 +0300
@@ -95,17 +95,18 @@
 static bool blk_kick_flush(struct request_queue *q,
               struct blk_flush_queue *fq);

-static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
+static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
 {
    unsigned int policy = 0;

    if (blk_rq_sectors(rq))
        policy |= REQ_FSEQ_DATA;

-   if (fflags & REQ_FLUSH) {
+   if (fflags & (1UL << QUEUE_FLAG_WC)) {
        if (rq->cmd_flags & REQ_FLUSH)
            policy |= REQ_FSEQ_PREFLUSH;
-       if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
+       if (!(fflags & (1UL << QUEUE_FLAG_FUA)) &&
+           (rq->cmd_flags & REQ_FUA))
            policy |= REQ_FSEQ_POSTFLUSH;
    }
    return policy;
@@ -384,7 +385,7 @@
 void blk_insert_flush(struct request *rq)
 {
    struct request_queue *q = rq->q;
-   unsigned int fflags = q->flush_flags;   /* may change, cache */
+   unsigned long fflags = q->queue_flags;  /* may change, cache */
    unsigned int policy = blk_flush_policy(fflags, rq);
    struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);

@@ -393,7 +394,7 @@
     * REQ_FLUSH and FUA for the driver.
     */
    rq->cmd_flags &= ~REQ_FLUSH;
-   if (!(fflags & REQ_FUA))
+   if (!(fflags & (1UL << QUEUE_FLAG_FUA)))
        rq->cmd_flags &= ~REQ_FUA;

    /*
diff -Naur linux-4.4.6-gentoo-orig/block/blk-mq.c linux-4.4.6-gentoo-patched/block/blk-mq.c
--- linux-4.4.6-gentoo-orig/block/blk-mq.c  2016-05-04 11:19:37.594649829 +0300
+++ linux-4.4.6-gentoo-patched/block/blk-mq.c   2016-05-04 11:02:48.600733982 +0300
@@ -22,6 +22,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
+#include <linux/wbt.h>

 #include <trace/events/block.h>

@@ -29,6 +30,7 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
+#include "blk-stat.h"

 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
@@ -276,6 +278,8 @@

    if (rq->cmd_flags & REQ_MQ_INFLIGHT)
        atomic_dec(&hctx->nr_active);
+
+   wbt_done(q->rq_wb, &rq->wb_stat);
    rq->cmd_flags = 0;

    clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
@@ -308,6 +312,7 @@
    blk_account_io_done(rq);

    if (rq->end_io) {
+       wbt_done(rq->q->rq_wb, &rq->wb_stat);
        rq->end_io(rq, error);
    } else {
        if (unlikely(blk_bidi_rq(rq)))
@@ -358,10 +363,19 @@
    put_cpu();
 }

+static void blk_mq_stat_add(struct request *rq)
+{
+   struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)];
+
+   blk_stat_add(stat, rq);
+}
+
 static void __blk_mq_complete_request(struct request *rq)
 {
    struct request_queue *q = rq->q;

+   blk_mq_stat_add(rq);
+
    if (!q->softirq_done_fn)
        blk_mq_end_request(rq, rq->errors);
    else
@@ -405,6 +419,8 @@
    if (unlikely(blk_bidi_rq(rq)))
        rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);

+   wbt_issue(q->rq_wb, &rq->wb_stat);
+
    blk_add_timer(rq);

    /*
@@ -440,6 +456,7 @@
    struct request_queue *q = rq->q;

    trace_block_rq_requeue(q, rq);
+   wbt_requeue(q->rq_wb, &rq->wb_stat);

    if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
        if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -1249,6 +1266,7 @@
    struct blk_plug *plug;
    struct request *same_queue_rq = NULL;
    blk_qc_t cookie;
+   bool wb_acct;

    blk_queue_bounce(q, &bio);

@@ -1266,9 +1284,17 @@
    } else
        request_count = blk_plug_queued_count(q);

+   wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, NULL);
+
    rq = blk_mq_map_request(q, bio, &data);
-   if (unlikely(!rq))
+   if (unlikely(!rq)) {
+       if (wb_acct)
+           __wbt_done(q->rq_wb);
        return BLK_QC_T_NONE;
+   }
+
+   if (wb_acct)
+       wbt_mark_tracked(&rq->wb_stat);

    cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);

@@ -1345,6 +1371,7 @@
    struct blk_map_ctx data;
    struct request *rq;
    blk_qc_t cookie;
+   bool wb_acct;

    blk_queue_bounce(q, &bio);

@@ -1359,9 +1386,17 @@
        blk_attempt_plug_merge(q, bio, &request_count, NULL))
        return BLK_QC_T_NONE;

+   wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, NULL);
+
    rq = blk_mq_map_request(q, bio, &data);
-   if (unlikely(!rq))
+   if (unlikely(!rq)) {
+       if (wb_acct)
+           __wbt_done(q->rq_wb);
        return BLK_QC_T_NONE;
+   }
+
+   if (wb_acct)
+       wbt_mark_tracked(&rq->wb_stat);

    cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);

@@ -1782,6 +1817,8 @@
        spin_lock_init(&__ctx->lock);
        INIT_LIST_HEAD(&__ctx->rq_list);
        __ctx->queue = q;
+       blk_stat_init(&__ctx->stat[0]);
+       blk_stat_init(&__ctx->stat[1]);

        /* If the cpu isn't online, the cpu is mapped to first hctx */
        if (!cpu_online(i))
@@ -2095,6 +2132,9 @@
    list_del_init(&q->all_q_node);
    mutex_unlock(&all_q_mutex);

+   wbt_exit(q->rq_wb);
+   q->rq_wb = NULL;
+
    blk_mq_del_queue_tag_set(q);

    blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
diff -Naur linux-4.4.6-gentoo-orig/block/blk-mq.h linux-4.4.6-gentoo-patched/block/blk-mq.h
--- linux-4.4.6-gentoo-orig/block/blk-mq.h  2016-05-04 11:19:37.594649829 +0300
+++ linux-4.4.6-gentoo-patched/block/blk-mq.h   2016-05-04 11:02:48.600733982 +0300
@@ -1,6 +1,8 @@
 #ifndef INT_BLK_MQ_H
 #define INT_BLK_MQ_H

+#include "blk-stat.h"
+
 struct blk_mq_tag_set;

 struct blk_mq_ctx {
@@ -20,6 +22,7 @@

    /* incremented at completion time */
    unsigned long       ____cacheline_aligned_in_smp rq_completed[2];
+   struct blk_rq_stat  stat[2];

    struct request_queue    *queue;
    struct kobject      kobj;
diff -Naur linux-4.4.6-gentoo-orig/block/blk-mq-sysfs.c linux-4.4.6-gentoo-patched/block/blk-mq-sysfs.c
--- linux-4.4.6-gentoo-orig/block/blk-mq-sysfs.c    2016-05-04 11:19:37.595649829 +0300
+++ linux-4.4.6-gentoo-patched/block/blk-mq-sysfs.c 2016-05-04 11:02:48.599733982 +0300
@@ -247,6 +247,47 @@
    return ret;
 }

+static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx)
+{
+   struct blk_mq_ctx *ctx;
+   unsigned int i;
+
+   hctx_for_each_ctx(hctx, ctx, i) {
+       blk_stat_init(&ctx->stat[0]);
+       blk_stat_init(&ctx->stat[1]);
+   }
+}
+
+static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx,
+                     const char *page, size_t count)
+{
+   blk_mq_stat_clear(hctx);
+   return count;
+}
+
+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
+{
+   return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
+           pre, (long long) stat->nr_samples,
+           (long long) stat->mean, (long long) stat->min,
+           (long long) stat->max);
+}
+
+static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+   struct blk_rq_stat stat[2];
+   ssize_t ret;
+
+   blk_stat_init(&stat[0]);
+   blk_stat_init(&stat[1]);
+
+   blk_hctx_stat_get(hctx, stat);
+
+   ret = print_stat(page, &stat[0], "read :");
+   ret += print_stat(page + ret, &stat[1], "write:");
+   return ret;
+}
+
 static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
    .attr = {.name = "dispatched", .mode = S_IRUGO },
    .show = blk_mq_sysfs_dispatched_show,
@@ -304,6 +345,11 @@
    .attr = {.name = "io_poll", .mode = S_IRUGO },
    .show = blk_mq_hw_sysfs_poll_show,
 };
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = {
+   .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR },
+   .show = blk_mq_hw_sysfs_stat_show,
+   .store = blk_mq_hw_sysfs_stat_store,
+};

 static struct attribute *default_hw_ctx_attrs[] = {
    &blk_mq_hw_sysfs_queued.attr,
@@ -314,6 +360,7 @@
    &blk_mq_hw_sysfs_cpus.attr,
    &blk_mq_hw_sysfs_active.attr,
    &blk_mq_hw_sysfs_poll.attr,
+   &blk_mq_hw_sysfs_stat.attr,
    NULL,
 };

diff -Naur linux-4.4.6-gentoo-orig/block/blk-settings.c linux-4.4.6-gentoo-patched/block/blk-settings.c
--- linux-4.4.6-gentoo-orig/block/blk-settings.c    2016-05-04 11:19:37.595649829 +0300
+++ linux-4.4.6-gentoo-patched/block/blk-settings.c 2016-05-04 11:02:48.600733982 +0300
@@ -820,31 +820,54 @@
 }
 EXPORT_SYMBOL(blk_queue_update_dma_alignment);

+void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
+{
+   spin_lock_irq(q->queue_lock);
+   if (queueable)
+       clear_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags);
+   else
+       set_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags);
+   spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
+
 /**
- * blk_queue_flush - configure queue's cache flush capability
+ * blk_set_queue_depth - tell the block layer about the device queue depth
  * @q:     the request queue for the device
- * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
+ * @depth:     queue depth
  *
- * Tell block layer cache flush capability of @q.  If it supports
- * flushing, REQ_FLUSH should be set.  If it supports bypassing
- * write cache for individual writes, REQ_FUA should be set.
  */
-void blk_queue_flush(struct request_queue *q, unsigned int flush)
+void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
 {
-   WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
-
-   if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
-       flush &= ~REQ_FUA;
-
-   q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
+   q->queue_depth = depth;
+   wbt_set_queue_depth(q->rq_wb, depth);
 }
-EXPORT_SYMBOL_GPL(blk_queue_flush);
+EXPORT_SYMBOL(blk_set_queue_depth);

-void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
+/**
+ * blk_queue_write_cache - configure queue's write cache
+ * @q:     the request queue for the device
+ * @wc:        write back cache on or off
+ * @fua:   device supports FUA writes, if true
+ *
+ * Tell the block layer about the write cache of @q.
+ */
+void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
 {
-   q->flush_not_queueable = !queueable;
+   spin_lock_irq(q->queue_lock);
+   if (wc)
+       queue_flag_set(QUEUE_FLAG_WC, q);
+   else
+       queue_flag_clear(QUEUE_FLAG_WC, q);
+   if (fua)
+       queue_flag_set(QUEUE_FLAG_FUA, q);
+   else
+       queue_flag_clear(QUEUE_FLAG_FUA, q);
+   spin_unlock_irq(q->queue_lock);
+
+   wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
 }
-EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
+EXPORT_SYMBOL_GPL(blk_queue_write_cache);

 static int __init blk_settings_init(void)
 {
diff -Naur linux-4.4.6-gentoo-orig/block/blk-stat.c linux-4.4.6-gentoo-patched/block/blk-stat.c
--- linux-4.4.6-gentoo-orig/block/blk-stat.c    1970-01-01 03:00:00.000000000 +0300
+++ linux-4.4.6-gentoo-patched/block/blk-stat.c 2016-05-04 11:02:48.600733982 +0300
@@ -0,0 +1,185 @@
+/*
+ * Block stat tracking code
+ *
+ * Copyright (C) 2016 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/blk-mq.h>
+
+#include "blk-stat.h"
+#include "blk-mq.h"
+
+void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
+{
+   if (!src->nr_samples)
+       return;
+
+   dst->min = min(dst->min, src->min);
+   dst->max = max(dst->max, src->max);
+
+   if (!dst->nr_samples)
+       dst->mean = src->mean;
+   else {
+       dst->mean = div64_s64((src->mean * src->nr_samples) +
+                   (dst->mean * dst->nr_samples),
+                   dst->nr_samples + src->nr_samples);
+   }
+   dst->nr_samples += src->nr_samples;
+}
+
+static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+{
+   struct blk_mq_hw_ctx *hctx;
+   struct blk_mq_ctx *ctx;
+   int i, j, nr;
+
+   blk_stat_init(&dst[0]);
+   blk_stat_init(&dst[1]);
+
+   nr = 0;
+   do {
+       uint64_t newest = 0;
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+           hctx_for_each_ctx(hctx, ctx, j) {
+               if (!ctx->stat[0].nr_samples &&
+                   !ctx->stat[1].nr_samples)
+                   continue;
+               if (ctx->stat[0].time > newest)
+                   newest = ctx->stat[0].time;
+               if (ctx->stat[1].time > newest)
+                   newest = ctx->stat[1].time;
+           }
+       }
+
+       /*
+        * No samples
+        */
+       if (!newest)
+           break;
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+           hctx_for_each_ctx(hctx, ctx, j) {
+               if (ctx->stat[0].time == newest) {
+                   blk_stat_sum(&dst[0], &ctx->stat[0]);
+                   nr++;
+               }
+               if (ctx->stat[1].time == newest) {
+                   blk_stat_sum(&dst[1], &ctx->stat[1]);
+                   nr++;
+               }
+           }
+       }
+       /*
+        * If we race on finding an entry, just loop back again.
+        * Should be very rare.
+        */
+   } while (!nr);
+}
+
+void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+{
+   if (q->mq_ops)
+       blk_mq_stat_get(q, dst);
+   else {
+       memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat));
+       memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat));
+   }
+}
+
+void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
+{
+   struct blk_mq_ctx *ctx;
+   unsigned int i, nr;
+
+   nr = 0;
+   do {
+       uint64_t newest = 0;
+
+       hctx_for_each_ctx(hctx, ctx, i) {
+           if (!ctx->stat[0].nr_samples &&
+               !ctx->stat[1].nr_samples)
+               continue;
+
+           if (ctx->stat[0].time > newest)
+               newest = ctx->stat[0].time;
+           if (ctx->stat[1].time > newest)
+               newest = ctx->stat[1].time;
+       }
+
+       if (!newest)
+           break;
+
+       hctx_for_each_ctx(hctx, ctx, i) {
+           if (ctx->stat[0].time == newest) {
+               blk_stat_sum(&dst[0], &ctx->stat[0]);
+               nr++;
+           }
+           if (ctx->stat[1].time == newest) {
+               blk_stat_sum(&dst[1], &ctx->stat[1]);
+               nr++;
+           }
+       }
+       /*
+        * If we race on finding an entry, just loop back again.
+        * Should be very rare, as the window is only updated
+        * occasionally
+        */
+   } while (!nr);
+}
+
+static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
+{
+   stat->min = -1ULL;
+   stat->max = stat->nr_samples = stat->mean = 0;
+   stat->time = time_now & BLK_STAT_MASK;
+}
+
+void blk_stat_init(struct blk_rq_stat *stat)
+{
+   __blk_stat_init(stat, ktime_to_ns(ktime_get()));
+}
+
+void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
+{
+   s64 delta, now, value;
+   u64 rq_time = wbt_issue_stat_get_time(&rq->wb_stat);
+
+   now = ktime_to_ns(ktime_get());
+   if (now < rq_time)
+       return;
+
+   if ((now & BLK_STAT_MASK) != (stat->time & BLK_STAT_MASK))
+       __blk_stat_init(stat, now);
+
+   value = now - rq_time;
+   if (value > stat->max)
+       stat->max = value;
+   if (value < stat->min)
+       stat->min = value;
+
+   delta = value - stat->mean;
+   if (delta)
+       stat->mean += div64_s64(delta, stat->nr_samples + 1);
+
+   stat->nr_samples++;
+}
+
+void blk_stat_clear(struct request_queue *q)
+{
+   if (q->mq_ops) {
+       struct blk_mq_hw_ctx *hctx;
+       struct blk_mq_ctx *ctx;
+       int i, j;
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+           hctx_for_each_ctx(hctx, ctx, j) {
+               blk_stat_init(&ctx->stat[0]);
+               blk_stat_init(&ctx->stat[1]);
+           }
+       }
+   } else {
+       blk_stat_init(&q->rq_stats[0]);
+       blk_stat_init(&q->rq_stats[1]);
+   }
+}
diff -Naur linux-4.4.6-gentoo-orig/block/blk-stat.h linux-4.4.6-gentoo-patched/block/blk-stat.h
--- linux-4.4.6-gentoo-orig/block/blk-stat.h    1970-01-01 03:00:00.000000000 +0300
+++ linux-4.4.6-gentoo-patched/block/blk-stat.h 2016-05-04 11:02:48.600733982 +0300
@@ -0,0 +1,17 @@
+#ifndef BLK_STAT_H
+#define BLK_STAT_H
+
+/*
+ * ~0.13s window as a power-of-2 (2^27 nsecs)
+ */
+#define BLK_STAT_NSEC  134217728ULL
+#define BLK_STAT_MASK  ~(BLK_STAT_NSEC - 1)
+
+void blk_stat_add(struct blk_rq_stat *, struct request *);
+void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
+void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
+void blk_stat_clear(struct request_queue *q);
+void blk_stat_init(struct blk_rq_stat *);
+void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *);
+
+#endif
diff -Naur linux-4.4.6-gentoo-orig/block/blk-sysfs.c linux-4.4.6-gentoo-patched/block/blk-sysfs.c
--- linux-4.4.6-gentoo-orig/block/blk-sysfs.c   2016-05-04 11:19:37.596649829 +0300
+++ linux-4.4.6-gentoo-patched/block/blk-sysfs.c    2016-05-04 11:02:48.600733982 +0300
@@ -10,6 +10,7 @@
 #include <linux/blktrace_api.h>
 #include <linux/blk-mq.h>
 #include <linux/blk-cgroup.h>
+#include <linux/wbt.h>

 #include "blk.h"
 #include "blk-mq.h"
@@ -41,6 +42,19 @@
    return count;
 }

+static ssize_t queue_var_store64(u64 *var, const char *page)
+{
+   int err;
+   u64 v;
+
+   err = kstrtou64(page, 10, &v);
+   if (err < 0)
+       return err;
+
+   *var = v;
+   return 0;
+}
+
 static ssize_t queue_requests_show(struct request_queue *q, char *page)
 {
    return queue_var_show(q->nr_requests, (page));
@@ -348,6 +362,110 @@
    return ret;
 }

+static ssize_t queue_wb_win_show(struct request_queue *q, char *page)
+{
+   if (!q->rq_wb)
+       return -EINVAL;
+
+   return sprintf(page, "%llu\n", div_u64(q->rq_wb->win_nsec, 1000));
+}
+
+static ssize_t queue_wb_win_store(struct request_queue *q, const char *page,
+                 size_t count)
+{
+   ssize_t ret;
+   u64 val;
+
+   if (!q->rq_wb)
+       return -EINVAL;
+
+   ret = queue_var_store64(&val, page);
+   if (ret < 0)
+       return ret;
+
+   q->rq_wb->win_nsec = val * 1000ULL;
+   wbt_update_limits(q->rq_wb);
+   return count;
+}
+
+static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
+{
+   if (!q->rq_wb)
+       return -EINVAL;
+
+   return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
+}
+
+static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
+                 size_t count)
+{
+   ssize_t ret;
+   u64 val;
+
+   if (!q->rq_wb)
+       return -EINVAL;
+
+   ret = queue_var_store64(&val, page);
+   if (ret < 0)
+       return ret;
+
+   q->rq_wb->min_lat_nsec = val * 1000ULL;
+   wbt_update_limits(q->rq_wb);
+   return count;
+}
+
+static ssize_t queue_wc_show(struct request_queue *q, char *page)
+{
+   if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+       return sprintf(page, "write back\n");
+
+   return sprintf(page, "write through\n");
+}
+
+static ssize_t queue_wc_store(struct request_queue *q, const char *page,
+                 size_t count)
+{
+   int set = -1;
+
+   if (!strncmp(page, "write back", 10))
+       set = 1;
+   else if (!strncmp(page, "write through", 13) ||
+        !strncmp(page, "none", 4))
+       set = 0;
+
+   if (set == -1)
+       return -EINVAL;
+
+   spin_lock_irq(q->queue_lock);
+   if (set)
+       queue_flag_set(QUEUE_FLAG_WC, q);
+   else
+       queue_flag_clear(QUEUE_FLAG_WC, q);
+   spin_unlock_irq(q->queue_lock);
+
+   return count;
+}
+
+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
+{
+   return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
+           pre, (long long) stat->nr_samples,
+           (long long) stat->mean, (long long) stat->min,
+           (long long) stat->max);
+}
+
+static ssize_t queue_stats_show(struct request_queue *q, char *page)
+{
+   struct blk_rq_stat stat[2];
+   ssize_t ret;
+
+   blk_queue_stat_get(q, stat);
+
+   ret = print_stat(page, &stat[0], "read :");
+   ret += print_stat(page + ret, &stat[1], "write:");
+   return ret;
+}
+
 static struct queue_sysfs_entry queue_requests_entry = {
    .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
    .show = queue_requests_show,
@@ -479,6 +597,29 @@
    .store = queue_poll_store,
 };

+static struct queue_sysfs_entry queue_wc_entry = {
+   .attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR },
+   .show = queue_wc_show,
+   .store = queue_wc_store,
+};
+
+static struct queue_sysfs_entry queue_stats_entry = {
+   .attr = {.name = "stats", .mode = S_IRUGO },
+   .show = queue_stats_show,
+};
+
+static struct queue_sysfs_entry queue_wb_lat_entry = {
+   .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
+   .show = queue_wb_lat_show,
+   .store = queue_wb_lat_store,
+};
+
+static struct queue_sysfs_entry queue_wb_win_entry = {
+   .attr = {.name = "wbt_window_usec", .mode = S_IRUGO | S_IWUSR },
+   .show = queue_wb_win_show,
+   .store = queue_wb_win_store,
+};
+
 static struct attribute *default_attrs[] = {
    &queue_requests_entry.attr,
    &queue_ra_entry.attr,
@@ -504,6 +645,10 @@
    &queue_iostats_entry.attr,
    &queue_random_entry.attr,
    &queue_poll_entry.attr,
+   &queue_wc_entry.attr,
+   &queue_stats_entry.attr,
+   &queue_wb_lat_entry.attr,
+   &queue_wb_win_entry.attr,
    NULL,
 };

@@ -618,6 +763,43 @@
    .release    = blk_release_queue,
 };

+static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat)
+{
+   blk_queue_stat_get(data, stat);
+}
+
+static void blk_wb_stat_clear(void *data)
+{
+   blk_stat_clear(data);
+}
+
+static struct wb_stat_ops wb_stat_ops = {
+   .get    = blk_wb_stat_get,
+   .clear  = blk_wb_stat_clear,
+};
+
+static void blk_wb_init(struct request_queue *q)
+{
+   struct rq_wb *rwb;
+
+   rwb = wbt_init(&q->backing_dev_info, &wb_stat_ops, q);
+
+   /*
+    * If this fails, we don't get throttling
+    */
+   if (IS_ERR(rwb))
+       return;
+
+   if (blk_queue_nonrot(q))
+       rwb->min_lat_nsec = 2000000ULL;
+   else
+       rwb->min_lat_nsec = 75000000ULL;
+
+   wbt_set_queue_depth(rwb, blk_queue_depth(q));
+   wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+   q->rq_wb = rwb;
+}
+
 int blk_register_queue(struct gendisk *disk)
 {
    int ret;
@@ -657,6 +839,8 @@
    if (q->mq_ops)
        blk_mq_register_disk(disk);

+   blk_wb_init(q);
+
    if (!q->request_fn)
        return 0;

diff -Naur linux-4.4.6-gentoo-orig/block/Kconfig linux-4.4.6-gentoo-patched/block/Kconfig
--- linux-4.4.6-gentoo-orig/block/Kconfig   2016-05-04 11:19:37.596649829 +0300
+++ linux-4.4.6-gentoo-patched/block/Kconfig    2016-05-04 11:02:48.599733982 +0300
@@ -4,6 +4,7 @@
 menuconfig BLOCK
        bool "Enable the block layer" if EXPERT
        default y
+       select WBT
        help
     Provide block layer support for the kernel.

diff -Naur linux-4.4.6-gentoo-orig/block/Makefile linux-4.4.6-gentoo-patched/block/Makefile
--- linux-4.4.6-gentoo-orig/block/Makefile  2016-05-04 11:19:37.596649829 +0300
+++ linux-4.4.6-gentoo-patched/block/Makefile   2016-05-04 11:10:18.790696435 +0300
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
            blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
            blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-           blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
+           blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
            blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
            genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
            partitions/
diff -Naur linux-4.4.6-gentoo-orig/Documentation/block/queue-sysfs.txt linux-4.4.6-gentoo-patched/Documentation/block/queue-sysfs.txt
--- linux-4.4.6-gentoo-orig/Documentation/block/queue-sysfs.txt 2016-05-04 11:19:37.597649829 +0300
+++ linux-4.4.6-gentoo-patched/Documentation/block/queue-sysfs.txt  2016-05-04 11:02:48.598733982 +0300
@@ -141,6 +141,28 @@
 an IO scheduler name to this file will attempt to load that IO scheduler
 module, if it isn't already present in the system.

+write_cache (RW)
+----------------
+When read, this file will display whether the device has write back
+caching enabled or not. It will return "write back" for the former
+case, and "write through" for the latter. Writing to this file can
+change the kernels view of the device, but it doesn't alter the
+device state. This means that it might not be safe to toggle the
+setting from "write back" to "write through", since that will also
+eliminate cache flushes issued by the kernel.
+
+wb_lat_usec (RW)
+----------------
+If the device is registered for writeback throttling, then this file shows
+the target minimum read latency. If this latency is exceeded in a given
+window of time (see wb_window_usec), then the writeback throttling will start
+scaling back writes.
+
+wb_window_usec (RW)
+-------------------
+If the device is registered for writeback throttling, then this file shows
+the value of the monitoring window in which we'll look at the target
+latency. See wb_lat_usec.


 Jens Axboe <jens.axboe@oracle.com>, February 2009
diff -Naur linux-4.4.6-gentoo-orig/Documentation/block/writeback_cache_control.txt linux-4.4.6-gentoo-patched/Documentation/block/writeback_cache_control.txt
--- linux-4.4.6-gentoo-orig/Documentation/block/writeback_cache_control.txt 2016-05-04 11:19:37.597649829 +0300
+++ linux-4.4.6-gentoo-patched/Documentation/block/writeback_cache_control.txt  2016-05-04 11:02:48.598733982 +0300
@@ -71,7 +71,7 @@
 driver needs to tell the block layer that it supports flushing caches by
 doing:

-   blk_queue_flush(sdkp->disk->queue, REQ_FLUSH);
+   blk_queue_write_cache(sdkp->disk->queue, true, false);

 and handle empty REQ_FLUSH requests in its prep_fn/request_fn.  Note that
 REQ_FLUSH requests with a payload are automatically turned into a sequence
@@ -79,7 +79,7 @@
 layer.  For devices that also support the FUA bit the block layer needs
 to be told to pass through the REQ_FUA bit using:

-   blk_queue_flush(sdkp->disk->queue, REQ_FLUSH | REQ_FUA);
+   blk_queue_write_cache(sdkp->disk->queue, true, true);

 and the driver must handle write requests that have the REQ_FUA bit set
 in prep_fn/request_fn.  If the FUA bit is not natively supported the block
diff -Naur linux-4.4.6-gentoo-orig/drivers/block/drbd/drbd_main.c linux-4.4.6-gentoo-patched/drivers/block/drbd/drbd_main.c
--- linux-4.4.6-gentoo-orig/drivers/block/drbd/drbd_main.c  2016-05-04 11:19:37.598649829 +0300
+++ linux-4.4.6-gentoo-patched/drivers/block/drbd/drbd_main.c   2016-05-04 11:02:48.601733981 +0300
@@ -2769,7 +2769,7 @@
    q->backing_dev_info.congested_data = device;

    blk_queue_make_request(q, drbd_make_request);
-   blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
+   blk_queue_write_cache(q, true, true);
    /* Setting the max_hw_sectors to an odd value of 8kibyte here
       This triggers a max_bio_size message upon first attach or connect */
    blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
diff -Naur linux-4.4.6-gentoo-orig/drivers/block/loop.c linux-4.4.6-gentoo-patched/drivers/block/loop.c
--- linux-4.4.6-gentoo-orig/drivers/block/loop.c    2016-05-04 11:19:37.598649829 +0300
+++ linux-4.4.6-gentoo-patched/drivers/block/loop.c 2016-05-04 11:02:48.601733981 +0300
@@ -937,7 +937,7 @@
    mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));

    if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
-       blk_queue_flush(lo->lo_queue, REQ_FLUSH);
+       blk_queue_write_cache(lo->lo_queue, true, false);

    loop_update_dio(lo);
    set_capacity(lo->lo_disk, size);
diff -Naur linux-4.4.6-gentoo-orig/drivers/block/mtip32xx/mtip32xx.c linux-4.4.6-gentoo-patched/drivers/block/mtip32xx/mtip32xx.c
--- linux-4.4.6-gentoo-orig/drivers/block/mtip32xx/mtip32xx.c   2016-05-04 11:19:37.599649829 +0300
+++ linux-4.4.6-gentoo-patched/drivers/block/mtip32xx/mtip32xx.c    2016-05-04 11:02:48.602733981 +0300
@@ -3913,12 +3913,6 @@
    blk_queue_io_min(dd->queue, 4096);
    blk_queue_bounce_limit(dd->queue, dd->pdev->dma_mask);

-   /*
-    * write back cache is not supported in the device. FUA depends on
-    * write back cache support, hence setting flush support to zero.
-    */
-   blk_queue_flush(dd->queue, 0);
-
    /* Signal trim support */
    if (dd->trim_supp == true) {
        set_bit(QUEUE_FLAG_DISCARD, &dd->queue->queue_flags);
diff -Naur linux-4.4.6-gentoo-orig/drivers/block/nbd.c linux-4.4.6-gentoo-patched/drivers/block/nbd.c
--- linux-4.4.6-gentoo-orig/drivers/block/nbd.c 2016-05-04 11:19:37.600649829 +0300
+++ linux-4.4.6-gentoo-patched/drivers/block/nbd.c  2016-05-04 11:02:48.602733981 +0300
@@ -750,9 +750,9 @@
            queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
                nbd->disk->queue);
        if (nbd->flags & NBD_FLAG_SEND_FLUSH)
-           blk_queue_flush(nbd->disk->queue, REQ_FLUSH);
+           blk_queue_write_cache(nbd->disk->queue, true, false);
        else
-           blk_queue_flush(nbd->disk->queue, 0);
+           blk_queue_write_cache(nbd->disk->queue, false, false);

        thread = kthread_run(nbd_thread_send, nbd, "%s",
                     nbd_name(nbd));
diff -Naur linux-4.4.6-gentoo-orig/drivers/block/osdblk.c linux-4.4.6-gentoo-patched/drivers/block/osdblk.c
--- linux-4.4.6-gentoo-orig/drivers/block/osdblk.c  2016-05-04 11:19:37.600649829 +0300
+++ linux-4.4.6-gentoo-patched/drivers/block/osdblk.c   2016-05-04 11:02:48.602733981 +0300
@@ -437,7 +437,7 @@
    blk_queue_stack_limits(q, osd_request_queue(osdev->osd));

    blk_queue_prep_rq(q, blk_queue_start_tag);
-   blk_queue_flush(q, REQ_FLUSH);
+   blk_queue_write_cache(q, true, false);

    disk->queue = q;

diff -Naur linux-4.4.6-gentoo-orig/drivers/block/ps3disk.c linux-4.4.6-gentoo-patched/drivers/block/ps3disk.c
--- linux-4.4.6-gentoo-orig/drivers/block/ps3disk.c 2016-05-04 11:19:37.601649829 +0300
+++ linux-4.4.6-gentoo-patched/drivers/block/ps3disk.c  2016-05-04 11:02:48.602733981 +0300
@@ -468,7 +468,7 @@
    blk_queue_dma_alignment(queue, dev->blk_size-1);
    blk_queue_logical_block_size(queue, dev->blk_size);

-   blk_queue_flush(queue, REQ_FLUSH);
+   blk_queue_write_cache(queue, true, false);

    blk_queue_max_segments(queue, -1);
    blk_queue_max_segment_size(queue, dev->bounce_size);
diff -Naur linux-4.4.6-gentoo-orig/drivers/block/skd_main.c linux-4.4.6-gentoo-patched/drivers/block/skd_main.c
--- linux-4.4.6-gentoo-orig/drivers/block/skd_main.c    2016-05-04 11:19:37.601649829 +0300
+++ linux-4.4.6-gentoo-patched/drivers/block/skd_main.c 2016-05-04 11:02:48.603733981 +0300
@@ -4412,7 +4412,7 @@
    disk->queue = q;
    q->queuedata = skdev;

-   blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
+   blk_queue_write_cache(q, true, true);
    blk_queue_max_segments(q, skdev->sgs_per_request);
    blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS);

diff -Naur linux-4.4.6-gentoo-orig/drivers/block/virtio_blk.c linux-4.4.6-gentoo-patched/drivers/block/virtio_blk.c
--- linux-4.4.6-gentoo-orig/drivers/block/virtio_blk.c  2016-05-04 11:19:37.602649828 +0300
+++ linux-4.4.6-gentoo-patched/drivers/block/virtio_blk.c   2016-05-04 11:02:48.603733981 +0300
@@ -488,11 +488,7 @@
    u8 writeback = virtblk_get_cache_mode(vdev);
    struct virtio_blk *vblk = vdev->priv;

-   if (writeback)
-       blk_queue_flush(vblk->disk->queue, REQ_FLUSH);
-   else
-       blk_queue_flush(vblk->disk->queue, 0);
-
+   blk_queue_write_cache(vblk->disk->queue, writeback, false);
    revalidate_disk(vblk->disk);
 }

diff -Naur linux-4.4.6-gentoo-orig/drivers/block/xen-blkback/xenbus.c linux-4.4.6-gentoo-patched/drivers/block/xen-blkback/xenbus.c
--- linux-4.4.6-gentoo-orig/drivers/block/xen-blkback/xenbus.c  2016-05-04 11:19:37.603649828 +0300
+++ linux-4.4.6-gentoo-patched/drivers/block/xen-blkback/xenbus.c   2016-05-04 11:02:48.603733981 +0300
@@ -413,7 +413,7 @@
        vbd->type |= VDISK_REMOVABLE;

    q = bdev_get_queue(bdev);
-   if (q && q->flush_flags)
+   if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags))
        vbd->flush_support = true;

    if (q && blk_queue_secdiscard(q))
diff -Naur linux-4.4.6-gentoo-orig/drivers/block/xen-blkfront.c linux-4.4.6-gentoo-patched/drivers/block/xen-blkfront.c
--- linux-4.4.6-gentoo-orig/drivers/block/xen-blkfront.c    2016-05-04 11:19:37.603649828 +0300
+++ linux-4.4.6-gentoo-patched/drivers/block/xen-blkfront.c 2016-05-04 11:02:48.603733981 +0300
@@ -851,7 +851,8 @@

 static void xlvbd_flush(struct blkfront_info *info)
 {
-   blk_queue_flush(info->rq, info->feature_flush);
+   blk_queue_write_cache(info->rq, info->feature_flush & REQ_FLUSH,
+               info->feature_flush & REQ_FUA);
    pr_info("blkfront: %s: %s %s %s %s %s\n",
        info->gd->disk_name, flush_info(info->feature_flush),
        "persistent grants:", info->feature_persistent ?
diff -Naur linux-4.4.6-gentoo-orig/drivers/ide/ide-disk.c linux-4.4.6-gentoo-patched/drivers/ide/ide-disk.c
--- linux-4.4.6-gentoo-orig/drivers/ide/ide-disk.c  2016-05-04 11:19:37.604649828 +0300
+++ linux-4.4.6-gentoo-patched/drivers/ide/ide-disk.c   2016-05-04 11:02:48.603733981 +0300
@@ -522,7 +522,7 @@
 static void update_flush(ide_drive_t *drive)
 {
    u16 *id = drive->id;
-   unsigned flush = 0;
+   bool wc = false;

    if (drive->dev_flags & IDE_DFLAG_WCACHE) {
        unsigned long long capacity;
@@ -546,12 +546,12 @@
               drive->name, barrier ? "" : "not ");

        if (barrier) {
-           flush = REQ_FLUSH;
+           wc = true;
            blk_queue_prep_rq(drive->queue, idedisk_prep_fn);
        }
    }

-   blk_queue_flush(drive->queue, flush);
+   blk_queue_write_cache(drive->queue, wc, false);
 }

 ide_devset_get_flag(wcache, IDE_DFLAG_WCACHE);
diff -Naur linux-4.4.6-gentoo-orig/drivers/md/bcache/super.c linux-4.4.6-gentoo-patched/drivers/md/bcache/super.c
--- linux-4.4.6-gentoo-orig/drivers/md/bcache/super.c   2016-05-04 11:19:37.604649828 +0300
+++ linux-4.4.6-gentoo-patched/drivers/md/bcache/super.c    2016-05-04 11:02:48.604733981 +0300
@@ -816,7 +816,7 @@
    clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
    set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);

-   blk_queue_flush(q, REQ_FLUSH|REQ_FUA);
+   blk_queue_write_cache(q, true, true);

    return 0;
 }
diff -Naur linux-4.4.6-gentoo-orig/drivers/md/dm-table.c linux-4.4.6-gentoo-patched/drivers/md/dm-table.c
--- linux-4.4.6-gentoo-orig/drivers/md/dm-table.c   2016-05-04 11:19:37.605649828 +0300
+++ linux-4.4.6-gentoo-patched/drivers/md/dm-table.c    2016-05-04 11:02:48.604733981 +0300
@@ -1312,13 +1312,13 @@
 static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
                sector_t start, sector_t len, void *data)
 {
-   unsigned flush = (*(unsigned *)data);
+   unsigned long flush = (unsigned long) data;
    struct request_queue *q = bdev_get_queue(dev->bdev);

-   return q && (q->flush_flags & flush);
+   return q && (q->queue_flags & flush);
 }

-static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
+static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
 {
    struct dm_target *ti;
    unsigned i = 0;
@@ -1339,7 +1339,7 @@
            return true;

        if (ti->type->iterate_devices &&
-           ti->type->iterate_devices(ti, device_flush_capable, &flush))
+           ti->type->iterate_devices(ti, device_flush_capable, (void *) flush))
            return true;
    }

@@ -1470,7 +1470,7 @@
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
                   struct queue_limits *limits)
 {
-   unsigned flush = 0;
+   bool wc = false, fua = false;

    /*
     * Copy table's limits to the DM device's request_queue
@@ -1482,12 +1482,12 @@
    else
        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);

-   if (dm_table_supports_flush(t, REQ_FLUSH)) {
-       flush |= REQ_FLUSH;
-       if (dm_table_supports_flush(t, REQ_FUA))
-           flush |= REQ_FUA;
+   if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) {
+       wc = true;
+       if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA)))
+           fua = true;
    }
-   blk_queue_flush(q, flush);
+   blk_queue_write_cache(q, wc, fua);

    if (!dm_table_discard_zeroes_data(t))
        q->limits.discard_zeroes_data = 0;
diff -Naur linux-4.4.6-gentoo-orig/drivers/md/md.c linux-4.4.6-gentoo-patched/drivers/md/md.c
--- linux-4.4.6-gentoo-orig/drivers/md/md.c 2016-05-04 11:19:37.606649828 +0300
+++ linux-4.4.6-gentoo-patched/drivers/md/md.c  2016-05-04 11:02:48.605733981 +0300
@@ -5037,7 +5037,7 @@
    disk->fops = &md_fops;
    disk->private_data = mddev;
    disk->queue = mddev->queue;
-   blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
+   blk_queue_write_cache(mddev->queue, true, true);
    /* Allow extended partitions.  This makes the
     * 'mdp' device redundant, but we can't really
     * remove it now.
diff -Naur linux-4.4.6-gentoo-orig/drivers/md/raid5-cache.c linux-4.4.6-gentoo-patched/drivers/md/raid5-cache.c
--- linux-4.4.6-gentoo-orig/drivers/md/raid5-cache.c    2016-05-04 11:19:37.607649828 +0300
+++ linux-4.4.6-gentoo-patched/drivers/md/raid5-cache.c 2016-05-04 11:02:48.605733981 +0300
@@ -1133,6 +1133,7 @@

 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 {
+   struct request_queue *q = bdev_get_queue(rdev->bdev);
    struct r5l_log *log;

    if (PAGE_SIZE != 4096)
@@ -1142,7 +1143,7 @@
        return -ENOMEM;
    log->rdev = rdev;

-   log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0);
+   log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;

    log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
                       sizeof(rdev->mddev->uuid));
diff -Naur linux-4.4.6-gentoo-orig/drivers/mmc/card/block.c linux-4.4.6-gentoo-patched/drivers/mmc/card/block.c
--- linux-4.4.6-gentoo-orig/drivers/mmc/card/block.c    2016-05-04 11:19:37.608649828 +0300
+++ linux-4.4.6-gentoo-patched/drivers/mmc/card/block.c 2016-05-04 11:02:48.605733981 +0300
@@ -2282,7 +2282,7 @@
        ((card->ext_csd.rel_param & EXT_CSD_WR_REL_PARAM_EN) ||
         card->ext_csd.rel_sectors)) {
        md->flags |= MMC_BLK_REL_WR;
-       blk_queue_flush(md->queue.queue, REQ_FLUSH | REQ_FUA);
+       blk_queue_write_cache(md->queue.queue, true, true);
    }

    if (mmc_card_mmc(card) &&
diff -Naur linux-4.4.6-gentoo-orig/drivers/mtd/mtd_blkdevs.c linux-4.4.6-gentoo-patched/drivers/mtd/mtd_blkdevs.c
--- linux-4.4.6-gentoo-orig/drivers/mtd/mtd_blkdevs.c   2016-05-04 11:19:37.608649828 +0300
+++ linux-4.4.6-gentoo-patched/drivers/mtd/mtd_blkdevs.c    2016-05-04 11:02:48.605733981 +0300
@@ -409,7 +409,7 @@
        goto error3;

    if (tr->flush)
-       blk_queue_flush(new->rq, REQ_FLUSH);
+       blk_queue_write_cache(new->rq, true, false);

    new->rq->queuedata = new;
    blk_queue_logical_block_size(new->rq, tr->blksize);
diff -Naur linux-4.4.6-gentoo-orig/drivers/nvme/host/pci.c linux-4.4.6-gentoo-patched/drivers/nvme/host/pci.c
--- linux-4.4.6-gentoo-orig/drivers/nvme/host/pci.c 2016-01-11 02:01:32.000000000 +0300
+++ linux-4.4.6-gentoo-patched/drivers/nvme/host/pci.c  2016-05-04 11:48:03.179507579 +0300
@@ -2272,6 +2272,7 @@
    list_add_tail(&ns->list, &dev->namespaces);

    blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
+   bool vwc = false;
    if (dev->max_hw_sectors) {
        blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
        blk_queue_max_segments(ns->queue,
@@ -2279,8 +2280,10 @@
    }
    if (dev->stripe_size)
        blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
-   if (dev->vwc & NVME_CTRL_VWC_PRESENT)
-       blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
+   if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+       vwc = true;
+   blk_queue_write_cache(q, vwc, vwc);
+
    blk_queue_virt_boundary(ns->queue, dev->page_size - 1);

    disk->major = nvme_major;
diff -Naur linux-4.4.6-gentoo-orig/drivers/scsi/scsi.c linux-4.4.6-gentoo-patched/drivers/scsi/scsi.c
--- linux-4.4.6-gentoo-orig/drivers/scsi/scsi.c 2016-05-04 11:19:37.609649828 +0300
+++ linux-4.4.6-gentoo-patched/drivers/scsi/scsi.c  2016-05-04 11:03:27.408730745 +0300
@@ -621,6 +621,9 @@
        wmb();
    }

+   if (sdev->request_queue)
+       blk_set_queue_depth(sdev->request_queue, depth);
+
    return sdev->queue_depth;
 }
 EXPORT_SYMBOL(scsi_change_queue_depth);
diff -Naur linux-4.4.6-gentoo-orig/drivers/scsi/sd.c linux-4.4.6-gentoo-patched/drivers/scsi/sd.c
--- linux-4.4.6-gentoo-orig/drivers/scsi/sd.c   2016-05-04 11:19:37.609649828 +0300
+++ linux-4.4.6-gentoo-patched/drivers/scsi/sd.c    2016-05-04 11:03:27.408730745 +0300
@@ -137,15 +137,15 @@

 static void sd_set_flush_flag(struct scsi_disk *sdkp)
 {
-   unsigned flush = 0;
+   bool wc = false, fua = false;

    if (sdkp->WCE) {
-       flush |= REQ_FLUSH;
+       wc = true;
        if (sdkp->DPOFUA)
-           flush |= REQ_FUA;
+           fua = true;
    }

-   blk_queue_flush(sdkp->disk->queue, flush);
+   blk_queue_write_cache(sdkp->disk->queue, wc, fua);
 }

 static ssize_t
diff -Naur linux-4.4.6-gentoo-orig/drivers/target/target_core_iblock.c linux-4.4.6-gentoo-patched/drivers/target/target_core_iblock.c
--- linux-4.4.6-gentoo-orig/drivers/target/target_core_iblock.c 2016-05-04 11:19:37.610649828 +0300
+++ linux-4.4.6-gentoo-patched/drivers/target/target_core_iblock.c  2016-05-04 11:03:27.409730745 +0300
@@ -653,10 +653,10 @@
         * Force writethrough using WRITE_FUA if a volatile write cache
         * is not enabled, or if initiator set the Force Unit Access bit.
         */
-       if (q->flush_flags & REQ_FUA) {
+       if (test_bit(QUEUE_FLAG_FUA, &q->queue_flags)) {
            if (cmd->se_cmd_flags & SCF_FUA)
                rw = WRITE_FUA;
-           else if (!(q->flush_flags & REQ_FLUSH))
+           else if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
                rw = WRITE_FUA;
            else
                rw = WRITE;
@@ -802,7 +802,7 @@
    struct block_device *bd = ib_dev->ibd_bd;
    struct request_queue *q = bdev_get_queue(bd);

-   return q->flush_flags & REQ_FLUSH;
+   return test_bit(QUEUE_FLAG_WC, &q->queue_flags);
 }

 static const struct target_backend_ops iblock_ops = {
diff -Naur linux-4.4.6-gentoo-orig/fs/block_dev.c linux-4.4.6-gentoo-patched/fs/block_dev.c
--- linux-4.4.6-gentoo-orig/fs/block_dev.c  2016-05-04 11:19:37.610649828 +0300
+++ linux-4.4.6-gentoo-patched/fs/block_dev.c   2016-05-04 11:03:27.409730745 +0300
@@ -427,7 +427,7 @@
            struct page *page, struct writeback_control *wbc)
 {
    int result;
-   int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
+   int rw = wbc_to_write_cmd(wbc);
    const struct block_device_operations *ops = bdev->bd_disk->fops;

    if (!ops->rw_page || bdev_get_integrity(bdev))
diff -Naur linux-4.4.6-gentoo-orig/fs/buffer.c linux-4.4.6-gentoo-patched/fs/buffer.c
--- linux-4.4.6-gentoo-orig/fs/buffer.c 2016-05-04 11:19:37.611649828 +0300
+++ linux-4.4.6-gentoo-patched/fs/buffer.c  2016-05-04 11:03:27.409730745 +0300
@@ -1708,7 +1708,7 @@
    struct buffer_head *bh, *head;
    unsigned int blocksize, bbits;
    int nr_underway = 0;
-   int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+   int write_op = wbc_to_write_cmd(wbc);

    head = create_page_buffers(page, inode,
                    (1 << BH_Dirty)|(1 << BH_Uptodate));
diff -Naur linux-4.4.6-gentoo-orig/fs/f2fs/data.c linux-4.4.6-gentoo-patched/fs/f2fs/data.c
--- linux-4.4.6-gentoo-orig/fs/f2fs/data.c  2016-05-04 11:19:37.612649828 +0300
+++ linux-4.4.6-gentoo-patched/fs/f2fs/data.c   2016-05-04 11:03:27.409730745 +0300
@@ -1115,7 +1115,7 @@
    struct f2fs_io_info fio = {
        .sbi = sbi,
        .type = DATA,
-       .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+       .rw = wbc_to_write_cmd(wbc),
        .page = page,
        .encrypted_page = NULL,
    };
diff -Naur linux-4.4.6-gentoo-orig/fs/f2fs/node.c linux-4.4.6-gentoo-patched/fs/f2fs/node.c
--- linux-4.4.6-gentoo-orig/fs/f2fs/node.c  2016-05-04 11:19:37.612649828 +0300
+++ linux-4.4.6-gentoo-patched/fs/f2fs/node.c   2016-05-04 11:03:27.409730745 +0300
@@ -1305,7 +1305,7 @@
    struct f2fs_io_info fio = {
        .sbi = sbi,
        .type = NODE,
-       .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+       .rw = wbc_to_write_cmd(wbc),
        .page = page,
        .encrypted_page = NULL,
    };
diff -Naur linux-4.4.6-gentoo-orig/fs/fs-writeback.c.orig linux-4.4.6-gentoo-patched/fs/fs-writeback.c.orig
--- linux-4.4.6-gentoo-orig/fs/fs-writeback.c.orig  2016-05-04 11:19:37.613649828 +0300
+++ linux-4.4.6-gentoo-patched/fs/fs-writeback.c.orig   1970-01-01 03:00:00.000000000 +0300
@@ -1,2394 +0,0 @@
-/*
- * fs/fs-writeback.c
- *
- * Copyright (C) 2002, Linus Torvalds.
- *
- * Contains all the functions related to writing back and waiting
- * upon dirty inodes against superblocks, and writing back dirty
- * pages against inodes.  ie: data writeback.  Writeout of the
- * inode itself is not handled here.
- *
- * 10Apr2002   Andrew Morton
- *     Split out of fs/inode.c
- *     Additions for address_space-based writeback
- */
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/kthread.h>
-#include <linux/writeback.h>
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/tracepoint.h>
-#include <linux/device.h>
-#include <linux/memcontrol.h>
-#include "internal.h"
-
-/*
- * 4MB minimal write chunk size
- */
-#define MIN_WRITEBACK_PAGES    (4096UL >> (PAGE_CACHE_SHIFT - 10))
-
-struct wb_completion {
-   atomic_t        cnt;
-};
-
-/*
- * Passed into wb_writeback(), essentially a subset of writeback_control
- */
-struct wb_writeback_work {
-   long nr_pages;
-   struct super_block *sb;
-   unsigned long *older_than_this;
-   enum writeback_sync_modes sync_mode;
-   unsigned int tagged_writepages:1;
-   unsigned int for_kupdate:1;
-   unsigned int range_cyclic:1;
-   unsigned int for_background:1;
-   unsigned int for_sync:1;    /* sync(2) WB_SYNC_ALL writeback */
-   unsigned int auto_free:1;   /* free on completion */
-   enum wb_reason reason;      /* why was writeback initiated? */
-
-   struct list_head list;      /* pending work list */
-   struct wb_completion *done; /* set if the caller waits */
-};
-
-/*
- * If one wants to wait for one or more wb_writeback_works, each work's
- * ->done should be set to a wb_completion defined using the following
- * macro.  Once all work items are issued with wb_queue_work(), the caller
- * can wait for the completion of all using wb_wait_for_completion().  Work
- * items which are waited upon aren't freed automatically on completion.
- */
-#define DEFINE_WB_COMPLETION_ONSTACK(cmpl)             \
-   struct wb_completion cmpl = {                   \
-       .cnt        = ATOMIC_INIT(1),           \
-   }
-
-
-/*
- * If an inode is constantly having its pages dirtied, but then the
- * updates stop dirtytime_expire_interval seconds in the past, it's
- * possible for the worst case time between when an inode has its
- * timestamps updated and when they finally get written out to be two
- * dirtytime_expire_intervals.  We set the default to 12 hours (in
- * seconds), which means most of the time inodes will have their
- * timestamps written to disk after 12 hours, but in the worst case a
- * few inodes might not their timestamps updated for 24 hours.
- */
-unsigned int dirtytime_expire_interval = 12 * 60 * 60;
-
-static inline struct inode *wb_inode(struct list_head *head)
-{
-   return list_entry(head, struct inode, i_io_list);
-}
-
-/*
- * Include the creation of the trace points after defining the
- * wb_writeback_work structure and inline functions so that the definition
- * remains local to this file.
- */
-#define CREATE_TRACE_POINTS
-#include <trace/events/writeback.h>
-
-EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
-
-static bool wb_io_lists_populated(struct bdi_writeback *wb)
-{
-   if (wb_has_dirty_io(wb)) {
-       return false;
-   } else {
-       set_bit(WB_has_dirty_io, &wb->state);
-       WARN_ON_ONCE(!wb->avg_write_bandwidth);
-       atomic_long_add(wb->avg_write_bandwidth,
-               &wb->bdi->tot_write_bandwidth);
-       return true;
-   }
-}
-
-static void wb_io_lists_depopulated(struct bdi_writeback *wb)
-{
-   if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
-       list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
-       clear_bit(WB_has_dirty_io, &wb->state);
-       WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
-                   &wb->bdi->tot_write_bandwidth) < 0);
-   }
-}
-
-/**
- * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
- * @inode: inode to be moved
- * @wb: target bdi_writeback
- * @head: one of @wb->b_{dirty|io|more_io}
- *
- * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
- * Returns %true if @inode is the first occupant of the !dirty_time IO
- * lists; otherwise, %false.
- */
-static bool inode_io_list_move_locked(struct inode *inode,
-                     struct bdi_writeback *wb,
-                     struct list_head *head)
-{
-   assert_spin_locked(&wb->list_lock);
-
-   list_move(&inode->i_io_list, head);
-
-   /* dirty_time doesn't count as dirty_io until expiration */
-   if (head != &wb->b_dirty_time)
-       return wb_io_lists_populated(wb);
-
-   wb_io_lists_depopulated(wb);
-   return false;
-}
-
-/**
- * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
- * @inode: inode to be removed
- * @wb: bdi_writeback @inode is being removed from
- *
- * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
- * clear %WB_has_dirty_io if all are empty afterwards.
- */
-static void inode_io_list_del_locked(struct inode *inode,
-                    struct bdi_writeback *wb)
-{
-   assert_spin_locked(&wb->list_lock);
-
-   list_del_init(&inode->i_io_list);
-   wb_io_lists_depopulated(wb);
-}
-
-static void wb_wakeup(struct bdi_writeback *wb)
-{
-   spin_lock_bh(&wb->work_lock);
-   if (test_bit(WB_registered, &wb->state))
-       mod_delayed_work(bdi_wq, &wb->dwork, 0);
-   spin_unlock_bh(&wb->work_lock);
-}
-
-static void wb_queue_work(struct bdi_writeback *wb,
-             struct wb_writeback_work *work)
-{
-   trace_writeback_queue(wb, work);
-
-   spin_lock_bh(&wb->work_lock);
-   if (!test_bit(WB_registered, &wb->state))
-       goto out_unlock;
-   if (work->done)
-       atomic_inc(&work->done->cnt);
-   list_add_tail(&work->list, &wb->work_list);
-   mod_delayed_work(bdi_wq, &wb->dwork, 0);
-out_unlock:
-   spin_unlock_bh(&wb->work_lock);
-}
-
-/**
- * wb_wait_for_completion - wait for completion of bdi_writeback_works
- * @bdi: bdi work items were issued to
- * @done: target wb_completion
- *
- * Wait for one or more work items issued to @bdi with their ->done field
- * set to @done, which should have been defined with
- * DEFINE_WB_COMPLETION_ONSTACK().  This function returns after all such
- * work items are completed.  Work items which are waited upon aren't freed
- * automatically on completion.
- */
-static void wb_wait_for_completion(struct backing_dev_info *bdi,
-                  struct wb_completion *done)
-{
-   atomic_dec(&done->cnt);     /* put down the initial count */
-   wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
-}
-
-#ifdef CONFIG_CGROUP_WRITEBACK
-
-/* parameters for foreign inode detection, see wb_detach_inode() */
-#define WB_FRN_TIME_SHIFT  13  /* 1s = 2^13, upto 8 secs w/ 16bit */
-#define WB_FRN_TIME_AVG_SHIFT  3   /* avg = avg * 7/8 + new * 1/8 */
-#define WB_FRN_TIME_CUT_DIV    2   /* ignore rounds < avg / 2 */
-#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT))  /* 2s */
-
-#define WB_FRN_HIST_SLOTS  16  /* inode->i_wb_frn_history is 16bit */
-#define WB_FRN_HIST_UNIT   (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
-                   /* each slot's duration is 2s / 16 */
-#define WB_FRN_HIST_THR_SLOTS  (WB_FRN_HIST_SLOTS / 2)
-                   /* if foreign slots >= 8, switch */
-#define WB_FRN_HIST_MAX_SLOTS  (WB_FRN_HIST_THR_SLOTS / 2 + 1)
-                   /* one round can affect upto 5 slots */
-
-static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
-static struct workqueue_struct *isw_wq;
-
-void __inode_attach_wb(struct inode *inode, struct page *page)
-{
-   struct backing_dev_info *bdi = inode_to_bdi(inode);
-   struct bdi_writeback *wb = NULL;
-
-   if (inode_cgwb_enabled(inode)) {
-       struct cgroup_subsys_state *memcg_css;
-
-       if (page) {
-           memcg_css = mem_cgroup_css_from_page(page);
-           wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
-       } else {
-           /* must pin memcg_css, see wb_get_create() */
-           memcg_css = task_get_css(current, memory_cgrp_id);
-           wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
-           css_put(memcg_css);
-       }
-   }
-
-   if (!wb)
-       wb = &bdi->wb;
-
-   /*
-    * There may be multiple instances of this function racing to
-    * update the same inode.  Use cmpxchg() to tell the winner.
-    */
-   if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
-       wb_put(wb);
-}
-
-/**
- * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
- * @inode: inode of interest with i_lock held
- *
- * Returns @inode's wb with its list_lock held.  @inode->i_lock must be
- * held on entry and is released on return.  The returned wb is guaranteed
- * to stay @inode's associated wb until its list_lock is released.
- */
-static struct bdi_writeback *
-locked_inode_to_wb_and_lock_list(struct inode *inode)
-   __releases(&inode->i_lock)
-   __acquires(&wb->list_lock)
-{
-   while (true) {
-       struct bdi_writeback *wb = inode_to_wb(inode);
-
-       /*
-        * inode_to_wb() association is protected by both
-        * @inode->i_lock and @wb->list_lock but list_lock nests
-        * outside i_lock.  Drop i_lock and verify that the
-        * association hasn't changed after acquiring list_lock.
-        */
-       wb_get(wb);
-       spin_unlock(&inode->i_lock);
-       spin_lock(&wb->list_lock);
-       wb_put(wb);     /* not gonna deref it anymore */
-
-       /* i_wb may have changed inbetween, can't use inode_to_wb() */
-       if (likely(wb == inode->i_wb))
-           return wb;  /* @inode already has ref */
-
-       spin_unlock(&wb->list_lock);
-       cpu_relax();
-       spin_lock(&inode->i_lock);
-   }
-}
-
-/**
- * inode_to_wb_and_lock_list - determine an inode's wb and lock it
- * @inode: inode of interest
- *
- * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
- * on entry.
- */
-static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
-   __acquires(&wb->list_lock)
-{
-   spin_lock(&inode->i_lock);
-   return locked_inode_to_wb_and_lock_list(inode);
-}
-
-struct inode_switch_wbs_context {
-   struct inode        *inode;
-   struct bdi_writeback    *new_wb;
-
-   struct rcu_head     rcu_head;
-   struct work_struct  work;
-};
-
-static void inode_switch_wbs_work_fn(struct work_struct *work)
-{
-   struct inode_switch_wbs_context *isw =
-       container_of(work, struct inode_switch_wbs_context, work);
-   struct inode *inode = isw->inode;
-   struct address_space *mapping = inode->i_mapping;
-   struct bdi_writeback *old_wb = inode->i_wb;
-   struct bdi_writeback *new_wb = isw->new_wb;
-   struct radix_tree_iter iter;
-   bool switched = false;
-   void **slot;
-
-   /*
-    * By the time control reaches here, RCU grace period has passed
-    * since I_WB_SWITCH assertion and all wb stat update transactions
-    * between unlocked_inode_to_wb_begin/end() are guaranteed to be
-    * synchronizing against mapping->tree_lock.
-    *
-    * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
-    * gives us exclusion against all wb related operations on @inode
-    * including IO list manipulations and stat updates.
-    */
-   if (old_wb < new_wb) {
-       spin_lock(&old_wb->list_lock);
-       spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
-   } else {
-       spin_lock(&new_wb->list_lock);
-       spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
-   }
-   spin_lock(&inode->i_lock);
-   spin_lock_irq(&mapping->tree_lock);
-
-   /*
-    * Once I_FREEING is visible under i_lock, the eviction path owns
-    * the inode and we shouldn't modify ->i_io_list.
-    */
-   if (unlikely(inode->i_state & I_FREEING))
-       goto skip_switch;
-
-   /*
-    * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
-    * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
-    * pages actually under underwriteback.
-    */
-   radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
-                  PAGECACHE_TAG_DIRTY) {
-       struct page *page = radix_tree_deref_slot_protected(slot,
-                           &mapping->tree_lock);
-       if (likely(page) && PageDirty(page)) {
-           __dec_wb_stat(old_wb, WB_RECLAIMABLE);
-           __inc_wb_stat(new_wb, WB_RECLAIMABLE);
-       }
-   }
-
-   radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
-                  PAGECACHE_TAG_WRITEBACK) {
-       struct page *page = radix_tree_deref_slot_protected(slot,
-                           &mapping->tree_lock);
-       if (likely(page)) {
-           WARN_ON_ONCE(!PageWriteback(page));
-           __dec_wb_stat(old_wb, WB_WRITEBACK);
-           __inc_wb_stat(new_wb, WB_WRITEBACK);
-       }
-   }
-
-   wb_get(new_wb);
-
-   /*
-    * Transfer to @new_wb's IO list if necessary.  The specific list
-    * @inode was on is ignored and the inode is put on ->b_dirty which
-    * is always correct including from ->b_dirty_time.  The transfer
-    * preserves @inode->dirtied_when ordering.
-    */
-   if (!list_empty(&inode->i_io_list)) {
-       struct inode *pos;
-
-       inode_io_list_del_locked(inode, old_wb);
-       inode->i_wb = new_wb;
-       list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
-           if (time_after_eq(inode->dirtied_when,
-                     pos->dirtied_when))
-               break;
-       inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
-   } else {
-       inode->i_wb = new_wb;
-   }
-
-   /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
-   inode->i_wb_frn_winner = 0;
-   inode->i_wb_frn_avg_time = 0;
-   inode->i_wb_frn_history = 0;
-   switched = true;
-skip_switch:
-   /*
-    * Paired with load_acquire in unlocked_inode_to_wb_begin() and
-    * ensures that the new wb is visible if they see !I_WB_SWITCH.
-    */
-   smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
-
-   spin_unlock_irq(&mapping->tree_lock);
-   spin_unlock(&inode->i_lock);
-   spin_unlock(&new_wb->list_lock);
-   spin_unlock(&old_wb->list_lock);
-
-   if (switched) {
-       wb_wakeup(new_wb);
-       wb_put(old_wb);
-   }
-   wb_put(new_wb);
-
-   iput(inode);
-   kfree(isw);
-
-   atomic_dec(&isw_nr_in_flight);
-}
-
-static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
-{
-   struct inode_switch_wbs_context *isw = container_of(rcu_head,
-               struct inode_switch_wbs_context, rcu_head);
-
-   /* needs to grab bh-unsafe locks, bounce to work item */
-   INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
-   queue_work(isw_wq, &isw->work);
-}
-
-/**
- * inode_switch_wbs - change the wb association of an inode
- * @inode: target inode
- * @new_wb_id: ID of the new wb
- *
- * Switch @inode's wb association to the wb identified by @new_wb_id.  The
- * switching is performed asynchronously and may fail silently.
- */
-static void inode_switch_wbs(struct inode *inode, int new_wb_id)
-{
-   struct backing_dev_info *bdi = inode_to_bdi(inode);
-   struct cgroup_subsys_state *memcg_css;
-   struct inode_switch_wbs_context *isw;
-
-   /* noop if seems to be already in progress */
-   if (inode->i_state & I_WB_SWITCH)
-       return;
-
-   isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
-   if (!isw)
-       return;
-
-   /* find and pin the new wb */
-   rcu_read_lock();
-   memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
-   if (memcg_css)
-       isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
-   rcu_read_unlock();
-   if (!isw->new_wb)
-       goto out_free;
-
-   /* while holding I_WB_SWITCH, no one else can update the association */
-   spin_lock(&inode->i_lock);
-   if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
-       inode->i_state & (I_WB_SWITCH | I_FREEING) ||
-       inode_to_wb(inode) == isw->new_wb) {
-       spin_unlock(&inode->i_lock);
-       goto out_free;
-   }
-   inode->i_state |= I_WB_SWITCH;
-   spin_unlock(&inode->i_lock);
-
-   ihold(inode);
-   isw->inode = inode;
-
-   atomic_inc(&isw_nr_in_flight);
-
-   /*
-    * In addition to synchronizing among switchers, I_WB_SWITCH tells
-    * the RCU protected stat update paths to grab the mapping's
-    * tree_lock so that stat transfer can synchronize against them.
-    * Let's continue after I_WB_SWITCH is guaranteed to be visible.
-    */
-   call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
-   return;
-
-out_free:
-   if (isw->new_wb)
-       wb_put(isw->new_wb);
-   kfree(isw);
-}
-
-/**
- * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
- * @wbc: writeback_control of interest
- * @inode: target inode
- *
- * @inode is locked and about to be written back under the control of @wbc.
- * Record @inode's writeback context into @wbc and unlock the i_lock.  On
- * writeback completion, wbc_detach_inode() should be called.  This is used
- * to track the cgroup writeback context.
- */
-void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
-                struct inode *inode)
-{
-   if (!inode_cgwb_enabled(inode)) {
-       spin_unlock(&inode->i_lock);
-       return;
-   }
-
-   wbc->wb = inode_to_wb(inode);
-   wbc->inode = inode;
-
-   wbc->wb_id = wbc->wb->memcg_css->id;
-   wbc->wb_lcand_id = inode->i_wb_frn_winner;
-   wbc->wb_tcand_id = 0;
-   wbc->wb_bytes = 0;
-   wbc->wb_lcand_bytes = 0;
-   wbc->wb_tcand_bytes = 0;
-
-   wb_get(wbc->wb);
-   spin_unlock(&inode->i_lock);
-
-   /*
-    * A dying wb indicates that the memcg-blkcg mapping has changed
-    * and a new wb is already serving the memcg.  Switch immediately.
-    */
-   if (unlikely(wb_dying(wbc->wb)))
-       inode_switch_wbs(inode, wbc->wb_id);
-}
-
-/**
- * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
- * @wbc: writeback_control of the just finished writeback
- *
- * To be called after a writeback attempt of an inode finishes and undoes
- * wbc_attach_and_unlock_inode().  Can be called under any context.
- *
- * As concurrent write sharing of an inode is expected to be very rare and
- * memcg only tracks page ownership on first-use basis severely confining
- * the usefulness of such sharing, cgroup writeback tracks ownership
- * per-inode.  While the support for concurrent write sharing of an inode
- * is deemed unnecessary, an inode being written to by different cgroups at
- * different points in time is a lot more common, and, more importantly,
- * charging only by first-use can too readily lead to grossly incorrect
- * behaviors (single foreign page can lead to gigabytes of writeback to be
- * incorrectly attributed).
- *
- * To resolve this issue, cgroup writeback detects the majority dirtier of
- * an inode and transfers the ownership to it.  To avoid unnnecessary
- * oscillation, the detection mechanism keeps track of history and gives
- * out the switch verdict only if the foreign usage pattern is stable over
- * a certain amount of time and/or writeback attempts.
- *
- * On each writeback attempt, @wbc tries to detect the majority writer
- * using Boyer-Moore majority vote algorithm.  In addition to the byte
- * count from the majority voting, it also counts the bytes written for the
- * current wb and the last round's winner wb (max of last round's current
- * wb, the winner from two rounds ago, and the last round's majority
- * candidate).  Keeping track of the historical winner helps the algorithm
- * to semi-reliably detect the most active writer even when it's not the
- * absolute majority.
- *
- * Once the winner of the round is determined, whether the winner is
- * foreign or not and how much IO time the round consumed is recorded in
- * inode->i_wb_frn_history.  If the amount of recorded foreign IO time is
- * over a certain threshold, the switch verdict is given.
- */
-void wbc_detach_inode(struct writeback_control *wbc)
-{
-   struct bdi_writeback *wb = wbc->wb;
-   struct inode *inode = wbc->inode;
-   unsigned long avg_time, max_bytes, max_time;
-   u16 history;
-   int max_id;
-
-   if (!wb)
-       return;
-
-   history = inode->i_wb_frn_history;
-   avg_time = inode->i_wb_frn_avg_time;
-
-   /* pick the winner of this round */
-   if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
-       wbc->wb_bytes >= wbc->wb_tcand_bytes) {
-       max_id = wbc->wb_id;
-       max_bytes = wbc->wb_bytes;
-   } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
-       max_id = wbc->wb_lcand_id;
-       max_bytes = wbc->wb_lcand_bytes;
-   } else {
-       max_id = wbc->wb_tcand_id;
-       max_bytes = wbc->wb_tcand_bytes;
-   }
-
-   /*
-    * Calculate the amount of IO time the winner consumed and fold it
-    * into the running average kept per inode.  If the consumed IO
-    * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
-    * deciding whether to switch or not.  This is to prevent one-off
-    * small dirtiers from skewing the verdict.
-    */
-   max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
-               wb->avg_write_bandwidth);
-   if (avg_time)
-       avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
-               (avg_time >> WB_FRN_TIME_AVG_SHIFT);
-   else
-       avg_time = max_time;    /* immediate catch up on first run */
-
-   if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
-       int slots;
-
-       /*
-        * The switch verdict is reached if foreign wb's consume
-        * more than a certain proportion of IO time in a
-        * WB_FRN_TIME_PERIOD.  This is loosely tracked by 16 slot
-        * history mask where each bit represents one sixteenth of
-        * the period.  Determine the number of slots to shift into
-        * history from @max_time.
-        */
-       slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
-               (unsigned long)WB_FRN_HIST_MAX_SLOTS);
-       history <<= slots;
-       if (wbc->wb_id != max_id)
-           history |= (1U << slots) - 1;
-
-       /*
-        * Switch if the current wb isn't the consistent winner.
-        * If there are multiple closely competing dirtiers, the
-        * inode may switch across them repeatedly over time, which
-        * is okay.  The main goal is avoiding keeping an inode on
-        * the wrong wb for an extended period of time.
-        */
-       if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
-           inode_switch_wbs(inode, max_id);
-   }
-
-   /*
-    * Multiple instances of this function may race to update the
-    * following fields but we don't mind occassional inaccuracies.
-    */
-   inode->i_wb_frn_winner = max_id;
-   inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
-   inode->i_wb_frn_history = history;
-
-   wb_put(wbc->wb);
-   wbc->wb = NULL;
-}
-
-/**
- * wbc_account_io - account IO issued during writeback
- * @wbc: writeback_control of the writeback in progress
- * @page: page being written out
- * @bytes: number of bytes being written out
- *
- * @bytes from @page are about to written out during the writeback
- * controlled by @wbc.  Keep the book for foreign inode detection.  See
- * wbc_detach_inode().
- */
-void wbc_account_io(struct writeback_control *wbc, struct page *page,
-           size_t bytes)
-{
-   int id;
-
-   /*
-    * pageout() path doesn't attach @wbc to the inode being written
-    * out.  This is intentional as we don't want the function to block
-    * behind a slow cgroup.  Ultimately, we want pageout() to kick off
-    * regular writeback instead of writing things out itself.
-    */
-   if (!wbc->wb)
-       return;
-
-   rcu_read_lock();
-   id = mem_cgroup_css_from_page(page)->id;
-   rcu_read_unlock();
-
-   if (id == wbc->wb_id) {
-       wbc->wb_bytes += bytes;
-       return;
-   }
-
-   if (id == wbc->wb_lcand_id)
-       wbc->wb_lcand_bytes += bytes;
-
-   /* Boyer-Moore majority vote algorithm */
-   if (!wbc->wb_tcand_bytes)
-       wbc->wb_tcand_id = id;
-   if (id == wbc->wb_tcand_id)
-       wbc->wb_tcand_bytes += bytes;
-   else
-       wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
-}
-EXPORT_SYMBOL_GPL(wbc_account_io);
-
-/**
- * inode_congested - test whether an inode is congested
- * @inode: inode to test for congestion (may be NULL)
- * @cong_bits: mask of WB_[a]sync_congested bits to test
- *
- * Tests whether @inode is congested.  @cong_bits is the mask of congestion
- * bits to test and the return value is the mask of set bits.
- *
- * If cgroup writeback is enabled for @inode, the congestion state is
- * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
- * associated with @inode is congested; otherwise, the root wb's congestion
- * state is used.
- *
- * @inode is allowed to be NULL as this function is often called on
- * mapping->host which is NULL for the swapper space.
- */
-int inode_congested(struct inode *inode, int cong_bits)
-{
-   /*
-    * Once set, ->i_wb never becomes NULL while the inode is alive.
-    * Start transaction iff ->i_wb is visible.
-    */
-   if (inode && inode_to_wb_is_valid(inode)) {
-       struct bdi_writeback *wb;
-       bool locked, congested;
-
-       wb = unlocked_inode_to_wb_begin(inode, &locked);
-       congested = wb_congested(wb, cong_bits);
-       unlocked_inode_to_wb_end(inode, locked);
-       return congested;
-   }
-
-   return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
-}
-EXPORT_SYMBOL_GPL(inode_congested);
-
-/**
- * wb_split_bdi_pages - split nr_pages to write according to bandwidth
- * @wb: target bdi_writeback to split @nr_pages to
- * @nr_pages: number of pages to write for the whole bdi
- *
- * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
- * relation to the total write bandwidth of all wb's w/ dirty inodes on
- * @wb->bdi.
- */
-static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
-{
-   unsigned long this_bw = wb->avg_write_bandwidth;
-   unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
-
-   if (nr_pages == LONG_MAX)
-       return LONG_MAX;
-
-   /*
-    * This may be called on clean wb's and proportional distribution
-    * may not make sense, just use the original @nr_pages in those
-    * cases.  In general, we wanna err on the side of writing more.
-    */
-   if (!tot_bw || this_bw >= tot_bw)
-       return nr_pages;
-   else
-       return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
-}
-
-/**
- * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
- * @bdi: target backing_dev_info
- * @base_work: wb_writeback_work to issue
- * @skip_if_busy: skip wb's which already have writeback in progress
- *
- * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
- * have dirty inodes.  If @base_work->nr_page isn't %LONG_MAX, it's
- * distributed to the busy wbs according to each wb's proportion in the
- * total active write bandwidth of @bdi.
- */
-static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
-                 struct wb_writeback_work *base_work,
-                 bool skip_if_busy)
-{
-   struct bdi_writeback *last_wb = NULL;
-   struct bdi_writeback *wb = list_entry(&bdi->wb_list,
-                         struct bdi_writeback, bdi_node);
-
-   might_sleep();
-restart:
-   rcu_read_lock();
-   list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
-       DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
-       struct wb_writeback_work fallback_work;
-       struct wb_writeback_work *work;
-       long nr_pages;
-
-       if (last_wb) {
-           wb_put(last_wb);
-           last_wb = NULL;
-       }
-
-       /* SYNC_ALL writes out I_DIRTY_TIME too */
-       if (!wb_has_dirty_io(wb) &&
-           (base_work->sync_mode == WB_SYNC_NONE ||
-            list_empty(&wb->b_dirty_time)))
-           continue;
-       if (skip_if_busy && writeback_in_progress(wb))
-           continue;
-
-       nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
-
-       work = kmalloc(sizeof(*work), GFP_ATOMIC);
-       if (work) {
-           *work = *base_work;
-           work->nr_pages = nr_pages;
-           work->auto_free = 1;
-           wb_queue_work(wb, work);
-           continue;
-       }
-
-       /* alloc failed, execute synchronously using on-stack fallback */
-       work = &fallback_work;
-       *work = *base_work;
-       work->nr_pages = nr_pages;
-       work->auto_free = 0;
-       work->done = &fallback_work_done;
-
-       wb_queue_work(wb, work);
-
-       /*
-        * Pin @wb so that it stays on @bdi->wb_list.  This allows
-        * continuing iteration from @wb after dropping and
-        * regrabbing rcu read lock.
-        */
-       wb_get(wb);
-       last_wb = wb;
-
-       rcu_read_unlock();
-       wb_wait_for_completion(bdi, &fallback_work_done);
-       goto restart;
-   }
-   rcu_read_unlock();
-
-   if (last_wb)
-       wb_put(last_wb);
-}
-
-/**
- * cgroup_writeback_umount - flush inode wb switches for umount
- *
- * This function is called when a super_block is about to be destroyed and
- * flushes in-flight inode wb switches.  An inode wb switch goes through
- * RCU and then workqueue, so the two need to be flushed in order to ensure
- * that all previously scheduled switches are finished.  As wb switches are
- * rare occurrences and synchronize_rcu() can take a while, perform
- * flushing iff wb switches are in flight.
- */
-void cgroup_writeback_umount(void)
-{
-   if (atomic_read(&isw_nr_in_flight)) {
-       synchronize_rcu();
-       flush_workqueue(isw_wq);
-   }
-}
-
-static int __init cgroup_writeback_init(void)
-{
-   isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
-   if (!isw_wq)
-       return -ENOMEM;
-   return 0;
-}
-fs_initcall(cgroup_writeback_init);
-
-#else  /* CONFIG_CGROUP_WRITEBACK */
-
-static struct bdi_writeback *
-locked_inode_to_wb_and_lock_list(struct inode *inode)
-   __releases(&inode->i_lock)
-   __acquires(&wb->list_lock)
-{
-   struct bdi_writeback *wb = inode_to_wb(inode);
-
-   spin_unlock(&inode->i_lock);
-   spin_lock(&wb->list_lock);
-   return wb;
-}
-
-static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
-   __acquires(&wb->list_lock)
-{
-   struct bdi_writeback *wb = inode_to_wb(inode);
-
-   spin_lock(&wb->list_lock);
-   return wb;
-}
-
-static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
-{
-   return nr_pages;
-}
-
-static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
-                 struct wb_writeback_work *base_work,
-                 bool skip_if_busy)
-{
-   might_sleep();
-
-   if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
-       base_work->auto_free = 0;
-       wb_queue_work(&bdi->wb, base_work);
-   }
-}
-
-#endif /* CONFIG_CGROUP_WRITEBACK */
-
-void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
-           bool range_cyclic, enum wb_reason reason)
-{
-   struct wb_writeback_work *work;
-
-   if (!wb_has_dirty_io(wb))
-       return;
-
-   /*
-    * This is WB_SYNC_NONE writeback, so if allocation fails just
-    * wakeup the thread for old dirty data writeback
-    */
-   work = kzalloc(sizeof(*work), GFP_ATOMIC);
-   if (!work) {
-       trace_writeback_nowork(wb);
-       wb_wakeup(wb);
-       return;
-   }
-
-   work->sync_mode = WB_SYNC_NONE;
-   work->nr_pages  = nr_pages;
-   work->range_cyclic = range_cyclic;
-   work->reason    = reason;
-   work->auto_free = 1;
-
-   wb_queue_work(wb, work);
-}
-
-/**
- * wb_start_background_writeback - start background writeback
- * @wb: bdi_writback to write from
- *
- * Description:
- *   This makes sure WB_SYNC_NONE background writeback happens. When
- *   this function returns, it is only guaranteed that for given wb
- *   some IO is happening if we are over background dirty threshold.
- *   Caller need not hold sb s_umount semaphore.
- */
-void wb_start_background_writeback(struct bdi_writeback *wb)
-{
-   /*
-    * We just wake up the flusher thread. It will perform background
-    * writeback as soon as there is no other work to do.
-    */
-   trace_writeback_wake_background(wb);
-   wb_wakeup(wb);
-}
-
-/*
- * Remove the inode from the writeback list it is on.
- */
-void inode_io_list_del(struct inode *inode)
-{
-   struct bdi_writeback *wb;
-
-   wb = inode_to_wb_and_lock_list(inode);
-   inode_io_list_del_locked(inode, wb);
-   spin_unlock(&wb->list_lock);
-}
-
-/*
- * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
- * furthest end of its superblock's dirty-inode list.
- *
- * Before stamping the inode's ->dirtied_when, we check to see whether it is
- * already the most-recently-dirtied inode on the b_dirty list.  If that is
- * the case then the inode must have been redirtied while it was being written
- * out and we don't reset its dirtied_when.
- */
-static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
-{
-   if (!list_empty(&wb->b_dirty)) {
-       struct inode *tail;
-
-       tail = wb_inode(wb->b_dirty.next);
-       if (time_before(inode->dirtied_when, tail->dirtied_when))
-           inode->dirtied_when = jiffies;
-   }
-   inode_io_list_move_locked(inode, wb, &wb->b_dirty);
-}
-
-/*
- * requeue inode for re-scanning after bdi->b_io list is exhausted.
- */
-static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
-{
-   inode_io_list_move_locked(inode, wb, &wb->b_more_io);
-}
-
-static void inode_sync_complete(struct inode *inode)
-{
-   inode->i_state &= ~I_SYNC;
-   /* If inode is clean an unused, put it into LRU now... */
-   inode_add_lru(inode);
-   /* Waiters must see I_SYNC cleared before being woken up */
-   smp_mb();
-   wake_up_bit(&inode->i_state, __I_SYNC);
-}
-
-static bool inode_dirtied_after(struct inode *inode, unsigned long t)
-{
-   bool ret = time_after(inode->dirtied_when, t);
-#ifndef CONFIG_64BIT
-   /*
-    * For inodes being constantly redirtied, dirtied_when can get stuck.
-    * It _appears_ to be in the future, but is actually in distant past.
-    * This test is necessary to prevent such wrapped-around relative times
-    * from permanently stopping the whole bdi writeback.
-    */
-   ret = ret && time_before_eq(inode->dirtied_when, jiffies);
-#endif
-   return ret;
-}
-
-#define EXPIRE_DIRTY_ATIME 0x0001
-
-/*
- * Move expired (dirtied before work->older_than_this) dirty inodes from
- * @delaying_queue to @dispatch_queue.
- */
-static int move_expired_inodes(struct list_head *delaying_queue,
-                  struct list_head *dispatch_queue,
-                  int flags,
-                  struct wb_writeback_work *work)
-{
-   unsigned long *older_than_this = NULL;
-   unsigned long expire_time;
-   LIST_HEAD(tmp);
-   struct list_head *pos, *node;
-   struct super_block *sb = NULL;
-   struct inode *inode;
-   int do_sb_sort = 0;
-   int moved = 0;
-
-   if ((flags & EXPIRE_DIRTY_ATIME) == 0)
-       older_than_this = work->older_than_this;
-   else if (!work->for_sync) {
-       expire_time = jiffies - (dirtytime_expire_interval * HZ);
-       older_than_this = &expire_time;
-   }
-   while (!list_empty(delaying_queue)) {
-       inode = wb_inode(delaying_queue->prev);
-       if (older_than_this &&
-           inode_dirtied_after(inode, *older_than_this))
-           break;
-       list_move(&inode->i_io_list, &tmp);
-       moved++;
-       if (flags & EXPIRE_DIRTY_ATIME)
-           set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
-       if (sb_is_blkdev_sb(inode->i_sb))
-           continue;
-       if (sb && sb != inode->i_sb)
-           do_sb_sort = 1;
-       sb = inode->i_sb;
-   }
-
-   /* just one sb in list, splice to dispatch_queue and we're done */
-   if (!do_sb_sort) {
-       list_splice(&tmp, dispatch_queue);
-       goto out;
-   }
-
-   /* Move inodes from one superblock together */
-   while (!list_empty(&tmp)) {
-       sb = wb_inode(tmp.prev)->i_sb;
-       list_for_each_prev_safe(pos, node, &tmp) {
-           inode = wb_inode(pos);
-           if (inode->i_sb == sb)
-               list_move(&inode->i_io_list, dispatch_queue);
-       }
-   }
-out:
-   return moved;
-}
-
-/*
- * Queue all expired dirty inodes for io, eldest first.
- * Before
- *         newly dirtied     b_dirty    b_io    b_more_io
- *         =============>    gf         edc     BA
- * After
- *         newly dirtied     b_dirty    b_io    b_more_io
- *         =============>    g          fBAedc
- *                                           |
- *                                           +--> dequeue for IO
- */
-static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
-{
-   int moved;
-
-   assert_spin_locked(&wb->list_lock);
-   list_splice_init(&wb->b_more_io, &wb->b_io);
-   moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
-   moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
-                    EXPIRE_DIRTY_ATIME, work);
-   if (moved)
-       wb_io_lists_populated(wb);
-   trace_writeback_queue_io(wb, work, moved);
-}
-
-static int write_inode(struct inode *inode, struct writeback_control *wbc)
-{
-   int ret;
-
-   if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
-       trace_writeback_write_inode_start(inode, wbc);
-       ret = inode->i_sb->s_op->write_inode(inode, wbc);
-       trace_writeback_write_inode(inode, wbc);
-       return ret;
-   }
-   return 0;
-}
-
-/*
- * Wait for writeback on an inode to complete. Called with i_lock held.
- * Caller must make sure inode cannot go away when we drop i_lock.
- */
-static void __inode_wait_for_writeback(struct inode *inode)
-   __releases(inode->i_lock)
-   __acquires(inode->i_lock)
-{
-   DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
-   wait_queue_head_t *wqh;
-
-   wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
-   while (inode->i_state & I_SYNC) {
-       spin_unlock(&inode->i_lock);
-       __wait_on_bit(wqh, &wq, bit_wait,
-                 TASK_UNINTERRUPTIBLE);
-       spin_lock(&inode->i_lock);
-   }
-}
-
-/*
- * Wait for writeback on an inode to complete. Caller must have inode pinned.
- */
-void inode_wait_for_writeback(struct inode *inode)
-{
-   spin_lock(&inode->i_lock);
-   __inode_wait_for_writeback(inode);
-   spin_unlock(&inode->i_lock);
-}
-
-/*
- * Sleep until I_SYNC is cleared. This function must be called with i_lock
- * held and drops it. It is aimed for callers not holding any inode reference
- * so once i_lock is dropped, inode can go away.
- */
-static void inode_sleep_on_writeback(struct inode *inode)
-   __releases(inode->i_lock)
-{
-   DEFINE_WAIT(wait);
-   wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
-   int sleep;
-
-   prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
-   sleep = inode->i_state & I_SYNC;
-   spin_unlock(&inode->i_lock);
-   if (sleep)
-       schedule();
-   finish_wait(wqh, &wait);
-}
-
-/*
- * Find proper writeback list for the inode depending on its current state and
- * possibly also change of its state while we were doing writeback.  Here we
- * handle things such as livelock prevention or fairness of writeback among
- * inodes. This function can be called only by flusher thread - noone else
- * processes all inodes in writeback lists and requeueing inodes behind flusher
- * thread's back can have unexpected consequences.
- */
-static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
-             struct writeback_control *wbc)
-{
-   if (inode->i_state & I_FREEING)
-       return;
-
-   /*
-    * Sync livelock prevention. Each inode is tagged and synced in one
-    * shot. If still dirty, it will be redirty_tail()'ed below.  Update
-    * the dirty time to prevent enqueue and sync it again.
-    */
-   if ((inode->i_state & I_DIRTY) &&
-       (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
-       inode->dirtied_when = jiffies;
-
-   if (wbc->pages_skipped) {
-       /*
-        * writeback is not making progress due to locked
-        * buffers. Skip this inode for now.
-        */
-       redirty_tail(inode, wb);
-       return;
-   }
-
-   if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
-       /*
-        * We didn't write back all the pages.  nfs_writepages()
-        * sometimes bales out without doing anything.
-        */
-       if (wbc->nr_to_write <= 0) {
-           /* Slice used up. Queue for next turn. */
-           requeue_io(inode, wb);
-       } else {
-           /*
-            * Writeback blocked by something other than
-            * congestion. Delay the inode for some time to
-            * avoid spinning on the CPU (100% iowait)
-            * retrying writeback of the dirty page/inode
-            * that cannot be performed immediately.
-            */
-           redirty_tail(inode, wb);
-       }
-   } else if (inode->i_state & I_DIRTY) {
-       /*
-        * Filesystems can dirty the inode during writeback operations,
-        * such as delayed allocation during submission or metadata
-        * updates after data IO completion.
-        */
-       redirty_tail(inode, wb);
-   } else if (inode->i_state & I_DIRTY_TIME) {
-       inode->dirtied_when = jiffies;
-       inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
-   } else {
-       /* The inode is clean. Remove from writeback lists. */
-       inode_io_list_del_locked(inode, wb);
-   }
-}
-
-/*
- * Write out an inode and its dirty pages. Do not update the writeback list
- * linkage. That is left to the caller. The caller is also responsible for
- * setting I_SYNC flag and calling inode_sync_complete() to clear it.
- */
-static int
-__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
-{
-   struct address_space *mapping = inode->i_mapping;
-   long nr_to_write = wbc->nr_to_write;
-   unsigned dirty;
-   int ret;
-
-   WARN_ON(!(inode->i_state & I_SYNC));
-
-   trace_writeback_single_inode_start(inode, wbc, nr_to_write);
-
-   ret = do_writepages(mapping, wbc);
-
-   /*
-    * Make sure to wait on the data before writing out the metadata.
-    * This is important for filesystems that modify metadata on data
-    * I/O completion. We don't do it for sync(2) writeback because it has a
-    * separate, external IO completion path and ->sync_fs for guaranteeing
-    * inode metadata is written back correctly.
-    */
-   if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
-       int err = filemap_fdatawait(mapping);
-       if (ret == 0)
-           ret = err;
-   }
-
-   /*
-    * Some filesystems may redirty the inode during the writeback
-    * due to delalloc, clear dirty metadata flags right before
-    * write_inode()
-    */
-   spin_lock(&inode->i_lock);
-
-   dirty = inode->i_state & I_DIRTY;
-   if (inode->i_state & I_DIRTY_TIME) {
-       if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
-           unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
-           unlikely(time_after(jiffies,
-                   (inode->dirtied_time_when +
-                    dirtytime_expire_interval * HZ)))) {
-           dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
-           trace_writeback_lazytime(inode);
-       }
-   } else
-       inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
-   inode->i_state &= ~dirty;
-
-   /*
-    * Paired with smp_mb() in __mark_inode_dirty().  This allows
-    * __mark_inode_dirty() to test i_state without grabbing i_lock -
-    * either they see the I_DIRTY bits cleared or we see the dirtied
-    * inode.
-    *
-    * I_DIRTY_PAGES is always cleared together above even if @mapping
-    * still has dirty pages.  The flag is reinstated after smp_mb() if
-    * necessary.  This guarantees that either __mark_inode_dirty()
-    * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
-    */
-   smp_mb();
-
-   if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-       inode->i_state |= I_DIRTY_PAGES;
-
-   spin_unlock(&inode->i_lock);
-
-   if (dirty & I_DIRTY_TIME)
-       mark_inode_dirty_sync(inode);
-   /* Don't write the inode if only I_DIRTY_PAGES was set */
-   if (dirty & ~I_DIRTY_PAGES) {
-       int err = write_inode(inode, wbc);
-       if (ret == 0)
-           ret = err;
-   }
-   trace_writeback_single_inode(inode, wbc, nr_to_write);
-   return ret;
-}
-
-/*
- * Write out an inode's dirty pages. Either the caller has an active reference
- * on the inode or the inode has I_WILL_FREE set.
- *
- * This function is designed to be called for writing back one inode which
- * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
- * and does more profound writeback list handling in writeback_sb_inodes().
- */
-static int
-writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
-              struct writeback_control *wbc)
-{
-   int ret = 0;
-
-   spin_lock(&inode->i_lock);
-   if (!atomic_read(&inode->i_count))
-       WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
-   else
-       WARN_ON(inode->i_state & I_WILL_FREE);
-
-   if (inode->i_state & I_SYNC) {
-       if (wbc->sync_mode != WB_SYNC_ALL)
-           goto out;
-       /*
-        * It's a data-integrity sync. We must wait. Since callers hold
-        * inode reference or inode has I_WILL_FREE set, it cannot go
-        * away under us.
-        */
-       __inode_wait_for_writeback(inode);
-   }
-   WARN_ON(inode->i_state & I_SYNC);
-   /*
-    * Skip inode if it is clean and we have no outstanding writeback in
-    * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
-    * function since flusher thread may be doing for example sync in
-    * parallel and if we move the inode, it could get skipped. So here we
-    * make sure inode is on some writeback list and leave it there unless
-    * we have completely cleaned the inode.
-    */
-   if (!(inode->i_state & I_DIRTY_ALL) &&
-       (wbc->sync_mode != WB_SYNC_ALL ||
-        !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
-       goto out;
-   inode->i_state |= I_SYNC;
-   wbc_attach_and_unlock_inode(wbc, inode);
-
-   ret = __writeback_single_inode(inode, wbc);
-
-   wbc_detach_inode(wbc);
-   spin_lock(&wb->list_lock);
-   spin_lock(&inode->i_lock);
-   /*
-    * If inode is clean, remove it from writeback lists. Otherwise don't
-    * touch it. See comment above for explanation.
-    */
-   if (!(inode->i_state & I_DIRTY_ALL))
-       inode_io_list_del_locked(inode, wb);
-   spin_unlock(&wb->list_lock);
-   inode_sync_complete(inode);
-out:
-   spin_unlock(&inode->i_lock);
-   return ret;
-}
-
-static long writeback_chunk_size(struct bdi_writeback *wb,
-                struct wb_writeback_work *work)
-{
-   long pages;
-
-   /*
-    * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
-    * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
-    * here avoids calling into writeback_inodes_wb() more than once.
-    *
-    * The intended call sequence for WB_SYNC_ALL writeback is:
-    *
-    *      wb_writeback()
-    *          writeback_sb_inodes()       <== called only once
-    *              write_cache_pages()     <== called once for each inode
-    *                   (quickly) tag currently dirty pages
-    *                   (maybe slowly) sync all tagged pages
-    */
-   if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
-       pages = LONG_MAX;
-   else {
-       pages = min(wb->avg_write_bandwidth / 2,
-               global_wb_domain.dirty_limit / DIRTY_SCOPE);
-       pages = min(pages, work->nr_pages);
-       pages = round_down(pages + MIN_WRITEBACK_PAGES,
-                  MIN_WRITEBACK_PAGES);
-   }
-
-   return pages;
-}
-
-/*
- * Write a portion of b_io inodes which belong to @sb.
- *
- * Return the number of pages and/or inodes written.
- *
- * NOTE! This is called with wb->list_lock held, and will
- * unlock and relock that for each inode it ends up doing
- * IO for.
- */
-static long writeback_sb_inodes(struct super_block *sb,
-               struct bdi_writeback *wb,
-               struct wb_writeback_work *work)
-{
-   struct writeback_control wbc = {
-       .sync_mode      = work->sync_mode,
-       .tagged_writepages  = work->tagged_writepages,
-       .for_kupdate        = work->for_kupdate,
-       .for_background     = work->for_background,
-       .for_sync       = work->for_sync,
-       .range_cyclic       = work->range_cyclic,
-       .range_start        = 0,
-       .range_end      = LLONG_MAX,
-   };
-   unsigned long start_time = jiffies;
-   long write_chunk;
-   long wrote = 0;  /* count both pages and inodes */
-
-   while (!list_empty(&wb->b_io)) {
-       struct inode *inode = wb_inode(wb->b_io.prev);
-
-       if (inode->i_sb != sb) {
-           if (work->sb) {
-               /*
-                * We only want to write back data for this
-                * superblock, move all inodes not belonging
-                * to it back onto the dirty list.
-                */
-               redirty_tail(inode, wb);
-               continue;
-           }
-
-           /*
-            * The inode belongs to a different superblock.
-            * Bounce back to the caller to unpin this and
-            * pin the next superblock.
-            */
-           break;
-       }
-
-       /*
-        * Don't bother with new inodes or inodes being freed, first
-        * kind does not need periodic writeout yet, and for the latter
-        * kind writeout is handled by the freer.
-        */
-       spin_lock(&inode->i_lock);
-       if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
-           spin_unlock(&inode->i_lock);
-           redirty_tail(inode, wb);
-           continue;
-       }
-       if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
-           /*
-            * If this inode is locked for writeback and we are not
-            * doing writeback-for-data-integrity, move it to
-            * b_more_io so that writeback can proceed with the
-            * other inodes on s_io.
-            *
-            * We'll have another go at writing back this inode
-            * when we completed a full scan of b_io.
-            */
-           spin_unlock(&inode->i_lock);
-           requeue_io(inode, wb);
-           trace_writeback_sb_inodes_requeue(inode);
-           continue;
-       }
-       spin_unlock(&wb->list_lock);
-
-       /*
-        * We already requeued the inode if it had I_SYNC set and we
-        * are doing WB_SYNC_NONE writeback. So this catches only the
-        * WB_SYNC_ALL case.
-        */
-       if (inode->i_state & I_SYNC) {
-           /* Wait for I_SYNC. This function drops i_lock... */
-           inode_sleep_on_writeback(inode);
-           /* Inode may be gone, start again */
-           spin_lock(&wb->list_lock);
-           continue;
-       }
-       inode->i_state |= I_SYNC;
-       wbc_attach_and_unlock_inode(&wbc, inode);
-
-       write_chunk = writeback_chunk_size(wb, work);
-       wbc.nr_to_write = write_chunk;
-       wbc.pages_skipped = 0;
-
-       /*
-        * We use I_SYNC to pin the inode in memory. While it is set
-        * evict_inode() will wait so the inode cannot be freed.
-        */
-       __writeback_single_inode(inode, &wbc);
-
-       wbc_detach_inode(&wbc);
-       work->nr_pages -= write_chunk - wbc.nr_to_write;
-       wrote += write_chunk - wbc.nr_to_write;
-
-       if (need_resched()) {
-           /*
-            * We're trying to balance between building up a nice
-            * long list of IOs to improve our merge rate, and
-            * getting those IOs out quickly for anyone throttling
-            * in balance_dirty_pages().  cond_resched() doesn't
-            * unplug, so get our IOs out the door before we
-            * give up the CPU.
-            */
-           blk_flush_plug(current);
-           cond_resched();
-       }
-
-
-       spin_lock(&wb->list_lock);
-       spin_lock(&inode->i_lock);
-       if (!(inode->i_state & I_DIRTY_ALL))
-           wrote++;
-       requeue_inode(inode, wb, &wbc);
-       inode_sync_complete(inode);
-       spin_unlock(&inode->i_lock);
-
-       /*
-        * bail out to wb_writeback() often enough to check
-        * background threshold and other termination conditions.
-        */
-       if (wrote) {
-           if (time_is_before_jiffies(start_time + HZ / 10UL))
-               break;
-           if (work->nr_pages <= 0)
-               break;
-       }
-   }
-   return wrote;
-}
-
-static long __writeback_inodes_wb(struct bdi_writeback *wb,
-                 struct wb_writeback_work *work)
-{
-   unsigned long start_time = jiffies;
-   long wrote = 0;
-
-   while (!list_empty(&wb->b_io)) {
-       struct inode *inode = wb_inode(wb->b_io.prev);
-       struct super_block *sb = inode->i_sb;
-
-       if (!trylock_super(sb)) {
-           /*
-            * trylock_super() may fail consistently due to
-            * s_umount being grabbed by someone else. Don't use
-            * requeue_io() to avoid busy retrying the inode/sb.
-            */
-           redirty_tail(inode, wb);
-           continue;
-       }
-       wrote += writeback_sb_inodes(sb, wb, work);
-       up_read(&sb->s_umount);
-
-       /* refer to the same tests at the end of writeback_sb_inodes */
-       if (wrote) {
-           if (time_is_before_jiffies(start_time + HZ / 10UL))
-               break;
-           if (work->nr_pages <= 0)
-               break;
-       }
-   }
-   /* Leave any unwritten inodes on b_io */
-   return wrote;
-}
-
-static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
-               enum wb_reason reason)
-{
-   struct wb_writeback_work work = {
-       .nr_pages   = nr_pages,
-       .sync_mode  = WB_SYNC_NONE,
-       .range_cyclic   = 1,
-       .reason     = reason,
-   };
-   struct blk_plug plug;
-
-   blk_start_plug(&plug);
-   spin_lock(&wb->list_lock);
-   if (list_empty(&wb->b_io))
-       queue_io(wb, &work);
-   __writeback_inodes_wb(wb, &work);
-   spin_unlock(&wb->list_lock);
-   blk_finish_plug(&plug);
-
-   return nr_pages - work.nr_pages;
-}
-
-/*
- * Explicit flushing or periodic writeback of "old" data.
- *
- * Define "old": the first time one of an inode's pages is dirtied, we mark the
- * dirtying-time in the inode's address_space.  So this periodic writeback code
- * just walks the superblock inode list, writing back any inodes which are
- * older than a specific point in time.
- *
- * Try to run once per dirty_writeback_interval.  But if a writeback event
- * takes longer than a dirty_writeback_interval interval, then leave a
- * one-second gap.
- *
- * older_than_this takes precedence over nr_to_write.  So we'll only write back
- * all dirty pages if they are all attached to "old" mappings.
- */
-static long wb_writeback(struct bdi_writeback *wb,
-            struct wb_writeback_work *work)
-{
-   unsigned long wb_start = jiffies;
-   long nr_pages = work->nr_pages;
-   unsigned long oldest_jif;
-   struct inode *inode;
-   long progress;
-   struct blk_plug plug;
-
-   oldest_jif = jiffies;
-   work->older_than_this = &oldest_jif;
-
-   blk_start_plug(&plug);
-   spin_lock(&wb->list_lock);
-   for (;;) {
-       /*
-        * Stop writeback when nr_pages has been consumed
-        */
-       if (work->nr_pages <= 0)
-           break;
-
-       /*
-        * Background writeout and kupdate-style writeback may
-        * run forever. Stop them if there is other work to do
-        * so that e.g. sync can proceed. They'll be restarted
-        * after the other works are all done.
-        */
-       if ((work->for_background || work->for_kupdate) &&
-           !list_empty(&wb->work_list))
-           break;
-
-       /*
-        * For background writeout, stop when we are below the
-        * background dirty threshold
-        */
-       if (work->for_background && !wb_over_bg_thresh(wb))
-           break;
-
-       /*
-        * Kupdate and background works are special and we want to
-        * include all inodes that need writing. Livelock avoidance is
-        * handled by these works yielding to any other work so we are
-        * safe.
-        */
-       if (work->for_kupdate) {
-           oldest_jif = jiffies -
-               msecs_to_jiffies(dirty_expire_interval * 10);
-       } else if (work->for_background)
-           oldest_jif = jiffies;
-
-       trace_writeback_start(wb, work);
-       if (list_empty(&wb->b_io))
-           queue_io(wb, work);
-       if (work->sb)
-           progress = writeback_sb_inodes(work->sb, wb, work);
-       else
-           progress = __writeback_inodes_wb(wb, work);
-       trace_writeback_written(wb, work);
-
-       wb_update_bandwidth(wb, wb_start);
-
-       /*
-        * Did we write something? Try for more
-        *
-        * Dirty inodes are moved to b_io for writeback in batches.
-        * The completion of the current batch does not necessarily
-        * mean the overall work is done. So we keep looping as long
-        * as made some progress on cleaning pages or inodes.
-        */
-       if (progress)
-           continue;
-       /*
-        * No more inodes for IO, bail
-        */
-       if (list_empty(&wb->b_more_io))
-           break;
-       /*
-        * Nothing written. Wait for some inode to
-        * become available for writeback. Otherwise
-        * we'll just busyloop.
-        */
-       if (!list_empty(&wb->b_more_io))  {
-           trace_writeback_wait(wb, work);
-           inode = wb_inode(wb->b_more_io.prev);
-           spin_lock(&inode->i_lock);
-           spin_unlock(&wb->list_lock);
-           /* This function drops i_lock... */
-           inode_sleep_on_writeback(inode);
-           spin_lock(&wb->list_lock);
-       }
-   }
-   spin_unlock(&wb->list_lock);
-   blk_finish_plug(&plug);
-
-   return nr_pages - work->nr_pages;
-}
-
-/*
- * Return the next wb_writeback_work struct that hasn't been processed yet.
- */
-static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
-{
-   struct wb_writeback_work *work = NULL;
-
-   spin_lock_bh(&wb->work_lock);
-   if (!list_empty(&wb->work_list)) {
-       work = list_entry(wb->work_list.next,
-                 struct wb_writeback_work, list);
-       list_del_init(&work->list);
-   }
-   spin_unlock_bh(&wb->work_lock);
-   return work;
-}
-
-/*
- * Add in the number of potentially dirty inodes, because each inode
- * write can dirty pagecache in the underlying blockdev.
- */
-static unsigned long get_nr_dirty_pages(void)
-{
-   return global_page_state(NR_FILE_DIRTY) +
-       global_page_state(NR_UNSTABLE_NFS) +
-       get_nr_dirty_inodes();
-}
-
-static long wb_check_background_flush(struct bdi_writeback *wb)
-{
-   if (wb_over_bg_thresh(wb)) {
-
-       struct wb_writeback_work work = {
-           .nr_pages   = LONG_MAX,
-           .sync_mode  = WB_SYNC_NONE,
-           .for_background = 1,
-           .range_cyclic   = 1,
-           .reason     = WB_REASON_BACKGROUND,
-       };
-
-       return wb_writeback(wb, &work);
-   }
-
-   return 0;
-}
-
-static long wb_check_old_data_flush(struct bdi_writeback *wb)
-{
-   unsigned long expired;
-   long nr_pages;
-
-   /*
-    * When set to zero, disable periodic writeback
-    */
-   if (!dirty_writeback_interval)
-       return 0;
-
-   expired = wb->last_old_flush +
-           msecs_to_jiffies(dirty_writeback_interval * 10);
-   if (time_before(jiffies, expired))
-       return 0;
-
-   wb->last_old_flush = jiffies;
-   nr_pages = get_nr_dirty_pages();
-
-   if (nr_pages) {
-       struct wb_writeback_work work = {
-           .nr_pages   = nr_pages,
-           .sync_mode  = WB_SYNC_NONE,
-           .for_kupdate    = 1,
-           .range_cyclic   = 1,
-           .reason     = WB_REASON_PERIODIC,
-       };
-
-       return wb_writeback(wb, &work);
-   }
-
-   return 0;
-}
-
-/*
- * Retrieve work items and do the writeback they describe
- */
-static long wb_do_writeback(struct bdi_writeback *wb)
-{
-   struct wb_writeback_work *work;
-   long wrote = 0;
-
-   set_bit(WB_writeback_running, &wb->state);
-   while ((work = get_next_work_item(wb)) != NULL) {
-       struct wb_completion *done = work->done;
-
-       trace_writeback_exec(wb, work);
-
-       wrote += wb_writeback(wb, work);
-
-       if (work->auto_free)
-           kfree(work);
-       if (done && atomic_dec_and_test(&done->cnt))
-           wake_up_all(&wb->bdi->wb_waitq);
-   }
-
-   /*
-    * Check for periodic writeback, kupdated() style
-    */
-   wrote += wb_check_old_data_flush(wb);
-   wrote += wb_check_background_flush(wb);
-   clear_bit(WB_writeback_running, &wb->state);
-
-   return wrote;
-}
-
-/*
- * Handle writeback of dirty data for the device backed by this bdi. Also
- * reschedules periodically and does kupdated style flushing.
- */
-void wb_workfn(struct work_struct *work)
-{
-   struct bdi_writeback *wb = container_of(to_delayed_work(work),
-                       struct bdi_writeback, dwork);
-   long pages_written;
-
-   set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
-   current->flags |= PF_SWAPWRITE;
-
-   if (likely(!current_is_workqueue_rescuer() ||
-          !test_bit(WB_registered, &wb->state))) {
-       /*
-        * The normal path.  Keep writing back @wb until its
-        * work_list is empty.  Note that this path is also taken
-        * if @wb is shutting down even when we're running off the
-        * rescuer as work_list needs to be drained.
-        */
-       do {
-           pages_written = wb_do_writeback(wb);
-           trace_writeback_pages_written(pages_written);
-       } while (!list_empty(&wb->work_list));
-   } else {
-       /*
-        * bdi_wq can't get enough workers and we're running off
-        * the emergency worker.  Don't hog it.  Hopefully, 1024 is
-        * enough for efficient IO.
-        */
-       pages_written = writeback_inodes_wb(wb, 1024,
-                           WB_REASON_FORKER_THREAD);
-       trace_writeback_pages_written(pages_written);
-   }
-
-   if (!list_empty(&wb->work_list))
-       mod_delayed_work(bdi_wq, &wb->dwork, 0);
-   else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-       wb_wakeup_delayed(wb);
-
-   current->flags &= ~PF_SWAPWRITE;
-}
-
-/*
- * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
- * the whole world.
- */
-void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
-{
-   struct backing_dev_info *bdi;
-
-   if (!nr_pages)
-       nr_pages = get_nr_dirty_pages();
-
-   rcu_read_lock();
-   list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
-       struct bdi_writeback *wb;
-
-       if (!bdi_has_dirty_io(bdi))
-           continue;
-
-       list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
-           wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
-                      false, reason);
-   }
-   rcu_read_unlock();
-}
-
-/*
- * Wake up bdi's periodically to make sure dirtytime inodes gets
- * written back periodically.  We deliberately do *not* check the
- * b_dirtytime list in wb_has_dirty_io(), since this would cause the
- * kernel to be constantly waking up once there are any dirtytime
- * inodes on the system.  So instead we define a separate delayed work
- * function which gets called much more rarely.  (By default, only
- * once every 12 hours.)
- *
- * If there is any other write activity going on in the file system,
- * this function won't be necessary.  But if the only thing that has
- * happened on the file system is a dirtytime inode caused by an atime
- * update, we need this infrastructure below to make sure that inode
- * eventually gets pushed out to disk.
- */
-static void wakeup_dirtytime_writeback(struct work_struct *w);
-static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
-
-static void wakeup_dirtytime_writeback(struct work_struct *w)
-{
-   struct backing_dev_info *bdi;
-
-   rcu_read_lock();
-   list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
-       struct bdi_writeback *wb;
-
-       list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
-           if (!list_empty(&wb->b_dirty_time))
-               wb_wakeup(wb);
-   }
-   rcu_read_unlock();
-   schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
-}
-
-static int __init start_dirtytime_writeback(void)
-{
-   schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
-   return 0;
-}
-__initcall(start_dirtytime_writeback);
-
-int dirtytime_interval_handler(struct ctl_table *table, int write,
-                  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-   int ret;
-
-   ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-   if (ret == 0 && write)
-       mod_delayed_work(system_wq, &dirtytime_work, 0);
-   return ret;
-}
-
-static noinline void block_dump___mark_inode_dirty(struct inode *inode)
-{
-   if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
-       struct dentry *dentry;
-       const char *name = "?";
-
-       dentry = d_find_alias(inode);
-       if (dentry) {
-           spin_lock(&dentry->d_lock);
-           name = (const char *) dentry->d_name.name;
-       }
-       printk(KERN_DEBUG
-              "%s(%d): dirtied inode %lu (%s) on %s\n",
-              current->comm, task_pid_nr(current), inode->i_ino,
-              name, inode->i_sb->s_id);
-       if (dentry) {
-           spin_unlock(&dentry->d_lock);
-           dput(dentry);
-       }
-   }
-}
-
-/**
- * __mark_inode_dirty -    internal function
- * @inode: inode to mark
- * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
- * Mark an inode as dirty. Callers should use mark_inode_dirty or
- *     mark_inode_dirty_sync.
- *
- * Put the inode on the super block's dirty list.
- *
- * CAREFUL! We mark it dirty unconditionally, but move it onto the
- * dirty list only if it is hashed or if it refers to a blockdev.
- * If it was not hashed, it will never be added to the dirty list
- * even if it is later hashed, as it will have been marked dirty already.
- *
- * In short, make sure you hash any inodes _before_ you start marking
- * them dirty.
- *
- * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
- * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
- * the kernel-internal blockdev inode represents the dirtying time of the
- * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
- * page->mapping->host, so the page-dirtying time is recorded in the internal
- * blockdev inode.
- */
-void __mark_inode_dirty(struct inode *inode, int flags)
-{
-#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
-   struct super_block *sb = inode->i_sb;
-   int dirtytime;
-
-   trace_writeback_mark_inode_dirty(inode, flags);
-
-   /*
-    * Don't do this for I_DIRTY_PAGES - that doesn't actually
-    * dirty the inode itself
-    */
-   if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
-       trace_writeback_dirty_inode_start(inode, flags);
-
-       if (sb->s_op->dirty_inode)
-           sb->s_op->dirty_inode(inode, flags);
-
-       trace_writeback_dirty_inode(inode, flags);
-   }
-   if (flags & I_DIRTY_INODE)
-       flags &= ~I_DIRTY_TIME;
-   dirtytime = flags & I_DIRTY_TIME;
-
-   /*
-    * Paired with smp_mb() in __writeback_single_inode() for the
-    * following lockless i_state test.  See there for details.
-    */
-   smp_mb();
-
-   if (((inode->i_state & flags) == flags) ||
-       (dirtytime && (inode->i_state & I_DIRTY_INODE)))
-       return;
-
-   if (unlikely(block_dump))
-       block_dump___mark_inode_dirty(inode);
-
-   spin_lock(&inode->i_lock);
-   if (dirtytime && (inode->i_state & I_DIRTY_INODE))
-       goto out_unlock_inode;
-   if ((inode->i_state & flags) != flags) {
-       const int was_dirty = inode->i_state & I_DIRTY;
-
-       inode_attach_wb(inode, NULL);
-
-       if (flags & I_DIRTY_INODE)
-           inode->i_state &= ~I_DIRTY_TIME;
-       inode->i_state |= flags;
-
-       /*
-        * If the inode is being synced, just update its dirty state.
-        * The unlocker will place the inode on the appropriate
-        * superblock list, based upon its state.
-        */
-       if (inode->i_state & I_SYNC)
-           goto out_unlock_inode;
-
-       /*
-        * Only add valid (hashed) inodes to the superblock's
-        * dirty list.  Add blockdev inodes as well.
-        */
-       if (!S_ISBLK(inode->i_mode)) {
-           if (inode_unhashed(inode))
-               goto out_unlock_inode;
-       }
-       if (inode->i_state & I_FREEING)
-           goto out_unlock_inode;
-
-       /*
-        * If the inode was already on b_dirty/b_io/b_more_io, don't
-        * reposition it (that would break b_dirty time-ordering).
-        */
-       if (!was_dirty) {
-           struct bdi_writeback *wb;
-           struct list_head *dirty_list;
-           bool wakeup_bdi = false;
-
-           wb = locked_inode_to_wb_and_lock_list(inode);
-
-           WARN(bdi_cap_writeback_dirty(wb->bdi) &&
-                !test_bit(WB_registered, &wb->state),
-                "bdi-%s not registered\n", wb->bdi->name);
-
-           inode->dirtied_when = jiffies;
-           if (dirtytime)
-               inode->dirtied_time_when = jiffies;
-
-           if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
-               dirty_list = &wb->b_dirty;
-           else
-               dirty_list = &wb->b_dirty_time;
-
-           wakeup_bdi = inode_io_list_move_locked(inode, wb,
-                                  dirty_list);
-
-           spin_unlock(&wb->list_lock);
-           trace_writeback_dirty_inode_enqueue(inode);
-
-           /*
-            * If this is the first dirty inode for this bdi,
-            * we have to wake-up the corresponding bdi thread
-            * to make sure background write-back happens
-            * later.
-            */
-           if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
-               wb_wakeup_delayed(wb);
-           return;
-       }
-   }
-out_unlock_inode:
-   spin_unlock(&inode->i_lock);
-
-#undef I_DIRTY_INODE
-}
-EXPORT_SYMBOL(__mark_inode_dirty);
-
-/*
- * The @s_sync_lock is used to serialise concurrent sync operations
- * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
- * Concurrent callers will block on the s_sync_lock rather than doing contending
- * walks. The queueing maintains sync(2) required behaviour as all the IO that
- * has been issued up to the time this function is enter is guaranteed to be
- * completed by the time we have gained the lock and waited for all IO that is
- * in progress regardless of the order callers are granted the lock.
- */
-static void wait_sb_inodes(struct super_block *sb)
-{
-   struct inode *inode, *old_inode = NULL;
-
-   /*
-    * We need to be protected against the filesystem going from
-    * r/o to r/w or vice versa.
-    */
-   WARN_ON(!rwsem_is_locked(&sb->s_umount));
-
-   mutex_lock(&sb->s_sync_lock);
-   spin_lock(&sb->s_inode_list_lock);
-
-   /*
-    * Data integrity sync. Must wait for all pages under writeback,
-    * because there may have been pages dirtied before our sync
-    * call, but which had writeout started before we write it out.
-    * In which case, the inode may not be on the dirty list, but
-    * we still have to wait for that writeout.
-    */
-   list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-       struct address_space *mapping = inode->i_mapping;
-
-       spin_lock(&inode->i_lock);
-       if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
-           (mapping->nrpages == 0)) {
-           spin_unlock(&inode->i_lock);
-           continue;
-       }
-       __iget(inode);
-       spin_unlock(&inode->i_lock);
-       spin_unlock(&sb->s_inode_list_lock);
-
-       /*
-        * We hold a reference to 'inode' so it couldn't have been
-        * removed from s_inodes list while we dropped the
-        * s_inode_list_lock.  We cannot iput the inode now as we can
-        * be holding the last reference and we cannot iput it under
-        * s_inode_list_lock. So we keep the reference and iput it
-        * later.
-        */
-       iput(old_inode);
-       old_inode = inode;
-
-       /*
-        * We keep the error status of individual mapping so that
-        * applications can catch the writeback error using fsync(2).
-        * See filemap_fdatawait_keep_errors() for details.
-        */
-       filemap_fdatawait_keep_errors(mapping);
-
-       cond_resched();
-
-       spin_lock(&sb->s_inode_list_lock);
-   }
-   spin_unlock(&sb->s_inode_list_lock);
-   iput(old_inode);
-   mutex_unlock(&sb->s_sync_lock);
-}
-
-static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
-                    enum wb_reason reason, bool skip_if_busy)
-{
-   DEFINE_WB_COMPLETION_ONSTACK(done);
-   struct wb_writeback_work work = {
-       .sb         = sb,
-       .sync_mode      = WB_SYNC_NONE,
-       .tagged_writepages  = 1,
-       .done           = &done,
-       .nr_pages       = nr,
-       .reason         = reason,
-   };
-   struct backing_dev_info *bdi = sb->s_bdi;
-
-   if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
-       return;
-   WARN_ON(!rwsem_is_locked(&sb->s_umount));
-
-   bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
-   wb_wait_for_completion(bdi, &done);
-}
-
-/**
- * writeback_inodes_sb_nr -    writeback dirty inodes from given super_block
- * @sb: the superblock
- * @nr: the number of pages to write
- * @reason: reason why some writeback work initiated
- *
- * Start writeback on some inodes on this super_block. No guarantees are made
- * on how many (if any) will be written, and this function does not wait
- * for IO completion of submitted IO.
- */
-void writeback_inodes_sb_nr(struct super_block *sb,
-               unsigned long nr,
-               enum wb_reason reason)
-{
-   __writeback_inodes_sb_nr(sb, nr, reason, false);
-}
-EXPORT_SYMBOL(writeback_inodes_sb_nr);
-
-/**
- * writeback_inodes_sb -   writeback dirty inodes from given super_block
- * @sb: the superblock
- * @reason: reason why some writeback work was initiated
- *
- * Start writeback on some inodes on this super_block. No guarantees are made
- * on how many (if any) will be written, and this function does not wait
- * for IO completion of submitted IO.
- */
-void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
-{
-   return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
-}
-EXPORT_SYMBOL(writeback_inodes_sb);
-
-/**
- * try_to_writeback_inodes_sb_nr - try to start writeback if none underway
- * @sb: the superblock
- * @nr: the number of pages to write
- * @reason: the reason of writeback
- *
- * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
- * Returns 1 if writeback was started, 0 if not.
- */
-bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
-                  enum wb_reason reason)
-{
-   if (!down_read_trylock(&sb->s_umount))
-       return false;
-
-   __writeback_inodes_sb_nr(sb, nr, reason, true);
-   up_read(&sb->s_umount);
-   return true;
-}
-EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
-
-/**
- * try_to_writeback_inodes_sb - try to start writeback if none underway
- * @sb: the superblock
- * @reason: reason why some writeback work was initiated
- *
- * Implement by try_to_writeback_inodes_sb_nr()
- * Returns 1 if writeback was started, 0 if not.
- */
-bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
-{
-   return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
-}
-EXPORT_SYMBOL(try_to_writeback_inodes_sb);
-
-/**
- * sync_inodes_sb  -   sync sb inode pages
- * @sb: the superblock
- *
- * This function writes and waits on any dirty inode belonging to this
- * super_block.
- */
-void sync_inodes_sb(struct super_block *sb)
-{
-   DEFINE_WB_COMPLETION_ONSTACK(done);
-   struct wb_writeback_work work = {
-       .sb     = sb,
-       .sync_mode  = WB_SYNC_ALL,
-       .nr_pages   = LONG_MAX,
-       .range_cyclic   = 0,
-       .done       = &done,
-       .reason     = WB_REASON_SYNC,
-       .for_sync   = 1,
-   };
-   struct backing_dev_info *bdi = sb->s_bdi;
-
-   /*
-    * Can't skip on !bdi_has_dirty() because we should wait for !dirty
-    * inodes under writeback and I_DIRTY_TIME inodes ignored by
-    * bdi_has_dirty() need to be written out too.
-    */
-   if (bdi == &noop_backing_dev_info)
-       return;
-   WARN_ON(!rwsem_is_locked(&sb->s_umount));
-
-   bdi_split_work_to_wbs(bdi, &work, false);
-   wb_wait_for_completion(bdi, &done);
-
-   wait_sb_inodes(sb);
-}
-EXPORT_SYMBOL(sync_inodes_sb);
-
-/**
- * write_inode_now -   write an inode to disk
- * @inode: inode to write to disk
- * @sync: whether the write should be synchronous or not
- *
- * This function commits an inode to disk immediately if it is dirty. This is
- * primarily needed by knfsd.
- *
- * The caller must either have a ref on the inode or must have set I_WILL_FREE.
- */
-int write_inode_now(struct inode *inode, int sync)
-{
-   struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-   struct writeback_control wbc = {
-       .nr_to_write = LONG_MAX,
-       .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
-       .range_start = 0,
-       .range_end = LLONG_MAX,
-   };
-
-   if (!mapping_cap_writeback_dirty(inode->i_mapping))
-       wbc.nr_to_write = 0;
-
-   might_sleep();
-   return writeback_single_inode(inode, wb, &wbc);
-}
-EXPORT_SYMBOL(write_inode_now);
-
-/**
- * sync_inode - write an inode and its pages to disk.
- * @inode: the inode to sync
- * @wbc: controls the writeback mode
- *
- * sync_inode() will write an inode and its pages to disk.  It will also
- * correctly update the inode on its superblock's dirty inode lists and will
- * update inode->i_state.
- *
- * The caller must have a ref on the inode.
- */
-int sync_inode(struct inode *inode, struct writeback_control *wbc)
-{
-   return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc);
-}
-EXPORT_SYMBOL(sync_inode);
-
-/**
- * sync_inode_metadata - write an inode to disk
- * @inode: the inode to sync
- * @wait: wait for I/O to complete.
- *
- * Write an inode to disk and adjust its dirty state after completion.
- *
- * Note: only writes the actual inode, no associated data or other metadata.
- */
-int sync_inode_metadata(struct inode *inode, int wait)
-{
-   struct writeback_control wbc = {
-       .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
-       .nr_to_write = 0, /* metadata-only */
-   };
-
-   return sync_inode(inode, &wbc);
-}
-EXPORT_SYMBOL(sync_inode_metadata);
diff -Naur linux-4.4.6-gentoo-orig/fs/gfs2/meta_io.c linux-4.4.6-gentoo-patched/fs/gfs2/meta_io.c
--- linux-4.4.6-gentoo-orig/fs/gfs2/meta_io.c   2016-05-04 11:19:37.613649828 +0300
+++ linux-4.4.6-gentoo-patched/fs/gfs2/meta_io.c    2016-05-04 11:03:27.410730745 +0300
@@ -37,8 +37,7 @@
 {
    struct buffer_head *bh, *head;
    int nr_underway = 0;
-   int write_op = REQ_META | REQ_PRIO |
-       (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+   int write_op = REQ_META | REQ_PRIO | wbc_to_write_cmd(wbc);

    BUG_ON(!PageLocked(page));
    BUG_ON(!page_has_buffers(page));
diff -Naur linux-4.4.6-gentoo-orig/fs/mpage.c linux-4.4.6-gentoo-patched/fs/mpage.c
--- linux-4.4.6-gentoo-orig/fs/mpage.c  2016-05-04 11:19:37.614649827 +0300
+++ linux-4.4.6-gentoo-patched/fs/mpage.c   2016-05-04 11:03:27.410730745 +0300
@@ -485,7 +485,6 @@
    struct buffer_head map_bh;
    loff_t i_size = i_size_read(inode);
    int ret = 0;
-   int wr = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);

    if (page_has_buffers(page)) {
        struct buffer_head *head = page_buffers(page);
@@ -594,7 +593,7 @@
     * This page will go to BIO.  Do we need to send this BIO off first?
     */
    if (bio && mpd->last_block_in_bio != blocks[0] - 1)
-       bio = mpage_bio_submit(wr, bio);
+       bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio);

 alloc_new:
    if (bio == NULL) {
@@ -621,7 +620,7 @@
    wbc_account_io(wbc, page, PAGE_SIZE);
    length = first_unmapped << blkbits;
    if (bio_add_page(bio, page, length, 0) < length) {
-       bio = mpage_bio_submit(wr, bio);
+       bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio);
        goto alloc_new;
    }

@@ -631,7 +630,7 @@
    set_page_writeback(page);
    unlock_page(page);
    if (boundary || (first_unmapped != blocks_per_page)) {
-       bio = mpage_bio_submit(wr, bio);
+       bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio);
        if (boundary_block) {
            write_boundary_block(boundary_bdev,
                    boundary_block, 1 << blkbits);
@@ -643,7 +642,7 @@

 confused:
    if (bio)
-       bio = mpage_bio_submit(wr, bio);
+       bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio);

    if (mpd->use_writepage) {
        ret = mapping->a_ops->writepage(page, wbc);
diff -Naur linux-4.4.6-gentoo-orig/fs/xfs/xfs_aops.c linux-4.4.6-gentoo-patched/fs/xfs/xfs_aops.c
--- linux-4.4.6-gentoo-orig/fs/xfs/xfs_aops.c   2016-05-04 11:19:37.614649827 +0300
+++ linux-4.4.6-gentoo-patched/fs/xfs/xfs_aops.c    2016-05-04 11:03:27.410730745 +0300
@@ -382,7 +382,7 @@
    atomic_inc(&ioend->io_remaining);
    bio->bi_private = ioend;
    bio->bi_end_io = xfs_end_bio;
-   submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
+   submit_bio(wbc_to_write_cmd(wbc), bio);
 }

 STATIC struct bio *
diff -Naur linux-4.4.6-gentoo-orig/include/linux/backing-dev-defs.h linux-4.4.6-gentoo-patched/include/linux/backing-dev-defs.h
--- linux-4.4.6-gentoo-orig/include/linux/backing-dev-defs.h    2016-05-04 11:19:37.615649827 +0300
+++ linux-4.4.6-gentoo-patched/include/linux/backing-dev-defs.h 2016-05-04 11:03:27.410730745 +0300
@@ -116,6 +116,8 @@
    struct list_head work_list;
    struct delayed_work dwork;  /* work item used for writeback */

+   atomic_t dirty_sleeping;    /* waiting on dirty limit exceeded */
+
    struct list_head bdi_node;  /* anchored at bdi->wb_list */

 #ifdef CONFIG_CGROUP_WRITEBACK
diff -Naur linux-4.4.6-gentoo-orig/include/linux/blkdev.h linux-4.4.6-gentoo-patched/include/linux/blkdev.h
--- linux-4.4.6-gentoo-orig/include/linux/blkdev.h  2016-05-04 11:19:37.615649827 +0300
+++ linux-4.4.6-gentoo-patched/include/linux/blkdev.h   2016-05-04 11:03:27.410730745 +0300
@@ -23,6 +23,7 @@
 #include <linux/rcupdate.h>
 #include <linux/percpu-refcount.h>
 #include <linux/scatterlist.h>
+#include <linux/wbt.h>

 struct module;
 struct scsi_ioctl_command;
@@ -36,6 +37,7 @@
 struct blkcg_gq;
 struct blk_flush_queue;
 struct pr_ops;
+struct rq_wb;

 #define BLKDEV_MIN_RQ  4
 #define BLKDEV_MAX_RQ  128 /* Default maximum */
@@ -152,6 +154,7 @@
    struct gendisk *rq_disk;
    struct hd_struct *part;
    unsigned long start_time;
+   struct wb_issue_stat wb_stat;
 #ifdef CONFIG_BLK_CGROUP
    struct request_list *rl;        /* rl this rq is alloced from */
    unsigned long long start_time_ns;
@@ -289,6 +292,8 @@
    int         nr_rqs[2];  /* # allocated [a]sync rqs */
    int         nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */

+   struct rq_wb        *rq_wb;
+
    /*
     * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
     * is used, root blkg allocates from @q->root_rl and all other
@@ -314,6 +319,8 @@
    struct blk_mq_ctx __percpu  *queue_ctx;
    unsigned int        nr_queues;

+   unsigned int        queue_depth;
+
    /* hw dispatch queues */
    struct blk_mq_hw_ctx    **queue_hw_ctx;
    unsigned int        nr_hw_queues;
@@ -399,6 +406,9 @@

    unsigned int        nr_sorted;
    unsigned int        in_flight[2];
+
+   struct blk_rq_stat  rq_stats[2];
+
    /*
     * Number of active block driver functions for which blk_drain_queue()
     * must wait. Must be incremented around functions that unlock the
@@ -431,8 +441,6 @@
    /*
     * for flush operations
     */
-   unsigned int        flush_flags;
-   unsigned int        flush_not_queueable:1;
    struct blk_flush_queue  *fq;

    struct list_head    requeue_list;
@@ -489,6 +497,9 @@
 #define QUEUE_FLAG_INIT_DONE   20  /* queue is initialized */
 #define QUEUE_FLAG_NO_SG_MERGE 21  /* don't attempt to merge SG segments*/
 #define QUEUE_FLAG_POLL           22   /* IO polling enabled if set */
+#define QUEUE_FLAG_WC         23   /* Write back caching */
+#define QUEUE_FLAG_FUA        24   /* device supports FUA writes */
+#define QUEUE_FLAG_FLUSH_NQ    25  /* flush not queueuable */

 #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) |        \
                 (1 << QUEUE_FLAG_STACKABLE)    |   \
@@ -677,6 +688,14 @@
    return false;
 }

+static inline unsigned int blk_queue_depth(struct request_queue *q)
+{
+   if (q->queue_depth)
+       return q->queue_depth;
+
+   return q->nr_requests;
+}
+
 /*
  * q->prep_rq_fn return values
  */
@@ -977,6 +996,7 @@
 extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
 extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
 extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
+extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
 extern void blk_set_default_limits(struct queue_limits *lim);
 extern void blk_set_stacking_limits(struct queue_limits *lim);
 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
@@ -1001,8 +1021,8 @@
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
 extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
-extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
 extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
+extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);

 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
@@ -1355,7 +1375,7 @@

 static inline bool queue_flush_queueable(struct request_queue *q)
 {
-   return !q->flush_not_queueable;
+   return !test_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags);
 }

 typedef struct {struct page *v;} Sector;
diff -Naur linux-4.4.6-gentoo-orig/include/linux/blk_types.h linux-4.4.6-gentoo-patched/include/linux/blk_types.h
--- linux-4.4.6-gentoo-orig/include/linux/blk_types.h   2016-05-04 11:19:37.616649827 +0300
+++ linux-4.4.6-gentoo-patched/include/linux/blk_types.h    2016-05-04 11:03:27.410730745 +0300
@@ -161,6 +161,7 @@
    __REQ_INTEGRITY,    /* I/O includes block integrity payload */
    __REQ_FUA,      /* forced unit access */
    __REQ_FLUSH,        /* request for cache flush */
+   __REQ_BG,       /* background activity */

    /* bio only flags */
    __REQ_RAHEAD,       /* read ahead, can fail anytime */
@@ -209,7 +210,7 @@
 #define REQ_COMMON_MASK \
    (REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | \
     REQ_DISCARD | REQ_WRITE_SAME | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | \
-    REQ_SECURE | REQ_INTEGRITY)
+    REQ_SECURE | REQ_INTEGRITY | REQ_BG)
 #define REQ_CLONE_MASK     REQ_COMMON_MASK

 #define BIO_NO_ADVANCE_ITER_MASK   (REQ_DISCARD|REQ_WRITE_SAME)
@@ -236,6 +237,7 @@
 #define REQ_COPY_USER      (1ULL << __REQ_COPY_USER)
 #define REQ_FLUSH      (1ULL << __REQ_FLUSH)
 #define REQ_FLUSH_SEQ      (1ULL << __REQ_FLUSH_SEQ)
+#define REQ_BG         (1ULL << __REQ_BG)
 #define REQ_IO_STAT        (1ULL << __REQ_IO_STAT)
 #define REQ_MIXED_MERGE        (1ULL << __REQ_MIXED_MERGE)
 #define REQ_SECURE     (1ULL << __REQ_SECURE)
@@ -268,4 +270,12 @@
    return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
 }

+struct blk_rq_stat {
+   s64 mean;
+   u64 min;
+   u64 max;
+   s64 nr_samples;
+   s64 time;
+};
+
 #endif /* __LINUX_BLK_TYPES_H */
diff -Naur linux-4.4.6-gentoo-orig/include/linux/fs.h linux-4.4.6-gentoo-patched/include/linux/fs.h
--- linux-4.4.6-gentoo-orig/include/linux/fs.h  2016-05-04 11:19:37.616649827 +0300
+++ linux-4.4.6-gentoo-patched/include/linux/fs.h   2016-05-04 11:03:27.411730745 +0300
@@ -189,6 +189,9 @@
  * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded
  *         by a cache flush and data is guaranteed to be on
  *         non-volatile media on completion.
+ * WRITE_BG        Background write. This is for background activity like
+ *         the periodic flush and background threshold writeback
+ *
  *
  */
 #define RW_MASK            REQ_WRITE
@@ -204,6 +207,7 @@
 #define WRITE_FLUSH        (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH)
 #define WRITE_FUA      (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA)
 #define WRITE_FLUSH_FUA        (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
+#define WRITE_BG       (WRITE | REQ_NOIDLE | REQ_BG)

 /*
  * Attribute flags.  These should be or-ed together to figure out what
diff -Naur linux-4.4.6-gentoo-orig/include/linux/sched.h.orig linux-4.4.6-gentoo-patched/include/linux/sched.h.orig
--- linux-4.4.6-gentoo-orig/include/linux/sched.h.orig  2016-05-04 11:19:37.618649827 +0300
+++ linux-4.4.6-gentoo-patched/include/linux/sched.h.orig   1970-01-01 03:00:00.000000000 +0300
@@ -1,3194 +0,0 @@
-#ifndef _LINUX_SCHED_H
-#define _LINUX_SCHED_H
-
-#include <uapi/linux/sched.h>
-
-#include <linux/sched/prio.h>
-
-
-struct sched_param {
-   int sched_priority;
-};
-
-#include <asm/param.h> /* for HZ */
-
-#include <linux/capability.h>
-#include <linux/threads.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/timex.h>
-#include <linux/jiffies.h>
-#include <linux/plist.h>
-#include <linux/rbtree.h>
-#include <linux/thread_info.h>
-#include <linux/cpumask.h>
-#include <linux/errno.h>
-#include <linux/nodemask.h>
-#include <linux/mm_types.h>
-#include <linux/preempt.h>
-
-#include <asm/page.h>
-#include <asm/ptrace.h>
-#include <linux/cputime.h>
-
-#include <linux/smp.h>
-#include <linux/sem.h>
-#include <linux/shm.h>
-#include <linux/signal.h>
-#include <linux/compiler.h>
-#include <linux/completion.h>
-#include <linux/pid.h>
-#include <linux/percpu.h>
-#include <linux/topology.h>
-#include <linux/proportions.h>
-#include <linux/seccomp.h>
-#include <linux/rcupdate.h>
-#include <linux/rculist.h>
-#include <linux/rtmutex.h>
-
-#include <linux/time.h>
-#include <linux/param.h>
-#include <linux/resource.h>
-#include <linux/timer.h>
-#include <linux/hrtimer.h>
-#include <linux/task_io_accounting.h>
-#include <linux/latencytop.h>
-#include <linux/cred.h>
-#include <linux/llist.h>
-#include <linux/uidgid.h>
-#include <linux/gfp.h>
-#include <linux/magic.h>
-#include <linux/cgroup-defs.h>
-
-#include <asm/processor.h>
-
-#define SCHED_ATTR_SIZE_VER0   48  /* sizeof first published struct */
-
-/*
- * Extended scheduling parameters data structure.
- *
- * This is needed because the original struct sched_param can not be
- * altered without introducing ABI issues with legacy applications
- * (e.g., in sched_getparam()).
- *
- * However, the possibility of specifying more than just a priority for
- * the tasks may be useful for a wide variety of application fields, e.g.,
- * multimedia, streaming, automation and control, and many others.
- *
- * This variant (sched_attr) is meant at describing a so-called
- * sporadic time-constrained task. In such model a task is specified by:
- *  - the activation period or minimum instance inter-arrival time;
- *  - the maximum (or average, depending on the actual scheduling
- *    discipline) computation time of all instances, a.k.a. runtime;
- *  - the deadline (relative to the actual activation time) of each
- *    instance.
- * Very briefly, a periodic (sporadic) task asks for the execution of
- * some specific computation --which is typically called an instance--
- * (at most) every period. Moreover, each instance typically lasts no more
- * than the runtime and must be completed by time instant t equal to
- * the instance activation time + the deadline.
- *
- * This is reflected by the actual fields of the sched_attr structure:
- *
- *  @size      size of the structure, for fwd/bwd compat.
- *
- *  @sched_policy  task's scheduling policy
- *  @sched_flags   for customizing the scheduler behaviour
- *  @sched_nice        task's nice value      (SCHED_NORMAL/BATCH)
- *  @sched_priority    task's static priority (SCHED_FIFO/RR)
- *  @sched_deadline    representative of the task's deadline
- *  @sched_runtime representative of the task's runtime
- *  @sched_period  representative of the task's period
- *
- * Given this task model, there are a multiplicity of scheduling algorithms
- * and policies, that can be used to ensure all the tasks will make their
- * timing constraints.
- *
- * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
- * only user of this new interface. More information about the algorithm
- * available in the scheduling class file or in Documentation/.
- */
-struct sched_attr {
-   u32 size;
-
-   u32 sched_policy;
-   u64 sched_flags;
-
-   /* SCHED_NORMAL, SCHED_BATCH */
-   s32 sched_nice;
-
-   /* SCHED_FIFO, SCHED_RR */
-   u32 sched_priority;
-
-   /* SCHED_DEADLINE */
-   u64 sched_runtime;
-   u64 sched_deadline;
-   u64 sched_period;
-};
-
-struct futex_pi_state;
-struct robust_list_head;
-struct bio_list;
-struct fs_struct;
-struct perf_event_context;
-struct blk_plug;
-struct filename;
-struct nameidata;
-
-#define VMACACHE_BITS 2
-#define VMACACHE_SIZE (1U << VMACACHE_BITS)
-#define VMACACHE_MASK (VMACACHE_SIZE - 1)
-
-/*
- * These are the constant used to fake the fixed-point load-average
- * counting. Some notes:
- *  - 11 bit fractions expand to 22 bits by the multiplies: this gives
- *    a load-average precision of 10 bits integer + 11 bits fractional
- *  - if you want to count load-averages more often, you need more
- *    precision, or rounding will get you. With 2-second counting freq,
- *    the EXP_n values would be 1981, 2034 and 2043 if still using only
- *    11 bit fractions.
- */
-extern unsigned long avenrun[];        /* Load averages */
-extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
-
-#define FSHIFT     11      /* nr of bits of precision */
-#define FIXED_1        (1<<FSHIFT) /* 1.0 as fixed-point */
-#define LOAD_FREQ  (5*HZ+1)    /* 5 sec intervals */
-#define EXP_1      1884        /* 1/exp(5sec/1min) as fixed-point */
-#define EXP_5      2014        /* 1/exp(5sec/5min) */
-#define EXP_15     2037        /* 1/exp(5sec/15min) */
-
-#define CALC_LOAD(load,exp,n) \
-   load *= exp; \
-   load += n*(FIXED_1-exp); \
-   load >>= FSHIFT;
-
-extern unsigned long total_forks;
-extern int nr_threads;
-DECLARE_PER_CPU(unsigned long, process_counts);
-extern int nr_processes(void);
-extern unsigned long nr_running(void);
-extern bool single_task_running(void);
-extern unsigned long nr_iowait(void);
-extern unsigned long nr_iowait_cpu(int cpu);
-extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
-
-extern void calc_global_load(unsigned long ticks);
-
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void update_cpu_load_nohz(void);
-#else
-static inline void update_cpu_load_nohz(void) { }
-#endif
-
-extern unsigned long get_parent_ip(unsigned long addr);
-
-extern void dump_cpu_task(int cpu);
-
-struct seq_file;
-struct cfs_rq;
-struct task_group;
-#ifdef CONFIG_SCHED_DEBUG
-extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
-extern void proc_sched_set_task(struct task_struct *p);
-#endif
-
-/*
- * Task state bitmask. NOTE! These bits are also
- * encoded in fs/proc/array.c: get_task_state().
- *
- * We have two separate sets of flags: task->state
- * is about runnability, while task->exit_state are
- * about the task exiting. Confusing, but this way
- * modifying one set can't modify the other one by
- * mistake.
- */
-#define TASK_RUNNING       0
-#define TASK_INTERRUPTIBLE 1
-#define TASK_UNINTERRUPTIBLE   2
-#define __TASK_STOPPED     4
-#define __TASK_TRACED      8
-/* in tsk->exit_state */
-#define EXIT_DEAD      16
-#define EXIT_ZOMBIE        32
-#define EXIT_TRACE     (EXIT_ZOMBIE | EXIT_DEAD)
-/* in tsk->state again */
-#define TASK_DEAD      64
-#define TASK_WAKEKILL      128
-#define TASK_WAKING        256
-#define TASK_PARKED        512
-#define TASK_NOLOAD        1024
-#define TASK_STATE_MAX     2048
-
-#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
-
-extern char ___assert_task_state[1 - 2*!!(
-       sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
-
-/* Convenience macros for the sake of set_task_state */
-#define TASK_KILLABLE      (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
-#define TASK_STOPPED       (TASK_WAKEKILL | __TASK_STOPPED)
-#define TASK_TRACED        (TASK_WAKEKILL | __TASK_TRACED)
-
-#define TASK_IDLE      (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
-
-/* Convenience macros for the sake of wake_up */
-#define TASK_NORMAL        (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
-#define TASK_ALL       (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
-
-/* get_task_state() */
-#define TASK_REPORT        (TASK_RUNNING | TASK_INTERRUPTIBLE | \
-                TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
-                __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
-
-#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
-#define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
-#define task_is_stopped_or_traced(task)    \
-           ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
-#define task_contributes_to_load(task) \
-               ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
-                (task->flags & PF_FROZEN) == 0 && \
-                (task->state & TASK_NOLOAD) == 0)
-
-#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-
-#define __set_task_state(tsk, state_value)         \
-   do {                            \
-       (tsk)->task_state_change = _THIS_IP_;       \
-       (tsk)->state = (state_value);           \
-   } while (0)
-#define set_task_state(tsk, state_value)           \
-   do {                            \
-       (tsk)->task_state_change = _THIS_IP_;       \
-       smp_store_mb((tsk)->state, (state_value));      \
-   } while (0)
-
-/*
- * set_current_state() includes a barrier so that the write of current->state
- * is correctly serialised wrt the caller's subsequent test of whether to
- * actually sleep:
- *
- * set_current_state(TASK_UNINTERRUPTIBLE);
- * if (do_i_need_to_sleep())
- *     schedule();
- *
- * If the caller does not need such serialisation then use __set_current_state()
- */
-#define __set_current_state(state_value)           \
-   do {                            \
-       current->task_state_change = _THIS_IP_;     \
-       current->state = (state_value);         \
-   } while (0)
-#define set_current_state(state_value)             \
-   do {                            \
-       current->task_state_change = _THIS_IP_;     \
-       smp_store_mb(current->state, (state_value));        \
-   } while (0)
-
-#else
-
-#define __set_task_state(tsk, state_value)     \
-   do { (tsk)->state = (state_value); } while (0)
-#define set_task_state(tsk, state_value)       \
-   smp_store_mb((tsk)->state, (state_value))
-
-/*
- * set_current_state() includes a barrier so that the write of current->state
- * is correctly serialised wrt the caller's subsequent test of whether to
- * actually sleep:
- *
- * set_current_state(TASK_UNINTERRUPTIBLE);
- * if (do_i_need_to_sleep())
- *     schedule();
- *
- * If the caller does not need such serialisation then use __set_current_state()
- */
-#define __set_current_state(state_value)       \
-   do { current->state = (state_value); } while (0)
-#define set_current_state(state_value)         \
-   smp_store_mb(current->state, (state_value))
-
-#endif
-
-/* Task command name length */
-#define TASK_COMM_LEN 16
-
-#include <linux/spinlock.h>
-
-/*
- * This serializes "schedule()" and also protects
- * the run-queue from deletions/modifications (but
- * _adding_ to the beginning of the run-queue has
- * a separate lock).
- */
-extern rwlock_t tasklist_lock;
-extern spinlock_t mmlist_lock;
-
-struct task_struct;
-
-#ifdef CONFIG_PROVE_RCU
-extern int lockdep_tasklist_lock_is_held(void);
-#endif /* #ifdef CONFIG_PROVE_RCU */
-
-extern void sched_init(void);
-extern void sched_init_smp(void);
-extern asmlinkage void schedule_tail(struct task_struct *prev);
-extern void init_idle(struct task_struct *idle, int cpu);
-extern void init_idle_bootup_task(struct task_struct *idle);
-
-extern cpumask_var_t cpu_isolated_map;
-
-extern int runqueue_is_locked(int cpu);
-
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void nohz_balance_enter_idle(int cpu);
-extern void set_cpu_sd_state_idle(void);
-extern int get_nohz_timer_target(void);
-#else
-static inline void nohz_balance_enter_idle(int cpu) { }
-static inline void set_cpu_sd_state_idle(void) { }
-#endif
-
-/*
- * Only dump TASK_* tasks. (0 for all tasks)
- */
-extern void show_state_filter(unsigned long state_filter);
-
-static inline void show_state(void)
-{
-   show_state_filter(0);
-}
-
-extern void show_regs(struct pt_regs *);
-
-/*
- * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
- * task), SP is the stack pointer of the first frame that should be shown in the back
- * trace (or NULL if the entire call-chain of the task should be shown).
- */
-extern void show_stack(struct task_struct *task, unsigned long *sp);
-
-extern void cpu_init (void);
-extern void trap_init(void);
-extern void update_process_times(int user);
-extern void scheduler_tick(void);
-
-extern void sched_show_task(struct task_struct *p);
-
-#ifdef CONFIG_LOCKUP_DETECTOR
-extern void touch_softlockup_watchdog(void);
-extern void touch_softlockup_watchdog_sync(void);
-extern void touch_all_softlockup_watchdogs(void);
-extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
-                 void __user *buffer,
-                 size_t *lenp, loff_t *ppos);
-extern unsigned int  softlockup_panic;
-extern unsigned int  hardlockup_panic;
-void lockup_detector_init(void);
-#else
-static inline void touch_softlockup_watchdog(void)
-{
-}
-static inline void touch_softlockup_watchdog_sync(void)
-{
-}
-static inline void touch_all_softlockup_watchdogs(void)
-{
-}
-static inline void lockup_detector_init(void)
-{
-}
-#endif
-
-#ifdef CONFIG_DETECT_HUNG_TASK
-void reset_hung_task_detector(void);
-#else
-static inline void reset_hung_task_detector(void)
-{
-}
-#endif
-
-/* Attach to any functions which should be ignored in wchan output. */
-#define __sched        __attribute__((__section__(".sched.text")))
-
-/* Linker adds these: start and end of __sched functions */
-extern char __sched_text_start[], __sched_text_end[];
-
-/* Is this address in the __sched functions? */
-extern int in_sched_functions(unsigned long addr);
-
-#define    MAX_SCHEDULE_TIMEOUT    LONG_MAX
-extern signed long schedule_timeout(signed long timeout);
-extern signed long schedule_timeout_interruptible(signed long timeout);
-extern signed long schedule_timeout_killable(signed long timeout);
-extern signed long schedule_timeout_uninterruptible(signed long timeout);
-asmlinkage void schedule(void);
-extern void schedule_preempt_disabled(void);
-
-extern long io_schedule_timeout(long timeout);
-
-static inline void io_schedule(void)
-{
-   io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
-}
-
-struct nsproxy;
-struct user_namespace;
-
-#ifdef CONFIG_MMU
-extern void arch_pick_mmap_layout(struct mm_struct *mm);
-extern unsigned long
-arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
-              unsigned long, unsigned long);
-extern unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
-             unsigned long len, unsigned long pgoff,
-             unsigned long flags);
-#else
-static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
-#endif
-
-#define SUID_DUMP_DISABLE  0   /* No setuid dumping */
-#define SUID_DUMP_USER     1   /* Dump as user of process */
-#define SUID_DUMP_ROOT     2   /* Dump as root */
-
-/* mm flags */
-
-/* for SUID_DUMP_* above */
-#define MMF_DUMPABLE_BITS 2
-#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
-
-extern void set_dumpable(struct mm_struct *mm, int value);
-/*
- * This returns the actual value of the suid_dumpable flag. For things
- * that are using this for checking for privilege transitions, it must
- * test against SUID_DUMP_USER rather than treating it as a boolean
- * value.
- */
-static inline int __get_dumpable(unsigned long mm_flags)
-{
-   return mm_flags & MMF_DUMPABLE_MASK;
-}
-
-static inline int get_dumpable(struct mm_struct *mm)
-{
-   return __get_dumpable(mm->flags);
-}
-
-/* coredump filter bits */
-#define MMF_DUMP_ANON_PRIVATE  2
-#define MMF_DUMP_ANON_SHARED   3
-#define MMF_DUMP_MAPPED_PRIVATE    4
-#define MMF_DUMP_MAPPED_SHARED 5
-#define MMF_DUMP_ELF_HEADERS   6
-#define MMF_DUMP_HUGETLB_PRIVATE 7
-#define MMF_DUMP_HUGETLB_SHARED  8
-#define MMF_DUMP_DAX_PRIVATE   9
-#define MMF_DUMP_DAX_SHARED    10
-
-#define MMF_DUMP_FILTER_SHIFT  MMF_DUMPABLE_BITS
-#define MMF_DUMP_FILTER_BITS   9
-#define MMF_DUMP_FILTER_MASK \
-   (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
-#define MMF_DUMP_FILTER_DEFAULT \
-   ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\
-    (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
-
-#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
-# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS)
-#else
-# define MMF_DUMP_MASK_DEFAULT_ELF 0
-#endif
-                   /* leave room for more dump flags */
-#define MMF_VM_MERGEABLE   16  /* KSM may merge identical pages */
-#define MMF_VM_HUGEPAGE        17  /* set when VM_HUGEPAGE is set on vma */
-#define MMF_EXE_FILE_CHANGED   18  /* see prctl_set_mm_exe_file() */
-
-#define MMF_HAS_UPROBES        19  /* has uprobes */
-#define MMF_RECALC_UPROBES 20  /* MMF_HAS_UPROBES can be wrong */
-
-#define MMF_INIT_MASK      (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
-
-struct sighand_struct {
-   atomic_t        count;
-   struct k_sigaction  action[_NSIG];
-   spinlock_t      siglock;
-   wait_queue_head_t   signalfd_wqh;
-};
-
-struct pacct_struct {
-   int         ac_flag;
-   long            ac_exitcode;
-   unsigned long       ac_mem;
-   cputime_t       ac_utime, ac_stime;
-   unsigned long       ac_minflt, ac_majflt;
-};
-
-struct cpu_itimer {
-   cputime_t expires;
-   cputime_t incr;
-   u32 error;
-   u32 incr_error;
-};
-
-/**
- * struct prev_cputime - snaphsot of system and user cputime
- * @utime: time spent in user mode
- * @stime: time spent in system mode
- * @lock: protects the above two fields
- *
- * Stores previous user/system time values such that we can guarantee
- * monotonicity.
- */
-struct prev_cputime {
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-   cputime_t utime;
-   cputime_t stime;
-   raw_spinlock_t lock;
-#endif
-};
-
-static inline void prev_cputime_init(struct prev_cputime *prev)
-{
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-   prev->utime = prev->stime = 0;
-   raw_spin_lock_init(&prev->lock);
-#endif
-}
-
-/**
- * struct task_cputime - collected CPU time counts
- * @utime:     time spent in user mode, in &cputime_t units
- * @stime:     time spent in kernel mode, in &cputime_t units
- * @sum_exec_runtime:  total time spent on the CPU, in nanoseconds
- *
- * This structure groups together three kinds of CPU time that are tracked for
- * threads and thread groups.  Most things considering CPU time want to group
- * these counts together and treat all three of them in parallel.
- */
-struct task_cputime {
-   cputime_t utime;
-   cputime_t stime;
-   unsigned long long sum_exec_runtime;
-};
-
-/* Alternate field names when used to cache expirations. */
-#define virt_exp   utime
-#define prof_exp   stime
-#define sched_exp  sum_exec_runtime
-
-#define INIT_CPUTIME   \
-   (struct task_cputime) {                 \
-       .utime = 0,                 \
-       .stime = 0,                 \
-       .sum_exec_runtime = 0,              \
-   }
-
-/*
- * This is the atomic variant of task_cputime, which can be used for
- * storing and updating task_cputime statistics without locking.
- */
-struct task_cputime_atomic {
-   atomic64_t utime;
-   atomic64_t stime;
-   atomic64_t sum_exec_runtime;
-};
-
-#define INIT_CPUTIME_ATOMIC \
-   (struct task_cputime_atomic) {              \
-       .utime = ATOMIC64_INIT(0),          \
-       .stime = ATOMIC64_INIT(0),          \
-       .sum_exec_runtime = ATOMIC64_INIT(0),       \
-   }
-
-#define PREEMPT_DISABLED   (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
-
-/*
- * Disable preemption until the scheduler is running -- use an unconditional
- * value so that it also works on !PREEMPT_COUNT kernels.
- *
- * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
- */
-#define INIT_PREEMPT_COUNT PREEMPT_OFFSET
-
-/*
- * Initial preempt_count value; reflects the preempt_count schedule invariant
- * which states that during context switches:
- *
- *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
- *
- * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
- * Note: See finish_task_switch().
- */
-#define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
-
-/**
- * struct thread_group_cputimer - thread group interval timer counts
- * @cputime_atomic:    atomic thread group interval timers.
- * @running:       true when there are timers running and
- *         @cputime_atomic receives updates.
- * @checking_timer:    true when a thread in the group is in the
- *         process of checking for thread group timers.
- *
- * This structure contains the version of task_cputime, above, that is
- * used for thread group CPU timer calculations.
- */
-struct thread_group_cputimer {
-   struct task_cputime_atomic cputime_atomic;
-   bool running;
-   bool checking_timer;
-};
-
-#include <linux/rwsem.h>
-struct autogroup;
-
-/*
- * NOTE! "signal_struct" does not have its own
- * locking, because a shared signal_struct always
- * implies a shared sighand_struct, so locking
- * sighand_struct is always a proper superset of
- * the locking of signal_struct.
- */
-struct signal_struct {
-   atomic_t        sigcnt;
-   atomic_t        live;
-   int         nr_threads;
-   struct list_head    thread_head;
-
-   wait_queue_head_t   wait_chldexit;  /* for wait4() */
-
-   /* current thread group signal load-balancing target: */
-   struct task_struct  *curr_target;
-
-   /* shared signal handling: */
-   struct sigpending   shared_pending;
-
-   /* thread group exit support */
-   int         group_exit_code;
-   /* overloaded:
-    * - notify group_exit_task when ->count is equal to notify_count
-    * - everyone except group_exit_task is stopped during signal delivery
-    *   of fatal signals, group_exit_task processes the signal.
-    */
-   int         notify_count;
-   struct task_struct  *group_exit_task;
-
-   /* thread group stop support, overloads group_exit_code too */
-   int         group_stop_count;
-   unsigned int        flags; /* see SIGNAL_* flags below */
-
-   /*
-    * PR_SET_CHILD_SUBREAPER marks a process, like a service
-    * manager, to re-parent orphan (double-forking) child processes
-    * to this process instead of 'init'. The service manager is
-    * able to receive SIGCHLD signals and is able to investigate
-    * the process until it calls wait(). All children of this
-    * process will inherit a flag if they should look for a
-    * child_subreaper process at exit.
-    */
-   unsigned int        is_child_subreaper:1;
-   unsigned int        has_child_subreaper:1;
-
-   /* POSIX.1b Interval Timers */
-   int         posix_timer_id;
-   struct list_head    posix_timers;
-
-   /* ITIMER_REAL timer for the process */
-   struct hrtimer real_timer;
-   struct pid *leader_pid;
-   ktime_t it_real_incr;
-
-   /*
-    * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
-    * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
-    * values are defined to 0 and 1 respectively
-    */
-   struct cpu_itimer it[2];
-
-   /*
-    * Thread group totals for process CPU timers.
-    * See thread_group_cputimer(), et al, for details.
-    */
-   struct thread_group_cputimer cputimer;
-
-   /* Earliest-expiration cache. */
-   struct task_cputime cputime_expires;
-
-   struct list_head cpu_timers[3];
-
-   struct pid *tty_old_pgrp;
-
-   /* boolean value for session group leader */
-   int leader;
-
-   struct tty_struct *tty; /* NULL if no tty */
-
-#ifdef CONFIG_SCHED_AUTOGROUP
-   struct autogroup *autogroup;
-#endif
-   /*
-    * Cumulative resource counters for dead threads in the group,
-    * and for reaped dead child processes forked by this group.
-    * Live threads maintain their own counters and add to these
-    * in __exit_signal, except for the group leader.
-    */
-   seqlock_t stats_lock;
-   cputime_t utime, stime, cutime, cstime;
-   cputime_t gtime;
-   cputime_t cgtime;
-   struct prev_cputime prev_cputime;
-   unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
-   unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
-   unsigned long inblock, oublock, cinblock, coublock;
-   unsigned long maxrss, cmaxrss;
-   struct task_io_accounting ioac;
-
-   /*
-    * Cumulative ns of schedule CPU time fo dead threads in the
-    * group, not including a zombie group leader, (This only differs
-    * from jiffies_to_ns(utime + stime) if sched_clock uses something
-    * other than jiffies.)
-    */
-   unsigned long long sum_sched_runtime;
-
-   /*
-    * We don't bother to synchronize most readers of this at all,
-    * because there is no reader checking a limit that actually needs
-    * to get both rlim_cur and rlim_max atomically, and either one
-    * alone is a single word that can safely be read normally.
-    * getrlimit/setrlimit use task_lock(current->group_leader) to
-    * protect this instead of the siglock, because they really
-    * have no need to disable irqs.
-    */
-   struct rlimit rlim[RLIM_NLIMITS];
-
-#ifdef CONFIG_BSD_PROCESS_ACCT
-   struct pacct_struct pacct;  /* per-process accounting information */
-#endif
-#ifdef CONFIG_TASKSTATS
-   struct taskstats *stats;
-#endif
-#ifdef CONFIG_AUDIT
-   unsigned audit_tty;
-   unsigned audit_tty_log_passwd;
-   struct tty_audit_buf *tty_audit_buf;
-#endif
-
-   oom_flags_t oom_flags;
-   short oom_score_adj;        /* OOM kill score adjustment */
-   short oom_score_adj_min;    /* OOM kill score adjustment min value.
-                    * Only settable by CAP_SYS_RESOURCE. */
-
-   struct mutex cred_guard_mutex;  /* guard against foreign influences on
-                    * credential calculations
-                    * (notably. ptrace) */
-};
-
-/*
- * Bits in flags field of signal_struct.
- */
-#define SIGNAL_STOP_STOPPED    0x00000001 /* job control stop in effect */
-#define SIGNAL_STOP_CONTINUED  0x00000002 /* SIGCONT since WCONTINUED reap */
-#define SIGNAL_GROUP_EXIT  0x00000004 /* group exit in progress */
-#define SIGNAL_GROUP_COREDUMP  0x00000008 /* coredump in progress */
-/*
- * Pending notifications to parent.
- */
-#define SIGNAL_CLD_STOPPED 0x00000010
-#define SIGNAL_CLD_CONTINUED   0x00000020
-#define SIGNAL_CLD_MASK        (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
-
-#define SIGNAL_UNKILLABLE  0x00000040 /* for init: ignore fatal signals */
-
-/* If true, all threads except ->group_exit_task have pending SIGKILL */
-static inline int signal_group_exit(const struct signal_struct *sig)
-{
-   return  (sig->flags & SIGNAL_GROUP_EXIT) ||
-       (sig->group_exit_task != NULL);
-}
-
-/*
- * Some day this will be a full-fledged user tracking system..
- */
-struct user_struct {
-   atomic_t __count;   /* reference count */
-   atomic_t processes; /* How many processes does this user have? */
-   atomic_t sigpending;    /* How many pending signals does this user have? */
-#ifdef CONFIG_INOTIFY_USER
-   atomic_t inotify_watches; /* How many inotify watches does this user have? */
-   atomic_t inotify_devs;  /* How many inotify devs does this user have opened? */
-#endif
-#ifdef CONFIG_FANOTIFY
-   atomic_t fanotify_listeners;
-#endif
-#ifdef CONFIG_EPOLL
-   atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
-#endif
-#ifdef CONFIG_POSIX_MQUEUE
-   /* protected by mq_lock */
-   unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
-#endif
-   unsigned long locked_shm; /* How many pages of mlocked shm ? */
-   unsigned long unix_inflight;    /* How many files in flight in unix sockets */
-
-#ifdef CONFIG_KEYS
-   struct key *uid_keyring;    /* UID specific keyring */
-   struct key *session_keyring;    /* UID's default session keyring */
-#endif
-
-   /* Hash table maintenance information */
-   struct hlist_node uidhash_node;
-   kuid_t uid;
-
-#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL)
-   atomic_long_t locked_vm;
-#endif
-};
-
-extern int uids_sysfs_init(void);
-
-extern struct user_struct *find_user(kuid_t);
-
-extern struct user_struct root_user;
-#define INIT_USER (&root_user)
-
-
-struct backing_dev_info;
-struct reclaim_state;
-
-#ifdef CONFIG_SCHED_INFO
-struct sched_info {
-   /* cumulative counters */
-   unsigned long pcount;         /* # of times run on this cpu */
-   unsigned long long run_delay; /* time spent waiting on a runqueue */
-
-   /* timestamps */
-   unsigned long long last_arrival,/* when we last ran on a cpu */
-              last_queued; /* when we were last queued to run */
-};
-#endif /* CONFIG_SCHED_INFO */
-
-#ifdef CONFIG_TASK_DELAY_ACCT
-struct task_delay_info {
-   spinlock_t  lock;
-   unsigned int    flags;  /* Private per-task flags */
-
-   /* For each stat XXX, add following, aligned appropriately
-    *
-    * struct timespec XXX_start, XXX_end;
-    * u64 XXX_delay;
-    * u32 XXX_count;
-    *
-    * Atomicity of updates to XXX_delay, XXX_count protected by
-    * single lock above (split into XXX_lock if contention is an issue).
-    */
-
-   /*
-    * XXX_count is incremented on every XXX operation, the delay
-    * associated with the operation is added to XXX_delay.
-    * XXX_delay contains the accumulated delay time in nanoseconds.
-    */
-   u64 blkio_start;    /* Shared by blkio, swapin */
-   u64 blkio_delay;    /* wait for sync block io completion */
-   u64 swapin_delay;   /* wait for swapin block io completion */
-   u32 blkio_count;    /* total count of the number of sync block */
-               /* io operations performed */
-   u32 swapin_count;   /* total count of the number of swapin block */
-               /* io operations performed */
-
-   u64 freepages_start;
-   u64 freepages_delay;    /* wait for memory reclaim */
-   u32 freepages_count;    /* total count of memory reclaim */
-};
-#endif /* CONFIG_TASK_DELAY_ACCT */
-
-static inline int sched_info_on(void)
-{
-#ifdef CONFIG_SCHEDSTATS
-   return 1;
-#elif defined(CONFIG_TASK_DELAY_ACCT)
-   extern int delayacct_on;
-   return delayacct_on;
-#else
-   return 0;
-#endif
-}
-
-enum cpu_idle_type {
-   CPU_IDLE,
-   CPU_NOT_IDLE,
-   CPU_NEWLY_IDLE,
-   CPU_MAX_IDLE_TYPES
-};
-
-/*
- * Increase resolution of cpu_capacity calculations
- */
-#define SCHED_CAPACITY_SHIFT   10
-#define SCHED_CAPACITY_SCALE   (1L << SCHED_CAPACITY_SHIFT)
-
-/*
- * Wake-queues are lists of tasks with a pending wakeup, whose
- * callers have already marked the task as woken internally,
- * and can thus carry on. A common use case is being able to
- * do the wakeups once the corresponding user lock as been
- * released.
- *
- * We hold reference to each task in the list across the wakeup,
- * thus guaranteeing that the memory is still valid by the time
- * the actual wakeups are performed in wake_up_q().
- *
- * One per task suffices, because there's never a need for a task to be
- * in two wake queues simultaneously; it is forbidden to abandon a task
- * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
- * already in a wake queue, the wakeup will happen soon and the second
- * waker can just skip it.
- *
- * The WAKE_Q macro declares and initializes the list head.
- * wake_up_q() does NOT reinitialize the list; it's expected to be
- * called near the end of a function, where the fact that the queue is
- * not used again will be easy to see by inspection.
- *
- * Note that this can cause spurious wakeups. schedule() callers
- * must ensure the call is done inside a loop, confirming that the
- * wakeup condition has in fact occurred.
- */
-struct wake_q_node {
-   struct wake_q_node *next;
-};
-
-struct wake_q_head {
-   struct wake_q_node *first;
-   struct wake_q_node **lastp;
-};
-
-#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
-
-#define WAKE_Q(name)                   \
-   struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
-
-extern void wake_q_add(struct wake_q_head *head,
-              struct task_struct *task);
-extern void wake_up_q(struct wake_q_head *head);
-
-/*
- * sched-domains (multiprocessor balancing) declarations:
- */
-#ifdef CONFIG_SMP
-#define SD_LOAD_BALANCE        0x0001  /* Do load balancing on this domain. */
-#define SD_BALANCE_NEWIDLE 0x0002  /* Balance when about to become idle */
-#define SD_BALANCE_EXEC        0x0004  /* Balance on exec */
-#define SD_BALANCE_FORK        0x0008  /* Balance on fork, clone */
-#define SD_BALANCE_WAKE        0x0010  /* Balance on wakeup */
-#define SD_WAKE_AFFINE     0x0020  /* Wake task to waking CPU */
-#define SD_SHARE_CPUCAPACITY   0x0080  /* Domain members share cpu power */
-#define SD_SHARE_POWERDOMAIN   0x0100  /* Domain members share power domain */
-#define SD_SHARE_PKG_RESOURCES 0x0200  /* Domain members share cpu pkg resources */
-#define SD_SERIALIZE       0x0400  /* Only a single load balancing instance */
-#define SD_ASYM_PACKING        0x0800  /* Place busy groups earlier in the domain */
-#define SD_PREFER_SIBLING  0x1000  /* Prefer to place tasks in a sibling domain */
-#define SD_OVERLAP     0x2000  /* sched_domains of this level overlap */
-#define SD_NUMA            0x4000  /* cross-node balancing */
-
-#ifdef CONFIG_SCHED_SMT
-static inline int cpu_smt_flags(void)
-{
-   return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
-}
-#endif
-
-#ifdef CONFIG_SCHED_MC
-static inline int cpu_core_flags(void)
-{
-   return SD_SHARE_PKG_RESOURCES;
-}
-#endif
-
-#ifdef CONFIG_NUMA
-static inline int cpu_numa_flags(void)
-{
-   return SD_NUMA;
-}
-#endif
-
-struct sched_domain_attr {
-   int relax_domain_level;
-};
-
-#define SD_ATTR_INIT   (struct sched_domain_attr) {    \
-   .relax_domain_level = -1,           \
-}
-
-extern int sched_domain_level_max;
-
-struct sched_group;
-
-struct sched_domain {
-   /* These fields must be setup */
-   struct sched_domain *parent;    /* top domain must be null terminated */
-   struct sched_domain *child; /* bottom domain must be null terminated */
-   struct sched_group *groups; /* the balancing groups of the domain */
-   unsigned long min_interval; /* Minimum balance interval ms */
-   unsigned long max_interval; /* Maximum balance interval ms */
-   unsigned int busy_factor;   /* less balancing by factor if busy */
-   unsigned int imbalance_pct; /* No balance until over watermark */
-   unsigned int cache_nice_tries;  /* Leave cache hot tasks for # tries */
-   unsigned int busy_idx;
-   unsigned int idle_idx;
-   unsigned int newidle_idx;
-   unsigned int wake_idx;
-   unsigned int forkexec_idx;
-   unsigned int smt_gain;
-
-   int nohz_idle;          /* NOHZ IDLE status */
-   int flags;          /* See SD_* */
-   int level;
-
-   /* Runtime fields. */
-   unsigned long last_balance; /* init to jiffies. units in jiffies */
-   unsigned int balance_interval;  /* initialise to 1. units in ms. */
-   unsigned int nr_balance_failed; /* initialise to 0 */
-
-   /* idle_balance() stats */
-   u64 max_newidle_lb_cost;
-   unsigned long next_decay_max_lb_cost;
-
-#ifdef CONFIG_SCHEDSTATS
-   /* load_balance() stats */
-   unsigned int lb_count[CPU_MAX_IDLE_TYPES];
-   unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
-   unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
-   unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
-   unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
-   unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
-   unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
-   unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
-
-   /* Active load balancing */
-   unsigned int alb_count;
-   unsigned int alb_failed;
-   unsigned int alb_pushed;
-
-   /* SD_BALANCE_EXEC stats */
-   unsigned int sbe_count;
-   unsigned int sbe_balanced;
-   unsigned int sbe_pushed;
-
-   /* SD_BALANCE_FORK stats */
-   unsigned int sbf_count;
-   unsigned int sbf_balanced;
-   unsigned int sbf_pushed;
-
-   /* try_to_wake_up() stats */
-   unsigned int ttwu_wake_remote;
-   unsigned int ttwu_move_affine;
-   unsigned int ttwu_move_balance;
-#endif
-#ifdef CONFIG_SCHED_DEBUG
-   char *name;
-#endif
-   union {
-       void *private;      /* used during construction */
-       struct rcu_head rcu;    /* used during destruction */
-   };
-
-   unsigned int span_weight;
-   /*
-    * Span of all CPUs in this domain.
-    *
-    * NOTE: this field is variable length. (Allocated dynamically
-    * by attaching extra space to the end of the structure,
-    * depending on how many CPUs the kernel has booted up with)
-    */
-   unsigned long span[0];
-};
-
-static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
-{
-   return to_cpumask(sd->span);
-}
-
-extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-                   struct sched_domain_attr *dattr_new);
-
-/* Allocate an array of sched domains, for partition_sched_domains(). */
-cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
-void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
-
-bool cpus_share_cache(int this_cpu, int that_cpu);
-
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-typedef int (*sched_domain_flags_f)(void);
-
-#define SDTL_OVERLAP   0x01
-
-struct sd_data {
-   struct sched_domain **__percpu sd;
-   struct sched_group **__percpu sg;
-   struct sched_group_capacity **__percpu sgc;
-};
-
-struct sched_domain_topology_level {
-   sched_domain_mask_f mask;
-   sched_domain_flags_f sd_flags;
-   int         flags;
-   int         numa_level;
-   struct sd_data      data;
-#ifdef CONFIG_SCHED_DEBUG
-   char                *name;
-#endif
-};
-
-extern void set_sched_topology(struct sched_domain_topology_level *tl);
-extern void wake_up_if_idle(int cpu);
-
-#ifdef CONFIG_SCHED_DEBUG
-# define SD_INIT_NAME(type)        .name = #type
-#else
-# define SD_INIT_NAME(type)
-#endif
-
-#else /* CONFIG_SMP */
-
-struct sched_domain_attr;
-
-static inline void
-partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-           struct sched_domain_attr *dattr_new)
-{
-}
-
-static inline bool cpus_share_cache(int this_cpu, int that_cpu)
-{
-   return true;
-}
-
-#endif /* !CONFIG_SMP */
-
-
-struct io_context;         /* See blkdev.h */
-
-
-#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
-extern void prefetch_stack(struct task_struct *t);
-#else
-static inline void prefetch_stack(struct task_struct *t) { }
-#endif
-
-struct audit_context;      /* See audit.c */
-struct mempolicy;
-struct pipe_inode_info;
-struct uts_namespace;
-
-struct load_weight {
-   unsigned long weight;
-   u32 inv_weight;
-};
-
-/*
- * The load_avg/util_avg accumulates an infinite geometric series.
- * 1) load_avg factors frequency scaling into the amount of time that a
- * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
- * aggregated such weights of all runnable and blocked sched_entities.
- * 2) util_avg factors frequency and cpu scaling into the amount of time
- * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
- * For cfs_rq, it is the aggregated such times of all runnable and
- * blocked sched_entities.
- * The 64 bit load_sum can:
- * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with
- * the highest weight (=88761) always runnable, we should not overflow
- * 2) for entity, support any load.weight always runnable
- */
-struct sched_avg {
-   u64 last_update_time, load_sum;
-   u32 util_sum, period_contrib;
-   unsigned long load_avg, util_avg;
-};
-
-#ifdef CONFIG_SCHEDSTATS
-struct sched_statistics {
-   u64         wait_start;
-   u64         wait_max;
-   u64         wait_count;
-   u64         wait_sum;
-   u64         iowait_count;
-   u64         iowait_sum;
-
-   u64         sleep_start;
-   u64         sleep_max;
-   s64         sum_sleep_runtime;
-
-   u64         block_start;
-   u64         block_max;
-   u64         exec_max;
-   u64         slice_max;
-
-   u64         nr_migrations_cold;
-   u64         nr_failed_migrations_affine;
-   u64         nr_failed_migrations_running;
-   u64         nr_failed_migrations_hot;
-   u64         nr_forced_migrations;
-
-   u64         nr_wakeups;
-   u64         nr_wakeups_sync;
-   u64         nr_wakeups_migrate;
-   u64         nr_wakeups_local;
-   u64         nr_wakeups_remote;
-   u64         nr_wakeups_affine;
-   u64         nr_wakeups_affine_attempts;
-   u64         nr_wakeups_passive;
-   u64         nr_wakeups_idle;
-};
-#endif
-
-struct sched_entity {
-   struct load_weight  load;       /* for load-balancing */
-   struct rb_node      run_node;
-   struct list_head    group_node;
-   unsigned int        on_rq;
-
-   u64         exec_start;
-   u64         sum_exec_runtime;
-   u64         vruntime;
-   u64         prev_sum_exec_runtime;
-
-   u64         nr_migrations;
-
-#ifdef CONFIG_SCHEDSTATS
-   struct sched_statistics statistics;
-#endif
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-   int         depth;
-   struct sched_entity *parent;
-   /* rq on which this entity is (to be) queued: */
-   struct cfs_rq       *cfs_rq;
-   /* rq "owned" by this entity/group: */
-   struct cfs_rq       *my_q;
-#endif
-
-#ifdef CONFIG_SMP
-   /* Per entity load average tracking */
-   struct sched_avg    avg;
-#endif
-};
-
-struct sched_rt_entity {
-   struct list_head run_list;
-   unsigned long timeout;
-   unsigned long watchdog_stamp;
-   unsigned int time_slice;
-
-   struct sched_rt_entity *back;
-#ifdef CONFIG_RT_GROUP_SCHED
-   struct sched_rt_entity  *parent;
-   /* rq on which this entity is (to be) queued: */
-   struct rt_rq        *rt_rq;
-   /* rq "owned" by this entity/group: */
-   struct rt_rq        *my_q;
-#endif
-};
-
-struct sched_dl_entity {
-   struct rb_node  rb_node;
-
-   /*
-    * Original scheduling parameters. Copied here from sched_attr
-    * during sched_setattr(), they will remain the same until
-    * the next sched_setattr().
-    */
-   u64 dl_runtime;     /* maximum runtime for each instance    */
-   u64 dl_deadline;    /* relative deadline of each instance   */
-   u64 dl_period;      /* separation of two instances (period) */
-   u64 dl_bw;      /* dl_runtime / dl_deadline     */
-
-   /*
-    * Actual scheduling parameters. Initialized with the values above,
-    * they are continously updated during task execution. Note that
-    * the remaining runtime could be < 0 in case we are in overrun.
-    */
-   s64 runtime;        /* remaining runtime for this instance  */
-   u64 deadline;       /* absolute deadline for this instance  */
-   unsigned int flags; /* specifying the scheduler behaviour   */
-
-   /*
-    * Some bool flags:
-    *
-    * @dl_throttled tells if we exhausted the runtime. If so, the
-    * task has to wait for a replenishment to be performed at the
-    * next firing of dl_timer.
-    *
-    * @dl_new tells if a new instance arrived. If so we must
-    * start executing it with full runtime and reset its absolute
-    * deadline;
-    *
-    * @dl_boosted tells if we are boosted due to DI. If so we are
-    * outside bandwidth enforcement mechanism (but only until we
-    * exit the critical section);
-    *
-    * @dl_yielded tells if task gave up the cpu before consuming
-    * all its available runtime during the last job.
-    */
-   int dl_throttled, dl_new, dl_boosted, dl_yielded;
-
-   /*
-    * Bandwidth enforcement timer. Each -deadline task has its
-    * own bandwidth to be enforced, thus we need one timer per task.
-    */
-   struct hrtimer dl_timer;
-};
-
-union rcu_special {
-   struct {
-       u8 blocked;
-       u8 need_qs;
-       u8 exp_need_qs;
-       u8 pad; /* Otherwise the compiler can store garbage here. */
-   } b; /* Bits. */
-   u32 s; /* Set of bits. */
-};
-struct rcu_node;
-
-enum perf_event_task_context {
-   perf_invalid_context = -1,
-   perf_hw_context = 0,
-   perf_sw_context,
-   perf_nr_task_contexts,
-};
-
-/* Track pages that require TLB flushes */
-struct tlbflush_unmap_batch {
-   /*
-    * Each bit set is a CPU that potentially has a TLB entry for one of
-    * the PFNs being flushed. See set_tlb_ubc_flush_pending().
-    */
-   struct cpumask cpumask;
-
-   /* True if any bit in cpumask is set */
-   bool flush_required;
-
-   /*
-    * If true then the PTE was dirty when unmapped. The entry must be
-    * flushed before IO is initiated or a stale TLB entry potentially
-    * allows an update without redirtying the page.
-    */
-   bool writable;
-};
-
-struct task_struct {
-   volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
-   void *stack;
-   atomic_t usage;
-   unsigned int flags; /* per process flags, defined below */
-   unsigned int ptrace;
-
-#ifdef CONFIG_SMP
-   struct llist_node wake_entry;
-   int on_cpu;
-   unsigned int wakee_flips;
-   unsigned long wakee_flip_decay_ts;
-   struct task_struct *last_wakee;
-
-   int wake_cpu;
-#endif
-   int on_rq;
-
-   int prio, static_prio, normal_prio;
-   unsigned int rt_priority;
-   const struct sched_class *sched_class;
-   struct sched_entity se;
-   struct sched_rt_entity rt;
-#ifdef CONFIG_CGROUP_SCHED
-   struct task_group *sched_task_group;
-#endif
-   struct sched_dl_entity dl;
-
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-   /* list of struct preempt_notifier: */
-   struct hlist_head preempt_notifiers;
-#endif
-
-#ifdef CONFIG_BLK_DEV_IO_TRACE
-   unsigned int btrace_seq;
-#endif
-
-   unsigned int policy;
-   int nr_cpus_allowed;
-   cpumask_t cpus_allowed;
-
-#ifdef CONFIG_PREEMPT_RCU
-   int rcu_read_lock_nesting;
-   union rcu_special rcu_read_unlock_special;
-   struct list_head rcu_node_entry;
-   struct rcu_node *rcu_blocked_node;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-#ifdef CONFIG_TASKS_RCU
-   unsigned long rcu_tasks_nvcsw;
-   bool rcu_tasks_holdout;
-   struct list_head rcu_tasks_holdout_list;
-   int rcu_tasks_idle_cpu;
-#endif /* #ifdef CONFIG_TASKS_RCU */
-
-#ifdef CONFIG_SCHED_INFO
-   struct sched_info sched_info;
-#endif
-
-   struct list_head tasks;
-#ifdef CONFIG_SMP
-   struct plist_node pushable_tasks;
-   struct rb_node pushable_dl_tasks;
-#endif
-
-   struct mm_struct *mm, *active_mm;
-   /* per-thread vma caching */
-   u32 vmacache_seqnum;
-   struct vm_area_struct *vmacache[VMACACHE_SIZE];
-#if defined(SPLIT_RSS_COUNTING)
-   struct task_rss_stat    rss_stat;
-#endif
-/* task state */
-   int exit_state;
-   int exit_code, exit_signal;
-   int pdeath_signal;  /*  The signal sent when the parent dies  */
-   unsigned long jobctl;   /* JOBCTL_*, siglock protected */
-
-   /* Used for emulating ABI behavior of previous Linux versions */
-   unsigned int personality;
-
-   /* scheduler bits, serialized by scheduler locks */
-   unsigned sched_reset_on_fork:1;
-   unsigned sched_contributes_to_load:1;
-   unsigned sched_migrated:1;
-   unsigned :0; /* force alignment to the next boundary */
-
-   /* unserialized, strictly 'current' */
-   unsigned in_execve:1; /* bit to tell LSMs we're in execve */
-   unsigned in_iowait:1;
-#ifdef CONFIG_MEMCG
-   unsigned memcg_may_oom:1;
-#endif
-#ifdef CONFIG_MEMCG_KMEM
-   unsigned memcg_kmem_skip_account:1;
-#endif
-#ifdef CONFIG_COMPAT_BRK
-   unsigned brk_randomized:1;
-#endif
-
-   unsigned long atomic_flags; /* Flags needing atomic access. */
-
-   struct restart_block restart_block;
-
-   pid_t pid;
-   pid_t tgid;
-
-#ifdef CONFIG_CC_STACKPROTECTOR
-   /* Canary value for the -fstack-protector gcc feature */
-   unsigned long stack_canary;
-#endif
-   /*
-    * pointers to (original) parent process, youngest child, younger sibling,
-    * older sibling, respectively.  (p->father can be replaced with
-    * p->real_parent->pid)
-    */
-   struct task_struct __rcu *real_parent; /* real parent process */
-   struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
-   /*
-    * children/sibling forms the list of my natural children
-    */
-   struct list_head children;  /* list of my children */
-   struct list_head sibling;   /* linkage in my parent's children list */
-   struct task_struct *group_leader;   /* threadgroup leader */
-
-   /*
-    * ptraced is the list of tasks this task is using ptrace on.
-    * This includes both natural children and PTRACE_ATTACH targets.
-    * p->ptrace_entry is p's link on the p->parent->ptraced list.
-    */
-   struct list_head ptraced;
-   struct list_head ptrace_entry;
-
-   /* PID/PID hash table linkage. */
-   struct pid_link pids[PIDTYPE_MAX];
-   struct list_head thread_group;
-   struct list_head thread_node;
-
-   struct completion *vfork_done;      /* for vfork() */
-   int __user *set_child_tid;      /* CLONE_CHILD_SETTID */
-   int __user *clear_child_tid;        /* CLONE_CHILD_CLEARTID */
-
-   cputime_t utime, stime, utimescaled, stimescaled;
-   cputime_t gtime;
-   struct prev_cputime prev_cputime;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-   seqlock_t vtime_seqlock;
-   unsigned long long vtime_snap;
-   enum {
-       VTIME_SLEEPING = 0,
-       VTIME_USER,
-       VTIME_SYS,
-   } vtime_snap_whence;
-#endif
-   unsigned long nvcsw, nivcsw; /* context switch counts */
-   u64 start_time;     /* monotonic time in nsec */
-   u64 real_start_time;    /* boot based time in nsec */
-/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
-   unsigned long min_flt, maj_flt;
-
-   struct task_cputime cputime_expires;
-   struct list_head cpu_timers[3];
-
-/* process credentials */
-   const struct cred __rcu *real_cred; /* objective and real subjective task
-                    * credentials (COW) */
-   const struct cred __rcu *cred;  /* effective (overridable) subjective task
-                    * credentials (COW) */
-   char comm[TASK_COMM_LEN]; /* executable name excluding path
-                    - access with [gs]et_task_comm (which lock
-                      it with task_lock())
-                    - initialized normally by setup_new_exec */
-/* file system info */
-   struct nameidata *nameidata;
-#ifdef CONFIG_SYSVIPC
-/* ipc stuff */
-   struct sysv_sem sysvsem;
-   struct sysv_shm sysvshm;
-#endif
-#ifdef CONFIG_DETECT_HUNG_TASK
-/* hung task detection */
-   unsigned long last_switch_count;
-#endif
-/* filesystem information */
-   struct fs_struct *fs;
-/* open file information */
-   struct files_struct *files;
-/* namespaces */
-   struct nsproxy *nsproxy;
-/* signal handlers */
-   struct signal_struct *signal;
-   struct sighand_struct *sighand;
-
-   sigset_t blocked, real_blocked;
-   sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
-   struct sigpending pending;
-
-   unsigned long sas_ss_sp;
-   size_t sas_ss_size;
-
-   struct callback_head *task_works;
-
-   struct audit_context *audit_context;
-#ifdef CONFIG_AUDITSYSCALL
-   kuid_t loginuid;
-   unsigned int sessionid;
-#endif
-   struct seccomp seccomp;
-
-/* Thread group tracking */
-       u32 parent_exec_id;
-       u32 self_exec_id;
-/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
- * mempolicy */
-   spinlock_t alloc_lock;
-
-   /* Protection of the PI data structures: */
-   raw_spinlock_t pi_lock;
-
-   struct wake_q_node wake_q;
-
-#ifdef CONFIG_RT_MUTEXES
-   /* PI waiters blocked on a rt_mutex held by this task */
-   struct rb_root pi_waiters;
-   struct rb_node *pi_waiters_leftmost;
-   /* Deadlock detection and priority inheritance handling */
-   struct rt_mutex_waiter *pi_blocked_on;
-#endif
-
-#ifdef CONFIG_DEBUG_MUTEXES
-   /* mutex deadlock detection */
-   struct mutex_waiter *blocked_on;
-#endif
-#ifdef CONFIG_TRACE_IRQFLAGS
-   unsigned int irq_events;
-   unsigned long hardirq_enable_ip;
-   unsigned long hardirq_disable_ip;
-   unsigned int hardirq_enable_event;
-   unsigned int hardirq_disable_event;
-   int hardirqs_enabled;
-   int hardirq_context;
-   unsigned long softirq_disable_ip;
-   unsigned long softirq_enable_ip;
-   unsigned int softirq_disable_event;
-   unsigned int softirq_enable_event;
-   int softirqs_enabled;
-   int softirq_context;
-#endif
-#ifdef CONFIG_LOCKDEP
-# define MAX_LOCK_DEPTH 48UL
-   u64 curr_chain_key;
-   int lockdep_depth;
-   unsigned int lockdep_recursion;
-   struct held_lock held_locks[MAX_LOCK_DEPTH];
-   gfp_t lockdep_reclaim_gfp;
-#endif
-
-/* journalling filesystem info */
-   void *journal_info;
-
-/* stacked block device info */
-   struct bio_list *bio_list;
-
-#ifdef CONFIG_BLOCK
-/* stack plugging */
-   struct blk_plug *plug;
-#endif
-
-/* VM state */
-   struct reclaim_state *reclaim_state;
-
-   struct backing_dev_info *backing_dev_info;
-
-   struct io_context *io_context;
-
-   unsigned long ptrace_message;
-   siginfo_t *last_siginfo; /* For ptrace use.  */
-   struct task_io_accounting ioac;
-#if defined(CONFIG_TASK_XACCT)
-   u64 acct_rss_mem1;  /* accumulated rss usage */
-   u64 acct_vm_mem1;   /* accumulated virtual memory usage */
-   cputime_t acct_timexpd; /* stime + utime since last update */
-#endif
-#ifdef CONFIG_CPUSETS
-   nodemask_t mems_allowed;    /* Protected by alloc_lock */
-   seqcount_t mems_allowed_seq;    /* Seqence no to catch updates */
-   int cpuset_mem_spread_rotor;
-   int cpuset_slab_spread_rotor;
-#endif
-#ifdef CONFIG_CGROUPS
-   /* Control Group info protected by css_set_lock */
-   struct css_set __rcu *cgroups;
-   /* cg_list protected by css_set_lock and tsk->alloc_lock */
-   struct list_head cg_list;
-#endif
-#ifdef CONFIG_FUTEX
-   struct robust_list_head __user *robust_list;
-#ifdef CONFIG_COMPAT
-   struct compat_robust_list_head __user *compat_robust_list;
-#endif
-   struct list_head pi_state_list;
-   struct futex_pi_state *pi_state_cache;
-#endif
-#ifdef CONFIG_PERF_EVENTS
-   struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
-   struct mutex perf_event_mutex;
-   struct list_head perf_event_list;
-#endif
-#ifdef CONFIG_DEBUG_PREEMPT
-   unsigned long preempt_disable_ip;
-#endif
-#ifdef CONFIG_NUMA
-   struct mempolicy *mempolicy;    /* Protected by alloc_lock */
-   short il_next;
-   short pref_node_fork;
-#endif
-#ifdef CONFIG_NUMA_BALANCING
-   int numa_scan_seq;
-   unsigned int numa_scan_period;
-   unsigned int numa_scan_period_max;
-   int numa_preferred_nid;
-   unsigned long numa_migrate_retry;
-   u64 node_stamp;         /* migration stamp  */
-   u64 last_task_numa_placement;
-   u64 last_sum_exec_runtime;
-   struct callback_head numa_work;
-
-   struct list_head numa_entry;
-   struct numa_group *numa_group;
-
-   /*
-    * numa_faults is an array split into four regions:
-    * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
-    * in this precise order.
-    *
-    * faults_memory: Exponential decaying average of faults on a per-node
-    * basis. Scheduling placement decisions are made based on these
-    * counts. The values remain static for the duration of a PTE scan.
-    * faults_cpu: Track the nodes the process was running on when a NUMA
-    * hinting fault was incurred.
-    * faults_memory_buffer and faults_cpu_buffer: Record faults per node
-    * during the current scan window. When the scan completes, the counts
-    * in faults_memory and faults_cpu decay and these values are copied.
-    */
-   unsigned long *numa_faults;
-   unsigned long total_numa_faults;
-
-   /*
-    * numa_faults_locality tracks if faults recorded during the last
-    * scan window were remote/local or failed to migrate. The task scan
-    * period is adapted based on the locality of the faults with different
-    * weights depending on whether they were shared or private faults
-    */
-   unsigned long numa_faults_locality[3];
-
-   unsigned long numa_pages_migrated;
-#endif /* CONFIG_NUMA_BALANCING */
-
-#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-   struct tlbflush_unmap_batch tlb_ubc;
-#endif
-
-   struct rcu_head rcu;
-
-   /*
-    * cache last used pipe for splice
-    */
-   struct pipe_inode_info *splice_pipe;
-
-   struct page_frag task_frag;
-
-#ifdef CONFIG_TASK_DELAY_ACCT
-   struct task_delay_info *delays;
-#endif
-#ifdef CONFIG_FAULT_INJECTION
-   int make_it_fail;
-#endif
-   /*
-    * when (nr_dirtied >= nr_dirtied_pause), it's time to call
-    * balance_dirty_pages() for some dirty throttling pause
-    */
-   int nr_dirtied;
-   int nr_dirtied_pause;
-   unsigned long dirty_paused_when; /* start of a write-and-pause period */
-
-#ifdef CONFIG_LATENCYTOP
-   int latency_record_count;
-   struct latency_record latency_record[LT_SAVECOUNT];
-#endif
-   /*
-    * time slack values; these are used to round up poll() and
-    * select() etc timeout values. These are in nanoseconds.
-    */
-   unsigned long timer_slack_ns;
-   unsigned long default_timer_slack_ns;
-
-#ifdef CONFIG_KASAN
-   unsigned int kasan_depth;
-#endif
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-   /* Index of current stored address in ret_stack */
-   int curr_ret_stack;
-   /* Stack of return addresses for return function tracing */
-   struct ftrace_ret_stack *ret_stack;
-   /* time stamp for last schedule */
-   unsigned long long ftrace_timestamp;
-   /*
-    * Number of functions that haven't been traced
-    * because of depth overrun.
-    */
-   atomic_t trace_overrun;
-   /* Pause for the tracing */
-   atomic_t tracing_graph_pause;
-#endif
-#ifdef CONFIG_TRACING
-   /* state flags for use by tracers */
-   unsigned long trace;
-   /* bitmask and counter of trace recursion */
-   unsigned long trace_recursion;
-#endif /* CONFIG_TRACING */
-#ifdef CONFIG_MEMCG
-   struct mem_cgroup *memcg_in_oom;
-   gfp_t memcg_oom_gfp_mask;
-   int memcg_oom_order;
-
-   /* number of pages to reclaim on returning to userland */
-   unsigned int memcg_nr_pages_over_high;
-#endif
-#ifdef CONFIG_UPROBES
-   struct uprobe_task *utask;
-#endif
-#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
-   unsigned int    sequential_io;
-   unsigned int    sequential_io_avg;
-#endif
-#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-   unsigned long   task_state_change;
-#endif
-   int pagefault_disabled;
-/* CPU-specific state of this task */
-   struct thread_struct thread;
-/*
- * WARNING: on x86, 'thread_struct' contains a variable-sized
- * structure.  It *MUST* be at the end of 'task_struct'.
- *
- * Do not put anything below here!
- */
-};
-
-#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
-extern int arch_task_struct_size __read_mostly;
-#else
-# define arch_task_struct_size (sizeof(struct task_struct))
-#endif
-
-/* Future-safe accessor for struct task_struct's cpus_allowed. */
-#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
-
-#define TNF_MIGRATED   0x01
-#define TNF_NO_GROUP   0x02
-#define TNF_SHARED 0x04
-#define TNF_FAULT_LOCAL    0x08
-#define TNF_MIGRATE_FAIL 0x10
-
-#ifdef CONFIG_NUMA_BALANCING
-extern void task_numa_fault(int last_node, int node, int pages, int flags);
-extern pid_t task_numa_group_id(struct task_struct *p);
-extern void set_numabalancing_state(bool enabled);
-extern void task_numa_free(struct task_struct *p);
-extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
-                   int src_nid, int dst_cpu);
-#else
-static inline void task_numa_fault(int last_node, int node, int pages,
-                  int flags)
-{
-}
-static inline pid_t task_numa_group_id(struct task_struct *p)
-{
-   return 0;
-}
-static inline void set_numabalancing_state(bool enabled)
-{
-}
-static inline void task_numa_free(struct task_struct *p)
-{
-}
-static inline bool should_numa_migrate_memory(struct task_struct *p,
-               struct page *page, int src_nid, int dst_cpu)
-{
-   return true;
-}
-#endif
-
-static inline struct pid *task_pid(struct task_struct *task)
-{
-   return task->pids[PIDTYPE_PID].pid;
-}
-
-static inline struct pid *task_tgid(struct task_struct *task)
-{
-   return task->group_leader->pids[PIDTYPE_PID].pid;
-}
-
-/*
- * Without tasklist or rcu lock it is not safe to dereference
- * the result of task_pgrp/task_session even if task == current,
- * we can race with another thread doing sys_setsid/sys_setpgid.
- */
-static inline struct pid *task_pgrp(struct task_struct *task)
-{
-   return task->group_leader->pids[PIDTYPE_PGID].pid;
-}
-
-static inline struct pid *task_session(struct task_struct *task)
-{
-   return task->group_leader->pids[PIDTYPE_SID].pid;
-}
-
-struct pid_namespace;
-
-/*
- * the helpers to get the task's different pids as they are seen
- * from various namespaces
- *
- * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
- * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
- *                     current.
- * task_xid_nr_ns()  : id seen from the ns specified;
- *
- * set_task_vxid()   : assigns a virtual id to a task;
- *
- * see also pid_nr() etc in include/linux/pid.h
- */
-pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
-           struct pid_namespace *ns);
-
-static inline pid_t task_pid_nr(struct task_struct *tsk)
-{
-   return tsk->pid;
-}
-
-static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
-                   struct pid_namespace *ns)
-{
-   return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
-}
-
-static inline pid_t task_pid_vnr(struct task_struct *tsk)
-{
-   return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
-}
-
-
-static inline pid_t task_tgid_nr(struct task_struct *tsk)
-{
-   return tsk->tgid;
-}
-
-pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
-
-static inline pid_t task_tgid_vnr(struct task_struct *tsk)
-{
-   return pid_vnr(task_tgid(tsk));
-}
-
-
-static inline int pid_alive(const struct task_struct *p);
-static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
-{
-   pid_t pid = 0;
-
-   rcu_read_lock();
-   if (pid_alive(tsk))
-       pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
-   rcu_read_unlock();
-
-   return pid;
-}
-
-static inline pid_t task_ppid_nr(const struct task_struct *tsk)
-{
-   return task_ppid_nr_ns(tsk, &init_pid_ns);
-}
-
-static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
-                   struct pid_namespace *ns)
-{
-   return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
-}
-
-static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
-{
-   return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
-}
-
-
-static inline pid_t task_session_nr_ns(struct task_struct *tsk,
-                   struct pid_namespace *ns)
-{
-   return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
-}
-
-static inline pid_t task_session_vnr(struct task_struct *tsk)
-{
-   return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
-}
-
-/* obsolete, do not use */
-static inline pid_t task_pgrp_nr(struct task_struct *tsk)
-{
-   return task_pgrp_nr_ns(tsk, &init_pid_ns);
-}
-
-/**
- * pid_alive - check that a task structure is not stale
- * @p: Task structure to be checked.
- *
- * Test if a process is not yet dead (at most zombie state)
- * If pid_alive fails, then pointers within the task structure
- * can be stale and must not be dereferenced.
- *
- * Return: 1 if the process is alive. 0 otherwise.
- */
-static inline int pid_alive(const struct task_struct *p)
-{
-   return p->pids[PIDTYPE_PID].pid != NULL;
-}
-
-/**
- * is_global_init - check if a task structure is init. Since init
- * is free to have sub-threads we need to check tgid.
- * @tsk: Task structure to be checked.
- *
- * Check if a task structure is the first user space task the kernel created.
- *
- * Return: 1 if the task structure is init. 0 otherwise.
- */
-static inline int is_global_init(struct task_struct *tsk)
-{
-   return task_tgid_nr(tsk) == 1;
-}
-
-extern struct pid *cad_pid;
-
-extern void free_task(struct task_struct *tsk);
-#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
-
-extern void __put_task_struct(struct task_struct *t);
-
-static inline void put_task_struct(struct task_struct *t)
-{
-   if (atomic_dec_and_test(&t->usage))
-       __put_task_struct(t);
-}
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-extern void task_cputime(struct task_struct *t,
-            cputime_t *utime, cputime_t *stime);
-extern void task_cputime_scaled(struct task_struct *t,
-               cputime_t *utimescaled, cputime_t *stimescaled);
-extern cputime_t task_gtime(struct task_struct *t);
-#else
-static inline void task_cputime(struct task_struct *t,
-               cputime_t *utime, cputime_t *stime)
-{
-   if (utime)
-       *utime = t->utime;
-   if (stime)
-       *stime = t->stime;
-}
-
-static inline void task_cputime_scaled(struct task_struct *t,
-                      cputime_t *utimescaled,
-                      cputime_t *stimescaled)
-{
-   if (utimescaled)
-       *utimescaled = t->utimescaled;
-   if (stimescaled)
-       *stimescaled = t->stimescaled;
-}
-
-static inline cputime_t task_gtime(struct task_struct *t)
-{
-   return t->gtime;
-}
-#endif
-extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
-extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
-
-/*
- * Per process flags
- */
-#define PF_EXITING 0x00000004  /* getting shut down */
-#define PF_EXITPIDONE  0x00000008  /* pi exit done on shut down */
-#define PF_VCPU        0x00000010  /* I'm a virtual CPU */
-#define PF_WQ_WORKER   0x00000020  /* I'm a workqueue worker */
-#define PF_FORKNOEXEC  0x00000040  /* forked but didn't exec */
-#define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
-#define PF_SUPERPRIV   0x00000100  /* used super-user privileges */
-#define PF_DUMPCORE    0x00000200  /* dumped core */
-#define PF_SIGNALED    0x00000400  /* killed by a signal */
-#define PF_MEMALLOC    0x00000800  /* Allocating memory */
-#define PF_NPROC_EXCEEDED 0x00001000   /* set_user noticed that RLIMIT_NPROC was exceeded */
-#define PF_USED_MATH   0x00002000  /* if unset the fpu must be initialized before use */
-#define PF_USED_ASYNC  0x00004000  /* used async_schedule*(), used by module init */
-#define PF_NOFREEZE    0x00008000  /* this thread should not be frozen */
-#define PF_FROZEN  0x00010000  /* frozen for system suspend */
-#define PF_FSTRANS 0x00020000  /* inside a filesystem transaction */
-#define PF_KSWAPD  0x00040000  /* I am kswapd */
-#define PF_MEMALLOC_NOIO 0x00080000    /* Allocating memory without IO involved */
-#define PF_LESS_THROTTLE 0x00100000    /* Throttle me less: I clean memory */
-#define PF_KTHREAD 0x00200000  /* I am a kernel thread */
-#define PF_RANDOMIZE   0x00400000  /* randomize virtual address space */
-#define PF_SWAPWRITE   0x00800000  /* Allowed to write to swap */
-#define PF_NO_SETAFFINITY 0x04000000   /* Userland is not allowed to meddle with cpus_allowed */
-#define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
-#define PF_MUTEX_TESTER    0x20000000  /* Thread belongs to the rt mutex tester */
-#define PF_FREEZER_SKIP    0x40000000  /* Freezer should not count it as freezable */
-#define PF_SUSPEND_TASK 0x80000000      /* this thread called freeze_processes and should not be frozen */
-
-/*
- * Only the _current_ task can read/write to tsk->flags, but other
- * tasks can access tsk->flags in readonly mode for example
- * with tsk_used_math (like during threaded core dumping).
- * There is however an exception to this rule during ptrace
- * or during fork: the ptracer task is allowed to write to the
- * child->flags of its traced child (same goes for fork, the parent
- * can write to the child->flags), because we're guaranteed the
- * child is not running and in turn not changing child->flags
- * at the same time the parent does it.
- */
-#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
-#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
-#define clear_used_math() clear_stopped_child_used_math(current)
-#define set_used_math() set_stopped_child_used_math(current)
-#define conditional_stopped_child_used_math(condition, child) \
-   do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
-#define conditional_used_math(condition) \
-   conditional_stopped_child_used_math(condition, current)
-#define copy_to_stopped_child_used_math(child) \
-   do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
-/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
-#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
-#define used_math() tsk_used_math(current)
-
-/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
- * __GFP_FS is also cleared as it implies __GFP_IO.
- */
-static inline gfp_t memalloc_noio_flags(gfp_t flags)
-{
-   if (unlikely(current->flags & PF_MEMALLOC_NOIO))
-       flags &= ~(__GFP_IO | __GFP_FS);
-   return flags;
-}
-
-static inline unsigned int memalloc_noio_save(void)
-{
-   unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
-   current->flags |= PF_MEMALLOC_NOIO;
-   return flags;
-}
-
-static inline void memalloc_noio_restore(unsigned int flags)
-{
-   current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
-}
-
-/* Per-process atomic flags. */
-#define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */
-#define PFA_SPREAD_PAGE  1      /* Spread page cache over cpuset */
-#define PFA_SPREAD_SLAB  2      /* Spread some slab caches over cpuset */
-
-
-#define TASK_PFA_TEST(name, func)                  \
-   static inline bool task_##func(struct task_struct *p)       \
-   { return test_bit(PFA_##name, &p->atomic_flags); }
-#define TASK_PFA_SET(name, func)                   \
-   static inline void task_set_##func(struct task_struct *p)   \
-   { set_bit(PFA_##name, &p->atomic_flags); }
-#define TASK_PFA_CLEAR(name, func)                 \
-   static inline void task_clear_##func(struct task_struct *p) \
-   { clear_bit(PFA_##name, &p->atomic_flags); }
-
-TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs)
-TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs)
-
-TASK_PFA_TEST(SPREAD_PAGE, spread_page)
-TASK_PFA_SET(SPREAD_PAGE, spread_page)
-TASK_PFA_CLEAR(SPREAD_PAGE, spread_page)
-
-TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
-TASK_PFA_SET(SPREAD_SLAB, spread_slab)
-TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
-
-/*
- * task->jobctl flags
- */
-#define JOBCTL_STOP_SIGMASK    0xffff  /* signr of the last group stop */
-
-#define JOBCTL_STOP_DEQUEUED_BIT 16    /* stop signal dequeued */
-#define JOBCTL_STOP_PENDING_BIT    17  /* task should stop for group stop */
-#define JOBCTL_STOP_CONSUME_BIT    18  /* consume group stop count */
-#define JOBCTL_TRAP_STOP_BIT   19  /* trap for STOP */
-#define JOBCTL_TRAP_NOTIFY_BIT 20  /* trap for NOTIFY */
-#define JOBCTL_TRAPPING_BIT    21  /* switching to TRACED */
-#define JOBCTL_LISTENING_BIT   22  /* ptracer is listening for events */
-
-#define JOBCTL_STOP_DEQUEUED   (1UL << JOBCTL_STOP_DEQUEUED_BIT)
-#define JOBCTL_STOP_PENDING    (1UL << JOBCTL_STOP_PENDING_BIT)
-#define JOBCTL_STOP_CONSUME    (1UL << JOBCTL_STOP_CONSUME_BIT)
-#define JOBCTL_TRAP_STOP   (1UL << JOBCTL_TRAP_STOP_BIT)
-#define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT)
-#define JOBCTL_TRAPPING        (1UL << JOBCTL_TRAPPING_BIT)
-#define JOBCTL_LISTENING   (1UL << JOBCTL_LISTENING_BIT)
-
-#define JOBCTL_TRAP_MASK   (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
-#define JOBCTL_PENDING_MASK    (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
-
-extern bool task_set_jobctl_pending(struct task_struct *task,
-                   unsigned long mask);
-extern void task_clear_jobctl_trapping(struct task_struct *task);
-extern void task_clear_jobctl_pending(struct task_struct *task,
-                     unsigned long mask);
-
-static inline void rcu_copy_process(struct task_struct *p)
-{
-#ifdef CONFIG_PREEMPT_RCU
-   p->rcu_read_lock_nesting = 0;
-   p->rcu_read_unlock_special.s = 0;
-   p->rcu_blocked_node = NULL;
-   INIT_LIST_HEAD(&p->rcu_node_entry);
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-#ifdef CONFIG_TASKS_RCU
-   p->rcu_tasks_holdout = false;
-   INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
-   p->rcu_tasks_idle_cpu = -1;
-#endif /* #ifdef CONFIG_TASKS_RCU */
-}
-
-static inline void tsk_restore_flags(struct task_struct *task,
-               unsigned long orig_flags, unsigned long flags)
-{
-   task->flags &= ~flags;
-   task->flags |= orig_flags & flags;
-}
-
-extern int cpuset_cpumask_can_shrink(const struct cpumask *cur,
-                    const struct cpumask *trial);
-extern int task_can_attach(struct task_struct *p,
-              const struct cpumask *cs_cpus_allowed);
-#ifdef CONFIG_SMP
-extern void do_set_cpus_allowed(struct task_struct *p,
-                  const struct cpumask *new_mask);
-
-extern int set_cpus_allowed_ptr(struct task_struct *p,
-               const struct cpumask *new_mask);
-#else
-static inline void do_set_cpus_allowed(struct task_struct *p,
-                     const struct cpumask *new_mask)
-{
-}
-static inline int set_cpus_allowed_ptr(struct task_struct *p,
-                      const struct cpumask *new_mask)
-{
-   if (!cpumask_test_cpu(0, new_mask))
-       return -EINVAL;
-   return 0;
-}
-#endif
-
-#ifdef CONFIG_NO_HZ_COMMON
-void calc_load_enter_idle(void);
-void calc_load_exit_idle(void);
-#else
-static inline void calc_load_enter_idle(void) { }
-static inline void calc_load_exit_idle(void) { }
-#endif /* CONFIG_NO_HZ_COMMON */
-
-/*
- * Do not use outside of architecture code which knows its limitations.
- *
- * sched_clock() has no promise of monotonicity or bounded drift between
- * CPUs, use (which you should not) requires disabling IRQs.
- *
- * Please use one of the three interfaces below.
- */
-extern unsigned long long notrace sched_clock(void);
-/*
- * See the comment in kernel/sched/clock.c
- */
-extern u64 cpu_clock(int cpu);
-extern u64 local_clock(void);
-extern u64 running_clock(void);
-extern u64 sched_clock_cpu(int cpu);
-
-
-extern void sched_clock_init(void);
-
-#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline void sched_clock_tick(void)
-{
-}
-
-static inline void sched_clock_idle_sleep_event(void)
-{
-}
-
-static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
-{
-}
-#else
-/*
- * Architectures can set this to 1 if they have specified
- * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
- * but then during bootup it turns out that sched_clock()
- * is reliable after all:
- */
-extern int sched_clock_stable(void);
-extern void set_sched_clock_stable(void);
-extern void clear_sched_clock_stable(void);
-
-extern void sched_clock_tick(void);
-extern void sched_clock_idle_sleep_event(void);
-extern void sched_clock_idle_wakeup_event(u64 delta_ns);
-#endif
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-/*
- * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
- * The reason for this explicit opt-in is not to have perf penalty with
- * slow sched_clocks.
- */
-extern void enable_sched_clock_irqtime(void);
-extern void disable_sched_clock_irqtime(void);
-#else
-static inline void enable_sched_clock_irqtime(void) {}
-static inline void disable_sched_clock_irqtime(void) {}
-#endif
-
-extern unsigned long long
-task_sched_runtime(struct task_struct *task);
-
-/* sched_exec is called by processes performing an exec */
-#ifdef CONFIG_SMP
-extern void sched_exec(void);
-#else
-#define sched_exec()   {}
-#endif
-
-extern void sched_clock_idle_sleep_event(void);
-extern void sched_clock_idle_wakeup_event(u64 delta_ns);
-
-#ifdef CONFIG_HOTPLUG_CPU
-extern void idle_task_exit(void);
-#else
-static inline void idle_task_exit(void) {}
-#endif
-
-#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
-extern void wake_up_nohz_cpu(int cpu);
-#else
-static inline void wake_up_nohz_cpu(int cpu) { }
-#endif
-
-#ifdef CONFIG_NO_HZ_FULL
-extern bool sched_can_stop_tick(void);
-extern u64 scheduler_tick_max_deferment(void);
-#else
-static inline bool sched_can_stop_tick(void) { return false; }
-#endif
-
-#ifdef CONFIG_SCHED_AUTOGROUP
-extern void sched_autogroup_create_attach(struct task_struct *p);
-extern void sched_autogroup_detach(struct task_struct *p);
-extern void sched_autogroup_fork(struct signal_struct *sig);
-extern void sched_autogroup_exit(struct signal_struct *sig);
-#ifdef CONFIG_PROC_FS
-extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
-extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice);
-#endif
-#else
-static inline void sched_autogroup_create_attach(struct task_struct *p) { }
-static inline void sched_autogroup_detach(struct task_struct *p) { }
-static inline void sched_autogroup_fork(struct signal_struct *sig) { }
-static inline void sched_autogroup_exit(struct signal_struct *sig) { }
-#endif
-
-extern int yield_to(struct task_struct *p, bool preempt);
-extern void set_user_nice(struct task_struct *p, long nice);
-extern int task_prio(const struct task_struct *p);
-/**
- * task_nice - return the nice value of a given task.
- * @p: the task in question.
- *
- * Return: The nice value [ -20 ... 0 ... 19 ].
- */
-static inline int task_nice(const struct task_struct *p)
-{
-   return PRIO_TO_NICE((p)->static_prio);
-}
-extern int can_nice(const struct task_struct *p, const int nice);
-extern int task_curr(const struct task_struct *p);
-extern int idle_cpu(int cpu);
-extern int sched_setscheduler(struct task_struct *, int,
-                 const struct sched_param *);
-extern int sched_setscheduler_nocheck(struct task_struct *, int,
-                     const struct sched_param *);
-extern int sched_setattr(struct task_struct *,
-            const struct sched_attr *);
-extern struct task_struct *idle_task(int cpu);
-/**
- * is_idle_task - is the specified task an idle task?
- * @p: the task in question.
- *
- * Return: 1 if @p is an idle task. 0 otherwise.
- */
-static inline bool is_idle_task(const struct task_struct *p)
-{
-   return p->pid == 0;
-}
-extern struct task_struct *curr_task(int cpu);
-extern void set_curr_task(int cpu, struct task_struct *p);
-
-void yield(void);
-
-union thread_union {
-   struct thread_info thread_info;
-   unsigned long stack[THREAD_SIZE/sizeof(long)];
-};
-
-#ifndef __HAVE_ARCH_KSTACK_END
-static inline int kstack_end(void *addr)
-{
-   /* Reliable end of stack detection:
-    * Some APM bios versions misalign the stack
-    */
-   return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
-}
-#endif
-
-extern union thread_union init_thread_union;
-extern struct task_struct init_task;
-
-extern struct   mm_struct init_mm;
-
-extern struct pid_namespace init_pid_ns;
-
-/*
- * find a task by one of its numerical ids
- *
- * find_task_by_pid_ns():
- *      finds a task by its pid in the specified namespace
- * find_task_by_vpid():
- *      finds a task by its virtual pid
- *
- * see also find_vpid() etc in include/linux/pid.h
- */
-
-extern struct task_struct *find_task_by_vpid(pid_t nr);
-extern struct task_struct *find_task_by_pid_ns(pid_t nr,
-       struct pid_namespace *ns);
-
-/* per-UID process charging. */
-extern struct user_struct * alloc_uid(kuid_t);
-static inline struct user_struct *get_uid(struct user_struct *u)
-{
-   atomic_inc(&u->__count);
-   return u;
-}
-extern void free_uid(struct user_struct *);
-
-#include <asm/current.h>
-
-extern void xtime_update(unsigned long ticks);
-
-extern int wake_up_state(struct task_struct *tsk, unsigned int state);
-extern int wake_up_process(struct task_struct *tsk);
-extern void wake_up_new_task(struct task_struct *tsk);
-#ifdef CONFIG_SMP
- extern void kick_process(struct task_struct *tsk);
-#else
- static inline void kick_process(struct task_struct *tsk) { }
-#endif
-extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
-extern void sched_dead(struct task_struct *p);
-
-extern void proc_caches_init(void);
-extern void flush_signals(struct task_struct *);
-extern void ignore_signals(struct task_struct *);
-extern void flush_signal_handlers(struct task_struct *, int force_default);
-extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
-
-static inline int kernel_dequeue_signal(siginfo_t *info)
-{
-   struct task_struct *tsk = current;
-   siginfo_t __info;
-   int ret;
-
-   spin_lock_irq(&tsk->sighand->siglock);
-   ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info);
-   spin_unlock_irq(&tsk->sighand->siglock);
-
-   return ret;
-}
-
-static inline void kernel_signal_stop(void)
-{
-   spin_lock_irq(&current->sighand->siglock);
-   if (current->jobctl & JOBCTL_STOP_DEQUEUED)
-       __set_current_state(TASK_STOPPED);
-   spin_unlock_irq(&current->sighand->siglock);
-
-   schedule();
-}
-
-extern void release_task(struct task_struct * p);
-extern int send_sig_info(int, struct siginfo *, struct task_struct *);
-extern int force_sigsegv(int, struct task_struct *);
-extern int force_sig_info(int, struct siginfo *, struct task_struct *);
-extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
-extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
-extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *,
-               const struct cred *, u32);
-extern int kill_pgrp(struct pid *pid, int sig, int priv);
-extern int kill_pid(struct pid *pid, int sig, int priv);
-extern int kill_proc_info(int, struct siginfo *, pid_t);
-extern __must_check bool do_notify_parent(struct task_struct *, int);
-extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
-extern void force_sig(int, struct task_struct *);
-extern int send_sig(int, struct task_struct *, int);
-extern int zap_other_threads(struct task_struct *p);
-extern struct sigqueue *sigqueue_alloc(void);
-extern void sigqueue_free(struct sigqueue *);
-extern int send_sigqueue(struct sigqueue *,  struct task_struct *, int group);
-extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
-
-static inline void restore_saved_sigmask(void)
-{
-   if (test_and_clear_restore_sigmask())
-       __set_current_blocked(&current->saved_sigmask);
-}
-
-static inline sigset_t *sigmask_to_save(void)
-{
-   sigset_t *res = &current->blocked;
-   if (unlikely(test_restore_sigmask()))
-       res = &current->saved_sigmask;
-   return res;
-}
-
-static inline int kill_cad_pid(int sig, int priv)
-{
-   return kill_pid(cad_pid, sig, priv);
-}
-
-/* These can be the second arg to send_sig_info/send_group_sig_info.  */
-#define SEND_SIG_NOINFO ((struct siginfo *) 0)
-#define SEND_SIG_PRIV  ((struct siginfo *) 1)
-#define SEND_SIG_FORCED    ((struct siginfo *) 2)
-
-/*
- * True if we are on the alternate signal stack.
- */
-static inline int on_sig_stack(unsigned long sp)
-{
-#ifdef CONFIG_STACK_GROWSUP
-   return sp >= current->sas_ss_sp &&
-       sp - current->sas_ss_sp < current->sas_ss_size;
-#else
-   return sp > current->sas_ss_sp &&
-       sp - current->sas_ss_sp <= current->sas_ss_size;
-#endif
-}
-
-static inline int sas_ss_flags(unsigned long sp)
-{
-   if (!current->sas_ss_size)
-       return SS_DISABLE;
-
-   return on_sig_stack(sp) ? SS_ONSTACK : 0;
-}
-
-static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
-{
-   if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
-#ifdef CONFIG_STACK_GROWSUP
-       return current->sas_ss_sp;
-#else
-       return current->sas_ss_sp + current->sas_ss_size;
-#endif
-   return sp;
-}
-
-/*
- * Routines for handling mm_structs
- */
-extern struct mm_struct * mm_alloc(void);
-
-/* mmdrop drops the mm and the page tables */
-extern void __mmdrop(struct mm_struct *);
-static inline void mmdrop(struct mm_struct * mm)
-{
-   if (unlikely(atomic_dec_and_test(&mm->mm_count)))
-       __mmdrop(mm);
-}
-
-/* mmput gets rid of the mappings and all user-space */
-extern void mmput(struct mm_struct *);
-/* Grab a reference to a task's mm, if it is not already going away */
-extern struct mm_struct *get_task_mm(struct task_struct *task);
-/*
- * Grab a reference to a task's mm, if it is not already going away
- * and ptrace_may_access with the mode parameter passed to it
- * succeeds.
- */
-extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
-/* Remove the current tasks stale references to the old mm_struct */
-extern void mm_release(struct task_struct *, struct mm_struct *);
-
-#ifdef CONFIG_HAVE_COPY_THREAD_TLS
-extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
-           struct task_struct *, unsigned long);
-#else
-extern int copy_thread(unsigned long, unsigned long, unsigned long,
-           struct task_struct *);
-
-/* Architectures that haven't opted into copy_thread_tls get the tls argument
- * via pt_regs, so ignore the tls argument passed via C. */
-static inline int copy_thread_tls(
-       unsigned long clone_flags, unsigned long sp, unsigned long arg,
-       struct task_struct *p, unsigned long tls)
-{
-   return copy_thread(clone_flags, sp, arg, p);
-}
-#endif
-extern void flush_thread(void);
-extern void exit_thread(void);
-
-extern void exit_files(struct task_struct *);
-extern void __cleanup_sighand(struct sighand_struct *);
-
-extern void exit_itimers(struct signal_struct *);
-extern void flush_itimer_signals(void);
-
-extern void do_group_exit(int);
-
-extern int do_execve(struct filename *,
-            const char __user * const __user *,
-            const char __user * const __user *);
-extern int do_execveat(int, struct filename *,
-              const char __user * const __user *,
-              const char __user * const __user *,
-              int);
-extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
-extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
-struct task_struct *fork_idle(int);
-extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
-
-extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
-static inline void set_task_comm(struct task_struct *tsk, const char *from)
-{
-   __set_task_comm(tsk, from, false);
-}
-extern char *get_task_comm(char *to, struct task_struct *tsk);
-
-#ifdef CONFIG_SMP
-void scheduler_ipi(void);
-extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
-#else
-static inline void scheduler_ipi(void) { }
-static inline unsigned long wait_task_inactive(struct task_struct *p,
-                          long match_state)
-{
-   return 1;
-}
-#endif
-
-#define tasklist_empty() \
-   list_empty(&init_task.tasks)
-
-#define next_task(p) \
-   list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
-
-#define for_each_process(p) \
-   for (p = &init_task ; (p = next_task(p)) != &init_task ; )
-
-extern bool current_is_single_threaded(void);
-
-/*
- * Careful: do_each_thread/while_each_thread is a double loop so
- *          'break' will not work as expected - use goto instead.
- */
-#define do_each_thread(g, t) \
-   for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
-
-#define while_each_thread(g, t) \
-   while ((t = next_thread(t)) != g)
-
-#define __for_each_thread(signal, t)   \
-   list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
-
-#define for_each_thread(p, t)      \
-   __for_each_thread((p)->signal, t)
-
-/* Careful: this is a double loop, 'break' won't work as expected. */
-#define for_each_process_thread(p, t)  \
-   for_each_process(p) for_each_thread(p, t)
-
-static inline int get_nr_threads(struct task_struct *tsk)
-{
-   return tsk->signal->nr_threads;
-}
-
-static inline bool thread_group_leader(struct task_struct *p)
-{
-   return p->exit_signal >= 0;
-}
-
-/* Do to the insanities of de_thread it is possible for a process
- * to have the pid of the thread group leader without actually being
- * the thread group leader.  For iteration through the pids in proc
- * all we care about is that we have a task with the appropriate
- * pid, we don't actually care if we have the right task.
- */
-static inline bool has_group_leader_pid(struct task_struct *p)
-{
-   return task_pid(p) == p->signal->leader_pid;
-}
-
-static inline
-bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
-{
-   return p1->signal == p2->signal;
-}
-
-static inline struct task_struct *next_thread(const struct task_struct *p)
-{
-   return list_entry_rcu(p->thread_group.next,
-                 struct task_struct, thread_group);
-}
-
-static inline int thread_group_empty(struct task_struct *p)
-{
-   return list_empty(&p->thread_group);
-}
-
-#define delay_group_leader(p) \
-       (thread_group_leader(p) && !thread_group_empty(p))
-
-/*
- * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
- * subscriptions and synchronises with wait4().  Also used in procfs.  Also
- * pins the final release of task.io_context.  Also protects ->cpuset and
- * ->cgroup.subsys[]. And ->vfork_done.
- *
- * Nests both inside and outside of read_lock(&tasklist_lock).
- * It must not be nested with write_lock_irq(&tasklist_lock),
- * neither inside nor outside.
- */
-static inline void task_lock(struct task_struct *p)
-{
-   spin_lock(&p->alloc_lock);
-}
-
-static inline void task_unlock(struct task_struct *p)
-{
-   spin_unlock(&p->alloc_lock);
-}
-
-extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
-                           unsigned long *flags);
-
-static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
-                              unsigned long *flags)
-{
-   struct sighand_struct *ret;
-
-   ret = __lock_task_sighand(tsk, flags);
-   (void)__cond_lock(&tsk->sighand->siglock, ret);
-   return ret;
-}
-
-static inline void unlock_task_sighand(struct task_struct *tsk,
-                       unsigned long *flags)
-{
-   spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
-}
-
-/**
- * threadgroup_change_begin - mark the beginning of changes to a threadgroup
- * @tsk: task causing the changes
- *
- * All operations which modify a threadgroup - a new thread joining the
- * group, death of a member thread (the assertion of PF_EXITING) and
- * exec(2) dethreading the process and replacing the leader - are wrapped
- * by threadgroup_change_{begin|end}().  This is to provide a place which
- * subsystems needing threadgroup stability can hook into for
- * synchronization.
- */
-static inline void threadgroup_change_begin(struct task_struct *tsk)
-{
-   might_sleep();
-   cgroup_threadgroup_change_begin(tsk);
-}
-
-/**
- * threadgroup_change_end - mark the end of changes to a threadgroup
- * @tsk: task causing the changes
- *
- * See threadgroup_change_begin().
- */
-static inline void threadgroup_change_end(struct task_struct *tsk)
-{
-   cgroup_threadgroup_change_end(tsk);
-}
-
-#ifndef __HAVE_THREAD_FUNCTIONS
-
-#define task_thread_info(task) ((struct thread_info *)(task)->stack)
-#define task_stack_page(task)  ((task)->stack)
-
-static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
-{
-   *task_thread_info(p) = *task_thread_info(org);
-   task_thread_info(p)->task = p;
-}
-
-/*
- * Return the address of the last usable long on the stack.
- *
- * When the stack grows down, this is just above the thread
- * info struct. Going any lower will corrupt the threadinfo.
- *
- * When the stack grows up, this is the highest address.
- * Beyond that position, we corrupt data on the next page.
- */
-static inline unsigned long *end_of_stack(struct task_struct *p)
-{
-#ifdef CONFIG_STACK_GROWSUP
-   return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1;
-#else
-   return (unsigned long *)(task_thread_info(p) + 1);
-#endif
-}
-
-#endif
-#define task_stack_end_corrupted(task) \
-       (*(end_of_stack(task)) != STACK_END_MAGIC)
-
-static inline int object_is_on_stack(void *obj)
-{
-   void *stack = task_stack_page(current);
-
-   return (obj >= stack) && (obj < (stack + THREAD_SIZE));
-}
-
-extern void thread_info_cache_init(void);
-
-#ifdef CONFIG_DEBUG_STACK_USAGE
-static inline unsigned long stack_not_used(struct task_struct *p)
-{
-   unsigned long *n = end_of_stack(p);
-
-   do {    /* Skip over canary */
-       n++;
-   } while (!*n);
-
-   return (unsigned long)n - (unsigned long)end_of_stack(p);
-}
-#endif
-extern void set_task_stack_end_magic(struct task_struct *tsk);
-
-/* set thread flags in other task's structures
- * - see asm/thread_info.h for TIF_xxxx flags available
- */
-static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
-{
-   set_ti_thread_flag(task_thread_info(tsk), flag);
-}
-
-static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
-{
-   clear_ti_thread_flag(task_thread_info(tsk), flag);
-}
-
-static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
-{
-   return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
-}
-
-static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
-{
-   return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
-}
-
-static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
-{
-   return test_ti_thread_flag(task_thread_info(tsk), flag);
-}
-
-static inline void set_tsk_need_resched(struct task_struct *tsk)
-{
-   set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
-}
-
-static inline void clear_tsk_need_resched(struct task_struct *tsk)
-{
-   clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
-}
-
-static inline int test_tsk_need_resched(struct task_struct *tsk)
-{
-   return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
-}
-
-static inline int restart_syscall(void)
-{
-   set_tsk_thread_flag(current, TIF_SIGPENDING);
-   return -ERESTARTNOINTR;
-}
-
-static inline int signal_pending(struct task_struct *p)
-{
-   return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
-}
-
-static inline int __fatal_signal_pending(struct task_struct *p)
-{
-   return unlikely(sigismember(&p->pending.signal, SIGKILL));
-}
-
-static inline int fatal_signal_pending(struct task_struct *p)
-{
-   return signal_pending(p) && __fatal_signal_pending(p);
-}
-
-static inline int signal_pending_state(long state, struct task_struct *p)
-{
-   if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
-       return 0;
-   if (!signal_pending(p))
-       return 0;
-
-   return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
-}
-
-/*
- * cond_resched() and cond_resched_lock(): latency reduction via
- * explicit rescheduling in places that are safe. The return
- * value indicates whether a reschedule was done in fact.
- * cond_resched_lock() will drop the spinlock before scheduling,
- * cond_resched_softirq() will enable bhs before scheduling.
- */
-extern int _cond_resched(void);
-
-#define cond_resched() ({          \
-   ___might_sleep(__FILE__, __LINE__, 0);  \
-   _cond_resched();            \
-})
-
-extern int __cond_resched_lock(spinlock_t *lock);
-
-#define cond_resched_lock(lock) ({             \
-   ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
-   __cond_resched_lock(lock);              \
-})
-
-extern int __cond_resched_softirq(void);
-
-#define cond_resched_softirq() ({                  \
-   ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
-   __cond_resched_softirq();                   \
-})
-
-static inline void cond_resched_rcu(void)
-{
-#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
-   rcu_read_unlock();
-   cond_resched();
-   rcu_read_lock();
-#endif
-}
-
-/*
- * Does a critical section need to be broken due to another
- * task waiting?: (technically does not depend on CONFIG_PREEMPT,
- * but a general need for low latency)
- */
-static inline int spin_needbreak(spinlock_t *lock)
-{
-#ifdef CONFIG_PREEMPT
-   return spin_is_contended(lock);
-#else
-   return 0;
-#endif
-}
-
-/*
- * Idle thread specific functions to determine the need_resched
- * polling state.
- */
-#ifdef TIF_POLLING_NRFLAG
-static inline int tsk_is_polling(struct task_struct *p)
-{
-   return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
-}
-
-static inline void __current_set_polling(void)
-{
-   set_thread_flag(TIF_POLLING_NRFLAG);
-}
-
-static inline bool __must_check current_set_polling_and_test(void)
-{
-   __current_set_polling();
-
-   /*
-    * Polling state must be visible before we test NEED_RESCHED,
-    * paired by resched_curr()
-    */
-   smp_mb__after_atomic();
-
-   return unlikely(tif_need_resched());
-}
-
-static inline void __current_clr_polling(void)
-{
-   clear_thread_flag(TIF_POLLING_NRFLAG);
-}
-
-static inline bool __must_check current_clr_polling_and_test(void)
-{
-   __current_clr_polling();
-
-   /*
-    * Polling state must be visible before we test NEED_RESCHED,
-    * paired by resched_curr()
-    */
-   smp_mb__after_atomic();
-
-   return unlikely(tif_need_resched());
-}
-
-#else
-static inline int tsk_is_polling(struct task_struct *p) { return 0; }
-static inline void __current_set_polling(void) { }
-static inline void __current_clr_polling(void) { }
-
-static inline bool __must_check current_set_polling_and_test(void)
-{
-   return unlikely(tif_need_resched());
-}
-static inline bool __must_check current_clr_polling_and_test(void)
-{
-   return unlikely(tif_need_resched());
-}
-#endif
-
-static inline void current_clr_polling(void)
-{
-   __current_clr_polling();
-
-   /*
-    * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
-    * Once the bit is cleared, we'll get IPIs with every new
-    * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
-    * fold.
-    */
-   smp_mb(); /* paired with resched_curr() */
-
-   preempt_fold_need_resched();
-}
-
-static __always_inline bool need_resched(void)
-{
-   return unlikely(tif_need_resched());
-}
-
-/*
- * Thread group CPU time accounting.
- */
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
-
-/*
- * Reevaluate whether the task has signals pending delivery.
- * Wake the task if so.
- * This is required every time the blocked sigset_t changes.
- * callers must hold sighand->siglock.
- */
-extern void recalc_sigpending_and_wake(struct task_struct *t);
-extern void recalc_sigpending(void);
-
-extern void signal_wake_up_state(struct task_struct *t, unsigned int state);
-
-static inline void signal_wake_up(struct task_struct *t, bool resume)
-{
-   signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
-}
-static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
-{
-   signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
-}
-
-/*
- * Wrappers for p->thread_info->cpu access. No-op on UP.
- */
-#ifdef CONFIG_SMP
-
-static inline unsigned int task_cpu(const struct task_struct *p)
-{
-   return task_thread_info(p)->cpu;
-}
-
-static inline int task_node(const struct task_struct *p)
-{
-   return cpu_to_node(task_cpu(p));
-}
-
-extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
-
-#else
-
-static inline unsigned int task_cpu(const struct task_struct *p)
-{
-   return 0;
-}
-
-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-}
-
-#endif /* CONFIG_SMP */
-
-extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
-extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
-
-#ifdef CONFIG_CGROUP_SCHED
-extern struct task_group root_task_group;
-#endif /* CONFIG_CGROUP_SCHED */
-
-extern int task_can_switch_user(struct user_struct *up,
-                   struct task_struct *tsk);
-
-#ifdef CONFIG_TASK_XACCT
-static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
-{
-   tsk->ioac.rchar += amt;
-}
-
-static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
-{
-   tsk->ioac.wchar += amt;
-}
-
-static inline void inc_syscr(struct task_struct *tsk)
-{
-   tsk->ioac.syscr++;
-}
-
-static inline void inc_syscw(struct task_struct *tsk)
-{
-   tsk->ioac.syscw++;
-}
-#else
-static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
-{
-}
-
-static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
-{
-}
-
-static inline void inc_syscr(struct task_struct *tsk)
-{
-}
-
-static inline void inc_syscw(struct task_struct *tsk)
-{
-}
-#endif
-
-#ifndef TASK_SIZE_OF
-#define TASK_SIZE_OF(tsk)  TASK_SIZE
-#endif
-
-#ifdef CONFIG_MEMCG
-extern void mm_update_next_owner(struct mm_struct *mm);
-#else
-static inline void mm_update_next_owner(struct mm_struct *mm)
-{
-}
-#endif /* CONFIG_MEMCG */
-
-static inline unsigned long task_rlimit(const struct task_struct *tsk,
-       unsigned int limit)
-{
-   return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
-}
-
-static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
-       unsigned int limit)
-{
-   return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
-}
-
-static inline unsigned long rlimit(unsigned int limit)
-{
-   return task_rlimit(current, limit);
-}
-
-static inline unsigned long rlimit_max(unsigned int limit)
-{
-   return task_rlimit_max(current, limit);
-}
-
-#endif
diff -Naur linux-4.4.6-gentoo-orig/include/linux/wbt.h linux-4.4.6-gentoo-patched/include/linux/wbt.h
--- linux-4.4.6-gentoo-orig/include/linux/wbt.h 1970-01-01 03:00:00.000000000 +0300
+++ linux-4.4.6-gentoo-patched/include/linux/wbt.h  2016-05-04 11:03:27.411730745 +0300
@@ -0,0 +1,95 @@
+#ifndef WB_THROTTLE_H
+#define WB_THROTTLE_H
+
+#include <linux/atomic.h>
+#include <linux/wait.h>
+#include <linux/timer.h>
+#include <linux/ktime.h>
+
+#define ISSUE_STAT_MASK        (1ULL << 63)
+#define ISSUE_STAT_TIME_MASK   ~ISSUE_STAT_MASK
+
+struct wb_issue_stat {
+   u64 time;
+};
+
+static inline void wbt_issue_stat_set_time(struct wb_issue_stat *stat)
+{
+   stat->time = (stat->time & ISSUE_STAT_MASK) |
+           (ktime_to_ns(ktime_get()) & ISSUE_STAT_TIME_MASK);
+}
+
+static inline u64 wbt_issue_stat_get_time(struct wb_issue_stat *stat)
+{
+   return stat->time & ISSUE_STAT_TIME_MASK;
+}
+
+static inline void wbt_mark_tracked(struct wb_issue_stat *stat)
+{
+   stat->time |= ISSUE_STAT_MASK;
+}
+
+static inline void wbt_clear_tracked(struct wb_issue_stat *stat)
+{
+   stat->time &= ~ISSUE_STAT_MASK;
+}
+
+static inline bool wbt_tracked(struct wb_issue_stat *stat)
+{
+   return (stat->time & ISSUE_STAT_MASK) != 0;
+}
+
+struct wb_stat_ops {
+   void (*get)(void *, struct blk_rq_stat *);
+   void (*clear)(void *);
+};
+
+struct rq_wb {
+   /*
+    * Settings that govern how we throttle
+    */
+   unsigned int wb_background;     /* background writeback */
+   unsigned int wb_normal;         /* normal writeback */
+   unsigned int wb_max;            /* max throughput writeback */
+   unsigned int scale_step;
+
+   u64 win_nsec;               /* default window size */
+   u64 cur_win_nsec;           /* current window size */
+
+   unsigned int unknown_cnt;
+
+   struct timer_list window_timer;
+
+   s64 sync_issue;
+   void *sync_cookie;
+
+   unsigned int wc;
+   unsigned int queue_depth;
+
+   unsigned long last_issue;       /* last non-throttled issue */
+   unsigned long last_comp;        /* last non-throttled comp */
+   unsigned long min_lat_nsec;
+   struct backing_dev_info *bdi;
+   struct request_queue *q;
+   wait_queue_head_t wait;
+   atomic_t inflight;
+
+   struct wb_stat_ops *stat_ops;
+   void *ops_data;
+};
+
+struct backing_dev_info;
+
+void __wbt_done(struct rq_wb *);
+void wbt_done(struct rq_wb *, struct wb_issue_stat *);
+bool wbt_wait(struct rq_wb *, unsigned int, spinlock_t *);
+struct rq_wb *wbt_init(struct backing_dev_info *, struct wb_stat_ops *, void *);
+void wbt_exit(struct rq_wb *);
+void wbt_update_limits(struct rq_wb *);
+void wbt_requeue(struct rq_wb *, struct wb_issue_stat *);
+void wbt_issue(struct rq_wb *, struct wb_issue_stat *);
+
+void wbt_set_queue_depth(struct rq_wb *, unsigned int);
+void wbt_set_write_cache(struct rq_wb *, bool);
+
+#endif
diff -Naur linux-4.4.6-gentoo-orig/include/linux/writeback.h linux-4.4.6-gentoo-patched/include/linux/writeback.h
--- linux-4.4.6-gentoo-orig/include/linux/writeback.h   2016-05-04 11:19:37.618649827 +0300
+++ linux-4.4.6-gentoo-patched/include/linux/writeback.h    2016-05-04 11:03:27.411730745 +0300
@@ -106,6 +106,16 @@
 #endif
 };

+static inline int wbc_to_write_cmd(struct writeback_control *wbc)
+{
+   if (wbc->sync_mode == WB_SYNC_ALL)
+       return WRITE_SYNC;
+   else if (wbc->for_kupdate || wbc->for_background)
+       return WRITE_BG;
+
+   return WRITE;
+}
+
 /*
  * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
  * and are measured against each other in.  There always is one global
diff -Naur linux-4.4.6-gentoo-orig/include/trace/events/wbt.h linux-4.4.6-gentoo-patched/include/trace/events/wbt.h
--- linux-4.4.6-gentoo-orig/include/trace/events/wbt.h  1970-01-01 03:00:00.000000000 +0300
+++ linux-4.4.6-gentoo-patched/include/trace/events/wbt.h   2016-05-04 11:03:27.411730745 +0300
@@ -0,0 +1,122 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM wbt
+
+#if !defined(_TRACE_WBT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_WBT_H
+
+#include <linux/tracepoint.h>
+#include <linux/wbt.h>
+
+/**
+ * wbt_stat - trace stats for blk_wb
+ * @stat: array of read/write stats
+ */
+TRACE_EVENT(wbt_stat,
+
+   TP_PROTO(struct backing_dev_info *bdi, struct blk_rq_stat *stat),
+
+   TP_ARGS(bdi, stat),
+
+   TP_STRUCT__entry(
+       __array(char, name, 32)
+       __field(s64, rmean)
+       __field(u64, rmin)
+       __field(u64, rmax)
+       __field(s64, rnr_samples)
+       __field(s64, rtime)
+       __field(s64, wmean)
+       __field(u64, wmin)
+       __field(u64, wmax)
+       __field(s64, wnr_samples)
+       __field(s64, wtime)
+   ),
+
+   TP_fast_assign(
+       strncpy(__entry->name, dev_name(bdi->dev), 32);
+       __entry->rmean      = stat[0].mean;
+       __entry->rmin       = stat[0].min;
+       __entry->rmax       = stat[0].max;
+       __entry->rnr_samples    = stat[0].nr_samples;
+       __entry->wmean      = stat[1].mean;
+       __entry->wmin       = stat[1].min;
+       __entry->wmax       = stat[1].max;
+       __entry->wnr_samples    = stat[1].nr_samples;
+   ),
+
+   TP_printk("%s: rmean=%llu, rmin=%llu, rmax=%llu, rsamples=%llu, "
+         "wmean=%llu, wmin=%llu, wmax=%llu, wsamples=%llu\n",
+         __entry->name, __entry->rmean, __entry->rmin, __entry->rmax,
+         __entry->rnr_samples, __entry->wmean, __entry->wmin,
+         __entry->wmax, __entry->wnr_samples)
+);
+
+/**
+ * wbt_lat - trace latency event
+ * @lat: latency trigger
+ */
+TRACE_EVENT(wbt_lat,
+
+   TP_PROTO(struct backing_dev_info *bdi, unsigned long lat),
+
+   TP_ARGS(bdi, lat),
+
+   TP_STRUCT__entry(
+       __array(char, name, 32)
+       __field(unsigned long, lat)
+   ),
+
+   TP_fast_assign(
+       strncpy(__entry->name, dev_name(bdi->dev), 32);
+       __entry->lat = lat;
+   ),
+
+   TP_printk("%s: latency %llu\n", __entry->name,
+           (unsigned long long) __entry->lat)
+);
+
+/**
+ * wbt_step - trace wb event step
+ * @msg: context message
+ * @step: the current scale step count
+ * @window: the current monitoring window
+ * @bg: the current background queue limit
+ * @normal: the current normal writeback limit
+ * @max: the current max throughput writeback limit
+ */
+TRACE_EVENT(wbt_step,
+
+   TP_PROTO(struct backing_dev_info *bdi, const char *msg,
+        unsigned int step, unsigned long window, unsigned int bg,
+        unsigned int normal, unsigned int max),
+
+   TP_ARGS(bdi, msg, step, window, bg, normal, max),
+
+   TP_STRUCT__entry(
+       __array(char, name, 32)
+       __field(const char *, msg)
+       __field(unsigned int, step)
+       __field(unsigned long, window)
+       __field(unsigned int, bg)
+       __field(unsigned int, normal)
+       __field(unsigned int, max)
+   ),
+
+   TP_fast_assign(
+       strncpy(__entry->name, dev_name(bdi->dev), 32);
+       __entry->msg    = msg;
+       __entry->step   = step;
+       __entry->window = window;
+       __entry->bg = bg;
+       __entry->normal = normal;
+       __entry->max    = max;
+   ),
+
+   TP_printk("%s: %s: step=%u, window=%lu, background=%u, normal=%u, max=%u\n",
+         __entry->name, __entry->msg, __entry->step, __entry->window,
+         __entry->bg, __entry->normal, __entry->max)
+);
+
+#endif /* _TRACE_WBT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff -Naur linux-4.4.6-gentoo-orig/lib/Kconfig linux-4.4.6-gentoo-patched/lib/Kconfig
--- linux-4.4.6-gentoo-orig/lib/Kconfig 2016-05-04 11:19:37.619649827 +0300
+++ linux-4.4.6-gentoo-patched/lib/Kconfig  2016-05-04 11:03:27.411730745 +0300
@@ -531,4 +531,7 @@
 config ARCH_HAS_MMIO_FLUSH
    bool

+config WBT
+   bool
+
 endmenu
diff -Naur linux-4.4.6-gentoo-orig/lib/Makefile linux-4.4.6-gentoo-patched/lib/Makefile
--- linux-4.4.6-gentoo-orig/lib/Makefile    2016-05-04 11:19:37.619649827 +0300
+++ linux-4.4.6-gentoo-patched/lib/Makefile 2016-05-04 11:08:23.874706019 +0300
@@ -164,6 +164,7 @@

 obj-$(CONFIG_SG_SPLIT) += sg_split.o
 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
+obj-$(CONFIG_WBT) += wbt.o

 libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \
           fdt_empty_tree.o
diff -Naur linux-4.4.6-gentoo-orig/lib/wbt.c linux-4.4.6-gentoo-patched/lib/wbt.c
--- linux-4.4.6-gentoo-orig/lib/wbt.c   1970-01-01 03:00:00.000000000 +0300
+++ linux-4.4.6-gentoo-patched/lib/wbt.c    2016-05-04 11:03:27.412730745 +0300
@@ -0,0 +1,524 @@
+/*
+ * buffered writeback throttling. losely based on CoDel. We can't drop
+ * packets for IO scheduling, so the logic is something like this:
+ *
+ * - Monitor latencies in a defined window of time.
+ * - If the minimum latency in the above window exceeds some target, increment
+ *   scaling step and scale down queue depth by a factor of 2x. The monitoring
+ *   window is then shrunk to 100 / sqrt(scaling step + 1).
+ * - For any window where we don't have solid data on what the latencies
+ *   look like, retain status quo.
+ * - If latencies look good, decrement scaling step.
+ *
+ * Copyright (C) 2016 Jens Axboe
+ *
+ * Things that (may) need changing:
+ *
+ * - Different scaling of background/normal/high priority writeback.
+ *   We may have to violate guarantees for max.
+ * - We can have mismatches between the stat window and our window.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/slab.h>
+#include <linux/backing-dev.h>
+#include <linux/wbt.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/wbt.h>
+
+enum {
+   /*
+    * Might need to be higher
+    */
+   RWB_MAX_DEPTH   = 64,
+
+   /*
+    * 100msec window
+    */
+   RWB_WINDOW_NSEC     = 100 * 1000 * 1000ULL,
+
+   /*
+    * Disregard stats, if we don't meet these minimums
+    */
+   RWB_MIN_WRITE_SAMPLES   = 3,
+   RWB_MIN_READ_SAMPLES    = 1,
+
+   RWB_UNKNOWN_BUMP    = 5,
+};
+
+static inline bool rwb_enabled(struct rq_wb *rwb)
+{
+   return rwb && rwb->wb_normal != 0;
+}
+
+/*
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
+ * false if 'v' + 1 would be bigger than 'below'.
+ */
+static bool atomic_inc_below(atomic_t *v, int below)
+{
+   int cur = atomic_read(v);
+
+   for (;;) {
+       int old;
+
+       if (cur >= below)
+           return false;
+       old = atomic_cmpxchg(v, cur, cur + 1);
+       if (old == cur)
+           break;
+       cur = old;
+   }
+
+   return true;
+}
+
+static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
+{
+   if (rwb_enabled(rwb)) {
+       const unsigned long cur = jiffies;
+
+       if (cur != *var)
+           *var = cur;
+   }
+}
+
+void __wbt_done(struct rq_wb *rwb)
+{
+   int inflight, limit = rwb->wb_normal;
+
+   /*
+    * If the device does write back caching, drop further down
+    * before we wake people up.
+    */
+   if (rwb->wc && !atomic_read(&rwb->bdi->wb.dirty_sleeping))
+       limit = 0;
+   else
+       limit = rwb->wb_normal;
+
+   /*
+    * Don't wake anyone up if we are above the normal limit. If
+    * throttling got disabled (limit == 0) with waiters, ensure
+    * that we wake them up.
+    */
+   inflight = atomic_dec_return(&rwb->inflight);
+   if (limit && inflight >= limit) {
+       if (!rwb->wb_max)
+           wake_up_all(&rwb->wait);
+       return;
+   }
+
+   if (waitqueue_active(&rwb->wait)) {
+       int diff = limit - inflight;
+
+       if (!inflight || diff >= rwb->wb_background / 2)
+           wake_up_nr(&rwb->wait, 1);
+   }
+}
+
+/*
+ * Called on completion of a request. Note that it's also called when
+ * a request is merged, when the request gets freed.
+ */
+void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat)
+{
+   if (!rwb)
+       return;
+
+   if (!wbt_tracked(stat)) {
+       if (rwb->sync_cookie == stat) {
+           rwb->sync_issue = 0;
+           rwb->sync_cookie = NULL;
+       }
+
+       wb_timestamp(rwb, &rwb->last_comp);
+   } else {
+       WARN_ON_ONCE(stat == rwb->sync_cookie);
+       __wbt_done(rwb);
+       wbt_clear_tracked(stat);
+   }
+}
+
+static void calc_wb_limits(struct rq_wb *rwb)
+{
+   unsigned int depth;
+
+   if (!rwb->min_lat_nsec) {
+       rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
+       return;
+   }
+
+   depth = min_t(unsigned int, RWB_MAX_DEPTH, rwb->queue_depth);
+
+   /*
+    * Reduce max depth by 50%, and re-calculate normal/bg based on that
+    */
+   rwb->wb_max = 1 + ((depth - 1) >> min(31U, rwb->scale_step));
+   rwb->wb_normal = (rwb->wb_max + 1) / 2;
+   rwb->wb_background = (rwb->wb_max + 3) / 4;
+}
+
+static bool inline stat_sample_valid(struct blk_rq_stat *stat)
+{
+   /*
+    * We need at least one read sample, and a minimum of
+    * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
+    * that it's writes impacting us, and not just some sole read on
+    * a device that is in a lower power state.
+    */
+   return stat[0].nr_samples >= 1 &&
+       stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES;
+}
+
+static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
+{
+   u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
+
+   if (!issue || !rwb->sync_cookie)
+       return 0;
+
+   now = ktime_to_ns(ktime_get());
+   return now - issue;
+}
+
+enum {
+   LAT_OK,
+   LAT_UNKNOWN,
+   LAT_EXCEEDED,
+};
+
+static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
+{
+   u64 thislat;
+
+   /*
+    * If our stored sync issue exceeds the window size, or it
+    * exceeds our min target AND we haven't logged any entries,
+    * flag the latency as exceeded.
+    */
+   thislat = rwb_sync_issue_lat(rwb);
+   if (thislat > rwb->cur_win_nsec ||
+       (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) {
+       trace_wbt_lat(rwb->bdi, thislat);
+       return LAT_EXCEEDED;
+   }
+
+   if (!stat_sample_valid(stat))
+       return LAT_UNKNOWN;
+
+   /*
+    * If the 'min' latency exceeds our target, step down.
+    */
+   if (stat[0].min > rwb->min_lat_nsec) {
+       trace_wbt_lat(rwb->bdi, stat[0].min);
+       trace_wbt_stat(rwb->bdi, stat);
+       return LAT_EXCEEDED;
+   }
+
+   if (rwb->scale_step)
+       trace_wbt_stat(rwb->bdi, stat);
+
+   return LAT_OK;
+}
+
+static int latency_exceeded(struct rq_wb *rwb)
+{
+   struct blk_rq_stat stat[2];
+
+   rwb->stat_ops->get(rwb->ops_data, stat);
+   return __latency_exceeded(rwb, stat);
+}
+
+static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
+{
+   trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
+           rwb->wb_background, rwb->wb_normal, rwb->wb_max);
+}
+
+static void scale_up(struct rq_wb *rwb)
+{
+   /*
+    * If we're at 0, we can't go lower.
+    */
+   if (!rwb->scale_step)
+       return;
+
+   rwb->scale_step--;
+   rwb->unknown_cnt = 0;
+   rwb->stat_ops->clear(rwb->ops_data);
+   calc_wb_limits(rwb);
+
+   if (waitqueue_active(&rwb->wait))
+       wake_up_all(&rwb->wait);
+
+   rwb_trace_step(rwb, "step up");
+}
+
+static void scale_down(struct rq_wb *rwb)
+{
+   /*
+    * Stop scaling down when we've hit the limit. This also prevents
+    * ->scale_step from going to crazy values, if the device can't
+    * keep up.
+    */
+   if (rwb->wb_max == 1)
+       return;
+
+   rwb->scale_step++;
+   rwb->unknown_cnt = 0;
+   rwb->stat_ops->clear(rwb->ops_data);
+   calc_wb_limits(rwb);
+   rwb_trace_step(rwb, "step down");
+}
+
+static void rwb_arm_timer(struct rq_wb *rwb)
+{
+   unsigned long expires;
+
+   /*
+    * We should speed this up, using some variant of a fast integer
+    * inverse square root calculation. Since we only do this for
+    * every window expiration, it's not a huge deal, though.
+    */
+   rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
+                   int_sqrt((rwb->scale_step + 1) << 8));
+   expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
+   mod_timer(&rwb->window_timer, expires);
+}
+
+static void wb_timer_fn(unsigned long data)
+{
+   struct rq_wb *rwb = (struct rq_wb *) data;
+   int status;
+
+   /*
+    * If we exceeded the latency target, step down. If we did not,
+    * step one level up. If we don't know enough to say either exceeded
+    * or ok, then don't do anything.
+    */
+   status = latency_exceeded(rwb);
+   switch (status) {
+   case LAT_EXCEEDED:
+       scale_down(rwb);
+       break;
+   case LAT_OK:
+       scale_up(rwb);
+       break;
+   case LAT_UNKNOWN:
+       /*
+        * We had no read samples, start bumping up the write
+        * depth slowly
+        */
+       if (++rwb->unknown_cnt >= RWB_UNKNOWN_BUMP)
+           scale_up(rwb);
+       break;
+   default:
+       break;
+   }
+
+   /*
+    * Re-arm timer, if we have IO in flight
+    */
+   if (rwb->scale_step || atomic_read(&rwb->inflight))
+       rwb_arm_timer(rwb);
+}
+
+void wbt_update_limits(struct rq_wb *rwb)
+{
+   rwb->scale_step = 0;
+   calc_wb_limits(rwb);
+
+   if (waitqueue_active(&rwb->wait))
+       wake_up_all(&rwb->wait);
+}
+
+static bool close_io(struct rq_wb *rwb)
+{
+   const unsigned long now = jiffies;
+
+   return time_before(now, rwb->last_issue + HZ / 10) ||
+       time_before(now, rwb->last_comp + HZ / 10);
+}
+
+#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO)
+
+static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
+{
+   unsigned int limit;
+
+   /*
+    * At this point we know it's a buffered write. If REQ_SYNC is
+    * set, then it's WB_SYNC_ALL writeback, and we'll use the max
+    * limit for that. If the write is marked as a background write,
+    * then use the idle limit, or go to normal if we haven't had
+    * competing IO for a bit.
+    */
+   if ((rw & REQ_HIPRIO) || atomic_read(&rwb->bdi->wb.dirty_sleeping))
+       limit = rwb->wb_max;
+   else if ((rw & REQ_BG) || close_io(rwb)) {
+       /*
+        * If less than 100ms since we completed unrelated IO,
+        * limit us to half the depth for background writeback.
+        */
+       limit = rwb->wb_background;
+   } else
+       limit = rwb->wb_normal;
+
+   return limit;
+}
+
+static inline bool may_queue(struct rq_wb *rwb, unsigned long rw)
+{
+   /*
+    * inc it here even if disabled, since we'll dec it at completion.
+    * this only happens if the task was sleeping in __wbt_wait(),
+    * and someone turned it off at the same time.
+    */
+   if (!rwb_enabled(rwb)) {
+       atomic_inc(&rwb->inflight);
+       return true;
+   }
+
+   return atomic_inc_below(&rwb->inflight, get_limit(rwb, rw));
+}
+
+/*
+ * Block if we will exceed our limit, or if we are currently waiting for
+ * the timer to kick off queuing again.
+ */
+static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
+{
+   DEFINE_WAIT(wait);
+
+   if (may_queue(rwb, rw))
+       return;
+
+   do {
+       prepare_to_wait_exclusive(&rwb->wait, &wait,
+                       TASK_UNINTERRUPTIBLE);
+
+       if (may_queue(rwb, rw))
+           break;
+
+       if (lock)
+           spin_unlock_irq(lock);
+
+       io_schedule();
+
+       if (lock)
+           spin_lock_irq(lock);
+   } while (1);
+
+   finish_wait(&rwb->wait, &wait);
+}
+
+static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw)
+{
+   /*
+    * If not a WRITE (or a discard), do nothing
+    */
+   if (!(rw & REQ_WRITE) || (rw & REQ_DISCARD))
+       return false;
+
+   /*
+    * Don't throttle WRITE_ODIRECT
+    */
+   if ((rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC)
+       return false;
+
+   return true;
+}
+
+/*
+ * Returns true if the IO request should be accounted, false if not.
+ * May sleep, if we have exceeded the writeback limits. Caller can pass
+ * in an irq held spinlock, if it holds one when calling this function.
+ * If we do sleep, we'll release and re-grab it.
+ */
+bool wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock)
+{
+   if (!rwb_enabled(rwb))
+       return false;
+
+   if (!wbt_should_throttle(rwb, rw)) {
+       wb_timestamp(rwb, &rwb->last_issue);
+       return false;
+   }
+
+   __wbt_wait(rwb, rw, lock);
+
+   if (!timer_pending(&rwb->window_timer))
+       rwb_arm_timer(rwb);
+
+   return true;
+}
+
+void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat)
+{
+   if (!rwb_enabled(rwb))
+       return;
+
+   wbt_issue_stat_set_time(stat);
+
+   if (!wbt_tracked(stat) && !rwb->sync_issue) {
+       rwb->sync_cookie = stat;
+       rwb->sync_issue = wbt_issue_stat_get_time(stat);
+   }
+}
+
+void wbt_requeue(struct rq_wb *rwb, struct wb_issue_stat *stat)
+{
+   if (!rwb_enabled(rwb))
+       return;
+   if (stat == rwb->sync_cookie) {
+       rwb->sync_issue = 0;
+       rwb->sync_cookie = NULL;
+   }
+}
+
+void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+{
+   if (rwb) {
+       rwb->queue_depth = depth;
+       wbt_update_limits(rwb);
+   }
+}
+
+void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
+{
+   if (rwb)
+       rwb->wc = write_cache_on;
+}
+
+struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops,
+              void *ops_data)
+{
+   struct rq_wb *rwb;
+
+   rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
+   if (!rwb)
+       return ERR_PTR(-ENOMEM);
+
+   atomic_set(&rwb->inflight, 0);
+   init_waitqueue_head(&rwb->wait);
+   setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
+   rwb->wc = 1;
+   rwb->queue_depth = RWB_MAX_DEPTH;
+   rwb->last_comp = rwb->last_issue = jiffies;
+   rwb->bdi = bdi;
+   rwb->win_nsec = RWB_WINDOW_NSEC;
+   rwb->stat_ops = ops,
+   rwb->ops_data = ops_data;
+   wbt_update_limits(rwb);
+   return rwb;
+}
+
+void wbt_exit(struct rq_wb *rwb)
+{
+   if (rwb) {
+       del_timer_sync(&rwb->window_timer);
+       kfree(rwb);
+   }
+}
diff -Naur linux-4.4.6-gentoo-orig/mm/backing-dev.c linux-4.4.6-gentoo-patched/mm/backing-dev.c
--- linux-4.4.6-gentoo-orig/mm/backing-dev.c    2016-05-04 11:19:37.620649827 +0300
+++ linux-4.4.6-gentoo-patched/mm/backing-dev.c 2016-05-04 11:03:27.412730745 +0300
@@ -310,6 +310,7 @@
    spin_lock_init(&wb->work_lock);
    INIT_LIST_HEAD(&wb->work_list);
    INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
+   atomic_set(&wb->dirty_sleeping, 0);

    wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
    if (!wb->congested)
diff -Naur linux-4.4.6-gentoo-orig/mm/page-writeback.c linux-4.4.6-gentoo-patched/mm/page-writeback.c
--- linux-4.4.6-gentoo-orig/mm/page-writeback.c 2016-05-04 11:19:37.621649827 +0300
+++ linux-4.4.6-gentoo-patched/mm/page-writeback.c  2016-05-04 11:03:27.412730745 +0300
@@ -1735,7 +1735,9 @@
                      pause,
                      start_time);
        __set_current_state(TASK_KILLABLE);
+       atomic_inc(&wb->dirty_sleeping);
        io_schedule_timeout(pause);
+       atomic_dec(&wb->dirty_sleeping);

        current->dirty_paused_when = now + pause;
        current->nr_dirtied = 0;