Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- diff -Naur linux-4.4.6-gentoo-orig/arch/um/drivers/ubd_kern.c linux-4.4.6-gentoo-patched/arch/um/drivers/ubd_kern.c
- --- linux-4.4.6-gentoo-orig/arch/um/drivers/ubd_kern.c 2016-05-04 11:19:37.591649829 +0300
- +++ linux-4.4.6-gentoo-patched/arch/um/drivers/ubd_kern.c 2016-05-04 11:02:48.599733982 +0300
- @@ -866,7 +866,7 @@
- goto out;
- }
- ubd_dev->queue->queuedata = ubd_dev;
- - blk_queue_flush(ubd_dev->queue, REQ_FLUSH);
- + blk_queue_write_cache(ubd_dev->queue, true, false);
- blk_queue_max_segments(ubd_dev->queue, MAX_SG);
- err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]);
- diff -Naur linux-4.4.6-gentoo-orig/block/blk-core.c linux-4.4.6-gentoo-patched/block/blk-core.c
- --- linux-4.4.6-gentoo-orig/block/blk-core.c 2016-05-04 11:19:37.593649829 +0300
- +++ linux-4.4.6-gentoo-patched/block/blk-core.c 2016-05-04 11:02:48.599733982 +0300
- @@ -33,6 +33,7 @@
- #include <linux/ratelimit.h>
- #include <linux/pm_runtime.h>
- #include <linux/blk-cgroup.h>
- +#include <linux/wbt.h>
- #define CREATE_TRACE_POINTS
- #include <trace/events/block.h>
- @@ -872,6 +873,8 @@
- fail:
- blk_free_flush_queue(q->fq);
- + wbt_exit(q->rq_wb);
- + q->rq_wb = NULL;
- return NULL;
- }
- EXPORT_SYMBOL(blk_init_allocated_queue);
- @@ -1385,6 +1388,7 @@
- blk_delete_timer(rq);
- blk_clear_rq_complete(rq);
- trace_block_rq_requeue(q, rq);
- + wbt_requeue(q->rq_wb, &rq->wb_stat);
- if (rq->cmd_flags & REQ_QUEUED)
- blk_queue_end_tag(q, rq);
- @@ -1475,6 +1479,8 @@
- /* this is a bio leak */
- WARN_ON(req->bio != NULL);
- + wbt_done(q->rq_wb, &req->wb_stat);
- +
- /*
- * Request may not have originated from ll_rw_blk. if not,
- * it didn't come out of our reserved rq pools
- @@ -1704,6 +1710,7 @@
- int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
- struct request *req;
- unsigned int request_count = 0;
- + bool wb_acct;
- /*
- * low level driver can indicate that it wants pages above a
- @@ -1756,6 +1763,8 @@
- }
- get_rq:
- + wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, q->queue_lock);
- +
- /*
- * This sync check and mask will be re-done in init_request_from_bio(),
- * but we need to set it earlier to expose the sync flag to the
- @@ -1771,11 +1780,16 @@
- */
- req = get_request(q, rw_flags, bio, GFP_NOIO);
- if (IS_ERR(req)) {
- + if (wb_acct)
- + __wbt_done(q->rq_wb);
- bio->bi_error = PTR_ERR(req);
- bio_endio(bio);
- goto out_unlock;
- }
- + if (wb_acct)
- + wbt_mark_tracked(&req->wb_stat);
- +
- /*
- * After dropping the lock and possibly sleeping here, our request
- * may now be mergeable after it had proven unmergeable (above).
- @@ -1953,7 +1967,8 @@
- * drivers without flush support don't have to worry
- * about them.
- */
- - if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
- + if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
- + !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
- bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
- if (!nr_sectors) {
- err = 0;
- @@ -2502,6 +2517,8 @@
- {
- blk_dequeue_request(req);
- + wbt_issue(req->q->rq_wb, &req->wb_stat);
- +
- /*
- * We are now handing the request to the hardware, initialize
- * resid_len to full count and add the timeout handler.
- @@ -2569,6 +2586,8 @@
- trace_block_rq_complete(req->q, req, nr_bytes);
- + blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req);
- +
- if (!req->bio)
- return false;
- @@ -2736,9 +2755,10 @@
- blk_account_io_done(req);
- - if (req->end_io)
- + if (req->end_io) {
- + wbt_done(req->q->rq_wb, &req->wb_stat);
- req->end_io(req, error);
- - else {
- + } else {
- if (blk_bidi_rq(req))
- __blk_put_request(req->next_rq->q, req->next_rq);
- diff -Naur linux-4.4.6-gentoo-orig/block/blk-flush.c linux-4.4.6-gentoo-patched/block/blk-flush.c
- --- linux-4.4.6-gentoo-orig/block/blk-flush.c 2016-05-04 11:19:37.593649829 +0300
- +++ linux-4.4.6-gentoo-patched/block/blk-flush.c 2016-05-04 11:02:48.599733982 +0300
- @@ -95,17 +95,18 @@
- static bool blk_kick_flush(struct request_queue *q,
- struct blk_flush_queue *fq);
- -static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
- +static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
- {
- unsigned int policy = 0;
- if (blk_rq_sectors(rq))
- policy |= REQ_FSEQ_DATA;
- - if (fflags & REQ_FLUSH) {
- + if (fflags & (1UL << QUEUE_FLAG_WC)) {
- if (rq->cmd_flags & REQ_FLUSH)
- policy |= REQ_FSEQ_PREFLUSH;
- - if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
- + if (!(fflags & (1UL << QUEUE_FLAG_FUA)) &&
- + (rq->cmd_flags & REQ_FUA))
- policy |= REQ_FSEQ_POSTFLUSH;
- }
- return policy;
- @@ -384,7 +385,7 @@
- void blk_insert_flush(struct request *rq)
- {
- struct request_queue *q = rq->q;
- - unsigned int fflags = q->flush_flags; /* may change, cache */
- + unsigned long fflags = q->queue_flags; /* may change, cache */
- unsigned int policy = blk_flush_policy(fflags, rq);
- struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
- @@ -393,7 +394,7 @@
- * REQ_FLUSH and FUA for the driver.
- */
- rq->cmd_flags &= ~REQ_FLUSH;
- - if (!(fflags & REQ_FUA))
- + if (!(fflags & (1UL << QUEUE_FLAG_FUA)))
- rq->cmd_flags &= ~REQ_FUA;
- /*
- diff -Naur linux-4.4.6-gentoo-orig/block/blk-mq.c linux-4.4.6-gentoo-patched/block/blk-mq.c
- --- linux-4.4.6-gentoo-orig/block/blk-mq.c 2016-05-04 11:19:37.594649829 +0300
- +++ linux-4.4.6-gentoo-patched/block/blk-mq.c 2016-05-04 11:02:48.600733982 +0300
- @@ -22,6 +22,7 @@
- #include <linux/sched/sysctl.h>
- #include <linux/delay.h>
- #include <linux/crash_dump.h>
- +#include <linux/wbt.h>
- #include <trace/events/block.h>
- @@ -29,6 +30,7 @@
- #include "blk.h"
- #include "blk-mq.h"
- #include "blk-mq-tag.h"
- +#include "blk-stat.h"
- static DEFINE_MUTEX(all_q_mutex);
- static LIST_HEAD(all_q_list);
- @@ -276,6 +278,8 @@
- if (rq->cmd_flags & REQ_MQ_INFLIGHT)
- atomic_dec(&hctx->nr_active);
- +
- + wbt_done(q->rq_wb, &rq->wb_stat);
- rq->cmd_flags = 0;
- clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
- @@ -308,6 +312,7 @@
- blk_account_io_done(rq);
- if (rq->end_io) {
- + wbt_done(rq->q->rq_wb, &rq->wb_stat);
- rq->end_io(rq, error);
- } else {
- if (unlikely(blk_bidi_rq(rq)))
- @@ -358,10 +363,19 @@
- put_cpu();
- }
- +static void blk_mq_stat_add(struct request *rq)
- +{
- + struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)];
- +
- + blk_stat_add(stat, rq);
- +}
- +
- static void __blk_mq_complete_request(struct request *rq)
- {
- struct request_queue *q = rq->q;
- + blk_mq_stat_add(rq);
- +
- if (!q->softirq_done_fn)
- blk_mq_end_request(rq, rq->errors);
- else
- @@ -405,6 +419,8 @@
- if (unlikely(blk_bidi_rq(rq)))
- rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
- + wbt_issue(q->rq_wb, &rq->wb_stat);
- +
- blk_add_timer(rq);
- /*
- @@ -440,6 +456,7 @@
- struct request_queue *q = rq->q;
- trace_block_rq_requeue(q, rq);
- + wbt_requeue(q->rq_wb, &rq->wb_stat);
- if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
- if (q->dma_drain_size && blk_rq_bytes(rq))
- @@ -1249,6 +1266,7 @@
- struct blk_plug *plug;
- struct request *same_queue_rq = NULL;
- blk_qc_t cookie;
- + bool wb_acct;
- blk_queue_bounce(q, &bio);
- @@ -1266,9 +1284,17 @@
- } else
- request_count = blk_plug_queued_count(q);
- + wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, NULL);
- +
- rq = blk_mq_map_request(q, bio, &data);
- - if (unlikely(!rq))
- + if (unlikely(!rq)) {
- + if (wb_acct)
- + __wbt_done(q->rq_wb);
- return BLK_QC_T_NONE;
- + }
- +
- + if (wb_acct)
- + wbt_mark_tracked(&rq->wb_stat);
- cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
- @@ -1345,6 +1371,7 @@
- struct blk_map_ctx data;
- struct request *rq;
- blk_qc_t cookie;
- + bool wb_acct;
- blk_queue_bounce(q, &bio);
- @@ -1359,9 +1386,17 @@
- blk_attempt_plug_merge(q, bio, &request_count, NULL))
- return BLK_QC_T_NONE;
- + wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, NULL);
- +
- rq = blk_mq_map_request(q, bio, &data);
- - if (unlikely(!rq))
- + if (unlikely(!rq)) {
- + if (wb_acct)
- + __wbt_done(q->rq_wb);
- return BLK_QC_T_NONE;
- + }
- +
- + if (wb_acct)
- + wbt_mark_tracked(&rq->wb_stat);
- cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
- @@ -1782,6 +1817,8 @@
- spin_lock_init(&__ctx->lock);
- INIT_LIST_HEAD(&__ctx->rq_list);
- __ctx->queue = q;
- + blk_stat_init(&__ctx->stat[0]);
- + blk_stat_init(&__ctx->stat[1]);
- /* If the cpu isn't online, the cpu is mapped to first hctx */
- if (!cpu_online(i))
- @@ -2095,6 +2132,9 @@
- list_del_init(&q->all_q_node);
- mutex_unlock(&all_q_mutex);
- + wbt_exit(q->rq_wb);
- + q->rq_wb = NULL;
- +
- blk_mq_del_queue_tag_set(q);
- blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
- diff -Naur linux-4.4.6-gentoo-orig/block/blk-mq.h linux-4.4.6-gentoo-patched/block/blk-mq.h
- --- linux-4.4.6-gentoo-orig/block/blk-mq.h 2016-05-04 11:19:37.594649829 +0300
- +++ linux-4.4.6-gentoo-patched/block/blk-mq.h 2016-05-04 11:02:48.600733982 +0300
- @@ -1,6 +1,8 @@
- #ifndef INT_BLK_MQ_H
- #define INT_BLK_MQ_H
- +#include "blk-stat.h"
- +
- struct blk_mq_tag_set;
- struct blk_mq_ctx {
- @@ -20,6 +22,7 @@
- /* incremented at completion time */
- unsigned long ____cacheline_aligned_in_smp rq_completed[2];
- + struct blk_rq_stat stat[2];
- struct request_queue *queue;
- struct kobject kobj;
- diff -Naur linux-4.4.6-gentoo-orig/block/blk-mq-sysfs.c linux-4.4.6-gentoo-patched/block/blk-mq-sysfs.c
- --- linux-4.4.6-gentoo-orig/block/blk-mq-sysfs.c 2016-05-04 11:19:37.595649829 +0300
- +++ linux-4.4.6-gentoo-patched/block/blk-mq-sysfs.c 2016-05-04 11:02:48.599733982 +0300
- @@ -247,6 +247,47 @@
- return ret;
- }
- +static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx)
- +{
- + struct blk_mq_ctx *ctx;
- + unsigned int i;
- +
- + hctx_for_each_ctx(hctx, ctx, i) {
- + blk_stat_init(&ctx->stat[0]);
- + blk_stat_init(&ctx->stat[1]);
- + }
- +}
- +
- +static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx,
- + const char *page, size_t count)
- +{
- + blk_mq_stat_clear(hctx);
- + return count;
- +}
- +
- +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
- +{
- + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
- + pre, (long long) stat->nr_samples,
- + (long long) stat->mean, (long long) stat->min,
- + (long long) stat->max);
- +}
- +
- +static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page)
- +{
- + struct blk_rq_stat stat[2];
- + ssize_t ret;
- +
- + blk_stat_init(&stat[0]);
- + blk_stat_init(&stat[1]);
- +
- + blk_hctx_stat_get(hctx, stat);
- +
- + ret = print_stat(page, &stat[0], "read :");
- + ret += print_stat(page + ret, &stat[1], "write:");
- + return ret;
- +}
- +
- static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
- .attr = {.name = "dispatched", .mode = S_IRUGO },
- .show = blk_mq_sysfs_dispatched_show,
- @@ -304,6 +345,11 @@
- .attr = {.name = "io_poll", .mode = S_IRUGO },
- .show = blk_mq_hw_sysfs_poll_show,
- };
- +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = {
- + .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR },
- + .show = blk_mq_hw_sysfs_stat_show,
- + .store = blk_mq_hw_sysfs_stat_store,
- +};
- static struct attribute *default_hw_ctx_attrs[] = {
- &blk_mq_hw_sysfs_queued.attr,
- @@ -314,6 +360,7 @@
- &blk_mq_hw_sysfs_cpus.attr,
- &blk_mq_hw_sysfs_active.attr,
- &blk_mq_hw_sysfs_poll.attr,
- + &blk_mq_hw_sysfs_stat.attr,
- NULL,
- };
- diff -Naur linux-4.4.6-gentoo-orig/block/blk-settings.c linux-4.4.6-gentoo-patched/block/blk-settings.c
- --- linux-4.4.6-gentoo-orig/block/blk-settings.c 2016-05-04 11:19:37.595649829 +0300
- +++ linux-4.4.6-gentoo-patched/block/blk-settings.c 2016-05-04 11:02:48.600733982 +0300
- @@ -820,31 +820,54 @@
- }
- EXPORT_SYMBOL(blk_queue_update_dma_alignment);
- +void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
- +{
- + spin_lock_irq(q->queue_lock);
- + if (queueable)
- + clear_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags);
- + else
- + set_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags);
- + spin_unlock_irq(q->queue_lock);
- +}
- +EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
- +
- /**
- - * blk_queue_flush - configure queue's cache flush capability
- + * blk_set_queue_depth - tell the block layer about the device queue depth
- * @q: the request queue for the device
- - * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
- + * @depth: queue depth
- *
- - * Tell block layer cache flush capability of @q. If it supports
- - * flushing, REQ_FLUSH should be set. If it supports bypassing
- - * write cache for individual writes, REQ_FUA should be set.
- */
- -void blk_queue_flush(struct request_queue *q, unsigned int flush)
- +void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
- {
- - WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
- -
- - if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
- - flush &= ~REQ_FUA;
- -
- - q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
- + q->queue_depth = depth;
- + wbt_set_queue_depth(q->rq_wb, depth);
- }
- -EXPORT_SYMBOL_GPL(blk_queue_flush);
- +EXPORT_SYMBOL(blk_set_queue_depth);
- -void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
- +/**
- + * blk_queue_write_cache - configure queue's write cache
- + * @q: the request queue for the device
- + * @wc: write back cache on or off
- + * @fua: device supports FUA writes, if true
- + *
- + * Tell the block layer about the write cache of @q.
- + */
- +void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
- {
- - q->flush_not_queueable = !queueable;
- + spin_lock_irq(q->queue_lock);
- + if (wc)
- + queue_flag_set(QUEUE_FLAG_WC, q);
- + else
- + queue_flag_clear(QUEUE_FLAG_WC, q);
- + if (fua)
- + queue_flag_set(QUEUE_FLAG_FUA, q);
- + else
- + queue_flag_clear(QUEUE_FLAG_FUA, q);
- + spin_unlock_irq(q->queue_lock);
- +
- + wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
- }
- -EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
- +EXPORT_SYMBOL_GPL(blk_queue_write_cache);
- static int __init blk_settings_init(void)
- {
- diff -Naur linux-4.4.6-gentoo-orig/block/blk-stat.c linux-4.4.6-gentoo-patched/block/blk-stat.c
- --- linux-4.4.6-gentoo-orig/block/blk-stat.c 1970-01-01 03:00:00.000000000 +0300
- +++ linux-4.4.6-gentoo-patched/block/blk-stat.c 2016-05-04 11:02:48.600733982 +0300
- @@ -0,0 +1,185 @@
- +/*
- + * Block stat tracking code
- + *
- + * Copyright (C) 2016 Jens Axboe
- + */
- +#include <linux/kernel.h>
- +#include <linux/blk-mq.h>
- +
- +#include "blk-stat.h"
- +#include "blk-mq.h"
- +
- +void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
- +{
- + if (!src->nr_samples)
- + return;
- +
- + dst->min = min(dst->min, src->min);
- + dst->max = max(dst->max, src->max);
- +
- + if (!dst->nr_samples)
- + dst->mean = src->mean;
- + else {
- + dst->mean = div64_s64((src->mean * src->nr_samples) +
- + (dst->mean * dst->nr_samples),
- + dst->nr_samples + src->nr_samples);
- + }
- + dst->nr_samples += src->nr_samples;
- +}
- +
- +static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
- +{
- + struct blk_mq_hw_ctx *hctx;
- + struct blk_mq_ctx *ctx;
- + int i, j, nr;
- +
- + blk_stat_init(&dst[0]);
- + blk_stat_init(&dst[1]);
- +
- + nr = 0;
- + do {
- + uint64_t newest = 0;
- +
- + queue_for_each_hw_ctx(q, hctx, i) {
- + hctx_for_each_ctx(hctx, ctx, j) {
- + if (!ctx->stat[0].nr_samples &&
- + !ctx->stat[1].nr_samples)
- + continue;
- + if (ctx->stat[0].time > newest)
- + newest = ctx->stat[0].time;
- + if (ctx->stat[1].time > newest)
- + newest = ctx->stat[1].time;
- + }
- + }
- +
- + /*
- + * No samples
- + */
- + if (!newest)
- + break;
- +
- + queue_for_each_hw_ctx(q, hctx, i) {
- + hctx_for_each_ctx(hctx, ctx, j) {
- + if (ctx->stat[0].time == newest) {
- + blk_stat_sum(&dst[0], &ctx->stat[0]);
- + nr++;
- + }
- + if (ctx->stat[1].time == newest) {
- + blk_stat_sum(&dst[1], &ctx->stat[1]);
- + nr++;
- + }
- + }
- + }
- + /*
- + * If we race on finding an entry, just loop back again.
- + * Should be very rare.
- + */
- + } while (!nr);
- +}
- +
- +void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
- +{
- + if (q->mq_ops)
- + blk_mq_stat_get(q, dst);
- + else {
- + memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat));
- + memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat));
- + }
- +}
- +
- +void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
- +{
- + struct blk_mq_ctx *ctx;
- + unsigned int i, nr;
- +
- + nr = 0;
- + do {
- + uint64_t newest = 0;
- +
- + hctx_for_each_ctx(hctx, ctx, i) {
- + if (!ctx->stat[0].nr_samples &&
- + !ctx->stat[1].nr_samples)
- + continue;
- +
- + if (ctx->stat[0].time > newest)
- + newest = ctx->stat[0].time;
- + if (ctx->stat[1].time > newest)
- + newest = ctx->stat[1].time;
- + }
- +
- + if (!newest)
- + break;
- +
- + hctx_for_each_ctx(hctx, ctx, i) {
- + if (ctx->stat[0].time == newest) {
- + blk_stat_sum(&dst[0], &ctx->stat[0]);
- + nr++;
- + }
- + if (ctx->stat[1].time == newest) {
- + blk_stat_sum(&dst[1], &ctx->stat[1]);
- + nr++;
- + }
- + }
- + /*
- + * If we race on finding an entry, just loop back again.
- + * Should be very rare, as the window is only updated
- + * occasionally
- + */
- + } while (!nr);
- +}
- +
- +static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
- +{
- + stat->min = -1ULL;
- + stat->max = stat->nr_samples = stat->mean = 0;
- + stat->time = time_now & BLK_STAT_MASK;
- +}
- +
- +void blk_stat_init(struct blk_rq_stat *stat)
- +{
- + __blk_stat_init(stat, ktime_to_ns(ktime_get()));
- +}
- +
- +void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
- +{
- + s64 delta, now, value;
- + u64 rq_time = wbt_issue_stat_get_time(&rq->wb_stat);
- +
- + now = ktime_to_ns(ktime_get());
- + if (now < rq_time)
- + return;
- +
- + if ((now & BLK_STAT_MASK) != (stat->time & BLK_STAT_MASK))
- + __blk_stat_init(stat, now);
- +
- + value = now - rq_time;
- + if (value > stat->max)
- + stat->max = value;
- + if (value < stat->min)
- + stat->min = value;
- +
- + delta = value - stat->mean;
- + if (delta)
- + stat->mean += div64_s64(delta, stat->nr_samples + 1);
- +
- + stat->nr_samples++;
- +}
- +
- +void blk_stat_clear(struct request_queue *q)
- +{
- + if (q->mq_ops) {
- + struct blk_mq_hw_ctx *hctx;
- + struct blk_mq_ctx *ctx;
- + int i, j;
- +
- + queue_for_each_hw_ctx(q, hctx, i) {
- + hctx_for_each_ctx(hctx, ctx, j) {
- + blk_stat_init(&ctx->stat[0]);
- + blk_stat_init(&ctx->stat[1]);
- + }
- + }
- + } else {
- + blk_stat_init(&q->rq_stats[0]);
- + blk_stat_init(&q->rq_stats[1]);
- + }
- +}
- diff -Naur linux-4.4.6-gentoo-orig/block/blk-stat.h linux-4.4.6-gentoo-patched/block/blk-stat.h
- --- linux-4.4.6-gentoo-orig/block/blk-stat.h 1970-01-01 03:00:00.000000000 +0300
- +++ linux-4.4.6-gentoo-patched/block/blk-stat.h 2016-05-04 11:02:48.600733982 +0300
- @@ -0,0 +1,17 @@
- +#ifndef BLK_STAT_H
- +#define BLK_STAT_H
- +
- +/*
- + * ~0.13s window as a power-of-2 (2^27 nsecs)
- + */
- +#define BLK_STAT_NSEC 134217728ULL
- +#define BLK_STAT_MASK ~(BLK_STAT_NSEC - 1)
- +
- +void blk_stat_add(struct blk_rq_stat *, struct request *);
- +void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
- +void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
- +void blk_stat_clear(struct request_queue *q);
- +void blk_stat_init(struct blk_rq_stat *);
- +void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *);
- +
- +#endif
- diff -Naur linux-4.4.6-gentoo-orig/block/blk-sysfs.c linux-4.4.6-gentoo-patched/block/blk-sysfs.c
- --- linux-4.4.6-gentoo-orig/block/blk-sysfs.c 2016-05-04 11:19:37.596649829 +0300
- +++ linux-4.4.6-gentoo-patched/block/blk-sysfs.c 2016-05-04 11:02:48.600733982 +0300
- @@ -10,6 +10,7 @@
- #include <linux/blktrace_api.h>
- #include <linux/blk-mq.h>
- #include <linux/blk-cgroup.h>
- +#include <linux/wbt.h>
- #include "blk.h"
- #include "blk-mq.h"
- @@ -41,6 +42,19 @@
- return count;
- }
- +static ssize_t queue_var_store64(u64 *var, const char *page)
- +{
- + int err;
- + u64 v;
- +
- + err = kstrtou64(page, 10, &v);
- + if (err < 0)
- + return err;
- +
- + *var = v;
- + return 0;
- +}
- +
- static ssize_t queue_requests_show(struct request_queue *q, char *page)
- {
- return queue_var_show(q->nr_requests, (page));
- @@ -348,6 +362,110 @@
- return ret;
- }
- +static ssize_t queue_wb_win_show(struct request_queue *q, char *page)
- +{
- + if (!q->rq_wb)
- + return -EINVAL;
- +
- + return sprintf(page, "%llu\n", div_u64(q->rq_wb->win_nsec, 1000));
- +}
- +
- +static ssize_t queue_wb_win_store(struct request_queue *q, const char *page,
- + size_t count)
- +{
- + ssize_t ret;
- + u64 val;
- +
- + if (!q->rq_wb)
- + return -EINVAL;
- +
- + ret = queue_var_store64(&val, page);
- + if (ret < 0)
- + return ret;
- +
- + q->rq_wb->win_nsec = val * 1000ULL;
- + wbt_update_limits(q->rq_wb);
- + return count;
- +}
- +
- +static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
- +{
- + if (!q->rq_wb)
- + return -EINVAL;
- +
- + return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
- +}
- +
- +static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
- + size_t count)
- +{
- + ssize_t ret;
- + u64 val;
- +
- + if (!q->rq_wb)
- + return -EINVAL;
- +
- + ret = queue_var_store64(&val, page);
- + if (ret < 0)
- + return ret;
- +
- + q->rq_wb->min_lat_nsec = val * 1000ULL;
- + wbt_update_limits(q->rq_wb);
- + return count;
- +}
- +
- +static ssize_t queue_wc_show(struct request_queue *q, char *page)
- +{
- + if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
- + return sprintf(page, "write back\n");
- +
- + return sprintf(page, "write through\n");
- +}
- +
- +static ssize_t queue_wc_store(struct request_queue *q, const char *page,
- + size_t count)
- +{
- + int set = -1;
- +
- + if (!strncmp(page, "write back", 10))
- + set = 1;
- + else if (!strncmp(page, "write through", 13) ||
- + !strncmp(page, "none", 4))
- + set = 0;
- +
- + if (set == -1)
- + return -EINVAL;
- +
- + spin_lock_irq(q->queue_lock);
- + if (set)
- + queue_flag_set(QUEUE_FLAG_WC, q);
- + else
- + queue_flag_clear(QUEUE_FLAG_WC, q);
- + spin_unlock_irq(q->queue_lock);
- +
- + return count;
- +}
- +
- +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
- +{
- + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
- + pre, (long long) stat->nr_samples,
- + (long long) stat->mean, (long long) stat->min,
- + (long long) stat->max);
- +}
- +
- +static ssize_t queue_stats_show(struct request_queue *q, char *page)
- +{
- + struct blk_rq_stat stat[2];
- + ssize_t ret;
- +
- + blk_queue_stat_get(q, stat);
- +
- + ret = print_stat(page, &stat[0], "read :");
- + ret += print_stat(page + ret, &stat[1], "write:");
- + return ret;
- +}
- +
- static struct queue_sysfs_entry queue_requests_entry = {
- .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
- .show = queue_requests_show,
- @@ -479,6 +597,29 @@
- .store = queue_poll_store,
- };
- +static struct queue_sysfs_entry queue_wc_entry = {
- + .attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR },
- + .show = queue_wc_show,
- + .store = queue_wc_store,
- +};
- +
- +static struct queue_sysfs_entry queue_stats_entry = {
- + .attr = {.name = "stats", .mode = S_IRUGO },
- + .show = queue_stats_show,
- +};
- +
- +static struct queue_sysfs_entry queue_wb_lat_entry = {
- + .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
- + .show = queue_wb_lat_show,
- + .store = queue_wb_lat_store,
- +};
- +
- +static struct queue_sysfs_entry queue_wb_win_entry = {
- + .attr = {.name = "wbt_window_usec", .mode = S_IRUGO | S_IWUSR },
- + .show = queue_wb_win_show,
- + .store = queue_wb_win_store,
- +};
- +
- static struct attribute *default_attrs[] = {
- &queue_requests_entry.attr,
- &queue_ra_entry.attr,
- @@ -504,6 +645,10 @@
- &queue_iostats_entry.attr,
- &queue_random_entry.attr,
- &queue_poll_entry.attr,
- + &queue_wc_entry.attr,
- + &queue_stats_entry.attr,
- + &queue_wb_lat_entry.attr,
- + &queue_wb_win_entry.attr,
- NULL,
- };
- @@ -618,6 +763,43 @@
- .release = blk_release_queue,
- };
- +static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat)
- +{
- + blk_queue_stat_get(data, stat);
- +}
- +
- +static void blk_wb_stat_clear(void *data)
- +{
- + blk_stat_clear(data);
- +}
- +
- +static struct wb_stat_ops wb_stat_ops = {
- + .get = blk_wb_stat_get,
- + .clear = blk_wb_stat_clear,
- +};
- +
- +static void blk_wb_init(struct request_queue *q)
- +{
- + struct rq_wb *rwb;
- +
- + rwb = wbt_init(&q->backing_dev_info, &wb_stat_ops, q);
- +
- + /*
- + * If this fails, we don't get throttling
- + */
- + if (IS_ERR(rwb))
- + return;
- +
- + if (blk_queue_nonrot(q))
- + rwb->min_lat_nsec = 2000000ULL;
- + else
- + rwb->min_lat_nsec = 75000000ULL;
- +
- + wbt_set_queue_depth(rwb, blk_queue_depth(q));
- + wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
- + q->rq_wb = rwb;
- +}
- +
- int blk_register_queue(struct gendisk *disk)
- {
- int ret;
- @@ -657,6 +839,8 @@
- if (q->mq_ops)
- blk_mq_register_disk(disk);
- + blk_wb_init(q);
- +
- if (!q->request_fn)
- return 0;
- diff -Naur linux-4.4.6-gentoo-orig/block/Kconfig linux-4.4.6-gentoo-patched/block/Kconfig
- --- linux-4.4.6-gentoo-orig/block/Kconfig 2016-05-04 11:19:37.596649829 +0300
- +++ linux-4.4.6-gentoo-patched/block/Kconfig 2016-05-04 11:02:48.599733982 +0300
- @@ -4,6 +4,7 @@
- menuconfig BLOCK
- bool "Enable the block layer" if EXPERT
- default y
- + select WBT
- help
- Provide block layer support for the kernel.
- diff -Naur linux-4.4.6-gentoo-orig/block/Makefile linux-4.4.6-gentoo-patched/block/Makefile
- --- linux-4.4.6-gentoo-orig/block/Makefile 2016-05-04 11:19:37.596649829 +0300
- +++ linux-4.4.6-gentoo-patched/block/Makefile 2016-05-04 11:10:18.790696435 +0300
- @@ -5,7 +5,7 @@
- obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
- blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
- blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- - blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
- + blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
- blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
- genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
- partitions/
- diff -Naur linux-4.4.6-gentoo-orig/Documentation/block/queue-sysfs.txt linux-4.4.6-gentoo-patched/Documentation/block/queue-sysfs.txt
- --- linux-4.4.6-gentoo-orig/Documentation/block/queue-sysfs.txt 2016-05-04 11:19:37.597649829 +0300
- +++ linux-4.4.6-gentoo-patched/Documentation/block/queue-sysfs.txt 2016-05-04 11:02:48.598733982 +0300
- @@ -141,6 +141,28 @@
- an IO scheduler name to this file will attempt to load that IO scheduler
- module, if it isn't already present in the system.
- +write_cache (RW)
- +----------------
- +When read, this file will display whether the device has write back
- +caching enabled or not. It will return "write back" for the former
- +case, and "write through" for the latter. Writing to this file can
- +change the kernels view of the device, but it doesn't alter the
- +device state. This means that it might not be safe to toggle the
- +setting from "write back" to "write through", since that will also
- +eliminate cache flushes issued by the kernel.
- +
- +wb_lat_usec (RW)
- +----------------
- +If the device is registered for writeback throttling, then this file shows
- +the target minimum read latency. If this latency is exceeded in a given
- +window of time (see wb_window_usec), then the writeback throttling will start
- +scaling back writes.
- +
- +wb_window_usec (RW)
- +-------------------
- +If the device is registered for writeback throttling, then this file shows
- +the value of the monitoring window in which we'll look at the target
- +latency. See wb_lat_usec.
- Jens Axboe <jens.axboe@oracle.com>, February 2009
- diff -Naur linux-4.4.6-gentoo-orig/Documentation/block/writeback_cache_control.txt linux-4.4.6-gentoo-patched/Documentation/block/writeback_cache_control.txt
- --- linux-4.4.6-gentoo-orig/Documentation/block/writeback_cache_control.txt 2016-05-04 11:19:37.597649829 +0300
- +++ linux-4.4.6-gentoo-patched/Documentation/block/writeback_cache_control.txt 2016-05-04 11:02:48.598733982 +0300
- @@ -71,7 +71,7 @@
- driver needs to tell the block layer that it supports flushing caches by
- doing:
- - blk_queue_flush(sdkp->disk->queue, REQ_FLUSH);
- + blk_queue_write_cache(sdkp->disk->queue, true, false);
- and handle empty REQ_FLUSH requests in its prep_fn/request_fn. Note that
- REQ_FLUSH requests with a payload are automatically turned into a sequence
- @@ -79,7 +79,7 @@
- layer. For devices that also support the FUA bit the block layer needs
- to be told to pass through the REQ_FUA bit using:
- - blk_queue_flush(sdkp->disk->queue, REQ_FLUSH | REQ_FUA);
- + blk_queue_write_cache(sdkp->disk->queue, true, true);
- and the driver must handle write requests that have the REQ_FUA bit set
- in prep_fn/request_fn. If the FUA bit is not natively supported the block
- diff -Naur linux-4.4.6-gentoo-orig/drivers/block/drbd/drbd_main.c linux-4.4.6-gentoo-patched/drivers/block/drbd/drbd_main.c
- --- linux-4.4.6-gentoo-orig/drivers/block/drbd/drbd_main.c 2016-05-04 11:19:37.598649829 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/block/drbd/drbd_main.c 2016-05-04 11:02:48.601733981 +0300
- @@ -2769,7 +2769,7 @@
- q->backing_dev_info.congested_data = device;
- blk_queue_make_request(q, drbd_make_request);
- - blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
- + blk_queue_write_cache(q, true, true);
- /* Setting the max_hw_sectors to an odd value of 8kibyte here
- This triggers a max_bio_size message upon first attach or connect */
- blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
- diff -Naur linux-4.4.6-gentoo-orig/drivers/block/loop.c linux-4.4.6-gentoo-patched/drivers/block/loop.c
- --- linux-4.4.6-gentoo-orig/drivers/block/loop.c 2016-05-04 11:19:37.598649829 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/block/loop.c 2016-05-04 11:02:48.601733981 +0300
- @@ -937,7 +937,7 @@
- mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
- if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
- - blk_queue_flush(lo->lo_queue, REQ_FLUSH);
- + blk_queue_write_cache(lo->lo_queue, true, false);
- loop_update_dio(lo);
- set_capacity(lo->lo_disk, size);
- diff -Naur linux-4.4.6-gentoo-orig/drivers/block/mtip32xx/mtip32xx.c linux-4.4.6-gentoo-patched/drivers/block/mtip32xx/mtip32xx.c
- --- linux-4.4.6-gentoo-orig/drivers/block/mtip32xx/mtip32xx.c 2016-05-04 11:19:37.599649829 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/block/mtip32xx/mtip32xx.c 2016-05-04 11:02:48.602733981 +0300
- @@ -3913,12 +3913,6 @@
- blk_queue_io_min(dd->queue, 4096);
- blk_queue_bounce_limit(dd->queue, dd->pdev->dma_mask);
- - /*
- - * write back cache is not supported in the device. FUA depends on
- - * write back cache support, hence setting flush support to zero.
- - */
- - blk_queue_flush(dd->queue, 0);
- -
- /* Signal trim support */
- if (dd->trim_supp == true) {
- set_bit(QUEUE_FLAG_DISCARD, &dd->queue->queue_flags);
- diff -Naur linux-4.4.6-gentoo-orig/drivers/block/nbd.c linux-4.4.6-gentoo-patched/drivers/block/nbd.c
- --- linux-4.4.6-gentoo-orig/drivers/block/nbd.c 2016-05-04 11:19:37.600649829 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/block/nbd.c 2016-05-04 11:02:48.602733981 +0300
- @@ -750,9 +750,9 @@
- queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
- nbd->disk->queue);
- if (nbd->flags & NBD_FLAG_SEND_FLUSH)
- - blk_queue_flush(nbd->disk->queue, REQ_FLUSH);
- + blk_queue_write_cache(nbd->disk->queue, true, false);
- else
- - blk_queue_flush(nbd->disk->queue, 0);
- + blk_queue_write_cache(nbd->disk->queue, false, false);
- thread = kthread_run(nbd_thread_send, nbd, "%s",
- nbd_name(nbd));
- diff -Naur linux-4.4.6-gentoo-orig/drivers/block/osdblk.c linux-4.4.6-gentoo-patched/drivers/block/osdblk.c
- --- linux-4.4.6-gentoo-orig/drivers/block/osdblk.c 2016-05-04 11:19:37.600649829 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/block/osdblk.c 2016-05-04 11:02:48.602733981 +0300
- @@ -437,7 +437,7 @@
- blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
- blk_queue_prep_rq(q, blk_queue_start_tag);
- - blk_queue_flush(q, REQ_FLUSH);
- + blk_queue_write_cache(q, true, false);
- disk->queue = q;
- diff -Naur linux-4.4.6-gentoo-orig/drivers/block/ps3disk.c linux-4.4.6-gentoo-patched/drivers/block/ps3disk.c
- --- linux-4.4.6-gentoo-orig/drivers/block/ps3disk.c 2016-05-04 11:19:37.601649829 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/block/ps3disk.c 2016-05-04 11:02:48.602733981 +0300
- @@ -468,7 +468,7 @@
- blk_queue_dma_alignment(queue, dev->blk_size-1);
- blk_queue_logical_block_size(queue, dev->blk_size);
- - blk_queue_flush(queue, REQ_FLUSH);
- + blk_queue_write_cache(queue, true, false);
- blk_queue_max_segments(queue, -1);
- blk_queue_max_segment_size(queue, dev->bounce_size);
- diff -Naur linux-4.4.6-gentoo-orig/drivers/block/skd_main.c linux-4.4.6-gentoo-patched/drivers/block/skd_main.c
- --- linux-4.4.6-gentoo-orig/drivers/block/skd_main.c 2016-05-04 11:19:37.601649829 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/block/skd_main.c 2016-05-04 11:02:48.603733981 +0300
- @@ -4412,7 +4412,7 @@
- disk->queue = q;
- q->queuedata = skdev;
- - blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
- + blk_queue_write_cache(q, true, true);
- blk_queue_max_segments(q, skdev->sgs_per_request);
- blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS);
- diff -Naur linux-4.4.6-gentoo-orig/drivers/block/virtio_blk.c linux-4.4.6-gentoo-patched/drivers/block/virtio_blk.c
- --- linux-4.4.6-gentoo-orig/drivers/block/virtio_blk.c 2016-05-04 11:19:37.602649828 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/block/virtio_blk.c 2016-05-04 11:02:48.603733981 +0300
- @@ -488,11 +488,7 @@
- u8 writeback = virtblk_get_cache_mode(vdev);
- struct virtio_blk *vblk = vdev->priv;
- - if (writeback)
- - blk_queue_flush(vblk->disk->queue, REQ_FLUSH);
- - else
- - blk_queue_flush(vblk->disk->queue, 0);
- -
- + blk_queue_write_cache(vblk->disk->queue, writeback, false);
- revalidate_disk(vblk->disk);
- }
- diff -Naur linux-4.4.6-gentoo-orig/drivers/block/xen-blkback/xenbus.c linux-4.4.6-gentoo-patched/drivers/block/xen-blkback/xenbus.c
- --- linux-4.4.6-gentoo-orig/drivers/block/xen-blkback/xenbus.c 2016-05-04 11:19:37.603649828 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/block/xen-blkback/xenbus.c 2016-05-04 11:02:48.603733981 +0300
- @@ -413,7 +413,7 @@
- vbd->type |= VDISK_REMOVABLE;
- q = bdev_get_queue(bdev);
- - if (q && q->flush_flags)
- + if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags))
- vbd->flush_support = true;
- if (q && blk_queue_secdiscard(q))
- diff -Naur linux-4.4.6-gentoo-orig/drivers/block/xen-blkfront.c linux-4.4.6-gentoo-patched/drivers/block/xen-blkfront.c
- --- linux-4.4.6-gentoo-orig/drivers/block/xen-blkfront.c 2016-05-04 11:19:37.603649828 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/block/xen-blkfront.c 2016-05-04 11:02:48.603733981 +0300
- @@ -851,7 +851,8 @@
- static void xlvbd_flush(struct blkfront_info *info)
- {
- - blk_queue_flush(info->rq, info->feature_flush);
- + blk_queue_write_cache(info->rq, info->feature_flush & REQ_FLUSH,
- + info->feature_flush & REQ_FUA);
- pr_info("blkfront: %s: %s %s %s %s %s\n",
- info->gd->disk_name, flush_info(info->feature_flush),
- "persistent grants:", info->feature_persistent ?
- diff -Naur linux-4.4.6-gentoo-orig/drivers/ide/ide-disk.c linux-4.4.6-gentoo-patched/drivers/ide/ide-disk.c
- --- linux-4.4.6-gentoo-orig/drivers/ide/ide-disk.c 2016-05-04 11:19:37.604649828 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/ide/ide-disk.c 2016-05-04 11:02:48.603733981 +0300
- @@ -522,7 +522,7 @@
- static void update_flush(ide_drive_t *drive)
- {
- u16 *id = drive->id;
- - unsigned flush = 0;
- + bool wc = false;
- if (drive->dev_flags & IDE_DFLAG_WCACHE) {
- unsigned long long capacity;
- @@ -546,12 +546,12 @@
- drive->name, barrier ? "" : "not ");
- if (barrier) {
- - flush = REQ_FLUSH;
- + wc = true;
- blk_queue_prep_rq(drive->queue, idedisk_prep_fn);
- }
- }
- - blk_queue_flush(drive->queue, flush);
- + blk_queue_write_cache(drive->queue, wc, false);
- }
- ide_devset_get_flag(wcache, IDE_DFLAG_WCACHE);
- diff -Naur linux-4.4.6-gentoo-orig/drivers/md/bcache/super.c linux-4.4.6-gentoo-patched/drivers/md/bcache/super.c
- --- linux-4.4.6-gentoo-orig/drivers/md/bcache/super.c 2016-05-04 11:19:37.604649828 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/md/bcache/super.c 2016-05-04 11:02:48.604733981 +0300
- @@ -816,7 +816,7 @@
- clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
- set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
- - blk_queue_flush(q, REQ_FLUSH|REQ_FUA);
- + blk_queue_write_cache(q, true, true);
- return 0;
- }
- diff -Naur linux-4.4.6-gentoo-orig/drivers/md/dm-table.c linux-4.4.6-gentoo-patched/drivers/md/dm-table.c
- --- linux-4.4.6-gentoo-orig/drivers/md/dm-table.c 2016-05-04 11:19:37.605649828 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/md/dm-table.c 2016-05-04 11:02:48.604733981 +0300
- @@ -1312,13 +1312,13 @@
- static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
- {
- - unsigned flush = (*(unsigned *)data);
- + unsigned long flush = (unsigned long) data;
- struct request_queue *q = bdev_get_queue(dev->bdev);
- - return q && (q->flush_flags & flush);
- + return q && (q->queue_flags & flush);
- }
- -static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
- +static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
- {
- struct dm_target *ti;
- unsigned i = 0;
- @@ -1339,7 +1339,7 @@
- return true;
- if (ti->type->iterate_devices &&
- - ti->type->iterate_devices(ti, device_flush_capable, &flush))
- + ti->type->iterate_devices(ti, device_flush_capable, (void *) flush))
- return true;
- }
- @@ -1470,7 +1470,7 @@
- void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
- struct queue_limits *limits)
- {
- - unsigned flush = 0;
- + bool wc = false, fua = false;
- /*
- * Copy table's limits to the DM device's request_queue
- @@ -1482,12 +1482,12 @@
- else
- queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
- - if (dm_table_supports_flush(t, REQ_FLUSH)) {
- - flush |= REQ_FLUSH;
- - if (dm_table_supports_flush(t, REQ_FUA))
- - flush |= REQ_FUA;
- + if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) {
- + wc = true;
- + if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA)))
- + fua = true;
- }
- - blk_queue_flush(q, flush);
- + blk_queue_write_cache(q, wc, fua);
- if (!dm_table_discard_zeroes_data(t))
- q->limits.discard_zeroes_data = 0;
- diff -Naur linux-4.4.6-gentoo-orig/drivers/md/md.c linux-4.4.6-gentoo-patched/drivers/md/md.c
- --- linux-4.4.6-gentoo-orig/drivers/md/md.c 2016-05-04 11:19:37.606649828 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/md/md.c 2016-05-04 11:02:48.605733981 +0300
- @@ -5037,7 +5037,7 @@
- disk->fops = &md_fops;
- disk->private_data = mddev;
- disk->queue = mddev->queue;
- - blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
- + blk_queue_write_cache(mddev->queue, true, true);
- /* Allow extended partitions. This makes the
- * 'mdp' device redundant, but we can't really
- * remove it now.
- diff -Naur linux-4.4.6-gentoo-orig/drivers/md/raid5-cache.c linux-4.4.6-gentoo-patched/drivers/md/raid5-cache.c
- --- linux-4.4.6-gentoo-orig/drivers/md/raid5-cache.c 2016-05-04 11:19:37.607649828 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/md/raid5-cache.c 2016-05-04 11:02:48.605733981 +0300
- @@ -1133,6 +1133,7 @@
- int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
- {
- + struct request_queue *q = bdev_get_queue(rdev->bdev);
- struct r5l_log *log;
- if (PAGE_SIZE != 4096)
- @@ -1142,7 +1143,7 @@
- return -ENOMEM;
- log->rdev = rdev;
- - log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0);
- + log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
- log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
- sizeof(rdev->mddev->uuid));
- diff -Naur linux-4.4.6-gentoo-orig/drivers/mmc/card/block.c linux-4.4.6-gentoo-patched/drivers/mmc/card/block.c
- --- linux-4.4.6-gentoo-orig/drivers/mmc/card/block.c 2016-05-04 11:19:37.608649828 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/mmc/card/block.c 2016-05-04 11:02:48.605733981 +0300
- @@ -2282,7 +2282,7 @@
- ((card->ext_csd.rel_param & EXT_CSD_WR_REL_PARAM_EN) ||
- card->ext_csd.rel_sectors)) {
- md->flags |= MMC_BLK_REL_WR;
- - blk_queue_flush(md->queue.queue, REQ_FLUSH | REQ_FUA);
- + blk_queue_write_cache(md->queue.queue, true, true);
- }
- if (mmc_card_mmc(card) &&
- diff -Naur linux-4.4.6-gentoo-orig/drivers/mtd/mtd_blkdevs.c linux-4.4.6-gentoo-patched/drivers/mtd/mtd_blkdevs.c
- --- linux-4.4.6-gentoo-orig/drivers/mtd/mtd_blkdevs.c 2016-05-04 11:19:37.608649828 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/mtd/mtd_blkdevs.c 2016-05-04 11:02:48.605733981 +0300
- @@ -409,7 +409,7 @@
- goto error3;
- if (tr->flush)
- - blk_queue_flush(new->rq, REQ_FLUSH);
- + blk_queue_write_cache(new->rq, true, false);
- new->rq->queuedata = new;
- blk_queue_logical_block_size(new->rq, tr->blksize);
- diff -Naur linux-4.4.6-gentoo-orig/drivers/nvme/host/pci.c linux-4.4.6-gentoo-patched/drivers/nvme/host/pci.c
- --- linux-4.4.6-gentoo-orig/drivers/nvme/host/pci.c 2016-01-11 02:01:32.000000000 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/nvme/host/pci.c 2016-05-04 11:48:03.179507579 +0300
- @@ -2272,6 +2272,7 @@
- list_add_tail(&ns->list, &dev->namespaces);
- blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
- + bool vwc = false;
- if (dev->max_hw_sectors) {
- blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
- blk_queue_max_segments(ns->queue,
- @@ -2279,8 +2280,10 @@
- }
- if (dev->stripe_size)
- blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
- - if (dev->vwc & NVME_CTRL_VWC_PRESENT)
- - blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
- + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
- + vwc = true;
- + blk_queue_write_cache(q, vwc, vwc);
- +
- blk_queue_virt_boundary(ns->queue, dev->page_size - 1);
- disk->major = nvme_major;
- diff -Naur linux-4.4.6-gentoo-orig/drivers/scsi/scsi.c linux-4.4.6-gentoo-patched/drivers/scsi/scsi.c
- --- linux-4.4.6-gentoo-orig/drivers/scsi/scsi.c 2016-05-04 11:19:37.609649828 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/scsi/scsi.c 2016-05-04 11:03:27.408730745 +0300
- @@ -621,6 +621,9 @@
- wmb();
- }
- + if (sdev->request_queue)
- + blk_set_queue_depth(sdev->request_queue, depth);
- +
- return sdev->queue_depth;
- }
- EXPORT_SYMBOL(scsi_change_queue_depth);
- diff -Naur linux-4.4.6-gentoo-orig/drivers/scsi/sd.c linux-4.4.6-gentoo-patched/drivers/scsi/sd.c
- --- linux-4.4.6-gentoo-orig/drivers/scsi/sd.c 2016-05-04 11:19:37.609649828 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/scsi/sd.c 2016-05-04 11:03:27.408730745 +0300
- @@ -137,15 +137,15 @@
- static void sd_set_flush_flag(struct scsi_disk *sdkp)
- {
- - unsigned flush = 0;
- + bool wc = false, fua = false;
- if (sdkp->WCE) {
- - flush |= REQ_FLUSH;
- + wc = true;
- if (sdkp->DPOFUA)
- - flush |= REQ_FUA;
- + fua = true;
- }
- - blk_queue_flush(sdkp->disk->queue, flush);
- + blk_queue_write_cache(sdkp->disk->queue, wc, fua);
- }
- static ssize_t
- diff -Naur linux-4.4.6-gentoo-orig/drivers/target/target_core_iblock.c linux-4.4.6-gentoo-patched/drivers/target/target_core_iblock.c
- --- linux-4.4.6-gentoo-orig/drivers/target/target_core_iblock.c 2016-05-04 11:19:37.610649828 +0300
- +++ linux-4.4.6-gentoo-patched/drivers/target/target_core_iblock.c 2016-05-04 11:03:27.409730745 +0300
- @@ -653,10 +653,10 @@
- * Force writethrough using WRITE_FUA if a volatile write cache
- * is not enabled, or if initiator set the Force Unit Access bit.
- */
- - if (q->flush_flags & REQ_FUA) {
- + if (test_bit(QUEUE_FLAG_FUA, &q->queue_flags)) {
- if (cmd->se_cmd_flags & SCF_FUA)
- rw = WRITE_FUA;
- - else if (!(q->flush_flags & REQ_FLUSH))
- + else if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
- rw = WRITE_FUA;
- else
- rw = WRITE;
- @@ -802,7 +802,7 @@
- struct block_device *bd = ib_dev->ibd_bd;
- struct request_queue *q = bdev_get_queue(bd);
- - return q->flush_flags & REQ_FLUSH;
- + return test_bit(QUEUE_FLAG_WC, &q->queue_flags);
- }
- static const struct target_backend_ops iblock_ops = {
- diff -Naur linux-4.4.6-gentoo-orig/fs/block_dev.c linux-4.4.6-gentoo-patched/fs/block_dev.c
- --- linux-4.4.6-gentoo-orig/fs/block_dev.c 2016-05-04 11:19:37.610649828 +0300
- +++ linux-4.4.6-gentoo-patched/fs/block_dev.c 2016-05-04 11:03:27.409730745 +0300
- @@ -427,7 +427,7 @@
- struct page *page, struct writeback_control *wbc)
- {
- int result;
- - int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
- + int rw = wbc_to_write_cmd(wbc);
- const struct block_device_operations *ops = bdev->bd_disk->fops;
- if (!ops->rw_page || bdev_get_integrity(bdev))
- diff -Naur linux-4.4.6-gentoo-orig/fs/buffer.c linux-4.4.6-gentoo-patched/fs/buffer.c
- --- linux-4.4.6-gentoo-orig/fs/buffer.c 2016-05-04 11:19:37.611649828 +0300
- +++ linux-4.4.6-gentoo-patched/fs/buffer.c 2016-05-04 11:03:27.409730745 +0300
- @@ -1708,7 +1708,7 @@
- struct buffer_head *bh, *head;
- unsigned int blocksize, bbits;
- int nr_underway = 0;
- - int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
- + int write_op = wbc_to_write_cmd(wbc);
- head = create_page_buffers(page, inode,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
- diff -Naur linux-4.4.6-gentoo-orig/fs/f2fs/data.c linux-4.4.6-gentoo-patched/fs/f2fs/data.c
- --- linux-4.4.6-gentoo-orig/fs/f2fs/data.c 2016-05-04 11:19:37.612649828 +0300
- +++ linux-4.4.6-gentoo-patched/fs/f2fs/data.c 2016-05-04 11:03:27.409730745 +0300
- @@ -1115,7 +1115,7 @@
- struct f2fs_io_info fio = {
- .sbi = sbi,
- .type = DATA,
- - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
- + .rw = wbc_to_write_cmd(wbc),
- .page = page,
- .encrypted_page = NULL,
- };
- diff -Naur linux-4.4.6-gentoo-orig/fs/f2fs/node.c linux-4.4.6-gentoo-patched/fs/f2fs/node.c
- --- linux-4.4.6-gentoo-orig/fs/f2fs/node.c 2016-05-04 11:19:37.612649828 +0300
- +++ linux-4.4.6-gentoo-patched/fs/f2fs/node.c 2016-05-04 11:03:27.409730745 +0300
- @@ -1305,7 +1305,7 @@
- struct f2fs_io_info fio = {
- .sbi = sbi,
- .type = NODE,
- - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
- + .rw = wbc_to_write_cmd(wbc),
- .page = page,
- .encrypted_page = NULL,
- };
- diff -Naur linux-4.4.6-gentoo-orig/fs/fs-writeback.c.orig linux-4.4.6-gentoo-patched/fs/fs-writeback.c.orig
- --- linux-4.4.6-gentoo-orig/fs/fs-writeback.c.orig 2016-05-04 11:19:37.613649828 +0300
- +++ linux-4.4.6-gentoo-patched/fs/fs-writeback.c.orig 1970-01-01 03:00:00.000000000 +0300
- @@ -1,2394 +0,0 @@
- -/*
- - * fs/fs-writeback.c
- - *
- - * Copyright (C) 2002, Linus Torvalds.
- - *
- - * Contains all the functions related to writing back and waiting
- - * upon dirty inodes against superblocks, and writing back dirty
- - * pages against inodes. ie: data writeback. Writeout of the
- - * inode itself is not handled here.
- - *
- - * 10Apr2002 Andrew Morton
- - * Split out of fs/inode.c
- - * Additions for address_space-based writeback
- - */
- -
- -#include <linux/kernel.h>
- -#include <linux/export.h>
- -#include <linux/spinlock.h>
- -#include <linux/slab.h>
- -#include <linux/sched.h>
- -#include <linux/fs.h>
- -#include <linux/mm.h>
- -#include <linux/pagemap.h>
- -#include <linux/kthread.h>
- -#include <linux/writeback.h>
- -#include <linux/blkdev.h>
- -#include <linux/backing-dev.h>
- -#include <linux/tracepoint.h>
- -#include <linux/device.h>
- -#include <linux/memcontrol.h>
- -#include "internal.h"
- -
- -/*
- - * 4MB minimal write chunk size
- - */
- -#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
- -
- -struct wb_completion {
- - atomic_t cnt;
- -};
- -
- -/*
- - * Passed into wb_writeback(), essentially a subset of writeback_control
- - */
- -struct wb_writeback_work {
- - long nr_pages;
- - struct super_block *sb;
- - unsigned long *older_than_this;
- - enum writeback_sync_modes sync_mode;
- - unsigned int tagged_writepages:1;
- - unsigned int for_kupdate:1;
- - unsigned int range_cyclic:1;
- - unsigned int for_background:1;
- - unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
- - unsigned int auto_free:1; /* free on completion */
- - enum wb_reason reason; /* why was writeback initiated? */
- -
- - struct list_head list; /* pending work list */
- - struct wb_completion *done; /* set if the caller waits */
- -};
- -
- -/*
- - * If one wants to wait for one or more wb_writeback_works, each work's
- - * ->done should be set to a wb_completion defined using the following
- - * macro. Once all work items are issued with wb_queue_work(), the caller
- - * can wait for the completion of all using wb_wait_for_completion(). Work
- - * items which are waited upon aren't freed automatically on completion.
- - */
- -#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \
- - struct wb_completion cmpl = { \
- - .cnt = ATOMIC_INIT(1), \
- - }
- -
- -
- -/*
- - * If an inode is constantly having its pages dirtied, but then the
- - * updates stop dirtytime_expire_interval seconds in the past, it's
- - * possible for the worst case time between when an inode has its
- - * timestamps updated and when they finally get written out to be two
- - * dirtytime_expire_intervals. We set the default to 12 hours (in
- - * seconds), which means most of the time inodes will have their
- - * timestamps written to disk after 12 hours, but in the worst case a
- - * few inodes might not their timestamps updated for 24 hours.
- - */
- -unsigned int dirtytime_expire_interval = 12 * 60 * 60;
- -
- -static inline struct inode *wb_inode(struct list_head *head)
- -{
- - return list_entry(head, struct inode, i_io_list);
- -}
- -
- -/*
- - * Include the creation of the trace points after defining the
- - * wb_writeback_work structure and inline functions so that the definition
- - * remains local to this file.
- - */
- -#define CREATE_TRACE_POINTS
- -#include <trace/events/writeback.h>
- -
- -EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
- -
- -static bool wb_io_lists_populated(struct bdi_writeback *wb)
- -{
- - if (wb_has_dirty_io(wb)) {
- - return false;
- - } else {
- - set_bit(WB_has_dirty_io, &wb->state);
- - WARN_ON_ONCE(!wb->avg_write_bandwidth);
- - atomic_long_add(wb->avg_write_bandwidth,
- - &wb->bdi->tot_write_bandwidth);
- - return true;
- - }
- -}
- -
- -static void wb_io_lists_depopulated(struct bdi_writeback *wb)
- -{
- - if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
- - list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
- - clear_bit(WB_has_dirty_io, &wb->state);
- - WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
- - &wb->bdi->tot_write_bandwidth) < 0);
- - }
- -}
- -
- -/**
- - * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
- - * @inode: inode to be moved
- - * @wb: target bdi_writeback
- - * @head: one of @wb->b_{dirty|io|more_io}
- - *
- - * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
- - * Returns %true if @inode is the first occupant of the !dirty_time IO
- - * lists; otherwise, %false.
- - */
- -static bool inode_io_list_move_locked(struct inode *inode,
- - struct bdi_writeback *wb,
- - struct list_head *head)
- -{
- - assert_spin_locked(&wb->list_lock);
- -
- - list_move(&inode->i_io_list, head);
- -
- - /* dirty_time doesn't count as dirty_io until expiration */
- - if (head != &wb->b_dirty_time)
- - return wb_io_lists_populated(wb);
- -
- - wb_io_lists_depopulated(wb);
- - return false;
- -}
- -
- -/**
- - * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
- - * @inode: inode to be removed
- - * @wb: bdi_writeback @inode is being removed from
- - *
- - * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
- - * clear %WB_has_dirty_io if all are empty afterwards.
- - */
- -static void inode_io_list_del_locked(struct inode *inode,
- - struct bdi_writeback *wb)
- -{
- - assert_spin_locked(&wb->list_lock);
- -
- - list_del_init(&inode->i_io_list);
- - wb_io_lists_depopulated(wb);
- -}
- -
- -static void wb_wakeup(struct bdi_writeback *wb)
- -{
- - spin_lock_bh(&wb->work_lock);
- - if (test_bit(WB_registered, &wb->state))
- - mod_delayed_work(bdi_wq, &wb->dwork, 0);
- - spin_unlock_bh(&wb->work_lock);
- -}
- -
- -static void wb_queue_work(struct bdi_writeback *wb,
- - struct wb_writeback_work *work)
- -{
- - trace_writeback_queue(wb, work);
- -
- - spin_lock_bh(&wb->work_lock);
- - if (!test_bit(WB_registered, &wb->state))
- - goto out_unlock;
- - if (work->done)
- - atomic_inc(&work->done->cnt);
- - list_add_tail(&work->list, &wb->work_list);
- - mod_delayed_work(bdi_wq, &wb->dwork, 0);
- -out_unlock:
- - spin_unlock_bh(&wb->work_lock);
- -}
- -
- -/**
- - * wb_wait_for_completion - wait for completion of bdi_writeback_works
- - * @bdi: bdi work items were issued to
- - * @done: target wb_completion
- - *
- - * Wait for one or more work items issued to @bdi with their ->done field
- - * set to @done, which should have been defined with
- - * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such
- - * work items are completed. Work items which are waited upon aren't freed
- - * automatically on completion.
- - */
- -static void wb_wait_for_completion(struct backing_dev_info *bdi,
- - struct wb_completion *done)
- -{
- - atomic_dec(&done->cnt); /* put down the initial count */
- - wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
- -}
- -
- -#ifdef CONFIG_CGROUP_WRITEBACK
- -
- -/* parameters for foreign inode detection, see wb_detach_inode() */
- -#define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */
- -#define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */
- -#define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */
- -#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */
- -
- -#define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */
- -#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
- - /* each slot's duration is 2s / 16 */
- -#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
- - /* if foreign slots >= 8, switch */
- -#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
- - /* one round can affect upto 5 slots */
- -
- -static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
- -static struct workqueue_struct *isw_wq;
- -
- -void __inode_attach_wb(struct inode *inode, struct page *page)
- -{
- - struct backing_dev_info *bdi = inode_to_bdi(inode);
- - struct bdi_writeback *wb = NULL;
- -
- - if (inode_cgwb_enabled(inode)) {
- - struct cgroup_subsys_state *memcg_css;
- -
- - if (page) {
- - memcg_css = mem_cgroup_css_from_page(page);
- - wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
- - } else {
- - /* must pin memcg_css, see wb_get_create() */
- - memcg_css = task_get_css(current, memory_cgrp_id);
- - wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
- - css_put(memcg_css);
- - }
- - }
- -
- - if (!wb)
- - wb = &bdi->wb;
- -
- - /*
- - * There may be multiple instances of this function racing to
- - * update the same inode. Use cmpxchg() to tell the winner.
- - */
- - if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
- - wb_put(wb);
- -}
- -
- -/**
- - * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
- - * @inode: inode of interest with i_lock held
- - *
- - * Returns @inode's wb with its list_lock held. @inode->i_lock must be
- - * held on entry and is released on return. The returned wb is guaranteed
- - * to stay @inode's associated wb until its list_lock is released.
- - */
- -static struct bdi_writeback *
- -locked_inode_to_wb_and_lock_list(struct inode *inode)
- - __releases(&inode->i_lock)
- - __acquires(&wb->list_lock)
- -{
- - while (true) {
- - struct bdi_writeback *wb = inode_to_wb(inode);
- -
- - /*
- - * inode_to_wb() association is protected by both
- - * @inode->i_lock and @wb->list_lock but list_lock nests
- - * outside i_lock. Drop i_lock and verify that the
- - * association hasn't changed after acquiring list_lock.
- - */
- - wb_get(wb);
- - spin_unlock(&inode->i_lock);
- - spin_lock(&wb->list_lock);
- - wb_put(wb); /* not gonna deref it anymore */
- -
- - /* i_wb may have changed inbetween, can't use inode_to_wb() */
- - if (likely(wb == inode->i_wb))
- - return wb; /* @inode already has ref */
- -
- - spin_unlock(&wb->list_lock);
- - cpu_relax();
- - spin_lock(&inode->i_lock);
- - }
- -}
- -
- -/**
- - * inode_to_wb_and_lock_list - determine an inode's wb and lock it
- - * @inode: inode of interest
- - *
- - * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
- - * on entry.
- - */
- -static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
- - __acquires(&wb->list_lock)
- -{
- - spin_lock(&inode->i_lock);
- - return locked_inode_to_wb_and_lock_list(inode);
- -}
- -
- -struct inode_switch_wbs_context {
- - struct inode *inode;
- - struct bdi_writeback *new_wb;
- -
- - struct rcu_head rcu_head;
- - struct work_struct work;
- -};
- -
- -static void inode_switch_wbs_work_fn(struct work_struct *work)
- -{
- - struct inode_switch_wbs_context *isw =
- - container_of(work, struct inode_switch_wbs_context, work);
- - struct inode *inode = isw->inode;
- - struct address_space *mapping = inode->i_mapping;
- - struct bdi_writeback *old_wb = inode->i_wb;
- - struct bdi_writeback *new_wb = isw->new_wb;
- - struct radix_tree_iter iter;
- - bool switched = false;
- - void **slot;
- -
- - /*
- - * By the time control reaches here, RCU grace period has passed
- - * since I_WB_SWITCH assertion and all wb stat update transactions
- - * between unlocked_inode_to_wb_begin/end() are guaranteed to be
- - * synchronizing against mapping->tree_lock.
- - *
- - * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
- - * gives us exclusion against all wb related operations on @inode
- - * including IO list manipulations and stat updates.
- - */
- - if (old_wb < new_wb) {
- - spin_lock(&old_wb->list_lock);
- - spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
- - } else {
- - spin_lock(&new_wb->list_lock);
- - spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
- - }
- - spin_lock(&inode->i_lock);
- - spin_lock_irq(&mapping->tree_lock);
- -
- - /*
- - * Once I_FREEING is visible under i_lock, the eviction path owns
- - * the inode and we shouldn't modify ->i_io_list.
- - */
- - if (unlikely(inode->i_state & I_FREEING))
- - goto skip_switch;
- -
- - /*
- - * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points
- - * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
- - * pages actually under underwriteback.
- - */
- - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
- - PAGECACHE_TAG_DIRTY) {
- - struct page *page = radix_tree_deref_slot_protected(slot,
- - &mapping->tree_lock);
- - if (likely(page) && PageDirty(page)) {
- - __dec_wb_stat(old_wb, WB_RECLAIMABLE);
- - __inc_wb_stat(new_wb, WB_RECLAIMABLE);
- - }
- - }
- -
- - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
- - PAGECACHE_TAG_WRITEBACK) {
- - struct page *page = radix_tree_deref_slot_protected(slot,
- - &mapping->tree_lock);
- - if (likely(page)) {
- - WARN_ON_ONCE(!PageWriteback(page));
- - __dec_wb_stat(old_wb, WB_WRITEBACK);
- - __inc_wb_stat(new_wb, WB_WRITEBACK);
- - }
- - }
- -
- - wb_get(new_wb);
- -
- - /*
- - * Transfer to @new_wb's IO list if necessary. The specific list
- - * @inode was on is ignored and the inode is put on ->b_dirty which
- - * is always correct including from ->b_dirty_time. The transfer
- - * preserves @inode->dirtied_when ordering.
- - */
- - if (!list_empty(&inode->i_io_list)) {
- - struct inode *pos;
- -
- - inode_io_list_del_locked(inode, old_wb);
- - inode->i_wb = new_wb;
- - list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
- - if (time_after_eq(inode->dirtied_when,
- - pos->dirtied_when))
- - break;
- - inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
- - } else {
- - inode->i_wb = new_wb;
- - }
- -
- - /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
- - inode->i_wb_frn_winner = 0;
- - inode->i_wb_frn_avg_time = 0;
- - inode->i_wb_frn_history = 0;
- - switched = true;
- -skip_switch:
- - /*
- - * Paired with load_acquire in unlocked_inode_to_wb_begin() and
- - * ensures that the new wb is visible if they see !I_WB_SWITCH.
- - */
- - smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
- -
- - spin_unlock_irq(&mapping->tree_lock);
- - spin_unlock(&inode->i_lock);
- - spin_unlock(&new_wb->list_lock);
- - spin_unlock(&old_wb->list_lock);
- -
- - if (switched) {
- - wb_wakeup(new_wb);
- - wb_put(old_wb);
- - }
- - wb_put(new_wb);
- -
- - iput(inode);
- - kfree(isw);
- -
- - atomic_dec(&isw_nr_in_flight);
- -}
- -
- -static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
- -{
- - struct inode_switch_wbs_context *isw = container_of(rcu_head,
- - struct inode_switch_wbs_context, rcu_head);
- -
- - /* needs to grab bh-unsafe locks, bounce to work item */
- - INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
- - queue_work(isw_wq, &isw->work);
- -}
- -
- -/**
- - * inode_switch_wbs - change the wb association of an inode
- - * @inode: target inode
- - * @new_wb_id: ID of the new wb
- - *
- - * Switch @inode's wb association to the wb identified by @new_wb_id. The
- - * switching is performed asynchronously and may fail silently.
- - */
- -static void inode_switch_wbs(struct inode *inode, int new_wb_id)
- -{
- - struct backing_dev_info *bdi = inode_to_bdi(inode);
- - struct cgroup_subsys_state *memcg_css;
- - struct inode_switch_wbs_context *isw;
- -
- - /* noop if seems to be already in progress */
- - if (inode->i_state & I_WB_SWITCH)
- - return;
- -
- - isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
- - if (!isw)
- - return;
- -
- - /* find and pin the new wb */
- - rcu_read_lock();
- - memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
- - if (memcg_css)
- - isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
- - rcu_read_unlock();
- - if (!isw->new_wb)
- - goto out_free;
- -
- - /* while holding I_WB_SWITCH, no one else can update the association */
- - spin_lock(&inode->i_lock);
- - if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
- - inode->i_state & (I_WB_SWITCH | I_FREEING) ||
- - inode_to_wb(inode) == isw->new_wb) {
- - spin_unlock(&inode->i_lock);
- - goto out_free;
- - }
- - inode->i_state |= I_WB_SWITCH;
- - spin_unlock(&inode->i_lock);
- -
- - ihold(inode);
- - isw->inode = inode;
- -
- - atomic_inc(&isw_nr_in_flight);
- -
- - /*
- - * In addition to synchronizing among switchers, I_WB_SWITCH tells
- - * the RCU protected stat update paths to grab the mapping's
- - * tree_lock so that stat transfer can synchronize against them.
- - * Let's continue after I_WB_SWITCH is guaranteed to be visible.
- - */
- - call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
- - return;
- -
- -out_free:
- - if (isw->new_wb)
- - wb_put(isw->new_wb);
- - kfree(isw);
- -}
- -
- -/**
- - * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
- - * @wbc: writeback_control of interest
- - * @inode: target inode
- - *
- - * @inode is locked and about to be written back under the control of @wbc.
- - * Record @inode's writeback context into @wbc and unlock the i_lock. On
- - * writeback completion, wbc_detach_inode() should be called. This is used
- - * to track the cgroup writeback context.
- - */
- -void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
- - struct inode *inode)
- -{
- - if (!inode_cgwb_enabled(inode)) {
- - spin_unlock(&inode->i_lock);
- - return;
- - }
- -
- - wbc->wb = inode_to_wb(inode);
- - wbc->inode = inode;
- -
- - wbc->wb_id = wbc->wb->memcg_css->id;
- - wbc->wb_lcand_id = inode->i_wb_frn_winner;
- - wbc->wb_tcand_id = 0;
- - wbc->wb_bytes = 0;
- - wbc->wb_lcand_bytes = 0;
- - wbc->wb_tcand_bytes = 0;
- -
- - wb_get(wbc->wb);
- - spin_unlock(&inode->i_lock);
- -
- - /*
- - * A dying wb indicates that the memcg-blkcg mapping has changed
- - * and a new wb is already serving the memcg. Switch immediately.
- - */
- - if (unlikely(wb_dying(wbc->wb)))
- - inode_switch_wbs(inode, wbc->wb_id);
- -}
- -
- -/**
- - * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
- - * @wbc: writeback_control of the just finished writeback
- - *
- - * To be called after a writeback attempt of an inode finishes and undoes
- - * wbc_attach_and_unlock_inode(). Can be called under any context.
- - *
- - * As concurrent write sharing of an inode is expected to be very rare and
- - * memcg only tracks page ownership on first-use basis severely confining
- - * the usefulness of such sharing, cgroup writeback tracks ownership
- - * per-inode. While the support for concurrent write sharing of an inode
- - * is deemed unnecessary, an inode being written to by different cgroups at
- - * different points in time is a lot more common, and, more importantly,
- - * charging only by first-use can too readily lead to grossly incorrect
- - * behaviors (single foreign page can lead to gigabytes of writeback to be
- - * incorrectly attributed).
- - *
- - * To resolve this issue, cgroup writeback detects the majority dirtier of
- - * an inode and transfers the ownership to it. To avoid unnnecessary
- - * oscillation, the detection mechanism keeps track of history and gives
- - * out the switch verdict only if the foreign usage pattern is stable over
- - * a certain amount of time and/or writeback attempts.
- - *
- - * On each writeback attempt, @wbc tries to detect the majority writer
- - * using Boyer-Moore majority vote algorithm. In addition to the byte
- - * count from the majority voting, it also counts the bytes written for the
- - * current wb and the last round's winner wb (max of last round's current
- - * wb, the winner from two rounds ago, and the last round's majority
- - * candidate). Keeping track of the historical winner helps the algorithm
- - * to semi-reliably detect the most active writer even when it's not the
- - * absolute majority.
- - *
- - * Once the winner of the round is determined, whether the winner is
- - * foreign or not and how much IO time the round consumed is recorded in
- - * inode->i_wb_frn_history. If the amount of recorded foreign IO time is
- - * over a certain threshold, the switch verdict is given.
- - */
- -void wbc_detach_inode(struct writeback_control *wbc)
- -{
- - struct bdi_writeback *wb = wbc->wb;
- - struct inode *inode = wbc->inode;
- - unsigned long avg_time, max_bytes, max_time;
- - u16 history;
- - int max_id;
- -
- - if (!wb)
- - return;
- -
- - history = inode->i_wb_frn_history;
- - avg_time = inode->i_wb_frn_avg_time;
- -
- - /* pick the winner of this round */
- - if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
- - wbc->wb_bytes >= wbc->wb_tcand_bytes) {
- - max_id = wbc->wb_id;
- - max_bytes = wbc->wb_bytes;
- - } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
- - max_id = wbc->wb_lcand_id;
- - max_bytes = wbc->wb_lcand_bytes;
- - } else {
- - max_id = wbc->wb_tcand_id;
- - max_bytes = wbc->wb_tcand_bytes;
- - }
- -
- - /*
- - * Calculate the amount of IO time the winner consumed and fold it
- - * into the running average kept per inode. If the consumed IO
- - * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
- - * deciding whether to switch or not. This is to prevent one-off
- - * small dirtiers from skewing the verdict.
- - */
- - max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
- - wb->avg_write_bandwidth);
- - if (avg_time)
- - avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
- - (avg_time >> WB_FRN_TIME_AVG_SHIFT);
- - else
- - avg_time = max_time; /* immediate catch up on first run */
- -
- - if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
- - int slots;
- -
- - /*
- - * The switch verdict is reached if foreign wb's consume
- - * more than a certain proportion of IO time in a
- - * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot
- - * history mask where each bit represents one sixteenth of
- - * the period. Determine the number of slots to shift into
- - * history from @max_time.
- - */
- - slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
- - (unsigned long)WB_FRN_HIST_MAX_SLOTS);
- - history <<= slots;
- - if (wbc->wb_id != max_id)
- - history |= (1U << slots) - 1;
- -
- - /*
- - * Switch if the current wb isn't the consistent winner.
- - * If there are multiple closely competing dirtiers, the
- - * inode may switch across them repeatedly over time, which
- - * is okay. The main goal is avoiding keeping an inode on
- - * the wrong wb for an extended period of time.
- - */
- - if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
- - inode_switch_wbs(inode, max_id);
- - }
- -
- - /*
- - * Multiple instances of this function may race to update the
- - * following fields but we don't mind occassional inaccuracies.
- - */
- - inode->i_wb_frn_winner = max_id;
- - inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
- - inode->i_wb_frn_history = history;
- -
- - wb_put(wbc->wb);
- - wbc->wb = NULL;
- -}
- -
- -/**
- - * wbc_account_io - account IO issued during writeback
- - * @wbc: writeback_control of the writeback in progress
- - * @page: page being written out
- - * @bytes: number of bytes being written out
- - *
- - * @bytes from @page are about to written out during the writeback
- - * controlled by @wbc. Keep the book for foreign inode detection. See
- - * wbc_detach_inode().
- - */
- -void wbc_account_io(struct writeback_control *wbc, struct page *page,
- - size_t bytes)
- -{
- - int id;
- -
- - /*
- - * pageout() path doesn't attach @wbc to the inode being written
- - * out. This is intentional as we don't want the function to block
- - * behind a slow cgroup. Ultimately, we want pageout() to kick off
- - * regular writeback instead of writing things out itself.
- - */
- - if (!wbc->wb)
- - return;
- -
- - rcu_read_lock();
- - id = mem_cgroup_css_from_page(page)->id;
- - rcu_read_unlock();
- -
- - if (id == wbc->wb_id) {
- - wbc->wb_bytes += bytes;
- - return;
- - }
- -
- - if (id == wbc->wb_lcand_id)
- - wbc->wb_lcand_bytes += bytes;
- -
- - /* Boyer-Moore majority vote algorithm */
- - if (!wbc->wb_tcand_bytes)
- - wbc->wb_tcand_id = id;
- - if (id == wbc->wb_tcand_id)
- - wbc->wb_tcand_bytes += bytes;
- - else
- - wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
- -}
- -EXPORT_SYMBOL_GPL(wbc_account_io);
- -
- -/**
- - * inode_congested - test whether an inode is congested
- - * @inode: inode to test for congestion (may be NULL)
- - * @cong_bits: mask of WB_[a]sync_congested bits to test
- - *
- - * Tests whether @inode is congested. @cong_bits is the mask of congestion
- - * bits to test and the return value is the mask of set bits.
- - *
- - * If cgroup writeback is enabled for @inode, the congestion state is
- - * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
- - * associated with @inode is congested; otherwise, the root wb's congestion
- - * state is used.
- - *
- - * @inode is allowed to be NULL as this function is often called on
- - * mapping->host which is NULL for the swapper space.
- - */
- -int inode_congested(struct inode *inode, int cong_bits)
- -{
- - /*
- - * Once set, ->i_wb never becomes NULL while the inode is alive.
- - * Start transaction iff ->i_wb is visible.
- - */
- - if (inode && inode_to_wb_is_valid(inode)) {
- - struct bdi_writeback *wb;
- - bool locked, congested;
- -
- - wb = unlocked_inode_to_wb_begin(inode, &locked);
- - congested = wb_congested(wb, cong_bits);
- - unlocked_inode_to_wb_end(inode, locked);
- - return congested;
- - }
- -
- - return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
- -}
- -EXPORT_SYMBOL_GPL(inode_congested);
- -
- -/**
- - * wb_split_bdi_pages - split nr_pages to write according to bandwidth
- - * @wb: target bdi_writeback to split @nr_pages to
- - * @nr_pages: number of pages to write for the whole bdi
- - *
- - * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
- - * relation to the total write bandwidth of all wb's w/ dirty inodes on
- - * @wb->bdi.
- - */
- -static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
- -{
- - unsigned long this_bw = wb->avg_write_bandwidth;
- - unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
- -
- - if (nr_pages == LONG_MAX)
- - return LONG_MAX;
- -
- - /*
- - * This may be called on clean wb's and proportional distribution
- - * may not make sense, just use the original @nr_pages in those
- - * cases. In general, we wanna err on the side of writing more.
- - */
- - if (!tot_bw || this_bw >= tot_bw)
- - return nr_pages;
- - else
- - return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
- -}
- -
- -/**
- - * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
- - * @bdi: target backing_dev_info
- - * @base_work: wb_writeback_work to issue
- - * @skip_if_busy: skip wb's which already have writeback in progress
- - *
- - * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
- - * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's
- - * distributed to the busy wbs according to each wb's proportion in the
- - * total active write bandwidth of @bdi.
- - */
- -static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
- - struct wb_writeback_work *base_work,
- - bool skip_if_busy)
- -{
- - struct bdi_writeback *last_wb = NULL;
- - struct bdi_writeback *wb = list_entry(&bdi->wb_list,
- - struct bdi_writeback, bdi_node);
- -
- - might_sleep();
- -restart:
- - rcu_read_lock();
- - list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
- - DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
- - struct wb_writeback_work fallback_work;
- - struct wb_writeback_work *work;
- - long nr_pages;
- -
- - if (last_wb) {
- - wb_put(last_wb);
- - last_wb = NULL;
- - }
- -
- - /* SYNC_ALL writes out I_DIRTY_TIME too */
- - if (!wb_has_dirty_io(wb) &&
- - (base_work->sync_mode == WB_SYNC_NONE ||
- - list_empty(&wb->b_dirty_time)))
- - continue;
- - if (skip_if_busy && writeback_in_progress(wb))
- - continue;
- -
- - nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
- -
- - work = kmalloc(sizeof(*work), GFP_ATOMIC);
- - if (work) {
- - *work = *base_work;
- - work->nr_pages = nr_pages;
- - work->auto_free = 1;
- - wb_queue_work(wb, work);
- - continue;
- - }
- -
- - /* alloc failed, execute synchronously using on-stack fallback */
- - work = &fallback_work;
- - *work = *base_work;
- - work->nr_pages = nr_pages;
- - work->auto_free = 0;
- - work->done = &fallback_work_done;
- -
- - wb_queue_work(wb, work);
- -
- - /*
- - * Pin @wb so that it stays on @bdi->wb_list. This allows
- - * continuing iteration from @wb after dropping and
- - * regrabbing rcu read lock.
- - */
- - wb_get(wb);
- - last_wb = wb;
- -
- - rcu_read_unlock();
- - wb_wait_for_completion(bdi, &fallback_work_done);
- - goto restart;
- - }
- - rcu_read_unlock();
- -
- - if (last_wb)
- - wb_put(last_wb);
- -}
- -
- -/**
- - * cgroup_writeback_umount - flush inode wb switches for umount
- - *
- - * This function is called when a super_block is about to be destroyed and
- - * flushes in-flight inode wb switches. An inode wb switch goes through
- - * RCU and then workqueue, so the two need to be flushed in order to ensure
- - * that all previously scheduled switches are finished. As wb switches are
- - * rare occurrences and synchronize_rcu() can take a while, perform
- - * flushing iff wb switches are in flight.
- - */
- -void cgroup_writeback_umount(void)
- -{
- - if (atomic_read(&isw_nr_in_flight)) {
- - synchronize_rcu();
- - flush_workqueue(isw_wq);
- - }
- -}
- -
- -static int __init cgroup_writeback_init(void)
- -{
- - isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
- - if (!isw_wq)
- - return -ENOMEM;
- - return 0;
- -}
- -fs_initcall(cgroup_writeback_init);
- -
- -#else /* CONFIG_CGROUP_WRITEBACK */
- -
- -static struct bdi_writeback *
- -locked_inode_to_wb_and_lock_list(struct inode *inode)
- - __releases(&inode->i_lock)
- - __acquires(&wb->list_lock)
- -{
- - struct bdi_writeback *wb = inode_to_wb(inode);
- -
- - spin_unlock(&inode->i_lock);
- - spin_lock(&wb->list_lock);
- - return wb;
- -}
- -
- -static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
- - __acquires(&wb->list_lock)
- -{
- - struct bdi_writeback *wb = inode_to_wb(inode);
- -
- - spin_lock(&wb->list_lock);
- - return wb;
- -}
- -
- -static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
- -{
- - return nr_pages;
- -}
- -
- -static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
- - struct wb_writeback_work *base_work,
- - bool skip_if_busy)
- -{
- - might_sleep();
- -
- - if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
- - base_work->auto_free = 0;
- - wb_queue_work(&bdi->wb, base_work);
- - }
- -}
- -
- -#endif /* CONFIG_CGROUP_WRITEBACK */
- -
- -void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
- - bool range_cyclic, enum wb_reason reason)
- -{
- - struct wb_writeback_work *work;
- -
- - if (!wb_has_dirty_io(wb))
- - return;
- -
- - /*
- - * This is WB_SYNC_NONE writeback, so if allocation fails just
- - * wakeup the thread for old dirty data writeback
- - */
- - work = kzalloc(sizeof(*work), GFP_ATOMIC);
- - if (!work) {
- - trace_writeback_nowork(wb);
- - wb_wakeup(wb);
- - return;
- - }
- -
- - work->sync_mode = WB_SYNC_NONE;
- - work->nr_pages = nr_pages;
- - work->range_cyclic = range_cyclic;
- - work->reason = reason;
- - work->auto_free = 1;
- -
- - wb_queue_work(wb, work);
- -}
- -
- -/**
- - * wb_start_background_writeback - start background writeback
- - * @wb: bdi_writback to write from
- - *
- - * Description:
- - * This makes sure WB_SYNC_NONE background writeback happens. When
- - * this function returns, it is only guaranteed that for given wb
- - * some IO is happening if we are over background dirty threshold.
- - * Caller need not hold sb s_umount semaphore.
- - */
- -void wb_start_background_writeback(struct bdi_writeback *wb)
- -{
- - /*
- - * We just wake up the flusher thread. It will perform background
- - * writeback as soon as there is no other work to do.
- - */
- - trace_writeback_wake_background(wb);
- - wb_wakeup(wb);
- -}
- -
- -/*
- - * Remove the inode from the writeback list it is on.
- - */
- -void inode_io_list_del(struct inode *inode)
- -{
- - struct bdi_writeback *wb;
- -
- - wb = inode_to_wb_and_lock_list(inode);
- - inode_io_list_del_locked(inode, wb);
- - spin_unlock(&wb->list_lock);
- -}
- -
- -/*
- - * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
- - * furthest end of its superblock's dirty-inode list.
- - *
- - * Before stamping the inode's ->dirtied_when, we check to see whether it is
- - * already the most-recently-dirtied inode on the b_dirty list. If that is
- - * the case then the inode must have been redirtied while it was being written
- - * out and we don't reset its dirtied_when.
- - */
- -static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
- -{
- - if (!list_empty(&wb->b_dirty)) {
- - struct inode *tail;
- -
- - tail = wb_inode(wb->b_dirty.next);
- - if (time_before(inode->dirtied_when, tail->dirtied_when))
- - inode->dirtied_when = jiffies;
- - }
- - inode_io_list_move_locked(inode, wb, &wb->b_dirty);
- -}
- -
- -/*
- - * requeue inode for re-scanning after bdi->b_io list is exhausted.
- - */
- -static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
- -{
- - inode_io_list_move_locked(inode, wb, &wb->b_more_io);
- -}
- -
- -static void inode_sync_complete(struct inode *inode)
- -{
- - inode->i_state &= ~I_SYNC;
- - /* If inode is clean an unused, put it into LRU now... */
- - inode_add_lru(inode);
- - /* Waiters must see I_SYNC cleared before being woken up */
- - smp_mb();
- - wake_up_bit(&inode->i_state, __I_SYNC);
- -}
- -
- -static bool inode_dirtied_after(struct inode *inode, unsigned long t)
- -{
- - bool ret = time_after(inode->dirtied_when, t);
- -#ifndef CONFIG_64BIT
- - /*
- - * For inodes being constantly redirtied, dirtied_when can get stuck.
- - * It _appears_ to be in the future, but is actually in distant past.
- - * This test is necessary to prevent such wrapped-around relative times
- - * from permanently stopping the whole bdi writeback.
- - */
- - ret = ret && time_before_eq(inode->dirtied_when, jiffies);
- -#endif
- - return ret;
- -}
- -
- -#define EXPIRE_DIRTY_ATIME 0x0001
- -
- -/*
- - * Move expired (dirtied before work->older_than_this) dirty inodes from
- - * @delaying_queue to @dispatch_queue.
- - */
- -static int move_expired_inodes(struct list_head *delaying_queue,
- - struct list_head *dispatch_queue,
- - int flags,
- - struct wb_writeback_work *work)
- -{
- - unsigned long *older_than_this = NULL;
- - unsigned long expire_time;
- - LIST_HEAD(tmp);
- - struct list_head *pos, *node;
- - struct super_block *sb = NULL;
- - struct inode *inode;
- - int do_sb_sort = 0;
- - int moved = 0;
- -
- - if ((flags & EXPIRE_DIRTY_ATIME) == 0)
- - older_than_this = work->older_than_this;
- - else if (!work->for_sync) {
- - expire_time = jiffies - (dirtytime_expire_interval * HZ);
- - older_than_this = &expire_time;
- - }
- - while (!list_empty(delaying_queue)) {
- - inode = wb_inode(delaying_queue->prev);
- - if (older_than_this &&
- - inode_dirtied_after(inode, *older_than_this))
- - break;
- - list_move(&inode->i_io_list, &tmp);
- - moved++;
- - if (flags & EXPIRE_DIRTY_ATIME)
- - set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
- - if (sb_is_blkdev_sb(inode->i_sb))
- - continue;
- - if (sb && sb != inode->i_sb)
- - do_sb_sort = 1;
- - sb = inode->i_sb;
- - }
- -
- - /* just one sb in list, splice to dispatch_queue and we're done */
- - if (!do_sb_sort) {
- - list_splice(&tmp, dispatch_queue);
- - goto out;
- - }
- -
- - /* Move inodes from one superblock together */
- - while (!list_empty(&tmp)) {
- - sb = wb_inode(tmp.prev)->i_sb;
- - list_for_each_prev_safe(pos, node, &tmp) {
- - inode = wb_inode(pos);
- - if (inode->i_sb == sb)
- - list_move(&inode->i_io_list, dispatch_queue);
- - }
- - }
- -out:
- - return moved;
- -}
- -
- -/*
- - * Queue all expired dirty inodes for io, eldest first.
- - * Before
- - * newly dirtied b_dirty b_io b_more_io
- - * =============> gf edc BA
- - * After
- - * newly dirtied b_dirty b_io b_more_io
- - * =============> g fBAedc
- - * |
- - * +--> dequeue for IO
- - */
- -static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
- -{
- - int moved;
- -
- - assert_spin_locked(&wb->list_lock);
- - list_splice_init(&wb->b_more_io, &wb->b_io);
- - moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
- - moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
- - EXPIRE_DIRTY_ATIME, work);
- - if (moved)
- - wb_io_lists_populated(wb);
- - trace_writeback_queue_io(wb, work, moved);
- -}
- -
- -static int write_inode(struct inode *inode, struct writeback_control *wbc)
- -{
- - int ret;
- -
- - if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
- - trace_writeback_write_inode_start(inode, wbc);
- - ret = inode->i_sb->s_op->write_inode(inode, wbc);
- - trace_writeback_write_inode(inode, wbc);
- - return ret;
- - }
- - return 0;
- -}
- -
- -/*
- - * Wait for writeback on an inode to complete. Called with i_lock held.
- - * Caller must make sure inode cannot go away when we drop i_lock.
- - */
- -static void __inode_wait_for_writeback(struct inode *inode)
- - __releases(inode->i_lock)
- - __acquires(inode->i_lock)
- -{
- - DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
- - wait_queue_head_t *wqh;
- -
- - wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
- - while (inode->i_state & I_SYNC) {
- - spin_unlock(&inode->i_lock);
- - __wait_on_bit(wqh, &wq, bit_wait,
- - TASK_UNINTERRUPTIBLE);
- - spin_lock(&inode->i_lock);
- - }
- -}
- -
- -/*
- - * Wait for writeback on an inode to complete. Caller must have inode pinned.
- - */
- -void inode_wait_for_writeback(struct inode *inode)
- -{
- - spin_lock(&inode->i_lock);
- - __inode_wait_for_writeback(inode);
- - spin_unlock(&inode->i_lock);
- -}
- -
- -/*
- - * Sleep until I_SYNC is cleared. This function must be called with i_lock
- - * held and drops it. It is aimed for callers not holding any inode reference
- - * so once i_lock is dropped, inode can go away.
- - */
- -static void inode_sleep_on_writeback(struct inode *inode)
- - __releases(inode->i_lock)
- -{
- - DEFINE_WAIT(wait);
- - wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
- - int sleep;
- -
- - prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
- - sleep = inode->i_state & I_SYNC;
- - spin_unlock(&inode->i_lock);
- - if (sleep)
- - schedule();
- - finish_wait(wqh, &wait);
- -}
- -
- -/*
- - * Find proper writeback list for the inode depending on its current state and
- - * possibly also change of its state while we were doing writeback. Here we
- - * handle things such as livelock prevention or fairness of writeback among
- - * inodes. This function can be called only by flusher thread - noone else
- - * processes all inodes in writeback lists and requeueing inodes behind flusher
- - * thread's back can have unexpected consequences.
- - */
- -static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
- - struct writeback_control *wbc)
- -{
- - if (inode->i_state & I_FREEING)
- - return;
- -
- - /*
- - * Sync livelock prevention. Each inode is tagged and synced in one
- - * shot. If still dirty, it will be redirty_tail()'ed below. Update
- - * the dirty time to prevent enqueue and sync it again.
- - */
- - if ((inode->i_state & I_DIRTY) &&
- - (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
- - inode->dirtied_when = jiffies;
- -
- - if (wbc->pages_skipped) {
- - /*
- - * writeback is not making progress due to locked
- - * buffers. Skip this inode for now.
- - */
- - redirty_tail(inode, wb);
- - return;
- - }
- -
- - if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
- - /*
- - * We didn't write back all the pages. nfs_writepages()
- - * sometimes bales out without doing anything.
- - */
- - if (wbc->nr_to_write <= 0) {
- - /* Slice used up. Queue for next turn. */
- - requeue_io(inode, wb);
- - } else {
- - /*
- - * Writeback blocked by something other than
- - * congestion. Delay the inode for some time to
- - * avoid spinning on the CPU (100% iowait)
- - * retrying writeback of the dirty page/inode
- - * that cannot be performed immediately.
- - */
- - redirty_tail(inode, wb);
- - }
- - } else if (inode->i_state & I_DIRTY) {
- - /*
- - * Filesystems can dirty the inode during writeback operations,
- - * such as delayed allocation during submission or metadata
- - * updates after data IO completion.
- - */
- - redirty_tail(inode, wb);
- - } else if (inode->i_state & I_DIRTY_TIME) {
- - inode->dirtied_when = jiffies;
- - inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
- - } else {
- - /* The inode is clean. Remove from writeback lists. */
- - inode_io_list_del_locked(inode, wb);
- - }
- -}
- -
- -/*
- - * Write out an inode and its dirty pages. Do not update the writeback list
- - * linkage. That is left to the caller. The caller is also responsible for
- - * setting I_SYNC flag and calling inode_sync_complete() to clear it.
- - */
- -static int
- -__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
- -{
- - struct address_space *mapping = inode->i_mapping;
- - long nr_to_write = wbc->nr_to_write;
- - unsigned dirty;
- - int ret;
- -
- - WARN_ON(!(inode->i_state & I_SYNC));
- -
- - trace_writeback_single_inode_start(inode, wbc, nr_to_write);
- -
- - ret = do_writepages(mapping, wbc);
- -
- - /*
- - * Make sure to wait on the data before writing out the metadata.
- - * This is important for filesystems that modify metadata on data
- - * I/O completion. We don't do it for sync(2) writeback because it has a
- - * separate, external IO completion path and ->sync_fs for guaranteeing
- - * inode metadata is written back correctly.
- - */
- - if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
- - int err = filemap_fdatawait(mapping);
- - if (ret == 0)
- - ret = err;
- - }
- -
- - /*
- - * Some filesystems may redirty the inode during the writeback
- - * due to delalloc, clear dirty metadata flags right before
- - * write_inode()
- - */
- - spin_lock(&inode->i_lock);
- -
- - dirty = inode->i_state & I_DIRTY;
- - if (inode->i_state & I_DIRTY_TIME) {
- - if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
- - unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
- - unlikely(time_after(jiffies,
- - (inode->dirtied_time_when +
- - dirtytime_expire_interval * HZ)))) {
- - dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
- - trace_writeback_lazytime(inode);
- - }
- - } else
- - inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
- - inode->i_state &= ~dirty;
- -
- - /*
- - * Paired with smp_mb() in __mark_inode_dirty(). This allows
- - * __mark_inode_dirty() to test i_state without grabbing i_lock -
- - * either they see the I_DIRTY bits cleared or we see the dirtied
- - * inode.
- - *
- - * I_DIRTY_PAGES is always cleared together above even if @mapping
- - * still has dirty pages. The flag is reinstated after smp_mb() if
- - * necessary. This guarantees that either __mark_inode_dirty()
- - * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
- - */
- - smp_mb();
- -
- - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- - inode->i_state |= I_DIRTY_PAGES;
- -
- - spin_unlock(&inode->i_lock);
- -
- - if (dirty & I_DIRTY_TIME)
- - mark_inode_dirty_sync(inode);
- - /* Don't write the inode if only I_DIRTY_PAGES was set */
- - if (dirty & ~I_DIRTY_PAGES) {
- - int err = write_inode(inode, wbc);
- - if (ret == 0)
- - ret = err;
- - }
- - trace_writeback_single_inode(inode, wbc, nr_to_write);
- - return ret;
- -}
- -
- -/*
- - * Write out an inode's dirty pages. Either the caller has an active reference
- - * on the inode or the inode has I_WILL_FREE set.
- - *
- - * This function is designed to be called for writing back one inode which
- - * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
- - * and does more profound writeback list handling in writeback_sb_inodes().
- - */
- -static int
- -writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
- - struct writeback_control *wbc)
- -{
- - int ret = 0;
- -
- - spin_lock(&inode->i_lock);
- - if (!atomic_read(&inode->i_count))
- - WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
- - else
- - WARN_ON(inode->i_state & I_WILL_FREE);
- -
- - if (inode->i_state & I_SYNC) {
- - if (wbc->sync_mode != WB_SYNC_ALL)
- - goto out;
- - /*
- - * It's a data-integrity sync. We must wait. Since callers hold
- - * inode reference or inode has I_WILL_FREE set, it cannot go
- - * away under us.
- - */
- - __inode_wait_for_writeback(inode);
- - }
- - WARN_ON(inode->i_state & I_SYNC);
- - /*
- - * Skip inode if it is clean and we have no outstanding writeback in
- - * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
- - * function since flusher thread may be doing for example sync in
- - * parallel and if we move the inode, it could get skipped. So here we
- - * make sure inode is on some writeback list and leave it there unless
- - * we have completely cleaned the inode.
- - */
- - if (!(inode->i_state & I_DIRTY_ALL) &&
- - (wbc->sync_mode != WB_SYNC_ALL ||
- - !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
- - goto out;
- - inode->i_state |= I_SYNC;
- - wbc_attach_and_unlock_inode(wbc, inode);
- -
- - ret = __writeback_single_inode(inode, wbc);
- -
- - wbc_detach_inode(wbc);
- - spin_lock(&wb->list_lock);
- - spin_lock(&inode->i_lock);
- - /*
- - * If inode is clean, remove it from writeback lists. Otherwise don't
- - * touch it. See comment above for explanation.
- - */
- - if (!(inode->i_state & I_DIRTY_ALL))
- - inode_io_list_del_locked(inode, wb);
- - spin_unlock(&wb->list_lock);
- - inode_sync_complete(inode);
- -out:
- - spin_unlock(&inode->i_lock);
- - return ret;
- -}
- -
- -static long writeback_chunk_size(struct bdi_writeback *wb,
- - struct wb_writeback_work *work)
- -{
- - long pages;
- -
- - /*
- - * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
- - * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
- - * here avoids calling into writeback_inodes_wb() more than once.
- - *
- - * The intended call sequence for WB_SYNC_ALL writeback is:
- - *
- - * wb_writeback()
- - * writeback_sb_inodes() <== called only once
- - * write_cache_pages() <== called once for each inode
- - * (quickly) tag currently dirty pages
- - * (maybe slowly) sync all tagged pages
- - */
- - if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
- - pages = LONG_MAX;
- - else {
- - pages = min(wb->avg_write_bandwidth / 2,
- - global_wb_domain.dirty_limit / DIRTY_SCOPE);
- - pages = min(pages, work->nr_pages);
- - pages = round_down(pages + MIN_WRITEBACK_PAGES,
- - MIN_WRITEBACK_PAGES);
- - }
- -
- - return pages;
- -}
- -
- -/*
- - * Write a portion of b_io inodes which belong to @sb.
- - *
- - * Return the number of pages and/or inodes written.
- - *
- - * NOTE! This is called with wb->list_lock held, and will
- - * unlock and relock that for each inode it ends up doing
- - * IO for.
- - */
- -static long writeback_sb_inodes(struct super_block *sb,
- - struct bdi_writeback *wb,
- - struct wb_writeback_work *work)
- -{
- - struct writeback_control wbc = {
- - .sync_mode = work->sync_mode,
- - .tagged_writepages = work->tagged_writepages,
- - .for_kupdate = work->for_kupdate,
- - .for_background = work->for_background,
- - .for_sync = work->for_sync,
- - .range_cyclic = work->range_cyclic,
- - .range_start = 0,
- - .range_end = LLONG_MAX,
- - };
- - unsigned long start_time = jiffies;
- - long write_chunk;
- - long wrote = 0; /* count both pages and inodes */
- -
- - while (!list_empty(&wb->b_io)) {
- - struct inode *inode = wb_inode(wb->b_io.prev);
- -
- - if (inode->i_sb != sb) {
- - if (work->sb) {
- - /*
- - * We only want to write back data for this
- - * superblock, move all inodes not belonging
- - * to it back onto the dirty list.
- - */
- - redirty_tail(inode, wb);
- - continue;
- - }
- -
- - /*
- - * The inode belongs to a different superblock.
- - * Bounce back to the caller to unpin this and
- - * pin the next superblock.
- - */
- - break;
- - }
- -
- - /*
- - * Don't bother with new inodes or inodes being freed, first
- - * kind does not need periodic writeout yet, and for the latter
- - * kind writeout is handled by the freer.
- - */
- - spin_lock(&inode->i_lock);
- - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
- - spin_unlock(&inode->i_lock);
- - redirty_tail(inode, wb);
- - continue;
- - }
- - if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
- - /*
- - * If this inode is locked for writeback and we are not
- - * doing writeback-for-data-integrity, move it to
- - * b_more_io so that writeback can proceed with the
- - * other inodes on s_io.
- - *
- - * We'll have another go at writing back this inode
- - * when we completed a full scan of b_io.
- - */
- - spin_unlock(&inode->i_lock);
- - requeue_io(inode, wb);
- - trace_writeback_sb_inodes_requeue(inode);
- - continue;
- - }
- - spin_unlock(&wb->list_lock);
- -
- - /*
- - * We already requeued the inode if it had I_SYNC set and we
- - * are doing WB_SYNC_NONE writeback. So this catches only the
- - * WB_SYNC_ALL case.
- - */
- - if (inode->i_state & I_SYNC) {
- - /* Wait for I_SYNC. This function drops i_lock... */
- - inode_sleep_on_writeback(inode);
- - /* Inode may be gone, start again */
- - spin_lock(&wb->list_lock);
- - continue;
- - }
- - inode->i_state |= I_SYNC;
- - wbc_attach_and_unlock_inode(&wbc, inode);
- -
- - write_chunk = writeback_chunk_size(wb, work);
- - wbc.nr_to_write = write_chunk;
- - wbc.pages_skipped = 0;
- -
- - /*
- - * We use I_SYNC to pin the inode in memory. While it is set
- - * evict_inode() will wait so the inode cannot be freed.
- - */
- - __writeback_single_inode(inode, &wbc);
- -
- - wbc_detach_inode(&wbc);
- - work->nr_pages -= write_chunk - wbc.nr_to_write;
- - wrote += write_chunk - wbc.nr_to_write;
- -
- - if (need_resched()) {
- - /*
- - * We're trying to balance between building up a nice
- - * long list of IOs to improve our merge rate, and
- - * getting those IOs out quickly for anyone throttling
- - * in balance_dirty_pages(). cond_resched() doesn't
- - * unplug, so get our IOs out the door before we
- - * give up the CPU.
- - */
- - blk_flush_plug(current);
- - cond_resched();
- - }
- -
- -
- - spin_lock(&wb->list_lock);
- - spin_lock(&inode->i_lock);
- - if (!(inode->i_state & I_DIRTY_ALL))
- - wrote++;
- - requeue_inode(inode, wb, &wbc);
- - inode_sync_complete(inode);
- - spin_unlock(&inode->i_lock);
- -
- - /*
- - * bail out to wb_writeback() often enough to check
- - * background threshold and other termination conditions.
- - */
- - if (wrote) {
- - if (time_is_before_jiffies(start_time + HZ / 10UL))
- - break;
- - if (work->nr_pages <= 0)
- - break;
- - }
- - }
- - return wrote;
- -}
- -
- -static long __writeback_inodes_wb(struct bdi_writeback *wb,
- - struct wb_writeback_work *work)
- -{
- - unsigned long start_time = jiffies;
- - long wrote = 0;
- -
- - while (!list_empty(&wb->b_io)) {
- - struct inode *inode = wb_inode(wb->b_io.prev);
- - struct super_block *sb = inode->i_sb;
- -
- - if (!trylock_super(sb)) {
- - /*
- - * trylock_super() may fail consistently due to
- - * s_umount being grabbed by someone else. Don't use
- - * requeue_io() to avoid busy retrying the inode/sb.
- - */
- - redirty_tail(inode, wb);
- - continue;
- - }
- - wrote += writeback_sb_inodes(sb, wb, work);
- - up_read(&sb->s_umount);
- -
- - /* refer to the same tests at the end of writeback_sb_inodes */
- - if (wrote) {
- - if (time_is_before_jiffies(start_time + HZ / 10UL))
- - break;
- - if (work->nr_pages <= 0)
- - break;
- - }
- - }
- - /* Leave any unwritten inodes on b_io */
- - return wrote;
- -}
- -
- -static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
- - enum wb_reason reason)
- -{
- - struct wb_writeback_work work = {
- - .nr_pages = nr_pages,
- - .sync_mode = WB_SYNC_NONE,
- - .range_cyclic = 1,
- - .reason = reason,
- - };
- - struct blk_plug plug;
- -
- - blk_start_plug(&plug);
- - spin_lock(&wb->list_lock);
- - if (list_empty(&wb->b_io))
- - queue_io(wb, &work);
- - __writeback_inodes_wb(wb, &work);
- - spin_unlock(&wb->list_lock);
- - blk_finish_plug(&plug);
- -
- - return nr_pages - work.nr_pages;
- -}
- -
- -/*
- - * Explicit flushing or periodic writeback of "old" data.
- - *
- - * Define "old": the first time one of an inode's pages is dirtied, we mark the
- - * dirtying-time in the inode's address_space. So this periodic writeback code
- - * just walks the superblock inode list, writing back any inodes which are
- - * older than a specific point in time.
- - *
- - * Try to run once per dirty_writeback_interval. But if a writeback event
- - * takes longer than a dirty_writeback_interval interval, then leave a
- - * one-second gap.
- - *
- - * older_than_this takes precedence over nr_to_write. So we'll only write back
- - * all dirty pages if they are all attached to "old" mappings.
- - */
- -static long wb_writeback(struct bdi_writeback *wb,
- - struct wb_writeback_work *work)
- -{
- - unsigned long wb_start = jiffies;
- - long nr_pages = work->nr_pages;
- - unsigned long oldest_jif;
- - struct inode *inode;
- - long progress;
- - struct blk_plug plug;
- -
- - oldest_jif = jiffies;
- - work->older_than_this = &oldest_jif;
- -
- - blk_start_plug(&plug);
- - spin_lock(&wb->list_lock);
- - for (;;) {
- - /*
- - * Stop writeback when nr_pages has been consumed
- - */
- - if (work->nr_pages <= 0)
- - break;
- -
- - /*
- - * Background writeout and kupdate-style writeback may
- - * run forever. Stop them if there is other work to do
- - * so that e.g. sync can proceed. They'll be restarted
- - * after the other works are all done.
- - */
- - if ((work->for_background || work->for_kupdate) &&
- - !list_empty(&wb->work_list))
- - break;
- -
- - /*
- - * For background writeout, stop when we are below the
- - * background dirty threshold
- - */
- - if (work->for_background && !wb_over_bg_thresh(wb))
- - break;
- -
- - /*
- - * Kupdate and background works are special and we want to
- - * include all inodes that need writing. Livelock avoidance is
- - * handled by these works yielding to any other work so we are
- - * safe.
- - */
- - if (work->for_kupdate) {
- - oldest_jif = jiffies -
- - msecs_to_jiffies(dirty_expire_interval * 10);
- - } else if (work->for_background)
- - oldest_jif = jiffies;
- -
- - trace_writeback_start(wb, work);
- - if (list_empty(&wb->b_io))
- - queue_io(wb, work);
- - if (work->sb)
- - progress = writeback_sb_inodes(work->sb, wb, work);
- - else
- - progress = __writeback_inodes_wb(wb, work);
- - trace_writeback_written(wb, work);
- -
- - wb_update_bandwidth(wb, wb_start);
- -
- - /*
- - * Did we write something? Try for more
- - *
- - * Dirty inodes are moved to b_io for writeback in batches.
- - * The completion of the current batch does not necessarily
- - * mean the overall work is done. So we keep looping as long
- - * as made some progress on cleaning pages or inodes.
- - */
- - if (progress)
- - continue;
- - /*
- - * No more inodes for IO, bail
- - */
- - if (list_empty(&wb->b_more_io))
- - break;
- - /*
- - * Nothing written. Wait for some inode to
- - * become available for writeback. Otherwise
- - * we'll just busyloop.
- - */
- - if (!list_empty(&wb->b_more_io)) {
- - trace_writeback_wait(wb, work);
- - inode = wb_inode(wb->b_more_io.prev);
- - spin_lock(&inode->i_lock);
- - spin_unlock(&wb->list_lock);
- - /* This function drops i_lock... */
- - inode_sleep_on_writeback(inode);
- - spin_lock(&wb->list_lock);
- - }
- - }
- - spin_unlock(&wb->list_lock);
- - blk_finish_plug(&plug);
- -
- - return nr_pages - work->nr_pages;
- -}
- -
- -/*
- - * Return the next wb_writeback_work struct that hasn't been processed yet.
- - */
- -static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
- -{
- - struct wb_writeback_work *work = NULL;
- -
- - spin_lock_bh(&wb->work_lock);
- - if (!list_empty(&wb->work_list)) {
- - work = list_entry(wb->work_list.next,
- - struct wb_writeback_work, list);
- - list_del_init(&work->list);
- - }
- - spin_unlock_bh(&wb->work_lock);
- - return work;
- -}
- -
- -/*
- - * Add in the number of potentially dirty inodes, because each inode
- - * write can dirty pagecache in the underlying blockdev.
- - */
- -static unsigned long get_nr_dirty_pages(void)
- -{
- - return global_page_state(NR_FILE_DIRTY) +
- - global_page_state(NR_UNSTABLE_NFS) +
- - get_nr_dirty_inodes();
- -}
- -
- -static long wb_check_background_flush(struct bdi_writeback *wb)
- -{
- - if (wb_over_bg_thresh(wb)) {
- -
- - struct wb_writeback_work work = {
- - .nr_pages = LONG_MAX,
- - .sync_mode = WB_SYNC_NONE,
- - .for_background = 1,
- - .range_cyclic = 1,
- - .reason = WB_REASON_BACKGROUND,
- - };
- -
- - return wb_writeback(wb, &work);
- - }
- -
- - return 0;
- -}
- -
- -static long wb_check_old_data_flush(struct bdi_writeback *wb)
- -{
- - unsigned long expired;
- - long nr_pages;
- -
- - /*
- - * When set to zero, disable periodic writeback
- - */
- - if (!dirty_writeback_interval)
- - return 0;
- -
- - expired = wb->last_old_flush +
- - msecs_to_jiffies(dirty_writeback_interval * 10);
- - if (time_before(jiffies, expired))
- - return 0;
- -
- - wb->last_old_flush = jiffies;
- - nr_pages = get_nr_dirty_pages();
- -
- - if (nr_pages) {
- - struct wb_writeback_work work = {
- - .nr_pages = nr_pages,
- - .sync_mode = WB_SYNC_NONE,
- - .for_kupdate = 1,
- - .range_cyclic = 1,
- - .reason = WB_REASON_PERIODIC,
- - };
- -
- - return wb_writeback(wb, &work);
- - }
- -
- - return 0;
- -}
- -
- -/*
- - * Retrieve work items and do the writeback they describe
- - */
- -static long wb_do_writeback(struct bdi_writeback *wb)
- -{
- - struct wb_writeback_work *work;
- - long wrote = 0;
- -
- - set_bit(WB_writeback_running, &wb->state);
- - while ((work = get_next_work_item(wb)) != NULL) {
- - struct wb_completion *done = work->done;
- -
- - trace_writeback_exec(wb, work);
- -
- - wrote += wb_writeback(wb, work);
- -
- - if (work->auto_free)
- - kfree(work);
- - if (done && atomic_dec_and_test(&done->cnt))
- - wake_up_all(&wb->bdi->wb_waitq);
- - }
- -
- - /*
- - * Check for periodic writeback, kupdated() style
- - */
- - wrote += wb_check_old_data_flush(wb);
- - wrote += wb_check_background_flush(wb);
- - clear_bit(WB_writeback_running, &wb->state);
- -
- - return wrote;
- -}
- -
- -/*
- - * Handle writeback of dirty data for the device backed by this bdi. Also
- - * reschedules periodically and does kupdated style flushing.
- - */
- -void wb_workfn(struct work_struct *work)
- -{
- - struct bdi_writeback *wb = container_of(to_delayed_work(work),
- - struct bdi_writeback, dwork);
- - long pages_written;
- -
- - set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
- - current->flags |= PF_SWAPWRITE;
- -
- - if (likely(!current_is_workqueue_rescuer() ||
- - !test_bit(WB_registered, &wb->state))) {
- - /*
- - * The normal path. Keep writing back @wb until its
- - * work_list is empty. Note that this path is also taken
- - * if @wb is shutting down even when we're running off the
- - * rescuer as work_list needs to be drained.
- - */
- - do {
- - pages_written = wb_do_writeback(wb);
- - trace_writeback_pages_written(pages_written);
- - } while (!list_empty(&wb->work_list));
- - } else {
- - /*
- - * bdi_wq can't get enough workers and we're running off
- - * the emergency worker. Don't hog it. Hopefully, 1024 is
- - * enough for efficient IO.
- - */
- - pages_written = writeback_inodes_wb(wb, 1024,
- - WB_REASON_FORKER_THREAD);
- - trace_writeback_pages_written(pages_written);
- - }
- -
- - if (!list_empty(&wb->work_list))
- - mod_delayed_work(bdi_wq, &wb->dwork, 0);
- - else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
- - wb_wakeup_delayed(wb);
- -
- - current->flags &= ~PF_SWAPWRITE;
- -}
- -
- -/*
- - * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
- - * the whole world.
- - */
- -void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
- -{
- - struct backing_dev_info *bdi;
- -
- - if (!nr_pages)
- - nr_pages = get_nr_dirty_pages();
- -
- - rcu_read_lock();
- - list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
- - struct bdi_writeback *wb;
- -
- - if (!bdi_has_dirty_io(bdi))
- - continue;
- -
- - list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
- - wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
- - false, reason);
- - }
- - rcu_read_unlock();
- -}
- -
- -/*
- - * Wake up bdi's periodically to make sure dirtytime inodes gets
- - * written back periodically. We deliberately do *not* check the
- - * b_dirtytime list in wb_has_dirty_io(), since this would cause the
- - * kernel to be constantly waking up once there are any dirtytime
- - * inodes on the system. So instead we define a separate delayed work
- - * function which gets called much more rarely. (By default, only
- - * once every 12 hours.)
- - *
- - * If there is any other write activity going on in the file system,
- - * this function won't be necessary. But if the only thing that has
- - * happened on the file system is a dirtytime inode caused by an atime
- - * update, we need this infrastructure below to make sure that inode
- - * eventually gets pushed out to disk.
- - */
- -static void wakeup_dirtytime_writeback(struct work_struct *w);
- -static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
- -
- -static void wakeup_dirtytime_writeback(struct work_struct *w)
- -{
- - struct backing_dev_info *bdi;
- -
- - rcu_read_lock();
- - list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
- - struct bdi_writeback *wb;
- -
- - list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
- - if (!list_empty(&wb->b_dirty_time))
- - wb_wakeup(wb);
- - }
- - rcu_read_unlock();
- - schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
- -}
- -
- -static int __init start_dirtytime_writeback(void)
- -{
- - schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
- - return 0;
- -}
- -__initcall(start_dirtytime_writeback);
- -
- -int dirtytime_interval_handler(struct ctl_table *table, int write,
- - void __user *buffer, size_t *lenp, loff_t *ppos)
- -{
- - int ret;
- -
- - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- - if (ret == 0 && write)
- - mod_delayed_work(system_wq, &dirtytime_work, 0);
- - return ret;
- -}
- -
- -static noinline void block_dump___mark_inode_dirty(struct inode *inode)
- -{
- - if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
- - struct dentry *dentry;
- - const char *name = "?";
- -
- - dentry = d_find_alias(inode);
- - if (dentry) {
- - spin_lock(&dentry->d_lock);
- - name = (const char *) dentry->d_name.name;
- - }
- - printk(KERN_DEBUG
- - "%s(%d): dirtied inode %lu (%s) on %s\n",
- - current->comm, task_pid_nr(current), inode->i_ino,
- - name, inode->i_sb->s_id);
- - if (dentry) {
- - spin_unlock(&dentry->d_lock);
- - dput(dentry);
- - }
- - }
- -}
- -
- -/**
- - * __mark_inode_dirty - internal function
- - * @inode: inode to mark
- - * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
- - * Mark an inode as dirty. Callers should use mark_inode_dirty or
- - * mark_inode_dirty_sync.
- - *
- - * Put the inode on the super block's dirty list.
- - *
- - * CAREFUL! We mark it dirty unconditionally, but move it onto the
- - * dirty list only if it is hashed or if it refers to a blockdev.
- - * If it was not hashed, it will never be added to the dirty list
- - * even if it is later hashed, as it will have been marked dirty already.
- - *
- - * In short, make sure you hash any inodes _before_ you start marking
- - * them dirty.
- - *
- - * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
- - * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
- - * the kernel-internal blockdev inode represents the dirtying time of the
- - * blockdev's pages. This is why for I_DIRTY_PAGES we always use
- - * page->mapping->host, so the page-dirtying time is recorded in the internal
- - * blockdev inode.
- - */
- -void __mark_inode_dirty(struct inode *inode, int flags)
- -{
- -#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
- - struct super_block *sb = inode->i_sb;
- - int dirtytime;
- -
- - trace_writeback_mark_inode_dirty(inode, flags);
- -
- - /*
- - * Don't do this for I_DIRTY_PAGES - that doesn't actually
- - * dirty the inode itself
- - */
- - if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
- - trace_writeback_dirty_inode_start(inode, flags);
- -
- - if (sb->s_op->dirty_inode)
- - sb->s_op->dirty_inode(inode, flags);
- -
- - trace_writeback_dirty_inode(inode, flags);
- - }
- - if (flags & I_DIRTY_INODE)
- - flags &= ~I_DIRTY_TIME;
- - dirtytime = flags & I_DIRTY_TIME;
- -
- - /*
- - * Paired with smp_mb() in __writeback_single_inode() for the
- - * following lockless i_state test. See there for details.
- - */
- - smp_mb();
- -
- - if (((inode->i_state & flags) == flags) ||
- - (dirtytime && (inode->i_state & I_DIRTY_INODE)))
- - return;
- -
- - if (unlikely(block_dump))
- - block_dump___mark_inode_dirty(inode);
- -
- - spin_lock(&inode->i_lock);
- - if (dirtytime && (inode->i_state & I_DIRTY_INODE))
- - goto out_unlock_inode;
- - if ((inode->i_state & flags) != flags) {
- - const int was_dirty = inode->i_state & I_DIRTY;
- -
- - inode_attach_wb(inode, NULL);
- -
- - if (flags & I_DIRTY_INODE)
- - inode->i_state &= ~I_DIRTY_TIME;
- - inode->i_state |= flags;
- -
- - /*
- - * If the inode is being synced, just update its dirty state.
- - * The unlocker will place the inode on the appropriate
- - * superblock list, based upon its state.
- - */
- - if (inode->i_state & I_SYNC)
- - goto out_unlock_inode;
- -
- - /*
- - * Only add valid (hashed) inodes to the superblock's
- - * dirty list. Add blockdev inodes as well.
- - */
- - if (!S_ISBLK(inode->i_mode)) {
- - if (inode_unhashed(inode))
- - goto out_unlock_inode;
- - }
- - if (inode->i_state & I_FREEING)
- - goto out_unlock_inode;
- -
- - /*
- - * If the inode was already on b_dirty/b_io/b_more_io, don't
- - * reposition it (that would break b_dirty time-ordering).
- - */
- - if (!was_dirty) {
- - struct bdi_writeback *wb;
- - struct list_head *dirty_list;
- - bool wakeup_bdi = false;
- -
- - wb = locked_inode_to_wb_and_lock_list(inode);
- -
- - WARN(bdi_cap_writeback_dirty(wb->bdi) &&
- - !test_bit(WB_registered, &wb->state),
- - "bdi-%s not registered\n", wb->bdi->name);
- -
- - inode->dirtied_when = jiffies;
- - if (dirtytime)
- - inode->dirtied_time_when = jiffies;
- -
- - if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
- - dirty_list = &wb->b_dirty;
- - else
- - dirty_list = &wb->b_dirty_time;
- -
- - wakeup_bdi = inode_io_list_move_locked(inode, wb,
- - dirty_list);
- -
- - spin_unlock(&wb->list_lock);
- - trace_writeback_dirty_inode_enqueue(inode);
- -
- - /*
- - * If this is the first dirty inode for this bdi,
- - * we have to wake-up the corresponding bdi thread
- - * to make sure background write-back happens
- - * later.
- - */
- - if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
- - wb_wakeup_delayed(wb);
- - return;
- - }
- - }
- -out_unlock_inode:
- - spin_unlock(&inode->i_lock);
- -
- -#undef I_DIRTY_INODE
- -}
- -EXPORT_SYMBOL(__mark_inode_dirty);
- -
- -/*
- - * The @s_sync_lock is used to serialise concurrent sync operations
- - * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
- - * Concurrent callers will block on the s_sync_lock rather than doing contending
- - * walks. The queueing maintains sync(2) required behaviour as all the IO that
- - * has been issued up to the time this function is enter is guaranteed to be
- - * completed by the time we have gained the lock and waited for all IO that is
- - * in progress regardless of the order callers are granted the lock.
- - */
- -static void wait_sb_inodes(struct super_block *sb)
- -{
- - struct inode *inode, *old_inode = NULL;
- -
- - /*
- - * We need to be protected against the filesystem going from
- - * r/o to r/w or vice versa.
- - */
- - WARN_ON(!rwsem_is_locked(&sb->s_umount));
- -
- - mutex_lock(&sb->s_sync_lock);
- - spin_lock(&sb->s_inode_list_lock);
- -
- - /*
- - * Data integrity sync. Must wait for all pages under writeback,
- - * because there may have been pages dirtied before our sync
- - * call, but which had writeout started before we write it out.
- - * In which case, the inode may not be on the dirty list, but
- - * we still have to wait for that writeout.
- - */
- - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- - struct address_space *mapping = inode->i_mapping;
- -
- - spin_lock(&inode->i_lock);
- - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
- - (mapping->nrpages == 0)) {
- - spin_unlock(&inode->i_lock);
- - continue;
- - }
- - __iget(inode);
- - spin_unlock(&inode->i_lock);
- - spin_unlock(&sb->s_inode_list_lock);
- -
- - /*
- - * We hold a reference to 'inode' so it couldn't have been
- - * removed from s_inodes list while we dropped the
- - * s_inode_list_lock. We cannot iput the inode now as we can
- - * be holding the last reference and we cannot iput it under
- - * s_inode_list_lock. So we keep the reference and iput it
- - * later.
- - */
- - iput(old_inode);
- - old_inode = inode;
- -
- - /*
- - * We keep the error status of individual mapping so that
- - * applications can catch the writeback error using fsync(2).
- - * See filemap_fdatawait_keep_errors() for details.
- - */
- - filemap_fdatawait_keep_errors(mapping);
- -
- - cond_resched();
- -
- - spin_lock(&sb->s_inode_list_lock);
- - }
- - spin_unlock(&sb->s_inode_list_lock);
- - iput(old_inode);
- - mutex_unlock(&sb->s_sync_lock);
- -}
- -
- -static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
- - enum wb_reason reason, bool skip_if_busy)
- -{
- - DEFINE_WB_COMPLETION_ONSTACK(done);
- - struct wb_writeback_work work = {
- - .sb = sb,
- - .sync_mode = WB_SYNC_NONE,
- - .tagged_writepages = 1,
- - .done = &done,
- - .nr_pages = nr,
- - .reason = reason,
- - };
- - struct backing_dev_info *bdi = sb->s_bdi;
- -
- - if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
- - return;
- - WARN_ON(!rwsem_is_locked(&sb->s_umount));
- -
- - bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
- - wb_wait_for_completion(bdi, &done);
- -}
- -
- -/**
- - * writeback_inodes_sb_nr - writeback dirty inodes from given super_block
- - * @sb: the superblock
- - * @nr: the number of pages to write
- - * @reason: reason why some writeback work initiated
- - *
- - * Start writeback on some inodes on this super_block. No guarantees are made
- - * on how many (if any) will be written, and this function does not wait
- - * for IO completion of submitted IO.
- - */
- -void writeback_inodes_sb_nr(struct super_block *sb,
- - unsigned long nr,
- - enum wb_reason reason)
- -{
- - __writeback_inodes_sb_nr(sb, nr, reason, false);
- -}
- -EXPORT_SYMBOL(writeback_inodes_sb_nr);
- -
- -/**
- - * writeback_inodes_sb - writeback dirty inodes from given super_block
- - * @sb: the superblock
- - * @reason: reason why some writeback work was initiated
- - *
- - * Start writeback on some inodes on this super_block. No guarantees are made
- - * on how many (if any) will be written, and this function does not wait
- - * for IO completion of submitted IO.
- - */
- -void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
- -{
- - return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
- -}
- -EXPORT_SYMBOL(writeback_inodes_sb);
- -
- -/**
- - * try_to_writeback_inodes_sb_nr - try to start writeback if none underway
- - * @sb: the superblock
- - * @nr: the number of pages to write
- - * @reason: the reason of writeback
- - *
- - * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
- - * Returns 1 if writeback was started, 0 if not.
- - */
- -bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
- - enum wb_reason reason)
- -{
- - if (!down_read_trylock(&sb->s_umount))
- - return false;
- -
- - __writeback_inodes_sb_nr(sb, nr, reason, true);
- - up_read(&sb->s_umount);
- - return true;
- -}
- -EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
- -
- -/**
- - * try_to_writeback_inodes_sb - try to start writeback if none underway
- - * @sb: the superblock
- - * @reason: reason why some writeback work was initiated
- - *
- - * Implement by try_to_writeback_inodes_sb_nr()
- - * Returns 1 if writeback was started, 0 if not.
- - */
- -bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
- -{
- - return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
- -}
- -EXPORT_SYMBOL(try_to_writeback_inodes_sb);
- -
- -/**
- - * sync_inodes_sb - sync sb inode pages
- - * @sb: the superblock
- - *
- - * This function writes and waits on any dirty inode belonging to this
- - * super_block.
- - */
- -void sync_inodes_sb(struct super_block *sb)
- -{
- - DEFINE_WB_COMPLETION_ONSTACK(done);
- - struct wb_writeback_work work = {
- - .sb = sb,
- - .sync_mode = WB_SYNC_ALL,
- - .nr_pages = LONG_MAX,
- - .range_cyclic = 0,
- - .done = &done,
- - .reason = WB_REASON_SYNC,
- - .for_sync = 1,
- - };
- - struct backing_dev_info *bdi = sb->s_bdi;
- -
- - /*
- - * Can't skip on !bdi_has_dirty() because we should wait for !dirty
- - * inodes under writeback and I_DIRTY_TIME inodes ignored by
- - * bdi_has_dirty() need to be written out too.
- - */
- - if (bdi == &noop_backing_dev_info)
- - return;
- - WARN_ON(!rwsem_is_locked(&sb->s_umount));
- -
- - bdi_split_work_to_wbs(bdi, &work, false);
- - wb_wait_for_completion(bdi, &done);
- -
- - wait_sb_inodes(sb);
- -}
- -EXPORT_SYMBOL(sync_inodes_sb);
- -
- -/**
- - * write_inode_now - write an inode to disk
- - * @inode: inode to write to disk
- - * @sync: whether the write should be synchronous or not
- - *
- - * This function commits an inode to disk immediately if it is dirty. This is
- - * primarily needed by knfsd.
- - *
- - * The caller must either have a ref on the inode or must have set I_WILL_FREE.
- - */
- -int write_inode_now(struct inode *inode, int sync)
- -{
- - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
- - struct writeback_control wbc = {
- - .nr_to_write = LONG_MAX,
- - .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
- - .range_start = 0,
- - .range_end = LLONG_MAX,
- - };
- -
- - if (!mapping_cap_writeback_dirty(inode->i_mapping))
- - wbc.nr_to_write = 0;
- -
- - might_sleep();
- - return writeback_single_inode(inode, wb, &wbc);
- -}
- -EXPORT_SYMBOL(write_inode_now);
- -
- -/**
- - * sync_inode - write an inode and its pages to disk.
- - * @inode: the inode to sync
- - * @wbc: controls the writeback mode
- - *
- - * sync_inode() will write an inode and its pages to disk. It will also
- - * correctly update the inode on its superblock's dirty inode lists and will
- - * update inode->i_state.
- - *
- - * The caller must have a ref on the inode.
- - */
- -int sync_inode(struct inode *inode, struct writeback_control *wbc)
- -{
- - return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc);
- -}
- -EXPORT_SYMBOL(sync_inode);
- -
- -/**
- - * sync_inode_metadata - write an inode to disk
- - * @inode: the inode to sync
- - * @wait: wait for I/O to complete.
- - *
- - * Write an inode to disk and adjust its dirty state after completion.
- - *
- - * Note: only writes the actual inode, no associated data or other metadata.
- - */
- -int sync_inode_metadata(struct inode *inode, int wait)
- -{
- - struct writeback_control wbc = {
- - .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
- - .nr_to_write = 0, /* metadata-only */
- - };
- -
- - return sync_inode(inode, &wbc);
- -}
- -EXPORT_SYMBOL(sync_inode_metadata);
- diff -Naur linux-4.4.6-gentoo-orig/fs/gfs2/meta_io.c linux-4.4.6-gentoo-patched/fs/gfs2/meta_io.c
- --- linux-4.4.6-gentoo-orig/fs/gfs2/meta_io.c 2016-05-04 11:19:37.613649828 +0300
- +++ linux-4.4.6-gentoo-patched/fs/gfs2/meta_io.c 2016-05-04 11:03:27.410730745 +0300
- @@ -37,8 +37,7 @@
- {
- struct buffer_head *bh, *head;
- int nr_underway = 0;
- - int write_op = REQ_META | REQ_PRIO |
- - (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
- + int write_op = REQ_META | REQ_PRIO | wbc_to_write_cmd(wbc);
- BUG_ON(!PageLocked(page));
- BUG_ON(!page_has_buffers(page));
- diff -Naur linux-4.4.6-gentoo-orig/fs/mpage.c linux-4.4.6-gentoo-patched/fs/mpage.c
- --- linux-4.4.6-gentoo-orig/fs/mpage.c 2016-05-04 11:19:37.614649827 +0300
- +++ linux-4.4.6-gentoo-patched/fs/mpage.c 2016-05-04 11:03:27.410730745 +0300
- @@ -485,7 +485,6 @@
- struct buffer_head map_bh;
- loff_t i_size = i_size_read(inode);
- int ret = 0;
- - int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
- if (page_has_buffers(page)) {
- struct buffer_head *head = page_buffers(page);
- @@ -594,7 +593,7 @@
- * This page will go to BIO. Do we need to send this BIO off first?
- */
- if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- - bio = mpage_bio_submit(wr, bio);
- + bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio);
- alloc_new:
- if (bio == NULL) {
- @@ -621,7 +620,7 @@
- wbc_account_io(wbc, page, PAGE_SIZE);
- length = first_unmapped << blkbits;
- if (bio_add_page(bio, page, length, 0) < length) {
- - bio = mpage_bio_submit(wr, bio);
- + bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio);
- goto alloc_new;
- }
- @@ -631,7 +630,7 @@
- set_page_writeback(page);
- unlock_page(page);
- if (boundary || (first_unmapped != blocks_per_page)) {
- - bio = mpage_bio_submit(wr, bio);
- + bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio);
- if (boundary_block) {
- write_boundary_block(boundary_bdev,
- boundary_block, 1 << blkbits);
- @@ -643,7 +642,7 @@
- confused:
- if (bio)
- - bio = mpage_bio_submit(wr, bio);
- + bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio);
- if (mpd->use_writepage) {
- ret = mapping->a_ops->writepage(page, wbc);
- diff -Naur linux-4.4.6-gentoo-orig/fs/xfs/xfs_aops.c linux-4.4.6-gentoo-patched/fs/xfs/xfs_aops.c
- --- linux-4.4.6-gentoo-orig/fs/xfs/xfs_aops.c 2016-05-04 11:19:37.614649827 +0300
- +++ linux-4.4.6-gentoo-patched/fs/xfs/xfs_aops.c 2016-05-04 11:03:27.410730745 +0300
- @@ -382,7 +382,7 @@
- atomic_inc(&ioend->io_remaining);
- bio->bi_private = ioend;
- bio->bi_end_io = xfs_end_bio;
- - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
- + submit_bio(wbc_to_write_cmd(wbc), bio);
- }
- STATIC struct bio *
- diff -Naur linux-4.4.6-gentoo-orig/include/linux/backing-dev-defs.h linux-4.4.6-gentoo-patched/include/linux/backing-dev-defs.h
- --- linux-4.4.6-gentoo-orig/include/linux/backing-dev-defs.h 2016-05-04 11:19:37.615649827 +0300
- +++ linux-4.4.6-gentoo-patched/include/linux/backing-dev-defs.h 2016-05-04 11:03:27.410730745 +0300
- @@ -116,6 +116,8 @@
- struct list_head work_list;
- struct delayed_work dwork; /* work item used for writeback */
- + atomic_t dirty_sleeping; /* waiting on dirty limit exceeded */
- +
- struct list_head bdi_node; /* anchored at bdi->wb_list */
- #ifdef CONFIG_CGROUP_WRITEBACK
- diff -Naur linux-4.4.6-gentoo-orig/include/linux/blkdev.h linux-4.4.6-gentoo-patched/include/linux/blkdev.h
- --- linux-4.4.6-gentoo-orig/include/linux/blkdev.h 2016-05-04 11:19:37.615649827 +0300
- +++ linux-4.4.6-gentoo-patched/include/linux/blkdev.h 2016-05-04 11:03:27.410730745 +0300
- @@ -23,6 +23,7 @@
- #include <linux/rcupdate.h>
- #include <linux/percpu-refcount.h>
- #include <linux/scatterlist.h>
- +#include <linux/wbt.h>
- struct module;
- struct scsi_ioctl_command;
- @@ -36,6 +37,7 @@
- struct blkcg_gq;
- struct blk_flush_queue;
- struct pr_ops;
- +struct rq_wb;
- #define BLKDEV_MIN_RQ 4
- #define BLKDEV_MAX_RQ 128 /* Default maximum */
- @@ -152,6 +154,7 @@
- struct gendisk *rq_disk;
- struct hd_struct *part;
- unsigned long start_time;
- + struct wb_issue_stat wb_stat;
- #ifdef CONFIG_BLK_CGROUP
- struct request_list *rl; /* rl this rq is alloced from */
- unsigned long long start_time_ns;
- @@ -289,6 +292,8 @@
- int nr_rqs[2]; /* # allocated [a]sync rqs */
- int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
- + struct rq_wb *rq_wb;
- +
- /*
- * If blkcg is not used, @q->root_rl serves all requests. If blkcg
- * is used, root blkg allocates from @q->root_rl and all other
- @@ -314,6 +319,8 @@
- struct blk_mq_ctx __percpu *queue_ctx;
- unsigned int nr_queues;
- + unsigned int queue_depth;
- +
- /* hw dispatch queues */
- struct blk_mq_hw_ctx **queue_hw_ctx;
- unsigned int nr_hw_queues;
- @@ -399,6 +406,9 @@
- unsigned int nr_sorted;
- unsigned int in_flight[2];
- +
- + struct blk_rq_stat rq_stats[2];
- +
- /*
- * Number of active block driver functions for which blk_drain_queue()
- * must wait. Must be incremented around functions that unlock the
- @@ -431,8 +441,6 @@
- /*
- * for flush operations
- */
- - unsigned int flush_flags;
- - unsigned int flush_not_queueable:1;
- struct blk_flush_queue *fq;
- struct list_head requeue_list;
- @@ -489,6 +497,9 @@
- #define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */
- #define QUEUE_FLAG_NO_SG_MERGE 21 /* don't attempt to merge SG segments*/
- #define QUEUE_FLAG_POLL 22 /* IO polling enabled if set */
- +#define QUEUE_FLAG_WC 23 /* Write back caching */
- +#define QUEUE_FLAG_FUA 24 /* device supports FUA writes */
- +#define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */
- #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
- (1 << QUEUE_FLAG_STACKABLE) | \
- @@ -677,6 +688,14 @@
- return false;
- }
- +static inline unsigned int blk_queue_depth(struct request_queue *q)
- +{
- + if (q->queue_depth)
- + return q->queue_depth;
- +
- + return q->nr_requests;
- +}
- +
- /*
- * q->prep_rq_fn return values
- */
- @@ -977,6 +996,7 @@
- extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
- extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
- extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
- +extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
- extern void blk_set_default_limits(struct queue_limits *lim);
- extern void blk_set_stacking_limits(struct queue_limits *lim);
- extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
- @@ -1001,8 +1021,8 @@
- extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
- extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
- extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
- -extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
- extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
- +extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
- extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
- extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
- @@ -1355,7 +1375,7 @@
- static inline bool queue_flush_queueable(struct request_queue *q)
- {
- - return !q->flush_not_queueable;
- + return !test_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags);
- }
- typedef struct {struct page *v;} Sector;
- diff -Naur linux-4.4.6-gentoo-orig/include/linux/blk_types.h linux-4.4.6-gentoo-patched/include/linux/blk_types.h
- --- linux-4.4.6-gentoo-orig/include/linux/blk_types.h 2016-05-04 11:19:37.616649827 +0300
- +++ linux-4.4.6-gentoo-patched/include/linux/blk_types.h 2016-05-04 11:03:27.410730745 +0300
- @@ -161,6 +161,7 @@
- __REQ_INTEGRITY, /* I/O includes block integrity payload */
- __REQ_FUA, /* forced unit access */
- __REQ_FLUSH, /* request for cache flush */
- + __REQ_BG, /* background activity */
- /* bio only flags */
- __REQ_RAHEAD, /* read ahead, can fail anytime */
- @@ -209,7 +210,7 @@
- #define REQ_COMMON_MASK \
- (REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | \
- REQ_DISCARD | REQ_WRITE_SAME | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | \
- - REQ_SECURE | REQ_INTEGRITY)
- + REQ_SECURE | REQ_INTEGRITY | REQ_BG)
- #define REQ_CLONE_MASK REQ_COMMON_MASK
- #define BIO_NO_ADVANCE_ITER_MASK (REQ_DISCARD|REQ_WRITE_SAME)
- @@ -236,6 +237,7 @@
- #define REQ_COPY_USER (1ULL << __REQ_COPY_USER)
- #define REQ_FLUSH (1ULL << __REQ_FLUSH)
- #define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ)
- +#define REQ_BG (1ULL << __REQ_BG)
- #define REQ_IO_STAT (1ULL << __REQ_IO_STAT)
- #define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE)
- #define REQ_SECURE (1ULL << __REQ_SECURE)
- @@ -268,4 +270,12 @@
- return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
- }
- +struct blk_rq_stat {
- + s64 mean;
- + u64 min;
- + u64 max;
- + s64 nr_samples;
- + s64 time;
- +};
- +
- #endif /* __LINUX_BLK_TYPES_H */
- diff -Naur linux-4.4.6-gentoo-orig/include/linux/fs.h linux-4.4.6-gentoo-patched/include/linux/fs.h
- --- linux-4.4.6-gentoo-orig/include/linux/fs.h 2016-05-04 11:19:37.616649827 +0300
- +++ linux-4.4.6-gentoo-patched/include/linux/fs.h 2016-05-04 11:03:27.411730745 +0300
- @@ -189,6 +189,9 @@
- * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded
- * by a cache flush and data is guaranteed to be on
- * non-volatile media on completion.
- + * WRITE_BG Background write. This is for background activity like
- + * the periodic flush and background threshold writeback
- + *
- *
- */
- #define RW_MASK REQ_WRITE
- @@ -204,6 +207,7 @@
- #define WRITE_FLUSH (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH)
- #define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA)
- #define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
- +#define WRITE_BG (WRITE | REQ_NOIDLE | REQ_BG)
- /*
- * Attribute flags. These should be or-ed together to figure out what
- diff -Naur linux-4.4.6-gentoo-orig/include/linux/sched.h.orig linux-4.4.6-gentoo-patched/include/linux/sched.h.orig
- --- linux-4.4.6-gentoo-orig/include/linux/sched.h.orig 2016-05-04 11:19:37.618649827 +0300
- +++ linux-4.4.6-gentoo-patched/include/linux/sched.h.orig 1970-01-01 03:00:00.000000000 +0300
- @@ -1,3194 +0,0 @@
- -#ifndef _LINUX_SCHED_H
- -#define _LINUX_SCHED_H
- -
- -#include <uapi/linux/sched.h>
- -
- -#include <linux/sched/prio.h>
- -
- -
- -struct sched_param {
- - int sched_priority;
- -};
- -
- -#include <asm/param.h> /* for HZ */
- -
- -#include <linux/capability.h>
- -#include <linux/threads.h>
- -#include <linux/kernel.h>
- -#include <linux/types.h>
- -#include <linux/timex.h>
- -#include <linux/jiffies.h>
- -#include <linux/plist.h>
- -#include <linux/rbtree.h>
- -#include <linux/thread_info.h>
- -#include <linux/cpumask.h>
- -#include <linux/errno.h>
- -#include <linux/nodemask.h>
- -#include <linux/mm_types.h>
- -#include <linux/preempt.h>
- -
- -#include <asm/page.h>
- -#include <asm/ptrace.h>
- -#include <linux/cputime.h>
- -
- -#include <linux/smp.h>
- -#include <linux/sem.h>
- -#include <linux/shm.h>
- -#include <linux/signal.h>
- -#include <linux/compiler.h>
- -#include <linux/completion.h>
- -#include <linux/pid.h>
- -#include <linux/percpu.h>
- -#include <linux/topology.h>
- -#include <linux/proportions.h>
- -#include <linux/seccomp.h>
- -#include <linux/rcupdate.h>
- -#include <linux/rculist.h>
- -#include <linux/rtmutex.h>
- -
- -#include <linux/time.h>
- -#include <linux/param.h>
- -#include <linux/resource.h>
- -#include <linux/timer.h>
- -#include <linux/hrtimer.h>
- -#include <linux/task_io_accounting.h>
- -#include <linux/latencytop.h>
- -#include <linux/cred.h>
- -#include <linux/llist.h>
- -#include <linux/uidgid.h>
- -#include <linux/gfp.h>
- -#include <linux/magic.h>
- -#include <linux/cgroup-defs.h>
- -
- -#include <asm/processor.h>
- -
- -#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
- -
- -/*
- - * Extended scheduling parameters data structure.
- - *
- - * This is needed because the original struct sched_param can not be
- - * altered without introducing ABI issues with legacy applications
- - * (e.g., in sched_getparam()).
- - *
- - * However, the possibility of specifying more than just a priority for
- - * the tasks may be useful for a wide variety of application fields, e.g.,
- - * multimedia, streaming, automation and control, and many others.
- - *
- - * This variant (sched_attr) is meant at describing a so-called
- - * sporadic time-constrained task. In such model a task is specified by:
- - * - the activation period or minimum instance inter-arrival time;
- - * - the maximum (or average, depending on the actual scheduling
- - * discipline) computation time of all instances, a.k.a. runtime;
- - * - the deadline (relative to the actual activation time) of each
- - * instance.
- - * Very briefly, a periodic (sporadic) task asks for the execution of
- - * some specific computation --which is typically called an instance--
- - * (at most) every period. Moreover, each instance typically lasts no more
- - * than the runtime and must be completed by time instant t equal to
- - * the instance activation time + the deadline.
- - *
- - * This is reflected by the actual fields of the sched_attr structure:
- - *
- - * @size size of the structure, for fwd/bwd compat.
- - *
- - * @sched_policy task's scheduling policy
- - * @sched_flags for customizing the scheduler behaviour
- - * @sched_nice task's nice value (SCHED_NORMAL/BATCH)
- - * @sched_priority task's static priority (SCHED_FIFO/RR)
- - * @sched_deadline representative of the task's deadline
- - * @sched_runtime representative of the task's runtime
- - * @sched_period representative of the task's period
- - *
- - * Given this task model, there are a multiplicity of scheduling algorithms
- - * and policies, that can be used to ensure all the tasks will make their
- - * timing constraints.
- - *
- - * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
- - * only user of this new interface. More information about the algorithm
- - * available in the scheduling class file or in Documentation/.
- - */
- -struct sched_attr {
- - u32 size;
- -
- - u32 sched_policy;
- - u64 sched_flags;
- -
- - /* SCHED_NORMAL, SCHED_BATCH */
- - s32 sched_nice;
- -
- - /* SCHED_FIFO, SCHED_RR */
- - u32 sched_priority;
- -
- - /* SCHED_DEADLINE */
- - u64 sched_runtime;
- - u64 sched_deadline;
- - u64 sched_period;
- -};
- -
- -struct futex_pi_state;
- -struct robust_list_head;
- -struct bio_list;
- -struct fs_struct;
- -struct perf_event_context;
- -struct blk_plug;
- -struct filename;
- -struct nameidata;
- -
- -#define VMACACHE_BITS 2
- -#define VMACACHE_SIZE (1U << VMACACHE_BITS)
- -#define VMACACHE_MASK (VMACACHE_SIZE - 1)
- -
- -/*
- - * These are the constant used to fake the fixed-point load-average
- - * counting. Some notes:
- - * - 11 bit fractions expand to 22 bits by the multiplies: this gives
- - * a load-average precision of 10 bits integer + 11 bits fractional
- - * - if you want to count load-averages more often, you need more
- - * precision, or rounding will get you. With 2-second counting freq,
- - * the EXP_n values would be 1981, 2034 and 2043 if still using only
- - * 11 bit fractions.
- - */
- -extern unsigned long avenrun[]; /* Load averages */
- -extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
- -
- -#define FSHIFT 11 /* nr of bits of precision */
- -#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
- -#define LOAD_FREQ (5*HZ+1) /* 5 sec intervals */
- -#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
- -#define EXP_5 2014 /* 1/exp(5sec/5min) */
- -#define EXP_15 2037 /* 1/exp(5sec/15min) */
- -
- -#define CALC_LOAD(load,exp,n) \
- - load *= exp; \
- - load += n*(FIXED_1-exp); \
- - load >>= FSHIFT;
- -
- -extern unsigned long total_forks;
- -extern int nr_threads;
- -DECLARE_PER_CPU(unsigned long, process_counts);
- -extern int nr_processes(void);
- -extern unsigned long nr_running(void);
- -extern bool single_task_running(void);
- -extern unsigned long nr_iowait(void);
- -extern unsigned long nr_iowait_cpu(int cpu);
- -extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
- -
- -extern void calc_global_load(unsigned long ticks);
- -
- -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
- -extern void update_cpu_load_nohz(void);
- -#else
- -static inline void update_cpu_load_nohz(void) { }
- -#endif
- -
- -extern unsigned long get_parent_ip(unsigned long addr);
- -
- -extern void dump_cpu_task(int cpu);
- -
- -struct seq_file;
- -struct cfs_rq;
- -struct task_group;
- -#ifdef CONFIG_SCHED_DEBUG
- -extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
- -extern void proc_sched_set_task(struct task_struct *p);
- -#endif
- -
- -/*
- - * Task state bitmask. NOTE! These bits are also
- - * encoded in fs/proc/array.c: get_task_state().
- - *
- - * We have two separate sets of flags: task->state
- - * is about runnability, while task->exit_state are
- - * about the task exiting. Confusing, but this way
- - * modifying one set can't modify the other one by
- - * mistake.
- - */
- -#define TASK_RUNNING 0
- -#define TASK_INTERRUPTIBLE 1
- -#define TASK_UNINTERRUPTIBLE 2
- -#define __TASK_STOPPED 4
- -#define __TASK_TRACED 8
- -/* in tsk->exit_state */
- -#define EXIT_DEAD 16
- -#define EXIT_ZOMBIE 32
- -#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD)
- -/* in tsk->state again */
- -#define TASK_DEAD 64
- -#define TASK_WAKEKILL 128
- -#define TASK_WAKING 256
- -#define TASK_PARKED 512
- -#define TASK_NOLOAD 1024
- -#define TASK_STATE_MAX 2048
- -
- -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
- -
- -extern char ___assert_task_state[1 - 2*!!(
- - sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
- -
- -/* Convenience macros for the sake of set_task_state */
- -#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
- -#define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED)
- -#define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED)
- -
- -#define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
- -
- -/* Convenience macros for the sake of wake_up */
- -#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
- -#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
- -
- -/* get_task_state() */
- -#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
- - TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
- - __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
- -
- -#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
- -#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
- -#define task_is_stopped_or_traced(task) \
- - ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
- -#define task_contributes_to_load(task) \
- - ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
- - (task->flags & PF_FROZEN) == 0 && \
- - (task->state & TASK_NOLOAD) == 0)
- -
- -#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
- -
- -#define __set_task_state(tsk, state_value) \
- - do { \
- - (tsk)->task_state_change = _THIS_IP_; \
- - (tsk)->state = (state_value); \
- - } while (0)
- -#define set_task_state(tsk, state_value) \
- - do { \
- - (tsk)->task_state_change = _THIS_IP_; \
- - smp_store_mb((tsk)->state, (state_value)); \
- - } while (0)
- -
- -/*
- - * set_current_state() includes a barrier so that the write of current->state
- - * is correctly serialised wrt the caller's subsequent test of whether to
- - * actually sleep:
- - *
- - * set_current_state(TASK_UNINTERRUPTIBLE);
- - * if (do_i_need_to_sleep())
- - * schedule();
- - *
- - * If the caller does not need such serialisation then use __set_current_state()
- - */
- -#define __set_current_state(state_value) \
- - do { \
- - current->task_state_change = _THIS_IP_; \
- - current->state = (state_value); \
- - } while (0)
- -#define set_current_state(state_value) \
- - do { \
- - current->task_state_change = _THIS_IP_; \
- - smp_store_mb(current->state, (state_value)); \
- - } while (0)
- -
- -#else
- -
- -#define __set_task_state(tsk, state_value) \
- - do { (tsk)->state = (state_value); } while (0)
- -#define set_task_state(tsk, state_value) \
- - smp_store_mb((tsk)->state, (state_value))
- -
- -/*
- - * set_current_state() includes a barrier so that the write of current->state
- - * is correctly serialised wrt the caller's subsequent test of whether to
- - * actually sleep:
- - *
- - * set_current_state(TASK_UNINTERRUPTIBLE);
- - * if (do_i_need_to_sleep())
- - * schedule();
- - *
- - * If the caller does not need such serialisation then use __set_current_state()
- - */
- -#define __set_current_state(state_value) \
- - do { current->state = (state_value); } while (0)
- -#define set_current_state(state_value) \
- - smp_store_mb(current->state, (state_value))
- -
- -#endif
- -
- -/* Task command name length */
- -#define TASK_COMM_LEN 16
- -
- -#include <linux/spinlock.h>
- -
- -/*
- - * This serializes "schedule()" and also protects
- - * the run-queue from deletions/modifications (but
- - * _adding_ to the beginning of the run-queue has
- - * a separate lock).
- - */
- -extern rwlock_t tasklist_lock;
- -extern spinlock_t mmlist_lock;
- -
- -struct task_struct;
- -
- -#ifdef CONFIG_PROVE_RCU
- -extern int lockdep_tasklist_lock_is_held(void);
- -#endif /* #ifdef CONFIG_PROVE_RCU */
- -
- -extern void sched_init(void);
- -extern void sched_init_smp(void);
- -extern asmlinkage void schedule_tail(struct task_struct *prev);
- -extern void init_idle(struct task_struct *idle, int cpu);
- -extern void init_idle_bootup_task(struct task_struct *idle);
- -
- -extern cpumask_var_t cpu_isolated_map;
- -
- -extern int runqueue_is_locked(int cpu);
- -
- -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
- -extern void nohz_balance_enter_idle(int cpu);
- -extern void set_cpu_sd_state_idle(void);
- -extern int get_nohz_timer_target(void);
- -#else
- -static inline void nohz_balance_enter_idle(int cpu) { }
- -static inline void set_cpu_sd_state_idle(void) { }
- -#endif
- -
- -/*
- - * Only dump TASK_* tasks. (0 for all tasks)
- - */
- -extern void show_state_filter(unsigned long state_filter);
- -
- -static inline void show_state(void)
- -{
- - show_state_filter(0);
- -}
- -
- -extern void show_regs(struct pt_regs *);
- -
- -/*
- - * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
- - * task), SP is the stack pointer of the first frame that should be shown in the back
- - * trace (or NULL if the entire call-chain of the task should be shown).
- - */
- -extern void show_stack(struct task_struct *task, unsigned long *sp);
- -
- -extern void cpu_init (void);
- -extern void trap_init(void);
- -extern void update_process_times(int user);
- -extern void scheduler_tick(void);
- -
- -extern void sched_show_task(struct task_struct *p);
- -
- -#ifdef CONFIG_LOCKUP_DETECTOR
- -extern void touch_softlockup_watchdog(void);
- -extern void touch_softlockup_watchdog_sync(void);
- -extern void touch_all_softlockup_watchdogs(void);
- -extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
- - void __user *buffer,
- - size_t *lenp, loff_t *ppos);
- -extern unsigned int softlockup_panic;
- -extern unsigned int hardlockup_panic;
- -void lockup_detector_init(void);
- -#else
- -static inline void touch_softlockup_watchdog(void)
- -{
- -}
- -static inline void touch_softlockup_watchdog_sync(void)
- -{
- -}
- -static inline void touch_all_softlockup_watchdogs(void)
- -{
- -}
- -static inline void lockup_detector_init(void)
- -{
- -}
- -#endif
- -
- -#ifdef CONFIG_DETECT_HUNG_TASK
- -void reset_hung_task_detector(void);
- -#else
- -static inline void reset_hung_task_detector(void)
- -{
- -}
- -#endif
- -
- -/* Attach to any functions which should be ignored in wchan output. */
- -#define __sched __attribute__((__section__(".sched.text")))
- -
- -/* Linker adds these: start and end of __sched functions */
- -extern char __sched_text_start[], __sched_text_end[];
- -
- -/* Is this address in the __sched functions? */
- -extern int in_sched_functions(unsigned long addr);
- -
- -#define MAX_SCHEDULE_TIMEOUT LONG_MAX
- -extern signed long schedule_timeout(signed long timeout);
- -extern signed long schedule_timeout_interruptible(signed long timeout);
- -extern signed long schedule_timeout_killable(signed long timeout);
- -extern signed long schedule_timeout_uninterruptible(signed long timeout);
- -asmlinkage void schedule(void);
- -extern void schedule_preempt_disabled(void);
- -
- -extern long io_schedule_timeout(long timeout);
- -
- -static inline void io_schedule(void)
- -{
- - io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
- -}
- -
- -struct nsproxy;
- -struct user_namespace;
- -
- -#ifdef CONFIG_MMU
- -extern void arch_pick_mmap_layout(struct mm_struct *mm);
- -extern unsigned long
- -arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
- - unsigned long, unsigned long);
- -extern unsigned long
- -arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
- - unsigned long len, unsigned long pgoff,
- - unsigned long flags);
- -#else
- -static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
- -#endif
- -
- -#define SUID_DUMP_DISABLE 0 /* No setuid dumping */
- -#define SUID_DUMP_USER 1 /* Dump as user of process */
- -#define SUID_DUMP_ROOT 2 /* Dump as root */
- -
- -/* mm flags */
- -
- -/* for SUID_DUMP_* above */
- -#define MMF_DUMPABLE_BITS 2
- -#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
- -
- -extern void set_dumpable(struct mm_struct *mm, int value);
- -/*
- - * This returns the actual value of the suid_dumpable flag. For things
- - * that are using this for checking for privilege transitions, it must
- - * test against SUID_DUMP_USER rather than treating it as a boolean
- - * value.
- - */
- -static inline int __get_dumpable(unsigned long mm_flags)
- -{
- - return mm_flags & MMF_DUMPABLE_MASK;
- -}
- -
- -static inline int get_dumpable(struct mm_struct *mm)
- -{
- - return __get_dumpable(mm->flags);
- -}
- -
- -/* coredump filter bits */
- -#define MMF_DUMP_ANON_PRIVATE 2
- -#define MMF_DUMP_ANON_SHARED 3
- -#define MMF_DUMP_MAPPED_PRIVATE 4
- -#define MMF_DUMP_MAPPED_SHARED 5
- -#define MMF_DUMP_ELF_HEADERS 6
- -#define MMF_DUMP_HUGETLB_PRIVATE 7
- -#define MMF_DUMP_HUGETLB_SHARED 8
- -#define MMF_DUMP_DAX_PRIVATE 9
- -#define MMF_DUMP_DAX_SHARED 10
- -
- -#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS
- -#define MMF_DUMP_FILTER_BITS 9
- -#define MMF_DUMP_FILTER_MASK \
- - (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
- -#define MMF_DUMP_FILTER_DEFAULT \
- - ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\
- - (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
- -
- -#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
- -# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS)
- -#else
- -# define MMF_DUMP_MASK_DEFAULT_ELF 0
- -#endif
- - /* leave room for more dump flags */
- -#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
- -#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */
- -#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */
- -
- -#define MMF_HAS_UPROBES 19 /* has uprobes */
- -#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
- -
- -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
- -
- -struct sighand_struct {
- - atomic_t count;
- - struct k_sigaction action[_NSIG];
- - spinlock_t siglock;
- - wait_queue_head_t signalfd_wqh;
- -};
- -
- -struct pacct_struct {
- - int ac_flag;
- - long ac_exitcode;
- - unsigned long ac_mem;
- - cputime_t ac_utime, ac_stime;
- - unsigned long ac_minflt, ac_majflt;
- -};
- -
- -struct cpu_itimer {
- - cputime_t expires;
- - cputime_t incr;
- - u32 error;
- - u32 incr_error;
- -};
- -
- -/**
- - * struct prev_cputime - snaphsot of system and user cputime
- - * @utime: time spent in user mode
- - * @stime: time spent in system mode
- - * @lock: protects the above two fields
- - *
- - * Stores previous user/system time values such that we can guarantee
- - * monotonicity.
- - */
- -struct prev_cputime {
- -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- - cputime_t utime;
- - cputime_t stime;
- - raw_spinlock_t lock;
- -#endif
- -};
- -
- -static inline void prev_cputime_init(struct prev_cputime *prev)
- -{
- -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- - prev->utime = prev->stime = 0;
- - raw_spin_lock_init(&prev->lock);
- -#endif
- -}
- -
- -/**
- - * struct task_cputime - collected CPU time counts
- - * @utime: time spent in user mode, in &cputime_t units
- - * @stime: time spent in kernel mode, in &cputime_t units
- - * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
- - *
- - * This structure groups together three kinds of CPU time that are tracked for
- - * threads and thread groups. Most things considering CPU time want to group
- - * these counts together and treat all three of them in parallel.
- - */
- -struct task_cputime {
- - cputime_t utime;
- - cputime_t stime;
- - unsigned long long sum_exec_runtime;
- -};
- -
- -/* Alternate field names when used to cache expirations. */
- -#define virt_exp utime
- -#define prof_exp stime
- -#define sched_exp sum_exec_runtime
- -
- -#define INIT_CPUTIME \
- - (struct task_cputime) { \
- - .utime = 0, \
- - .stime = 0, \
- - .sum_exec_runtime = 0, \
- - }
- -
- -/*
- - * This is the atomic variant of task_cputime, which can be used for
- - * storing and updating task_cputime statistics without locking.
- - */
- -struct task_cputime_atomic {
- - atomic64_t utime;
- - atomic64_t stime;
- - atomic64_t sum_exec_runtime;
- -};
- -
- -#define INIT_CPUTIME_ATOMIC \
- - (struct task_cputime_atomic) { \
- - .utime = ATOMIC64_INIT(0), \
- - .stime = ATOMIC64_INIT(0), \
- - .sum_exec_runtime = ATOMIC64_INIT(0), \
- - }
- -
- -#define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
- -
- -/*
- - * Disable preemption until the scheduler is running -- use an unconditional
- - * value so that it also works on !PREEMPT_COUNT kernels.
- - *
- - * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
- - */
- -#define INIT_PREEMPT_COUNT PREEMPT_OFFSET
- -
- -/*
- - * Initial preempt_count value; reflects the preempt_count schedule invariant
- - * which states that during context switches:
- - *
- - * preempt_count() == 2*PREEMPT_DISABLE_OFFSET
- - *
- - * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
- - * Note: See finish_task_switch().
- - */
- -#define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
- -
- -/**
- - * struct thread_group_cputimer - thread group interval timer counts
- - * @cputime_atomic: atomic thread group interval timers.
- - * @running: true when there are timers running and
- - * @cputime_atomic receives updates.
- - * @checking_timer: true when a thread in the group is in the
- - * process of checking for thread group timers.
- - *
- - * This structure contains the version of task_cputime, above, that is
- - * used for thread group CPU timer calculations.
- - */
- -struct thread_group_cputimer {
- - struct task_cputime_atomic cputime_atomic;
- - bool running;
- - bool checking_timer;
- -};
- -
- -#include <linux/rwsem.h>
- -struct autogroup;
- -
- -/*
- - * NOTE! "signal_struct" does not have its own
- - * locking, because a shared signal_struct always
- - * implies a shared sighand_struct, so locking
- - * sighand_struct is always a proper superset of
- - * the locking of signal_struct.
- - */
- -struct signal_struct {
- - atomic_t sigcnt;
- - atomic_t live;
- - int nr_threads;
- - struct list_head thread_head;
- -
- - wait_queue_head_t wait_chldexit; /* for wait4() */
- -
- - /* current thread group signal load-balancing target: */
- - struct task_struct *curr_target;
- -
- - /* shared signal handling: */
- - struct sigpending shared_pending;
- -
- - /* thread group exit support */
- - int group_exit_code;
- - /* overloaded:
- - * - notify group_exit_task when ->count is equal to notify_count
- - * - everyone except group_exit_task is stopped during signal delivery
- - * of fatal signals, group_exit_task processes the signal.
- - */
- - int notify_count;
- - struct task_struct *group_exit_task;
- -
- - /* thread group stop support, overloads group_exit_code too */
- - int group_stop_count;
- - unsigned int flags; /* see SIGNAL_* flags below */
- -
- - /*
- - * PR_SET_CHILD_SUBREAPER marks a process, like a service
- - * manager, to re-parent orphan (double-forking) child processes
- - * to this process instead of 'init'. The service manager is
- - * able to receive SIGCHLD signals and is able to investigate
- - * the process until it calls wait(). All children of this
- - * process will inherit a flag if they should look for a
- - * child_subreaper process at exit.
- - */
- - unsigned int is_child_subreaper:1;
- - unsigned int has_child_subreaper:1;
- -
- - /* POSIX.1b Interval Timers */
- - int posix_timer_id;
- - struct list_head posix_timers;
- -
- - /* ITIMER_REAL timer for the process */
- - struct hrtimer real_timer;
- - struct pid *leader_pid;
- - ktime_t it_real_incr;
- -
- - /*
- - * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
- - * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
- - * values are defined to 0 and 1 respectively
- - */
- - struct cpu_itimer it[2];
- -
- - /*
- - * Thread group totals for process CPU timers.
- - * See thread_group_cputimer(), et al, for details.
- - */
- - struct thread_group_cputimer cputimer;
- -
- - /* Earliest-expiration cache. */
- - struct task_cputime cputime_expires;
- -
- - struct list_head cpu_timers[3];
- -
- - struct pid *tty_old_pgrp;
- -
- - /* boolean value for session group leader */
- - int leader;
- -
- - struct tty_struct *tty; /* NULL if no tty */
- -
- -#ifdef CONFIG_SCHED_AUTOGROUP
- - struct autogroup *autogroup;
- -#endif
- - /*
- - * Cumulative resource counters for dead threads in the group,
- - * and for reaped dead child processes forked by this group.
- - * Live threads maintain their own counters and add to these
- - * in __exit_signal, except for the group leader.
- - */
- - seqlock_t stats_lock;
- - cputime_t utime, stime, cutime, cstime;
- - cputime_t gtime;
- - cputime_t cgtime;
- - struct prev_cputime prev_cputime;
- - unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
- - unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
- - unsigned long inblock, oublock, cinblock, coublock;
- - unsigned long maxrss, cmaxrss;
- - struct task_io_accounting ioac;
- -
- - /*
- - * Cumulative ns of schedule CPU time fo dead threads in the
- - * group, not including a zombie group leader, (This only differs
- - * from jiffies_to_ns(utime + stime) if sched_clock uses something
- - * other than jiffies.)
- - */
- - unsigned long long sum_sched_runtime;
- -
- - /*
- - * We don't bother to synchronize most readers of this at all,
- - * because there is no reader checking a limit that actually needs
- - * to get both rlim_cur and rlim_max atomically, and either one
- - * alone is a single word that can safely be read normally.
- - * getrlimit/setrlimit use task_lock(current->group_leader) to
- - * protect this instead of the siglock, because they really
- - * have no need to disable irqs.
- - */
- - struct rlimit rlim[RLIM_NLIMITS];
- -
- -#ifdef CONFIG_BSD_PROCESS_ACCT
- - struct pacct_struct pacct; /* per-process accounting information */
- -#endif
- -#ifdef CONFIG_TASKSTATS
- - struct taskstats *stats;
- -#endif
- -#ifdef CONFIG_AUDIT
- - unsigned audit_tty;
- - unsigned audit_tty_log_passwd;
- - struct tty_audit_buf *tty_audit_buf;
- -#endif
- -
- - oom_flags_t oom_flags;
- - short oom_score_adj; /* OOM kill score adjustment */
- - short oom_score_adj_min; /* OOM kill score adjustment min value.
- - * Only settable by CAP_SYS_RESOURCE. */
- -
- - struct mutex cred_guard_mutex; /* guard against foreign influences on
- - * credential calculations
- - * (notably. ptrace) */
- -};
- -
- -/*
- - * Bits in flags field of signal_struct.
- - */
- -#define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */
- -#define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */
- -#define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */
- -#define SIGNAL_GROUP_COREDUMP 0x00000008 /* coredump in progress */
- -/*
- - * Pending notifications to parent.
- - */
- -#define SIGNAL_CLD_STOPPED 0x00000010
- -#define SIGNAL_CLD_CONTINUED 0x00000020
- -#define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
- -
- -#define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */
- -
- -/* If true, all threads except ->group_exit_task have pending SIGKILL */
- -static inline int signal_group_exit(const struct signal_struct *sig)
- -{
- - return (sig->flags & SIGNAL_GROUP_EXIT) ||
- - (sig->group_exit_task != NULL);
- -}
- -
- -/*
- - * Some day this will be a full-fledged user tracking system..
- - */
- -struct user_struct {
- - atomic_t __count; /* reference count */
- - atomic_t processes; /* How many processes does this user have? */
- - atomic_t sigpending; /* How many pending signals does this user have? */
- -#ifdef CONFIG_INOTIFY_USER
- - atomic_t inotify_watches; /* How many inotify watches does this user have? */
- - atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
- -#endif
- -#ifdef CONFIG_FANOTIFY
- - atomic_t fanotify_listeners;
- -#endif
- -#ifdef CONFIG_EPOLL
- - atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
- -#endif
- -#ifdef CONFIG_POSIX_MQUEUE
- - /* protected by mq_lock */
- - unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
- -#endif
- - unsigned long locked_shm; /* How many pages of mlocked shm ? */
- - unsigned long unix_inflight; /* How many files in flight in unix sockets */
- -
- -#ifdef CONFIG_KEYS
- - struct key *uid_keyring; /* UID specific keyring */
- - struct key *session_keyring; /* UID's default session keyring */
- -#endif
- -
- - /* Hash table maintenance information */
- - struct hlist_node uidhash_node;
- - kuid_t uid;
- -
- -#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL)
- - atomic_long_t locked_vm;
- -#endif
- -};
- -
- -extern int uids_sysfs_init(void);
- -
- -extern struct user_struct *find_user(kuid_t);
- -
- -extern struct user_struct root_user;
- -#define INIT_USER (&root_user)
- -
- -
- -struct backing_dev_info;
- -struct reclaim_state;
- -
- -#ifdef CONFIG_SCHED_INFO
- -struct sched_info {
- - /* cumulative counters */
- - unsigned long pcount; /* # of times run on this cpu */
- - unsigned long long run_delay; /* time spent waiting on a runqueue */
- -
- - /* timestamps */
- - unsigned long long last_arrival,/* when we last ran on a cpu */
- - last_queued; /* when we were last queued to run */
- -};
- -#endif /* CONFIG_SCHED_INFO */
- -
- -#ifdef CONFIG_TASK_DELAY_ACCT
- -struct task_delay_info {
- - spinlock_t lock;
- - unsigned int flags; /* Private per-task flags */
- -
- - /* For each stat XXX, add following, aligned appropriately
- - *
- - * struct timespec XXX_start, XXX_end;
- - * u64 XXX_delay;
- - * u32 XXX_count;
- - *
- - * Atomicity of updates to XXX_delay, XXX_count protected by
- - * single lock above (split into XXX_lock if contention is an issue).
- - */
- -
- - /*
- - * XXX_count is incremented on every XXX operation, the delay
- - * associated with the operation is added to XXX_delay.
- - * XXX_delay contains the accumulated delay time in nanoseconds.
- - */
- - u64 blkio_start; /* Shared by blkio, swapin */
- - u64 blkio_delay; /* wait for sync block io completion */
- - u64 swapin_delay; /* wait for swapin block io completion */
- - u32 blkio_count; /* total count of the number of sync block */
- - /* io operations performed */
- - u32 swapin_count; /* total count of the number of swapin block */
- - /* io operations performed */
- -
- - u64 freepages_start;
- - u64 freepages_delay; /* wait for memory reclaim */
- - u32 freepages_count; /* total count of memory reclaim */
- -};
- -#endif /* CONFIG_TASK_DELAY_ACCT */
- -
- -static inline int sched_info_on(void)
- -{
- -#ifdef CONFIG_SCHEDSTATS
- - return 1;
- -#elif defined(CONFIG_TASK_DELAY_ACCT)
- - extern int delayacct_on;
- - return delayacct_on;
- -#else
- - return 0;
- -#endif
- -}
- -
- -enum cpu_idle_type {
- - CPU_IDLE,
- - CPU_NOT_IDLE,
- - CPU_NEWLY_IDLE,
- - CPU_MAX_IDLE_TYPES
- -};
- -
- -/*
- - * Increase resolution of cpu_capacity calculations
- - */
- -#define SCHED_CAPACITY_SHIFT 10
- -#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
- -
- -/*
- - * Wake-queues are lists of tasks with a pending wakeup, whose
- - * callers have already marked the task as woken internally,
- - * and can thus carry on. A common use case is being able to
- - * do the wakeups once the corresponding user lock as been
- - * released.
- - *
- - * We hold reference to each task in the list across the wakeup,
- - * thus guaranteeing that the memory is still valid by the time
- - * the actual wakeups are performed in wake_up_q().
- - *
- - * One per task suffices, because there's never a need for a task to be
- - * in two wake queues simultaneously; it is forbidden to abandon a task
- - * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
- - * already in a wake queue, the wakeup will happen soon and the second
- - * waker can just skip it.
- - *
- - * The WAKE_Q macro declares and initializes the list head.
- - * wake_up_q() does NOT reinitialize the list; it's expected to be
- - * called near the end of a function, where the fact that the queue is
- - * not used again will be easy to see by inspection.
- - *
- - * Note that this can cause spurious wakeups. schedule() callers
- - * must ensure the call is done inside a loop, confirming that the
- - * wakeup condition has in fact occurred.
- - */
- -struct wake_q_node {
- - struct wake_q_node *next;
- -};
- -
- -struct wake_q_head {
- - struct wake_q_node *first;
- - struct wake_q_node **lastp;
- -};
- -
- -#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
- -
- -#define WAKE_Q(name) \
- - struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
- -
- -extern void wake_q_add(struct wake_q_head *head,
- - struct task_struct *task);
- -extern void wake_up_q(struct wake_q_head *head);
- -
- -/*
- - * sched-domains (multiprocessor balancing) declarations:
- - */
- -#ifdef CONFIG_SMP
- -#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */
- -#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
- -#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
- -#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
- -#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
- -#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
- -#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */
- -#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
- -#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
- -#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
- -#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
- -#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
- -#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
- -#define SD_NUMA 0x4000 /* cross-node balancing */
- -
- -#ifdef CONFIG_SCHED_SMT
- -static inline int cpu_smt_flags(void)
- -{
- - return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
- -}
- -#endif
- -
- -#ifdef CONFIG_SCHED_MC
- -static inline int cpu_core_flags(void)
- -{
- - return SD_SHARE_PKG_RESOURCES;
- -}
- -#endif
- -
- -#ifdef CONFIG_NUMA
- -static inline int cpu_numa_flags(void)
- -{
- - return SD_NUMA;
- -}
- -#endif
- -
- -struct sched_domain_attr {
- - int relax_domain_level;
- -};
- -
- -#define SD_ATTR_INIT (struct sched_domain_attr) { \
- - .relax_domain_level = -1, \
- -}
- -
- -extern int sched_domain_level_max;
- -
- -struct sched_group;
- -
- -struct sched_domain {
- - /* These fields must be setup */
- - struct sched_domain *parent; /* top domain must be null terminated */
- - struct sched_domain *child; /* bottom domain must be null terminated */
- - struct sched_group *groups; /* the balancing groups of the domain */
- - unsigned long min_interval; /* Minimum balance interval ms */
- - unsigned long max_interval; /* Maximum balance interval ms */
- - unsigned int busy_factor; /* less balancing by factor if busy */
- - unsigned int imbalance_pct; /* No balance until over watermark */
- - unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
- - unsigned int busy_idx;
- - unsigned int idle_idx;
- - unsigned int newidle_idx;
- - unsigned int wake_idx;
- - unsigned int forkexec_idx;
- - unsigned int smt_gain;
- -
- - int nohz_idle; /* NOHZ IDLE status */
- - int flags; /* See SD_* */
- - int level;
- -
- - /* Runtime fields. */
- - unsigned long last_balance; /* init to jiffies. units in jiffies */
- - unsigned int balance_interval; /* initialise to 1. units in ms. */
- - unsigned int nr_balance_failed; /* initialise to 0 */
- -
- - /* idle_balance() stats */
- - u64 max_newidle_lb_cost;
- - unsigned long next_decay_max_lb_cost;
- -
- -#ifdef CONFIG_SCHEDSTATS
- - /* load_balance() stats */
- - unsigned int lb_count[CPU_MAX_IDLE_TYPES];
- - unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
- - unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
- - unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
- - unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
- - unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
- - unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
- - unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
- -
- - /* Active load balancing */
- - unsigned int alb_count;
- - unsigned int alb_failed;
- - unsigned int alb_pushed;
- -
- - /* SD_BALANCE_EXEC stats */
- - unsigned int sbe_count;
- - unsigned int sbe_balanced;
- - unsigned int sbe_pushed;
- -
- - /* SD_BALANCE_FORK stats */
- - unsigned int sbf_count;
- - unsigned int sbf_balanced;
- - unsigned int sbf_pushed;
- -
- - /* try_to_wake_up() stats */
- - unsigned int ttwu_wake_remote;
- - unsigned int ttwu_move_affine;
- - unsigned int ttwu_move_balance;
- -#endif
- -#ifdef CONFIG_SCHED_DEBUG
- - char *name;
- -#endif
- - union {
- - void *private; /* used during construction */
- - struct rcu_head rcu; /* used during destruction */
- - };
- -
- - unsigned int span_weight;
- - /*
- - * Span of all CPUs in this domain.
- - *
- - * NOTE: this field is variable length. (Allocated dynamically
- - * by attaching extra space to the end of the structure,
- - * depending on how many CPUs the kernel has booted up with)
- - */
- - unsigned long span[0];
- -};
- -
- -static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
- -{
- - return to_cpumask(sd->span);
- -}
- -
- -extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- - struct sched_domain_attr *dattr_new);
- -
- -/* Allocate an array of sched domains, for partition_sched_domains(). */
- -cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
- -void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
- -
- -bool cpus_share_cache(int this_cpu, int that_cpu);
- -
- -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
- -typedef int (*sched_domain_flags_f)(void);
- -
- -#define SDTL_OVERLAP 0x01
- -
- -struct sd_data {
- - struct sched_domain **__percpu sd;
- - struct sched_group **__percpu sg;
- - struct sched_group_capacity **__percpu sgc;
- -};
- -
- -struct sched_domain_topology_level {
- - sched_domain_mask_f mask;
- - sched_domain_flags_f sd_flags;
- - int flags;
- - int numa_level;
- - struct sd_data data;
- -#ifdef CONFIG_SCHED_DEBUG
- - char *name;
- -#endif
- -};
- -
- -extern void set_sched_topology(struct sched_domain_topology_level *tl);
- -extern void wake_up_if_idle(int cpu);
- -
- -#ifdef CONFIG_SCHED_DEBUG
- -# define SD_INIT_NAME(type) .name = #type
- -#else
- -# define SD_INIT_NAME(type)
- -#endif
- -
- -#else /* CONFIG_SMP */
- -
- -struct sched_domain_attr;
- -
- -static inline void
- -partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- - struct sched_domain_attr *dattr_new)
- -{
- -}
- -
- -static inline bool cpus_share_cache(int this_cpu, int that_cpu)
- -{
- - return true;
- -}
- -
- -#endif /* !CONFIG_SMP */
- -
- -
- -struct io_context; /* See blkdev.h */
- -
- -
- -#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
- -extern void prefetch_stack(struct task_struct *t);
- -#else
- -static inline void prefetch_stack(struct task_struct *t) { }
- -#endif
- -
- -struct audit_context; /* See audit.c */
- -struct mempolicy;
- -struct pipe_inode_info;
- -struct uts_namespace;
- -
- -struct load_weight {
- - unsigned long weight;
- - u32 inv_weight;
- -};
- -
- -/*
- - * The load_avg/util_avg accumulates an infinite geometric series.
- - * 1) load_avg factors frequency scaling into the amount of time that a
- - * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
- - * aggregated such weights of all runnable and blocked sched_entities.
- - * 2) util_avg factors frequency and cpu scaling into the amount of time
- - * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
- - * For cfs_rq, it is the aggregated such times of all runnable and
- - * blocked sched_entities.
- - * The 64 bit load_sum can:
- - * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with
- - * the highest weight (=88761) always runnable, we should not overflow
- - * 2) for entity, support any load.weight always runnable
- - */
- -struct sched_avg {
- - u64 last_update_time, load_sum;
- - u32 util_sum, period_contrib;
- - unsigned long load_avg, util_avg;
- -};
- -
- -#ifdef CONFIG_SCHEDSTATS
- -struct sched_statistics {
- - u64 wait_start;
- - u64 wait_max;
- - u64 wait_count;
- - u64 wait_sum;
- - u64 iowait_count;
- - u64 iowait_sum;
- -
- - u64 sleep_start;
- - u64 sleep_max;
- - s64 sum_sleep_runtime;
- -
- - u64 block_start;
- - u64 block_max;
- - u64 exec_max;
- - u64 slice_max;
- -
- - u64 nr_migrations_cold;
- - u64 nr_failed_migrations_affine;
- - u64 nr_failed_migrations_running;
- - u64 nr_failed_migrations_hot;
- - u64 nr_forced_migrations;
- -
- - u64 nr_wakeups;
- - u64 nr_wakeups_sync;
- - u64 nr_wakeups_migrate;
- - u64 nr_wakeups_local;
- - u64 nr_wakeups_remote;
- - u64 nr_wakeups_affine;
- - u64 nr_wakeups_affine_attempts;
- - u64 nr_wakeups_passive;
- - u64 nr_wakeups_idle;
- -};
- -#endif
- -
- -struct sched_entity {
- - struct load_weight load; /* for load-balancing */
- - struct rb_node run_node;
- - struct list_head group_node;
- - unsigned int on_rq;
- -
- - u64 exec_start;
- - u64 sum_exec_runtime;
- - u64 vruntime;
- - u64 prev_sum_exec_runtime;
- -
- - u64 nr_migrations;
- -
- -#ifdef CONFIG_SCHEDSTATS
- - struct sched_statistics statistics;
- -#endif
- -
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- - int depth;
- - struct sched_entity *parent;
- - /* rq on which this entity is (to be) queued: */
- - struct cfs_rq *cfs_rq;
- - /* rq "owned" by this entity/group: */
- - struct cfs_rq *my_q;
- -#endif
- -
- -#ifdef CONFIG_SMP
- - /* Per entity load average tracking */
- - struct sched_avg avg;
- -#endif
- -};
- -
- -struct sched_rt_entity {
- - struct list_head run_list;
- - unsigned long timeout;
- - unsigned long watchdog_stamp;
- - unsigned int time_slice;
- -
- - struct sched_rt_entity *back;
- -#ifdef CONFIG_RT_GROUP_SCHED
- - struct sched_rt_entity *parent;
- - /* rq on which this entity is (to be) queued: */
- - struct rt_rq *rt_rq;
- - /* rq "owned" by this entity/group: */
- - struct rt_rq *my_q;
- -#endif
- -};
- -
- -struct sched_dl_entity {
- - struct rb_node rb_node;
- -
- - /*
- - * Original scheduling parameters. Copied here from sched_attr
- - * during sched_setattr(), they will remain the same until
- - * the next sched_setattr().
- - */
- - u64 dl_runtime; /* maximum runtime for each instance */
- - u64 dl_deadline; /* relative deadline of each instance */
- - u64 dl_period; /* separation of two instances (period) */
- - u64 dl_bw; /* dl_runtime / dl_deadline */
- -
- - /*
- - * Actual scheduling parameters. Initialized with the values above,
- - * they are continously updated during task execution. Note that
- - * the remaining runtime could be < 0 in case we are in overrun.
- - */
- - s64 runtime; /* remaining runtime for this instance */
- - u64 deadline; /* absolute deadline for this instance */
- - unsigned int flags; /* specifying the scheduler behaviour */
- -
- - /*
- - * Some bool flags:
- - *
- - * @dl_throttled tells if we exhausted the runtime. If so, the
- - * task has to wait for a replenishment to be performed at the
- - * next firing of dl_timer.
- - *
- - * @dl_new tells if a new instance arrived. If so we must
- - * start executing it with full runtime and reset its absolute
- - * deadline;
- - *
- - * @dl_boosted tells if we are boosted due to DI. If so we are
- - * outside bandwidth enforcement mechanism (but only until we
- - * exit the critical section);
- - *
- - * @dl_yielded tells if task gave up the cpu before consuming
- - * all its available runtime during the last job.
- - */
- - int dl_throttled, dl_new, dl_boosted, dl_yielded;
- -
- - /*
- - * Bandwidth enforcement timer. Each -deadline task has its
- - * own bandwidth to be enforced, thus we need one timer per task.
- - */
- - struct hrtimer dl_timer;
- -};
- -
- -union rcu_special {
- - struct {
- - u8 blocked;
- - u8 need_qs;
- - u8 exp_need_qs;
- - u8 pad; /* Otherwise the compiler can store garbage here. */
- - } b; /* Bits. */
- - u32 s; /* Set of bits. */
- -};
- -struct rcu_node;
- -
- -enum perf_event_task_context {
- - perf_invalid_context = -1,
- - perf_hw_context = 0,
- - perf_sw_context,
- - perf_nr_task_contexts,
- -};
- -
- -/* Track pages that require TLB flushes */
- -struct tlbflush_unmap_batch {
- - /*
- - * Each bit set is a CPU that potentially has a TLB entry for one of
- - * the PFNs being flushed. See set_tlb_ubc_flush_pending().
- - */
- - struct cpumask cpumask;
- -
- - /* True if any bit in cpumask is set */
- - bool flush_required;
- -
- - /*
- - * If true then the PTE was dirty when unmapped. The entry must be
- - * flushed before IO is initiated or a stale TLB entry potentially
- - * allows an update without redirtying the page.
- - */
- - bool writable;
- -};
- -
- -struct task_struct {
- - volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
- - void *stack;
- - atomic_t usage;
- - unsigned int flags; /* per process flags, defined below */
- - unsigned int ptrace;
- -
- -#ifdef CONFIG_SMP
- - struct llist_node wake_entry;
- - int on_cpu;
- - unsigned int wakee_flips;
- - unsigned long wakee_flip_decay_ts;
- - struct task_struct *last_wakee;
- -
- - int wake_cpu;
- -#endif
- - int on_rq;
- -
- - int prio, static_prio, normal_prio;
- - unsigned int rt_priority;
- - const struct sched_class *sched_class;
- - struct sched_entity se;
- - struct sched_rt_entity rt;
- -#ifdef CONFIG_CGROUP_SCHED
- - struct task_group *sched_task_group;
- -#endif
- - struct sched_dl_entity dl;
- -
- -#ifdef CONFIG_PREEMPT_NOTIFIERS
- - /* list of struct preempt_notifier: */
- - struct hlist_head preempt_notifiers;
- -#endif
- -
- -#ifdef CONFIG_BLK_DEV_IO_TRACE
- - unsigned int btrace_seq;
- -#endif
- -
- - unsigned int policy;
- - int nr_cpus_allowed;
- - cpumask_t cpus_allowed;
- -
- -#ifdef CONFIG_PREEMPT_RCU
- - int rcu_read_lock_nesting;
- - union rcu_special rcu_read_unlock_special;
- - struct list_head rcu_node_entry;
- - struct rcu_node *rcu_blocked_node;
- -#endif /* #ifdef CONFIG_PREEMPT_RCU */
- -#ifdef CONFIG_TASKS_RCU
- - unsigned long rcu_tasks_nvcsw;
- - bool rcu_tasks_holdout;
- - struct list_head rcu_tasks_holdout_list;
- - int rcu_tasks_idle_cpu;
- -#endif /* #ifdef CONFIG_TASKS_RCU */
- -
- -#ifdef CONFIG_SCHED_INFO
- - struct sched_info sched_info;
- -#endif
- -
- - struct list_head tasks;
- -#ifdef CONFIG_SMP
- - struct plist_node pushable_tasks;
- - struct rb_node pushable_dl_tasks;
- -#endif
- -
- - struct mm_struct *mm, *active_mm;
- - /* per-thread vma caching */
- - u32 vmacache_seqnum;
- - struct vm_area_struct *vmacache[VMACACHE_SIZE];
- -#if defined(SPLIT_RSS_COUNTING)
- - struct task_rss_stat rss_stat;
- -#endif
- -/* task state */
- - int exit_state;
- - int exit_code, exit_signal;
- - int pdeath_signal; /* The signal sent when the parent dies */
- - unsigned long jobctl; /* JOBCTL_*, siglock protected */
- -
- - /* Used for emulating ABI behavior of previous Linux versions */
- - unsigned int personality;
- -
- - /* scheduler bits, serialized by scheduler locks */
- - unsigned sched_reset_on_fork:1;
- - unsigned sched_contributes_to_load:1;
- - unsigned sched_migrated:1;
- - unsigned :0; /* force alignment to the next boundary */
- -
- - /* unserialized, strictly 'current' */
- - unsigned in_execve:1; /* bit to tell LSMs we're in execve */
- - unsigned in_iowait:1;
- -#ifdef CONFIG_MEMCG
- - unsigned memcg_may_oom:1;
- -#endif
- -#ifdef CONFIG_MEMCG_KMEM
- - unsigned memcg_kmem_skip_account:1;
- -#endif
- -#ifdef CONFIG_COMPAT_BRK
- - unsigned brk_randomized:1;
- -#endif
- -
- - unsigned long atomic_flags; /* Flags needing atomic access. */
- -
- - struct restart_block restart_block;
- -
- - pid_t pid;
- - pid_t tgid;
- -
- -#ifdef CONFIG_CC_STACKPROTECTOR
- - /* Canary value for the -fstack-protector gcc feature */
- - unsigned long stack_canary;
- -#endif
- - /*
- - * pointers to (original) parent process, youngest child, younger sibling,
- - * older sibling, respectively. (p->father can be replaced with
- - * p->real_parent->pid)
- - */
- - struct task_struct __rcu *real_parent; /* real parent process */
- - struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
- - /*
- - * children/sibling forms the list of my natural children
- - */
- - struct list_head children; /* list of my children */
- - struct list_head sibling; /* linkage in my parent's children list */
- - struct task_struct *group_leader; /* threadgroup leader */
- -
- - /*
- - * ptraced is the list of tasks this task is using ptrace on.
- - * This includes both natural children and PTRACE_ATTACH targets.
- - * p->ptrace_entry is p's link on the p->parent->ptraced list.
- - */
- - struct list_head ptraced;
- - struct list_head ptrace_entry;
- -
- - /* PID/PID hash table linkage. */
- - struct pid_link pids[PIDTYPE_MAX];
- - struct list_head thread_group;
- - struct list_head thread_node;
- -
- - struct completion *vfork_done; /* for vfork() */
- - int __user *set_child_tid; /* CLONE_CHILD_SETTID */
- - int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
- -
- - cputime_t utime, stime, utimescaled, stimescaled;
- - cputime_t gtime;
- - struct prev_cputime prev_cputime;
- -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- - seqlock_t vtime_seqlock;
- - unsigned long long vtime_snap;
- - enum {
- - VTIME_SLEEPING = 0,
- - VTIME_USER,
- - VTIME_SYS,
- - } vtime_snap_whence;
- -#endif
- - unsigned long nvcsw, nivcsw; /* context switch counts */
- - u64 start_time; /* monotonic time in nsec */
- - u64 real_start_time; /* boot based time in nsec */
- -/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
- - unsigned long min_flt, maj_flt;
- -
- - struct task_cputime cputime_expires;
- - struct list_head cpu_timers[3];
- -
- -/* process credentials */
- - const struct cred __rcu *real_cred; /* objective and real subjective task
- - * credentials (COW) */
- - const struct cred __rcu *cred; /* effective (overridable) subjective task
- - * credentials (COW) */
- - char comm[TASK_COMM_LEN]; /* executable name excluding path
- - - access with [gs]et_task_comm (which lock
- - it with task_lock())
- - - initialized normally by setup_new_exec */
- -/* file system info */
- - struct nameidata *nameidata;
- -#ifdef CONFIG_SYSVIPC
- -/* ipc stuff */
- - struct sysv_sem sysvsem;
- - struct sysv_shm sysvshm;
- -#endif
- -#ifdef CONFIG_DETECT_HUNG_TASK
- -/* hung task detection */
- - unsigned long last_switch_count;
- -#endif
- -/* filesystem information */
- - struct fs_struct *fs;
- -/* open file information */
- - struct files_struct *files;
- -/* namespaces */
- - struct nsproxy *nsproxy;
- -/* signal handlers */
- - struct signal_struct *signal;
- - struct sighand_struct *sighand;
- -
- - sigset_t blocked, real_blocked;
- - sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
- - struct sigpending pending;
- -
- - unsigned long sas_ss_sp;
- - size_t sas_ss_size;
- -
- - struct callback_head *task_works;
- -
- - struct audit_context *audit_context;
- -#ifdef CONFIG_AUDITSYSCALL
- - kuid_t loginuid;
- - unsigned int sessionid;
- -#endif
- - struct seccomp seccomp;
- -
- -/* Thread group tracking */
- - u32 parent_exec_id;
- - u32 self_exec_id;
- -/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
- - * mempolicy */
- - spinlock_t alloc_lock;
- -
- - /* Protection of the PI data structures: */
- - raw_spinlock_t pi_lock;
- -
- - struct wake_q_node wake_q;
- -
- -#ifdef CONFIG_RT_MUTEXES
- - /* PI waiters blocked on a rt_mutex held by this task */
- - struct rb_root pi_waiters;
- - struct rb_node *pi_waiters_leftmost;
- - /* Deadlock detection and priority inheritance handling */
- - struct rt_mutex_waiter *pi_blocked_on;
- -#endif
- -
- -#ifdef CONFIG_DEBUG_MUTEXES
- - /* mutex deadlock detection */
- - struct mutex_waiter *blocked_on;
- -#endif
- -#ifdef CONFIG_TRACE_IRQFLAGS
- - unsigned int irq_events;
- - unsigned long hardirq_enable_ip;
- - unsigned long hardirq_disable_ip;
- - unsigned int hardirq_enable_event;
- - unsigned int hardirq_disable_event;
- - int hardirqs_enabled;
- - int hardirq_context;
- - unsigned long softirq_disable_ip;
- - unsigned long softirq_enable_ip;
- - unsigned int softirq_disable_event;
- - unsigned int softirq_enable_event;
- - int softirqs_enabled;
- - int softirq_context;
- -#endif
- -#ifdef CONFIG_LOCKDEP
- -# define MAX_LOCK_DEPTH 48UL
- - u64 curr_chain_key;
- - int lockdep_depth;
- - unsigned int lockdep_recursion;
- - struct held_lock held_locks[MAX_LOCK_DEPTH];
- - gfp_t lockdep_reclaim_gfp;
- -#endif
- -
- -/* journalling filesystem info */
- - void *journal_info;
- -
- -/* stacked block device info */
- - struct bio_list *bio_list;
- -
- -#ifdef CONFIG_BLOCK
- -/* stack plugging */
- - struct blk_plug *plug;
- -#endif
- -
- -/* VM state */
- - struct reclaim_state *reclaim_state;
- -
- - struct backing_dev_info *backing_dev_info;
- -
- - struct io_context *io_context;
- -
- - unsigned long ptrace_message;
- - siginfo_t *last_siginfo; /* For ptrace use. */
- - struct task_io_accounting ioac;
- -#if defined(CONFIG_TASK_XACCT)
- - u64 acct_rss_mem1; /* accumulated rss usage */
- - u64 acct_vm_mem1; /* accumulated virtual memory usage */
- - cputime_t acct_timexpd; /* stime + utime since last update */
- -#endif
- -#ifdef CONFIG_CPUSETS
- - nodemask_t mems_allowed; /* Protected by alloc_lock */
- - seqcount_t mems_allowed_seq; /* Seqence no to catch updates */
- - int cpuset_mem_spread_rotor;
- - int cpuset_slab_spread_rotor;
- -#endif
- -#ifdef CONFIG_CGROUPS
- - /* Control Group info protected by css_set_lock */
- - struct css_set __rcu *cgroups;
- - /* cg_list protected by css_set_lock and tsk->alloc_lock */
- - struct list_head cg_list;
- -#endif
- -#ifdef CONFIG_FUTEX
- - struct robust_list_head __user *robust_list;
- -#ifdef CONFIG_COMPAT
- - struct compat_robust_list_head __user *compat_robust_list;
- -#endif
- - struct list_head pi_state_list;
- - struct futex_pi_state *pi_state_cache;
- -#endif
- -#ifdef CONFIG_PERF_EVENTS
- - struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
- - struct mutex perf_event_mutex;
- - struct list_head perf_event_list;
- -#endif
- -#ifdef CONFIG_DEBUG_PREEMPT
- - unsigned long preempt_disable_ip;
- -#endif
- -#ifdef CONFIG_NUMA
- - struct mempolicy *mempolicy; /* Protected by alloc_lock */
- - short il_next;
- - short pref_node_fork;
- -#endif
- -#ifdef CONFIG_NUMA_BALANCING
- - int numa_scan_seq;
- - unsigned int numa_scan_period;
- - unsigned int numa_scan_period_max;
- - int numa_preferred_nid;
- - unsigned long numa_migrate_retry;
- - u64 node_stamp; /* migration stamp */
- - u64 last_task_numa_placement;
- - u64 last_sum_exec_runtime;
- - struct callback_head numa_work;
- -
- - struct list_head numa_entry;
- - struct numa_group *numa_group;
- -
- - /*
- - * numa_faults is an array split into four regions:
- - * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
- - * in this precise order.
- - *
- - * faults_memory: Exponential decaying average of faults on a per-node
- - * basis. Scheduling placement decisions are made based on these
- - * counts. The values remain static for the duration of a PTE scan.
- - * faults_cpu: Track the nodes the process was running on when a NUMA
- - * hinting fault was incurred.
- - * faults_memory_buffer and faults_cpu_buffer: Record faults per node
- - * during the current scan window. When the scan completes, the counts
- - * in faults_memory and faults_cpu decay and these values are copied.
- - */
- - unsigned long *numa_faults;
- - unsigned long total_numa_faults;
- -
- - /*
- - * numa_faults_locality tracks if faults recorded during the last
- - * scan window were remote/local or failed to migrate. The task scan
- - * period is adapted based on the locality of the faults with different
- - * weights depending on whether they were shared or private faults
- - */
- - unsigned long numa_faults_locality[3];
- -
- - unsigned long numa_pages_migrated;
- -#endif /* CONFIG_NUMA_BALANCING */
- -
- -#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
- - struct tlbflush_unmap_batch tlb_ubc;
- -#endif
- -
- - struct rcu_head rcu;
- -
- - /*
- - * cache last used pipe for splice
- - */
- - struct pipe_inode_info *splice_pipe;
- -
- - struct page_frag task_frag;
- -
- -#ifdef CONFIG_TASK_DELAY_ACCT
- - struct task_delay_info *delays;
- -#endif
- -#ifdef CONFIG_FAULT_INJECTION
- - int make_it_fail;
- -#endif
- - /*
- - * when (nr_dirtied >= nr_dirtied_pause), it's time to call
- - * balance_dirty_pages() for some dirty throttling pause
- - */
- - int nr_dirtied;
- - int nr_dirtied_pause;
- - unsigned long dirty_paused_when; /* start of a write-and-pause period */
- -
- -#ifdef CONFIG_LATENCYTOP
- - int latency_record_count;
- - struct latency_record latency_record[LT_SAVECOUNT];
- -#endif
- - /*
- - * time slack values; these are used to round up poll() and
- - * select() etc timeout values. These are in nanoseconds.
- - */
- - unsigned long timer_slack_ns;
- - unsigned long default_timer_slack_ns;
- -
- -#ifdef CONFIG_KASAN
- - unsigned int kasan_depth;
- -#endif
- -#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- - /* Index of current stored address in ret_stack */
- - int curr_ret_stack;
- - /* Stack of return addresses for return function tracing */
- - struct ftrace_ret_stack *ret_stack;
- - /* time stamp for last schedule */
- - unsigned long long ftrace_timestamp;
- - /*
- - * Number of functions that haven't been traced
- - * because of depth overrun.
- - */
- - atomic_t trace_overrun;
- - /* Pause for the tracing */
- - atomic_t tracing_graph_pause;
- -#endif
- -#ifdef CONFIG_TRACING
- - /* state flags for use by tracers */
- - unsigned long trace;
- - /* bitmask and counter of trace recursion */
- - unsigned long trace_recursion;
- -#endif /* CONFIG_TRACING */
- -#ifdef CONFIG_MEMCG
- - struct mem_cgroup *memcg_in_oom;
- - gfp_t memcg_oom_gfp_mask;
- - int memcg_oom_order;
- -
- - /* number of pages to reclaim on returning to userland */
- - unsigned int memcg_nr_pages_over_high;
- -#endif
- -#ifdef CONFIG_UPROBES
- - struct uprobe_task *utask;
- -#endif
- -#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
- - unsigned int sequential_io;
- - unsigned int sequential_io_avg;
- -#endif
- -#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
- - unsigned long task_state_change;
- -#endif
- - int pagefault_disabled;
- -/* CPU-specific state of this task */
- - struct thread_struct thread;
- -/*
- - * WARNING: on x86, 'thread_struct' contains a variable-sized
- - * structure. It *MUST* be at the end of 'task_struct'.
- - *
- - * Do not put anything below here!
- - */
- -};
- -
- -#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
- -extern int arch_task_struct_size __read_mostly;
- -#else
- -# define arch_task_struct_size (sizeof(struct task_struct))
- -#endif
- -
- -/* Future-safe accessor for struct task_struct's cpus_allowed. */
- -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
- -
- -#define TNF_MIGRATED 0x01
- -#define TNF_NO_GROUP 0x02
- -#define TNF_SHARED 0x04
- -#define TNF_FAULT_LOCAL 0x08
- -#define TNF_MIGRATE_FAIL 0x10
- -
- -#ifdef CONFIG_NUMA_BALANCING
- -extern void task_numa_fault(int last_node, int node, int pages, int flags);
- -extern pid_t task_numa_group_id(struct task_struct *p);
- -extern void set_numabalancing_state(bool enabled);
- -extern void task_numa_free(struct task_struct *p);
- -extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
- - int src_nid, int dst_cpu);
- -#else
- -static inline void task_numa_fault(int last_node, int node, int pages,
- - int flags)
- -{
- -}
- -static inline pid_t task_numa_group_id(struct task_struct *p)
- -{
- - return 0;
- -}
- -static inline void set_numabalancing_state(bool enabled)
- -{
- -}
- -static inline void task_numa_free(struct task_struct *p)
- -{
- -}
- -static inline bool should_numa_migrate_memory(struct task_struct *p,
- - struct page *page, int src_nid, int dst_cpu)
- -{
- - return true;
- -}
- -#endif
- -
- -static inline struct pid *task_pid(struct task_struct *task)
- -{
- - return task->pids[PIDTYPE_PID].pid;
- -}
- -
- -static inline struct pid *task_tgid(struct task_struct *task)
- -{
- - return task->group_leader->pids[PIDTYPE_PID].pid;
- -}
- -
- -/*
- - * Without tasklist or rcu lock it is not safe to dereference
- - * the result of task_pgrp/task_session even if task == current,
- - * we can race with another thread doing sys_setsid/sys_setpgid.
- - */
- -static inline struct pid *task_pgrp(struct task_struct *task)
- -{
- - return task->group_leader->pids[PIDTYPE_PGID].pid;
- -}
- -
- -static inline struct pid *task_session(struct task_struct *task)
- -{
- - return task->group_leader->pids[PIDTYPE_SID].pid;
- -}
- -
- -struct pid_namespace;
- -
- -/*
- - * the helpers to get the task's different pids as they are seen
- - * from various namespaces
- - *
- - * task_xid_nr() : global id, i.e. the id seen from the init namespace;
- - * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of
- - * current.
- - * task_xid_nr_ns() : id seen from the ns specified;
- - *
- - * set_task_vxid() : assigns a virtual id to a task;
- - *
- - * see also pid_nr() etc in include/linux/pid.h
- - */
- -pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
- - struct pid_namespace *ns);
- -
- -static inline pid_t task_pid_nr(struct task_struct *tsk)
- -{
- - return tsk->pid;
- -}
- -
- -static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
- - struct pid_namespace *ns)
- -{
- - return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
- -}
- -
- -static inline pid_t task_pid_vnr(struct task_struct *tsk)
- -{
- - return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
- -}
- -
- -
- -static inline pid_t task_tgid_nr(struct task_struct *tsk)
- -{
- - return tsk->tgid;
- -}
- -
- -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
- -
- -static inline pid_t task_tgid_vnr(struct task_struct *tsk)
- -{
- - return pid_vnr(task_tgid(tsk));
- -}
- -
- -
- -static inline int pid_alive(const struct task_struct *p);
- -static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
- -{
- - pid_t pid = 0;
- -
- - rcu_read_lock();
- - if (pid_alive(tsk))
- - pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
- - rcu_read_unlock();
- -
- - return pid;
- -}
- -
- -static inline pid_t task_ppid_nr(const struct task_struct *tsk)
- -{
- - return task_ppid_nr_ns(tsk, &init_pid_ns);
- -}
- -
- -static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
- - struct pid_namespace *ns)
- -{
- - return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
- -}
- -
- -static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
- -{
- - return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
- -}
- -
- -
- -static inline pid_t task_session_nr_ns(struct task_struct *tsk,
- - struct pid_namespace *ns)
- -{
- - return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
- -}
- -
- -static inline pid_t task_session_vnr(struct task_struct *tsk)
- -{
- - return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
- -}
- -
- -/* obsolete, do not use */
- -static inline pid_t task_pgrp_nr(struct task_struct *tsk)
- -{
- - return task_pgrp_nr_ns(tsk, &init_pid_ns);
- -}
- -
- -/**
- - * pid_alive - check that a task structure is not stale
- - * @p: Task structure to be checked.
- - *
- - * Test if a process is not yet dead (at most zombie state)
- - * If pid_alive fails, then pointers within the task structure
- - * can be stale and must not be dereferenced.
- - *
- - * Return: 1 if the process is alive. 0 otherwise.
- - */
- -static inline int pid_alive(const struct task_struct *p)
- -{
- - return p->pids[PIDTYPE_PID].pid != NULL;
- -}
- -
- -/**
- - * is_global_init - check if a task structure is init. Since init
- - * is free to have sub-threads we need to check tgid.
- - * @tsk: Task structure to be checked.
- - *
- - * Check if a task structure is the first user space task the kernel created.
- - *
- - * Return: 1 if the task structure is init. 0 otherwise.
- - */
- -static inline int is_global_init(struct task_struct *tsk)
- -{
- - return task_tgid_nr(tsk) == 1;
- -}
- -
- -extern struct pid *cad_pid;
- -
- -extern void free_task(struct task_struct *tsk);
- -#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
- -
- -extern void __put_task_struct(struct task_struct *t);
- -
- -static inline void put_task_struct(struct task_struct *t)
- -{
- - if (atomic_dec_and_test(&t->usage))
- - __put_task_struct(t);
- -}
- -
- -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- -extern void task_cputime(struct task_struct *t,
- - cputime_t *utime, cputime_t *stime);
- -extern void task_cputime_scaled(struct task_struct *t,
- - cputime_t *utimescaled, cputime_t *stimescaled);
- -extern cputime_t task_gtime(struct task_struct *t);
- -#else
- -static inline void task_cputime(struct task_struct *t,
- - cputime_t *utime, cputime_t *stime)
- -{
- - if (utime)
- - *utime = t->utime;
- - if (stime)
- - *stime = t->stime;
- -}
- -
- -static inline void task_cputime_scaled(struct task_struct *t,
- - cputime_t *utimescaled,
- - cputime_t *stimescaled)
- -{
- - if (utimescaled)
- - *utimescaled = t->utimescaled;
- - if (stimescaled)
- - *stimescaled = t->stimescaled;
- -}
- -
- -static inline cputime_t task_gtime(struct task_struct *t)
- -{
- - return t->gtime;
- -}
- -#endif
- -extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
- -extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
- -
- -/*
- - * Per process flags
- - */
- -#define PF_EXITING 0x00000004 /* getting shut down */
- -#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
- -#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
- -#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
- -#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
- -#define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */
- -#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
- -#define PF_DUMPCORE 0x00000200 /* dumped core */
- -#define PF_SIGNALED 0x00000400 /* killed by a signal */
- -#define PF_MEMALLOC 0x00000800 /* Allocating memory */
- -#define PF_NPROC_EXCEEDED 0x00001000 /* set_user noticed that RLIMIT_NPROC was exceeded */
- -#define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */
- -#define PF_USED_ASYNC 0x00004000 /* used async_schedule*(), used by module init */
- -#define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */
- -#define PF_FROZEN 0x00010000 /* frozen for system suspend */
- -#define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */
- -#define PF_KSWAPD 0x00040000 /* I am kswapd */
- -#define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */
- -#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
- -#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
- -#define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */
- -#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
- -#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
- -#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
- -#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
- -#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
- -#define PF_SUSPEND_TASK 0x80000000 /* this thread called freeze_processes and should not be frozen */
- -
- -/*
- - * Only the _current_ task can read/write to tsk->flags, but other
- - * tasks can access tsk->flags in readonly mode for example
- - * with tsk_used_math (like during threaded core dumping).
- - * There is however an exception to this rule during ptrace
- - * or during fork: the ptracer task is allowed to write to the
- - * child->flags of its traced child (same goes for fork, the parent
- - * can write to the child->flags), because we're guaranteed the
- - * child is not running and in turn not changing child->flags
- - * at the same time the parent does it.
- - */
- -#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
- -#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
- -#define clear_used_math() clear_stopped_child_used_math(current)
- -#define set_used_math() set_stopped_child_used_math(current)
- -#define conditional_stopped_child_used_math(condition, child) \
- - do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
- -#define conditional_used_math(condition) \
- - conditional_stopped_child_used_math(condition, current)
- -#define copy_to_stopped_child_used_math(child) \
- - do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
- -/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
- -#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
- -#define used_math() tsk_used_math(current)
- -
- -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
- - * __GFP_FS is also cleared as it implies __GFP_IO.
- - */
- -static inline gfp_t memalloc_noio_flags(gfp_t flags)
- -{
- - if (unlikely(current->flags & PF_MEMALLOC_NOIO))
- - flags &= ~(__GFP_IO | __GFP_FS);
- - return flags;
- -}
- -
- -static inline unsigned int memalloc_noio_save(void)
- -{
- - unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
- - current->flags |= PF_MEMALLOC_NOIO;
- - return flags;
- -}
- -
- -static inline void memalloc_noio_restore(unsigned int flags)
- -{
- - current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
- -}
- -
- -/* Per-process atomic flags. */
- -#define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */
- -#define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */
- -#define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */
- -
- -
- -#define TASK_PFA_TEST(name, func) \
- - static inline bool task_##func(struct task_struct *p) \
- - { return test_bit(PFA_##name, &p->atomic_flags); }
- -#define TASK_PFA_SET(name, func) \
- - static inline void task_set_##func(struct task_struct *p) \
- - { set_bit(PFA_##name, &p->atomic_flags); }
- -#define TASK_PFA_CLEAR(name, func) \
- - static inline void task_clear_##func(struct task_struct *p) \
- - { clear_bit(PFA_##name, &p->atomic_flags); }
- -
- -TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs)
- -TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs)
- -
- -TASK_PFA_TEST(SPREAD_PAGE, spread_page)
- -TASK_PFA_SET(SPREAD_PAGE, spread_page)
- -TASK_PFA_CLEAR(SPREAD_PAGE, spread_page)
- -
- -TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
- -TASK_PFA_SET(SPREAD_SLAB, spread_slab)
- -TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
- -
- -/*
- - * task->jobctl flags
- - */
- -#define JOBCTL_STOP_SIGMASK 0xffff /* signr of the last group stop */
- -
- -#define JOBCTL_STOP_DEQUEUED_BIT 16 /* stop signal dequeued */
- -#define JOBCTL_STOP_PENDING_BIT 17 /* task should stop for group stop */
- -#define JOBCTL_STOP_CONSUME_BIT 18 /* consume group stop count */
- -#define JOBCTL_TRAP_STOP_BIT 19 /* trap for STOP */
- -#define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */
- -#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */
- -#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */
- -
- -#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT)
- -#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT)
- -#define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT)
- -#define JOBCTL_TRAP_STOP (1UL << JOBCTL_TRAP_STOP_BIT)
- -#define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT)
- -#define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT)
- -#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT)
- -
- -#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
- -#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
- -
- -extern bool task_set_jobctl_pending(struct task_struct *task,
- - unsigned long mask);
- -extern void task_clear_jobctl_trapping(struct task_struct *task);
- -extern void task_clear_jobctl_pending(struct task_struct *task,
- - unsigned long mask);
- -
- -static inline void rcu_copy_process(struct task_struct *p)
- -{
- -#ifdef CONFIG_PREEMPT_RCU
- - p->rcu_read_lock_nesting = 0;
- - p->rcu_read_unlock_special.s = 0;
- - p->rcu_blocked_node = NULL;
- - INIT_LIST_HEAD(&p->rcu_node_entry);
- -#endif /* #ifdef CONFIG_PREEMPT_RCU */
- -#ifdef CONFIG_TASKS_RCU
- - p->rcu_tasks_holdout = false;
- - INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
- - p->rcu_tasks_idle_cpu = -1;
- -#endif /* #ifdef CONFIG_TASKS_RCU */
- -}
- -
- -static inline void tsk_restore_flags(struct task_struct *task,
- - unsigned long orig_flags, unsigned long flags)
- -{
- - task->flags &= ~flags;
- - task->flags |= orig_flags & flags;
- -}
- -
- -extern int cpuset_cpumask_can_shrink(const struct cpumask *cur,
- - const struct cpumask *trial);
- -extern int task_can_attach(struct task_struct *p,
- - const struct cpumask *cs_cpus_allowed);
- -#ifdef CONFIG_SMP
- -extern void do_set_cpus_allowed(struct task_struct *p,
- - const struct cpumask *new_mask);
- -
- -extern int set_cpus_allowed_ptr(struct task_struct *p,
- - const struct cpumask *new_mask);
- -#else
- -static inline void do_set_cpus_allowed(struct task_struct *p,
- - const struct cpumask *new_mask)
- -{
- -}
- -static inline int set_cpus_allowed_ptr(struct task_struct *p,
- - const struct cpumask *new_mask)
- -{
- - if (!cpumask_test_cpu(0, new_mask))
- - return -EINVAL;
- - return 0;
- -}
- -#endif
- -
- -#ifdef CONFIG_NO_HZ_COMMON
- -void calc_load_enter_idle(void);
- -void calc_load_exit_idle(void);
- -#else
- -static inline void calc_load_enter_idle(void) { }
- -static inline void calc_load_exit_idle(void) { }
- -#endif /* CONFIG_NO_HZ_COMMON */
- -
- -/*
- - * Do not use outside of architecture code which knows its limitations.
- - *
- - * sched_clock() has no promise of monotonicity or bounded drift between
- - * CPUs, use (which you should not) requires disabling IRQs.
- - *
- - * Please use one of the three interfaces below.
- - */
- -extern unsigned long long notrace sched_clock(void);
- -/*
- - * See the comment in kernel/sched/clock.c
- - */
- -extern u64 cpu_clock(int cpu);
- -extern u64 local_clock(void);
- -extern u64 running_clock(void);
- -extern u64 sched_clock_cpu(int cpu);
- -
- -
- -extern void sched_clock_init(void);
- -
- -#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
- -static inline void sched_clock_tick(void)
- -{
- -}
- -
- -static inline void sched_clock_idle_sleep_event(void)
- -{
- -}
- -
- -static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
- -{
- -}
- -#else
- -/*
- - * Architectures can set this to 1 if they have specified
- - * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
- - * but then during bootup it turns out that sched_clock()
- - * is reliable after all:
- - */
- -extern int sched_clock_stable(void);
- -extern void set_sched_clock_stable(void);
- -extern void clear_sched_clock_stable(void);
- -
- -extern void sched_clock_tick(void);
- -extern void sched_clock_idle_sleep_event(void);
- -extern void sched_clock_idle_wakeup_event(u64 delta_ns);
- -#endif
- -
- -#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- -/*
- - * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
- - * The reason for this explicit opt-in is not to have perf penalty with
- - * slow sched_clocks.
- - */
- -extern void enable_sched_clock_irqtime(void);
- -extern void disable_sched_clock_irqtime(void);
- -#else
- -static inline void enable_sched_clock_irqtime(void) {}
- -static inline void disable_sched_clock_irqtime(void) {}
- -#endif
- -
- -extern unsigned long long
- -task_sched_runtime(struct task_struct *task);
- -
- -/* sched_exec is called by processes performing an exec */
- -#ifdef CONFIG_SMP
- -extern void sched_exec(void);
- -#else
- -#define sched_exec() {}
- -#endif
- -
- -extern void sched_clock_idle_sleep_event(void);
- -extern void sched_clock_idle_wakeup_event(u64 delta_ns);
- -
- -#ifdef CONFIG_HOTPLUG_CPU
- -extern void idle_task_exit(void);
- -#else
- -static inline void idle_task_exit(void) {}
- -#endif
- -
- -#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
- -extern void wake_up_nohz_cpu(int cpu);
- -#else
- -static inline void wake_up_nohz_cpu(int cpu) { }
- -#endif
- -
- -#ifdef CONFIG_NO_HZ_FULL
- -extern bool sched_can_stop_tick(void);
- -extern u64 scheduler_tick_max_deferment(void);
- -#else
- -static inline bool sched_can_stop_tick(void) { return false; }
- -#endif
- -
- -#ifdef CONFIG_SCHED_AUTOGROUP
- -extern void sched_autogroup_create_attach(struct task_struct *p);
- -extern void sched_autogroup_detach(struct task_struct *p);
- -extern void sched_autogroup_fork(struct signal_struct *sig);
- -extern void sched_autogroup_exit(struct signal_struct *sig);
- -#ifdef CONFIG_PROC_FS
- -extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
- -extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice);
- -#endif
- -#else
- -static inline void sched_autogroup_create_attach(struct task_struct *p) { }
- -static inline void sched_autogroup_detach(struct task_struct *p) { }
- -static inline void sched_autogroup_fork(struct signal_struct *sig) { }
- -static inline void sched_autogroup_exit(struct signal_struct *sig) { }
- -#endif
- -
- -extern int yield_to(struct task_struct *p, bool preempt);
- -extern void set_user_nice(struct task_struct *p, long nice);
- -extern int task_prio(const struct task_struct *p);
- -/**
- - * task_nice - return the nice value of a given task.
- - * @p: the task in question.
- - *
- - * Return: The nice value [ -20 ... 0 ... 19 ].
- - */
- -static inline int task_nice(const struct task_struct *p)
- -{
- - return PRIO_TO_NICE((p)->static_prio);
- -}
- -extern int can_nice(const struct task_struct *p, const int nice);
- -extern int task_curr(const struct task_struct *p);
- -extern int idle_cpu(int cpu);
- -extern int sched_setscheduler(struct task_struct *, int,
- - const struct sched_param *);
- -extern int sched_setscheduler_nocheck(struct task_struct *, int,
- - const struct sched_param *);
- -extern int sched_setattr(struct task_struct *,
- - const struct sched_attr *);
- -extern struct task_struct *idle_task(int cpu);
- -/**
- - * is_idle_task - is the specified task an idle task?
- - * @p: the task in question.
- - *
- - * Return: 1 if @p is an idle task. 0 otherwise.
- - */
- -static inline bool is_idle_task(const struct task_struct *p)
- -{
- - return p->pid == 0;
- -}
- -extern struct task_struct *curr_task(int cpu);
- -extern void set_curr_task(int cpu, struct task_struct *p);
- -
- -void yield(void);
- -
- -union thread_union {
- - struct thread_info thread_info;
- - unsigned long stack[THREAD_SIZE/sizeof(long)];
- -};
- -
- -#ifndef __HAVE_ARCH_KSTACK_END
- -static inline int kstack_end(void *addr)
- -{
- - /* Reliable end of stack detection:
- - * Some APM bios versions misalign the stack
- - */
- - return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
- -}
- -#endif
- -
- -extern union thread_union init_thread_union;
- -extern struct task_struct init_task;
- -
- -extern struct mm_struct init_mm;
- -
- -extern struct pid_namespace init_pid_ns;
- -
- -/*
- - * find a task by one of its numerical ids
- - *
- - * find_task_by_pid_ns():
- - * finds a task by its pid in the specified namespace
- - * find_task_by_vpid():
- - * finds a task by its virtual pid
- - *
- - * see also find_vpid() etc in include/linux/pid.h
- - */
- -
- -extern struct task_struct *find_task_by_vpid(pid_t nr);
- -extern struct task_struct *find_task_by_pid_ns(pid_t nr,
- - struct pid_namespace *ns);
- -
- -/* per-UID process charging. */
- -extern struct user_struct * alloc_uid(kuid_t);
- -static inline struct user_struct *get_uid(struct user_struct *u)
- -{
- - atomic_inc(&u->__count);
- - return u;
- -}
- -extern void free_uid(struct user_struct *);
- -
- -#include <asm/current.h>
- -
- -extern void xtime_update(unsigned long ticks);
- -
- -extern int wake_up_state(struct task_struct *tsk, unsigned int state);
- -extern int wake_up_process(struct task_struct *tsk);
- -extern void wake_up_new_task(struct task_struct *tsk);
- -#ifdef CONFIG_SMP
- - extern void kick_process(struct task_struct *tsk);
- -#else
- - static inline void kick_process(struct task_struct *tsk) { }
- -#endif
- -extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
- -extern void sched_dead(struct task_struct *p);
- -
- -extern void proc_caches_init(void);
- -extern void flush_signals(struct task_struct *);
- -extern void ignore_signals(struct task_struct *);
- -extern void flush_signal_handlers(struct task_struct *, int force_default);
- -extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
- -
- -static inline int kernel_dequeue_signal(siginfo_t *info)
- -{
- - struct task_struct *tsk = current;
- - siginfo_t __info;
- - int ret;
- -
- - spin_lock_irq(&tsk->sighand->siglock);
- - ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info);
- - spin_unlock_irq(&tsk->sighand->siglock);
- -
- - return ret;
- -}
- -
- -static inline void kernel_signal_stop(void)
- -{
- - spin_lock_irq(¤t->sighand->siglock);
- - if (current->jobctl & JOBCTL_STOP_DEQUEUED)
- - __set_current_state(TASK_STOPPED);
- - spin_unlock_irq(¤t->sighand->siglock);
- -
- - schedule();
- -}
- -
- -extern void release_task(struct task_struct * p);
- -extern int send_sig_info(int, struct siginfo *, struct task_struct *);
- -extern int force_sigsegv(int, struct task_struct *);
- -extern int force_sig_info(int, struct siginfo *, struct task_struct *);
- -extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
- -extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
- -extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *,
- - const struct cred *, u32);
- -extern int kill_pgrp(struct pid *pid, int sig, int priv);
- -extern int kill_pid(struct pid *pid, int sig, int priv);
- -extern int kill_proc_info(int, struct siginfo *, pid_t);
- -extern __must_check bool do_notify_parent(struct task_struct *, int);
- -extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
- -extern void force_sig(int, struct task_struct *);
- -extern int send_sig(int, struct task_struct *, int);
- -extern int zap_other_threads(struct task_struct *p);
- -extern struct sigqueue *sigqueue_alloc(void);
- -extern void sigqueue_free(struct sigqueue *);
- -extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group);
- -extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
- -
- -static inline void restore_saved_sigmask(void)
- -{
- - if (test_and_clear_restore_sigmask())
- - __set_current_blocked(¤t->saved_sigmask);
- -}
- -
- -static inline sigset_t *sigmask_to_save(void)
- -{
- - sigset_t *res = ¤t->blocked;
- - if (unlikely(test_restore_sigmask()))
- - res = ¤t->saved_sigmask;
- - return res;
- -}
- -
- -static inline int kill_cad_pid(int sig, int priv)
- -{
- - return kill_pid(cad_pid, sig, priv);
- -}
- -
- -/* These can be the second arg to send_sig_info/send_group_sig_info. */
- -#define SEND_SIG_NOINFO ((struct siginfo *) 0)
- -#define SEND_SIG_PRIV ((struct siginfo *) 1)
- -#define SEND_SIG_FORCED ((struct siginfo *) 2)
- -
- -/*
- - * True if we are on the alternate signal stack.
- - */
- -static inline int on_sig_stack(unsigned long sp)
- -{
- -#ifdef CONFIG_STACK_GROWSUP
- - return sp >= current->sas_ss_sp &&
- - sp - current->sas_ss_sp < current->sas_ss_size;
- -#else
- - return sp > current->sas_ss_sp &&
- - sp - current->sas_ss_sp <= current->sas_ss_size;
- -#endif
- -}
- -
- -static inline int sas_ss_flags(unsigned long sp)
- -{
- - if (!current->sas_ss_size)
- - return SS_DISABLE;
- -
- - return on_sig_stack(sp) ? SS_ONSTACK : 0;
- -}
- -
- -static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
- -{
- - if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
- -#ifdef CONFIG_STACK_GROWSUP
- - return current->sas_ss_sp;
- -#else
- - return current->sas_ss_sp + current->sas_ss_size;
- -#endif
- - return sp;
- -}
- -
- -/*
- - * Routines for handling mm_structs
- - */
- -extern struct mm_struct * mm_alloc(void);
- -
- -/* mmdrop drops the mm and the page tables */
- -extern void __mmdrop(struct mm_struct *);
- -static inline void mmdrop(struct mm_struct * mm)
- -{
- - if (unlikely(atomic_dec_and_test(&mm->mm_count)))
- - __mmdrop(mm);
- -}
- -
- -/* mmput gets rid of the mappings and all user-space */
- -extern void mmput(struct mm_struct *);
- -/* Grab a reference to a task's mm, if it is not already going away */
- -extern struct mm_struct *get_task_mm(struct task_struct *task);
- -/*
- - * Grab a reference to a task's mm, if it is not already going away
- - * and ptrace_may_access with the mode parameter passed to it
- - * succeeds.
- - */
- -extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
- -/* Remove the current tasks stale references to the old mm_struct */
- -extern void mm_release(struct task_struct *, struct mm_struct *);
- -
- -#ifdef CONFIG_HAVE_COPY_THREAD_TLS
- -extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
- - struct task_struct *, unsigned long);
- -#else
- -extern int copy_thread(unsigned long, unsigned long, unsigned long,
- - struct task_struct *);
- -
- -/* Architectures that haven't opted into copy_thread_tls get the tls argument
- - * via pt_regs, so ignore the tls argument passed via C. */
- -static inline int copy_thread_tls(
- - unsigned long clone_flags, unsigned long sp, unsigned long arg,
- - struct task_struct *p, unsigned long tls)
- -{
- - return copy_thread(clone_flags, sp, arg, p);
- -}
- -#endif
- -extern void flush_thread(void);
- -extern void exit_thread(void);
- -
- -extern void exit_files(struct task_struct *);
- -extern void __cleanup_sighand(struct sighand_struct *);
- -
- -extern void exit_itimers(struct signal_struct *);
- -extern void flush_itimer_signals(void);
- -
- -extern void do_group_exit(int);
- -
- -extern int do_execve(struct filename *,
- - const char __user * const __user *,
- - const char __user * const __user *);
- -extern int do_execveat(int, struct filename *,
- - const char __user * const __user *,
- - const char __user * const __user *,
- - int);
- -extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
- -extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
- -struct task_struct *fork_idle(int);
- -extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
- -
- -extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
- -static inline void set_task_comm(struct task_struct *tsk, const char *from)
- -{
- - __set_task_comm(tsk, from, false);
- -}
- -extern char *get_task_comm(char *to, struct task_struct *tsk);
- -
- -#ifdef CONFIG_SMP
- -void scheduler_ipi(void);
- -extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
- -#else
- -static inline void scheduler_ipi(void) { }
- -static inline unsigned long wait_task_inactive(struct task_struct *p,
- - long match_state)
- -{
- - return 1;
- -}
- -#endif
- -
- -#define tasklist_empty() \
- - list_empty(&init_task.tasks)
- -
- -#define next_task(p) \
- - list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
- -
- -#define for_each_process(p) \
- - for (p = &init_task ; (p = next_task(p)) != &init_task ; )
- -
- -extern bool current_is_single_threaded(void);
- -
- -/*
- - * Careful: do_each_thread/while_each_thread is a double loop so
- - * 'break' will not work as expected - use goto instead.
- - */
- -#define do_each_thread(g, t) \
- - for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
- -
- -#define while_each_thread(g, t) \
- - while ((t = next_thread(t)) != g)
- -
- -#define __for_each_thread(signal, t) \
- - list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
- -
- -#define for_each_thread(p, t) \
- - __for_each_thread((p)->signal, t)
- -
- -/* Careful: this is a double loop, 'break' won't work as expected. */
- -#define for_each_process_thread(p, t) \
- - for_each_process(p) for_each_thread(p, t)
- -
- -static inline int get_nr_threads(struct task_struct *tsk)
- -{
- - return tsk->signal->nr_threads;
- -}
- -
- -static inline bool thread_group_leader(struct task_struct *p)
- -{
- - return p->exit_signal >= 0;
- -}
- -
- -/* Do to the insanities of de_thread it is possible for a process
- - * to have the pid of the thread group leader without actually being
- - * the thread group leader. For iteration through the pids in proc
- - * all we care about is that we have a task with the appropriate
- - * pid, we don't actually care if we have the right task.
- - */
- -static inline bool has_group_leader_pid(struct task_struct *p)
- -{
- - return task_pid(p) == p->signal->leader_pid;
- -}
- -
- -static inline
- -bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
- -{
- - return p1->signal == p2->signal;
- -}
- -
- -static inline struct task_struct *next_thread(const struct task_struct *p)
- -{
- - return list_entry_rcu(p->thread_group.next,
- - struct task_struct, thread_group);
- -}
- -
- -static inline int thread_group_empty(struct task_struct *p)
- -{
- - return list_empty(&p->thread_group);
- -}
- -
- -#define delay_group_leader(p) \
- - (thread_group_leader(p) && !thread_group_empty(p))
- -
- -/*
- - * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
- - * subscriptions and synchronises with wait4(). Also used in procfs. Also
- - * pins the final release of task.io_context. Also protects ->cpuset and
- - * ->cgroup.subsys[]. And ->vfork_done.
- - *
- - * Nests both inside and outside of read_lock(&tasklist_lock).
- - * It must not be nested with write_lock_irq(&tasklist_lock),
- - * neither inside nor outside.
- - */
- -static inline void task_lock(struct task_struct *p)
- -{
- - spin_lock(&p->alloc_lock);
- -}
- -
- -static inline void task_unlock(struct task_struct *p)
- -{
- - spin_unlock(&p->alloc_lock);
- -}
- -
- -extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
- - unsigned long *flags);
- -
- -static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
- - unsigned long *flags)
- -{
- - struct sighand_struct *ret;
- -
- - ret = __lock_task_sighand(tsk, flags);
- - (void)__cond_lock(&tsk->sighand->siglock, ret);
- - return ret;
- -}
- -
- -static inline void unlock_task_sighand(struct task_struct *tsk,
- - unsigned long *flags)
- -{
- - spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
- -}
- -
- -/**
- - * threadgroup_change_begin - mark the beginning of changes to a threadgroup
- - * @tsk: task causing the changes
- - *
- - * All operations which modify a threadgroup - a new thread joining the
- - * group, death of a member thread (the assertion of PF_EXITING) and
- - * exec(2) dethreading the process and replacing the leader - are wrapped
- - * by threadgroup_change_{begin|end}(). This is to provide a place which
- - * subsystems needing threadgroup stability can hook into for
- - * synchronization.
- - */
- -static inline void threadgroup_change_begin(struct task_struct *tsk)
- -{
- - might_sleep();
- - cgroup_threadgroup_change_begin(tsk);
- -}
- -
- -/**
- - * threadgroup_change_end - mark the end of changes to a threadgroup
- - * @tsk: task causing the changes
- - *
- - * See threadgroup_change_begin().
- - */
- -static inline void threadgroup_change_end(struct task_struct *tsk)
- -{
- - cgroup_threadgroup_change_end(tsk);
- -}
- -
- -#ifndef __HAVE_THREAD_FUNCTIONS
- -
- -#define task_thread_info(task) ((struct thread_info *)(task)->stack)
- -#define task_stack_page(task) ((task)->stack)
- -
- -static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
- -{
- - *task_thread_info(p) = *task_thread_info(org);
- - task_thread_info(p)->task = p;
- -}
- -
- -/*
- - * Return the address of the last usable long on the stack.
- - *
- - * When the stack grows down, this is just above the thread
- - * info struct. Going any lower will corrupt the threadinfo.
- - *
- - * When the stack grows up, this is the highest address.
- - * Beyond that position, we corrupt data on the next page.
- - */
- -static inline unsigned long *end_of_stack(struct task_struct *p)
- -{
- -#ifdef CONFIG_STACK_GROWSUP
- - return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1;
- -#else
- - return (unsigned long *)(task_thread_info(p) + 1);
- -#endif
- -}
- -
- -#endif
- -#define task_stack_end_corrupted(task) \
- - (*(end_of_stack(task)) != STACK_END_MAGIC)
- -
- -static inline int object_is_on_stack(void *obj)
- -{
- - void *stack = task_stack_page(current);
- -
- - return (obj >= stack) && (obj < (stack + THREAD_SIZE));
- -}
- -
- -extern void thread_info_cache_init(void);
- -
- -#ifdef CONFIG_DEBUG_STACK_USAGE
- -static inline unsigned long stack_not_used(struct task_struct *p)
- -{
- - unsigned long *n = end_of_stack(p);
- -
- - do { /* Skip over canary */
- - n++;
- - } while (!*n);
- -
- - return (unsigned long)n - (unsigned long)end_of_stack(p);
- -}
- -#endif
- -extern void set_task_stack_end_magic(struct task_struct *tsk);
- -
- -/* set thread flags in other task's structures
- - * - see asm/thread_info.h for TIF_xxxx flags available
- - */
- -static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
- -{
- - set_ti_thread_flag(task_thread_info(tsk), flag);
- -}
- -
- -static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
- -{
- - clear_ti_thread_flag(task_thread_info(tsk), flag);
- -}
- -
- -static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
- -{
- - return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
- -}
- -
- -static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
- -{
- - return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
- -}
- -
- -static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
- -{
- - return test_ti_thread_flag(task_thread_info(tsk), flag);
- -}
- -
- -static inline void set_tsk_need_resched(struct task_struct *tsk)
- -{
- - set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
- -}
- -
- -static inline void clear_tsk_need_resched(struct task_struct *tsk)
- -{
- - clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
- -}
- -
- -static inline int test_tsk_need_resched(struct task_struct *tsk)
- -{
- - return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
- -}
- -
- -static inline int restart_syscall(void)
- -{
- - set_tsk_thread_flag(current, TIF_SIGPENDING);
- - return -ERESTARTNOINTR;
- -}
- -
- -static inline int signal_pending(struct task_struct *p)
- -{
- - return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
- -}
- -
- -static inline int __fatal_signal_pending(struct task_struct *p)
- -{
- - return unlikely(sigismember(&p->pending.signal, SIGKILL));
- -}
- -
- -static inline int fatal_signal_pending(struct task_struct *p)
- -{
- - return signal_pending(p) && __fatal_signal_pending(p);
- -}
- -
- -static inline int signal_pending_state(long state, struct task_struct *p)
- -{
- - if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
- - return 0;
- - if (!signal_pending(p))
- - return 0;
- -
- - return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
- -}
- -
- -/*
- - * cond_resched() and cond_resched_lock(): latency reduction via
- - * explicit rescheduling in places that are safe. The return
- - * value indicates whether a reschedule was done in fact.
- - * cond_resched_lock() will drop the spinlock before scheduling,
- - * cond_resched_softirq() will enable bhs before scheduling.
- - */
- -extern int _cond_resched(void);
- -
- -#define cond_resched() ({ \
- - ___might_sleep(__FILE__, __LINE__, 0); \
- - _cond_resched(); \
- -})
- -
- -extern int __cond_resched_lock(spinlock_t *lock);
- -
- -#define cond_resched_lock(lock) ({ \
- - ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
- - __cond_resched_lock(lock); \
- -})
- -
- -extern int __cond_resched_softirq(void);
- -
- -#define cond_resched_softirq() ({ \
- - ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
- - __cond_resched_softirq(); \
- -})
- -
- -static inline void cond_resched_rcu(void)
- -{
- -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
- - rcu_read_unlock();
- - cond_resched();
- - rcu_read_lock();
- -#endif
- -}
- -
- -/*
- - * Does a critical section need to be broken due to another
- - * task waiting?: (technically does not depend on CONFIG_PREEMPT,
- - * but a general need for low latency)
- - */
- -static inline int spin_needbreak(spinlock_t *lock)
- -{
- -#ifdef CONFIG_PREEMPT
- - return spin_is_contended(lock);
- -#else
- - return 0;
- -#endif
- -}
- -
- -/*
- - * Idle thread specific functions to determine the need_resched
- - * polling state.
- - */
- -#ifdef TIF_POLLING_NRFLAG
- -static inline int tsk_is_polling(struct task_struct *p)
- -{
- - return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
- -}
- -
- -static inline void __current_set_polling(void)
- -{
- - set_thread_flag(TIF_POLLING_NRFLAG);
- -}
- -
- -static inline bool __must_check current_set_polling_and_test(void)
- -{
- - __current_set_polling();
- -
- - /*
- - * Polling state must be visible before we test NEED_RESCHED,
- - * paired by resched_curr()
- - */
- - smp_mb__after_atomic();
- -
- - return unlikely(tif_need_resched());
- -}
- -
- -static inline void __current_clr_polling(void)
- -{
- - clear_thread_flag(TIF_POLLING_NRFLAG);
- -}
- -
- -static inline bool __must_check current_clr_polling_and_test(void)
- -{
- - __current_clr_polling();
- -
- - /*
- - * Polling state must be visible before we test NEED_RESCHED,
- - * paired by resched_curr()
- - */
- - smp_mb__after_atomic();
- -
- - return unlikely(tif_need_resched());
- -}
- -
- -#else
- -static inline int tsk_is_polling(struct task_struct *p) { return 0; }
- -static inline void __current_set_polling(void) { }
- -static inline void __current_clr_polling(void) { }
- -
- -static inline bool __must_check current_set_polling_and_test(void)
- -{
- - return unlikely(tif_need_resched());
- -}
- -static inline bool __must_check current_clr_polling_and_test(void)
- -{
- - return unlikely(tif_need_resched());
- -}
- -#endif
- -
- -static inline void current_clr_polling(void)
- -{
- - __current_clr_polling();
- -
- - /*
- - * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
- - * Once the bit is cleared, we'll get IPIs with every new
- - * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
- - * fold.
- - */
- - smp_mb(); /* paired with resched_curr() */
- -
- - preempt_fold_need_resched();
- -}
- -
- -static __always_inline bool need_resched(void)
- -{
- - return unlikely(tif_need_resched());
- -}
- -
- -/*
- - * Thread group CPU time accounting.
- - */
- -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
- -void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
- -
- -/*
- - * Reevaluate whether the task has signals pending delivery.
- - * Wake the task if so.
- - * This is required every time the blocked sigset_t changes.
- - * callers must hold sighand->siglock.
- - */
- -extern void recalc_sigpending_and_wake(struct task_struct *t);
- -extern void recalc_sigpending(void);
- -
- -extern void signal_wake_up_state(struct task_struct *t, unsigned int state);
- -
- -static inline void signal_wake_up(struct task_struct *t, bool resume)
- -{
- - signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
- -}
- -static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
- -{
- - signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
- -}
- -
- -/*
- - * Wrappers for p->thread_info->cpu access. No-op on UP.
- - */
- -#ifdef CONFIG_SMP
- -
- -static inline unsigned int task_cpu(const struct task_struct *p)
- -{
- - return task_thread_info(p)->cpu;
- -}
- -
- -static inline int task_node(const struct task_struct *p)
- -{
- - return cpu_to_node(task_cpu(p));
- -}
- -
- -extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
- -
- -#else
- -
- -static inline unsigned int task_cpu(const struct task_struct *p)
- -{
- - return 0;
- -}
- -
- -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
- -{
- -}
- -
- -#endif /* CONFIG_SMP */
- -
- -extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
- -extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
- -
- -#ifdef CONFIG_CGROUP_SCHED
- -extern struct task_group root_task_group;
- -#endif /* CONFIG_CGROUP_SCHED */
- -
- -extern int task_can_switch_user(struct user_struct *up,
- - struct task_struct *tsk);
- -
- -#ifdef CONFIG_TASK_XACCT
- -static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
- -{
- - tsk->ioac.rchar += amt;
- -}
- -
- -static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
- -{
- - tsk->ioac.wchar += amt;
- -}
- -
- -static inline void inc_syscr(struct task_struct *tsk)
- -{
- - tsk->ioac.syscr++;
- -}
- -
- -static inline void inc_syscw(struct task_struct *tsk)
- -{
- - tsk->ioac.syscw++;
- -}
- -#else
- -static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
- -{
- -}
- -
- -static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
- -{
- -}
- -
- -static inline void inc_syscr(struct task_struct *tsk)
- -{
- -}
- -
- -static inline void inc_syscw(struct task_struct *tsk)
- -{
- -}
- -#endif
- -
- -#ifndef TASK_SIZE_OF
- -#define TASK_SIZE_OF(tsk) TASK_SIZE
- -#endif
- -
- -#ifdef CONFIG_MEMCG
- -extern void mm_update_next_owner(struct mm_struct *mm);
- -#else
- -static inline void mm_update_next_owner(struct mm_struct *mm)
- -{
- -}
- -#endif /* CONFIG_MEMCG */
- -
- -static inline unsigned long task_rlimit(const struct task_struct *tsk,
- - unsigned int limit)
- -{
- - return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
- -}
- -
- -static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
- - unsigned int limit)
- -{
- - return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
- -}
- -
- -static inline unsigned long rlimit(unsigned int limit)
- -{
- - return task_rlimit(current, limit);
- -}
- -
- -static inline unsigned long rlimit_max(unsigned int limit)
- -{
- - return task_rlimit_max(current, limit);
- -}
- -
- -#endif
- diff -Naur linux-4.4.6-gentoo-orig/include/linux/wbt.h linux-4.4.6-gentoo-patched/include/linux/wbt.h
- --- linux-4.4.6-gentoo-orig/include/linux/wbt.h 1970-01-01 03:00:00.000000000 +0300
- +++ linux-4.4.6-gentoo-patched/include/linux/wbt.h 2016-05-04 11:03:27.411730745 +0300
- @@ -0,0 +1,95 @@
- +#ifndef WB_THROTTLE_H
- +#define WB_THROTTLE_H
- +
- +#include <linux/atomic.h>
- +#include <linux/wait.h>
- +#include <linux/timer.h>
- +#include <linux/ktime.h>
- +
- +#define ISSUE_STAT_MASK (1ULL << 63)
- +#define ISSUE_STAT_TIME_MASK ~ISSUE_STAT_MASK
- +
- +struct wb_issue_stat {
- + u64 time;
- +};
- +
- +static inline void wbt_issue_stat_set_time(struct wb_issue_stat *stat)
- +{
- + stat->time = (stat->time & ISSUE_STAT_MASK) |
- + (ktime_to_ns(ktime_get()) & ISSUE_STAT_TIME_MASK);
- +}
- +
- +static inline u64 wbt_issue_stat_get_time(struct wb_issue_stat *stat)
- +{
- + return stat->time & ISSUE_STAT_TIME_MASK;
- +}
- +
- +static inline void wbt_mark_tracked(struct wb_issue_stat *stat)
- +{
- + stat->time |= ISSUE_STAT_MASK;
- +}
- +
- +static inline void wbt_clear_tracked(struct wb_issue_stat *stat)
- +{
- + stat->time &= ~ISSUE_STAT_MASK;
- +}
- +
- +static inline bool wbt_tracked(struct wb_issue_stat *stat)
- +{
- + return (stat->time & ISSUE_STAT_MASK) != 0;
- +}
- +
- +struct wb_stat_ops {
- + void (*get)(void *, struct blk_rq_stat *);
- + void (*clear)(void *);
- +};
- +
- +struct rq_wb {
- + /*
- + * Settings that govern how we throttle
- + */
- + unsigned int wb_background; /* background writeback */
- + unsigned int wb_normal; /* normal writeback */
- + unsigned int wb_max; /* max throughput writeback */
- + unsigned int scale_step;
- +
- + u64 win_nsec; /* default window size */
- + u64 cur_win_nsec; /* current window size */
- +
- + unsigned int unknown_cnt;
- +
- + struct timer_list window_timer;
- +
- + s64 sync_issue;
- + void *sync_cookie;
- +
- + unsigned int wc;
- + unsigned int queue_depth;
- +
- + unsigned long last_issue; /* last non-throttled issue */
- + unsigned long last_comp; /* last non-throttled comp */
- + unsigned long min_lat_nsec;
- + struct backing_dev_info *bdi;
- + struct request_queue *q;
- + wait_queue_head_t wait;
- + atomic_t inflight;
- +
- + struct wb_stat_ops *stat_ops;
- + void *ops_data;
- +};
- +
- +struct backing_dev_info;
- +
- +void __wbt_done(struct rq_wb *);
- +void wbt_done(struct rq_wb *, struct wb_issue_stat *);
- +bool wbt_wait(struct rq_wb *, unsigned int, spinlock_t *);
- +struct rq_wb *wbt_init(struct backing_dev_info *, struct wb_stat_ops *, void *);
- +void wbt_exit(struct rq_wb *);
- +void wbt_update_limits(struct rq_wb *);
- +void wbt_requeue(struct rq_wb *, struct wb_issue_stat *);
- +void wbt_issue(struct rq_wb *, struct wb_issue_stat *);
- +
- +void wbt_set_queue_depth(struct rq_wb *, unsigned int);
- +void wbt_set_write_cache(struct rq_wb *, bool);
- +
- +#endif
- diff -Naur linux-4.4.6-gentoo-orig/include/linux/writeback.h linux-4.4.6-gentoo-patched/include/linux/writeback.h
- --- linux-4.4.6-gentoo-orig/include/linux/writeback.h 2016-05-04 11:19:37.618649827 +0300
- +++ linux-4.4.6-gentoo-patched/include/linux/writeback.h 2016-05-04 11:03:27.411730745 +0300
- @@ -106,6 +106,16 @@
- #endif
- };
- +static inline int wbc_to_write_cmd(struct writeback_control *wbc)
- +{
- + if (wbc->sync_mode == WB_SYNC_ALL)
- + return WRITE_SYNC;
- + else if (wbc->for_kupdate || wbc->for_background)
- + return WRITE_BG;
- +
- + return WRITE;
- +}
- +
- /*
- * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
- * and are measured against each other in. There always is one global
- diff -Naur linux-4.4.6-gentoo-orig/include/trace/events/wbt.h linux-4.4.6-gentoo-patched/include/trace/events/wbt.h
- --- linux-4.4.6-gentoo-orig/include/trace/events/wbt.h 1970-01-01 03:00:00.000000000 +0300
- +++ linux-4.4.6-gentoo-patched/include/trace/events/wbt.h 2016-05-04 11:03:27.411730745 +0300
- @@ -0,0 +1,122 @@
- +#undef TRACE_SYSTEM
- +#define TRACE_SYSTEM wbt
- +
- +#if !defined(_TRACE_WBT_H) || defined(TRACE_HEADER_MULTI_READ)
- +#define _TRACE_WBT_H
- +
- +#include <linux/tracepoint.h>
- +#include <linux/wbt.h>
- +
- +/**
- + * wbt_stat - trace stats for blk_wb
- + * @stat: array of read/write stats
- + */
- +TRACE_EVENT(wbt_stat,
- +
- + TP_PROTO(struct backing_dev_info *bdi, struct blk_rq_stat *stat),
- +
- + TP_ARGS(bdi, stat),
- +
- + TP_STRUCT__entry(
- + __array(char, name, 32)
- + __field(s64, rmean)
- + __field(u64, rmin)
- + __field(u64, rmax)
- + __field(s64, rnr_samples)
- + __field(s64, rtime)
- + __field(s64, wmean)
- + __field(u64, wmin)
- + __field(u64, wmax)
- + __field(s64, wnr_samples)
- + __field(s64, wtime)
- + ),
- +
- + TP_fast_assign(
- + strncpy(__entry->name, dev_name(bdi->dev), 32);
- + __entry->rmean = stat[0].mean;
- + __entry->rmin = stat[0].min;
- + __entry->rmax = stat[0].max;
- + __entry->rnr_samples = stat[0].nr_samples;
- + __entry->wmean = stat[1].mean;
- + __entry->wmin = stat[1].min;
- + __entry->wmax = stat[1].max;
- + __entry->wnr_samples = stat[1].nr_samples;
- + ),
- +
- + TP_printk("%s: rmean=%llu, rmin=%llu, rmax=%llu, rsamples=%llu, "
- + "wmean=%llu, wmin=%llu, wmax=%llu, wsamples=%llu\n",
- + __entry->name, __entry->rmean, __entry->rmin, __entry->rmax,
- + __entry->rnr_samples, __entry->wmean, __entry->wmin,
- + __entry->wmax, __entry->wnr_samples)
- +);
- +
- +/**
- + * wbt_lat - trace latency event
- + * @lat: latency trigger
- + */
- +TRACE_EVENT(wbt_lat,
- +
- + TP_PROTO(struct backing_dev_info *bdi, unsigned long lat),
- +
- + TP_ARGS(bdi, lat),
- +
- + TP_STRUCT__entry(
- + __array(char, name, 32)
- + __field(unsigned long, lat)
- + ),
- +
- + TP_fast_assign(
- + strncpy(__entry->name, dev_name(bdi->dev), 32);
- + __entry->lat = lat;
- + ),
- +
- + TP_printk("%s: latency %llu\n", __entry->name,
- + (unsigned long long) __entry->lat)
- +);
- +
- +/**
- + * wbt_step - trace wb event step
- + * @msg: context message
- + * @step: the current scale step count
- + * @window: the current monitoring window
- + * @bg: the current background queue limit
- + * @normal: the current normal writeback limit
- + * @max: the current max throughput writeback limit
- + */
- +TRACE_EVENT(wbt_step,
- +
- + TP_PROTO(struct backing_dev_info *bdi, const char *msg,
- + unsigned int step, unsigned long window, unsigned int bg,
- + unsigned int normal, unsigned int max),
- +
- + TP_ARGS(bdi, msg, step, window, bg, normal, max),
- +
- + TP_STRUCT__entry(
- + __array(char, name, 32)
- + __field(const char *, msg)
- + __field(unsigned int, step)
- + __field(unsigned long, window)
- + __field(unsigned int, bg)
- + __field(unsigned int, normal)
- + __field(unsigned int, max)
- + ),
- +
- + TP_fast_assign(
- + strncpy(__entry->name, dev_name(bdi->dev), 32);
- + __entry->msg = msg;
- + __entry->step = step;
- + __entry->window = window;
- + __entry->bg = bg;
- + __entry->normal = normal;
- + __entry->max = max;
- + ),
- +
- + TP_printk("%s: %s: step=%u, window=%lu, background=%u, normal=%u, max=%u\n",
- + __entry->name, __entry->msg, __entry->step, __entry->window,
- + __entry->bg, __entry->normal, __entry->max)
- +);
- +
- +#endif /* _TRACE_WBT_H */
- +
- +/* This part must be outside protection */
- +#include <trace/define_trace.h>
- diff -Naur linux-4.4.6-gentoo-orig/lib/Kconfig linux-4.4.6-gentoo-patched/lib/Kconfig
- --- linux-4.4.6-gentoo-orig/lib/Kconfig 2016-05-04 11:19:37.619649827 +0300
- +++ linux-4.4.6-gentoo-patched/lib/Kconfig 2016-05-04 11:03:27.411730745 +0300
- @@ -531,4 +531,7 @@
- config ARCH_HAS_MMIO_FLUSH
- bool
- +config WBT
- + bool
- +
- endmenu
- diff -Naur linux-4.4.6-gentoo-orig/lib/Makefile linux-4.4.6-gentoo-patched/lib/Makefile
- --- linux-4.4.6-gentoo-orig/lib/Makefile 2016-05-04 11:19:37.619649827 +0300
- +++ linux-4.4.6-gentoo-patched/lib/Makefile 2016-05-04 11:08:23.874706019 +0300
- @@ -164,6 +164,7 @@
- obj-$(CONFIG_SG_SPLIT) += sg_split.o
- obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
- +obj-$(CONFIG_WBT) += wbt.o
- libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \
- fdt_empty_tree.o
- diff -Naur linux-4.4.6-gentoo-orig/lib/wbt.c linux-4.4.6-gentoo-patched/lib/wbt.c
- --- linux-4.4.6-gentoo-orig/lib/wbt.c 1970-01-01 03:00:00.000000000 +0300
- +++ linux-4.4.6-gentoo-patched/lib/wbt.c 2016-05-04 11:03:27.412730745 +0300
- @@ -0,0 +1,524 @@
- +/*
- + * buffered writeback throttling. losely based on CoDel. We can't drop
- + * packets for IO scheduling, so the logic is something like this:
- + *
- + * - Monitor latencies in a defined window of time.
- + * - If the minimum latency in the above window exceeds some target, increment
- + * scaling step and scale down queue depth by a factor of 2x. The monitoring
- + * window is then shrunk to 100 / sqrt(scaling step + 1).
- + * - For any window where we don't have solid data on what the latencies
- + * look like, retain status quo.
- + * - If latencies look good, decrement scaling step.
- + *
- + * Copyright (C) 2016 Jens Axboe
- + *
- + * Things that (may) need changing:
- + *
- + * - Different scaling of background/normal/high priority writeback.
- + * We may have to violate guarantees for max.
- + * - We can have mismatches between the stat window and our window.
- + *
- + */
- +#include <linux/kernel.h>
- +#include <linux/blk_types.h>
- +#include <linux/slab.h>
- +#include <linux/backing-dev.h>
- +#include <linux/wbt.h>
- +
- +#define CREATE_TRACE_POINTS
- +#include <trace/events/wbt.h>
- +
- +enum {
- + /*
- + * Might need to be higher
- + */
- + RWB_MAX_DEPTH = 64,
- +
- + /*
- + * 100msec window
- + */
- + RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL,
- +
- + /*
- + * Disregard stats, if we don't meet these minimums
- + */
- + RWB_MIN_WRITE_SAMPLES = 3,
- + RWB_MIN_READ_SAMPLES = 1,
- +
- + RWB_UNKNOWN_BUMP = 5,
- +};
- +
- +static inline bool rwb_enabled(struct rq_wb *rwb)
- +{
- + return rwb && rwb->wb_normal != 0;
- +}
- +
- +/*
- + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
- + * false if 'v' + 1 would be bigger than 'below'.
- + */
- +static bool atomic_inc_below(atomic_t *v, int below)
- +{
- + int cur = atomic_read(v);
- +
- + for (;;) {
- + int old;
- +
- + if (cur >= below)
- + return false;
- + old = atomic_cmpxchg(v, cur, cur + 1);
- + if (old == cur)
- + break;
- + cur = old;
- + }
- +
- + return true;
- +}
- +
- +static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
- +{
- + if (rwb_enabled(rwb)) {
- + const unsigned long cur = jiffies;
- +
- + if (cur != *var)
- + *var = cur;
- + }
- +}
- +
- +void __wbt_done(struct rq_wb *rwb)
- +{
- + int inflight, limit = rwb->wb_normal;
- +
- + /*
- + * If the device does write back caching, drop further down
- + * before we wake people up.
- + */
- + if (rwb->wc && !atomic_read(&rwb->bdi->wb.dirty_sleeping))
- + limit = 0;
- + else
- + limit = rwb->wb_normal;
- +
- + /*
- + * Don't wake anyone up if we are above the normal limit. If
- + * throttling got disabled (limit == 0) with waiters, ensure
- + * that we wake them up.
- + */
- + inflight = atomic_dec_return(&rwb->inflight);
- + if (limit && inflight >= limit) {
- + if (!rwb->wb_max)
- + wake_up_all(&rwb->wait);
- + return;
- + }
- +
- + if (waitqueue_active(&rwb->wait)) {
- + int diff = limit - inflight;
- +
- + if (!inflight || diff >= rwb->wb_background / 2)
- + wake_up_nr(&rwb->wait, 1);
- + }
- +}
- +
- +/*
- + * Called on completion of a request. Note that it's also called when
- + * a request is merged, when the request gets freed.
- + */
- +void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat)
- +{
- + if (!rwb)
- + return;
- +
- + if (!wbt_tracked(stat)) {
- + if (rwb->sync_cookie == stat) {
- + rwb->sync_issue = 0;
- + rwb->sync_cookie = NULL;
- + }
- +
- + wb_timestamp(rwb, &rwb->last_comp);
- + } else {
- + WARN_ON_ONCE(stat == rwb->sync_cookie);
- + __wbt_done(rwb);
- + wbt_clear_tracked(stat);
- + }
- +}
- +
- +static void calc_wb_limits(struct rq_wb *rwb)
- +{
- + unsigned int depth;
- +
- + if (!rwb->min_lat_nsec) {
- + rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
- + return;
- + }
- +
- + depth = min_t(unsigned int, RWB_MAX_DEPTH, rwb->queue_depth);
- +
- + /*
- + * Reduce max depth by 50%, and re-calculate normal/bg based on that
- + */
- + rwb->wb_max = 1 + ((depth - 1) >> min(31U, rwb->scale_step));
- + rwb->wb_normal = (rwb->wb_max + 1) / 2;
- + rwb->wb_background = (rwb->wb_max + 3) / 4;
- +}
- +
- +static bool inline stat_sample_valid(struct blk_rq_stat *stat)
- +{
- + /*
- + * We need at least one read sample, and a minimum of
- + * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
- + * that it's writes impacting us, and not just some sole read on
- + * a device that is in a lower power state.
- + */
- + return stat[0].nr_samples >= 1 &&
- + stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES;
- +}
- +
- +static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
- +{
- + u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
- +
- + if (!issue || !rwb->sync_cookie)
- + return 0;
- +
- + now = ktime_to_ns(ktime_get());
- + return now - issue;
- +}
- +
- +enum {
- + LAT_OK,
- + LAT_UNKNOWN,
- + LAT_EXCEEDED,
- +};
- +
- +static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
- +{
- + u64 thislat;
- +
- + /*
- + * If our stored sync issue exceeds the window size, or it
- + * exceeds our min target AND we haven't logged any entries,
- + * flag the latency as exceeded.
- + */
- + thislat = rwb_sync_issue_lat(rwb);
- + if (thislat > rwb->cur_win_nsec ||
- + (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) {
- + trace_wbt_lat(rwb->bdi, thislat);
- + return LAT_EXCEEDED;
- + }
- +
- + if (!stat_sample_valid(stat))
- + return LAT_UNKNOWN;
- +
- + /*
- + * If the 'min' latency exceeds our target, step down.
- + */
- + if (stat[0].min > rwb->min_lat_nsec) {
- + trace_wbt_lat(rwb->bdi, stat[0].min);
- + trace_wbt_stat(rwb->bdi, stat);
- + return LAT_EXCEEDED;
- + }
- +
- + if (rwb->scale_step)
- + trace_wbt_stat(rwb->bdi, stat);
- +
- + return LAT_OK;
- +}
- +
- +static int latency_exceeded(struct rq_wb *rwb)
- +{
- + struct blk_rq_stat stat[2];
- +
- + rwb->stat_ops->get(rwb->ops_data, stat);
- + return __latency_exceeded(rwb, stat);
- +}
- +
- +static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
- +{
- + trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
- + rwb->wb_background, rwb->wb_normal, rwb->wb_max);
- +}
- +
- +static void scale_up(struct rq_wb *rwb)
- +{
- + /*
- + * If we're at 0, we can't go lower.
- + */
- + if (!rwb->scale_step)
- + return;
- +
- + rwb->scale_step--;
- + rwb->unknown_cnt = 0;
- + rwb->stat_ops->clear(rwb->ops_data);
- + calc_wb_limits(rwb);
- +
- + if (waitqueue_active(&rwb->wait))
- + wake_up_all(&rwb->wait);
- +
- + rwb_trace_step(rwb, "step up");
- +}
- +
- +static void scale_down(struct rq_wb *rwb)
- +{
- + /*
- + * Stop scaling down when we've hit the limit. This also prevents
- + * ->scale_step from going to crazy values, if the device can't
- + * keep up.
- + */
- + if (rwb->wb_max == 1)
- + return;
- +
- + rwb->scale_step++;
- + rwb->unknown_cnt = 0;
- + rwb->stat_ops->clear(rwb->ops_data);
- + calc_wb_limits(rwb);
- + rwb_trace_step(rwb, "step down");
- +}
- +
- +static void rwb_arm_timer(struct rq_wb *rwb)
- +{
- + unsigned long expires;
- +
- + /*
- + * We should speed this up, using some variant of a fast integer
- + * inverse square root calculation. Since we only do this for
- + * every window expiration, it's not a huge deal, though.
- + */
- + rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
- + int_sqrt((rwb->scale_step + 1) << 8));
- + expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
- + mod_timer(&rwb->window_timer, expires);
- +}
- +
- +static void wb_timer_fn(unsigned long data)
- +{
- + struct rq_wb *rwb = (struct rq_wb *) data;
- + int status;
- +
- + /*
- + * If we exceeded the latency target, step down. If we did not,
- + * step one level up. If we don't know enough to say either exceeded
- + * or ok, then don't do anything.
- + */
- + status = latency_exceeded(rwb);
- + switch (status) {
- + case LAT_EXCEEDED:
- + scale_down(rwb);
- + break;
- + case LAT_OK:
- + scale_up(rwb);
- + break;
- + case LAT_UNKNOWN:
- + /*
- + * We had no read samples, start bumping up the write
- + * depth slowly
- + */
- + if (++rwb->unknown_cnt >= RWB_UNKNOWN_BUMP)
- + scale_up(rwb);
- + break;
- + default:
- + break;
- + }
- +
- + /*
- + * Re-arm timer, if we have IO in flight
- + */
- + if (rwb->scale_step || atomic_read(&rwb->inflight))
- + rwb_arm_timer(rwb);
- +}
- +
- +void wbt_update_limits(struct rq_wb *rwb)
- +{
- + rwb->scale_step = 0;
- + calc_wb_limits(rwb);
- +
- + if (waitqueue_active(&rwb->wait))
- + wake_up_all(&rwb->wait);
- +}
- +
- +static bool close_io(struct rq_wb *rwb)
- +{
- + const unsigned long now = jiffies;
- +
- + return time_before(now, rwb->last_issue + HZ / 10) ||
- + time_before(now, rwb->last_comp + HZ / 10);
- +}
- +
- +#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO)
- +
- +static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
- +{
- + unsigned int limit;
- +
- + /*
- + * At this point we know it's a buffered write. If REQ_SYNC is
- + * set, then it's WB_SYNC_ALL writeback, and we'll use the max
- + * limit for that. If the write is marked as a background write,
- + * then use the idle limit, or go to normal if we haven't had
- + * competing IO for a bit.
- + */
- + if ((rw & REQ_HIPRIO) || atomic_read(&rwb->bdi->wb.dirty_sleeping))
- + limit = rwb->wb_max;
- + else if ((rw & REQ_BG) || close_io(rwb)) {
- + /*
- + * If less than 100ms since we completed unrelated IO,
- + * limit us to half the depth for background writeback.
- + */
- + limit = rwb->wb_background;
- + } else
- + limit = rwb->wb_normal;
- +
- + return limit;
- +}
- +
- +static inline bool may_queue(struct rq_wb *rwb, unsigned long rw)
- +{
- + /*
- + * inc it here even if disabled, since we'll dec it at completion.
- + * this only happens if the task was sleeping in __wbt_wait(),
- + * and someone turned it off at the same time.
- + */
- + if (!rwb_enabled(rwb)) {
- + atomic_inc(&rwb->inflight);
- + return true;
- + }
- +
- + return atomic_inc_below(&rwb->inflight, get_limit(rwb, rw));
- +}
- +
- +/*
- + * Block if we will exceed our limit, or if we are currently waiting for
- + * the timer to kick off queuing again.
- + */
- +static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
- +{
- + DEFINE_WAIT(wait);
- +
- + if (may_queue(rwb, rw))
- + return;
- +
- + do {
- + prepare_to_wait_exclusive(&rwb->wait, &wait,
- + TASK_UNINTERRUPTIBLE);
- +
- + if (may_queue(rwb, rw))
- + break;
- +
- + if (lock)
- + spin_unlock_irq(lock);
- +
- + io_schedule();
- +
- + if (lock)
- + spin_lock_irq(lock);
- + } while (1);
- +
- + finish_wait(&rwb->wait, &wait);
- +}
- +
- +static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw)
- +{
- + /*
- + * If not a WRITE (or a discard), do nothing
- + */
- + if (!(rw & REQ_WRITE) || (rw & REQ_DISCARD))
- + return false;
- +
- + /*
- + * Don't throttle WRITE_ODIRECT
- + */
- + if ((rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC)
- + return false;
- +
- + return true;
- +}
- +
- +/*
- + * Returns true if the IO request should be accounted, false if not.
- + * May sleep, if we have exceeded the writeback limits. Caller can pass
- + * in an irq held spinlock, if it holds one when calling this function.
- + * If we do sleep, we'll release and re-grab it.
- + */
- +bool wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock)
- +{
- + if (!rwb_enabled(rwb))
- + return false;
- +
- + if (!wbt_should_throttle(rwb, rw)) {
- + wb_timestamp(rwb, &rwb->last_issue);
- + return false;
- + }
- +
- + __wbt_wait(rwb, rw, lock);
- +
- + if (!timer_pending(&rwb->window_timer))
- + rwb_arm_timer(rwb);
- +
- + return true;
- +}
- +
- +void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat)
- +{
- + if (!rwb_enabled(rwb))
- + return;
- +
- + wbt_issue_stat_set_time(stat);
- +
- + if (!wbt_tracked(stat) && !rwb->sync_issue) {
- + rwb->sync_cookie = stat;
- + rwb->sync_issue = wbt_issue_stat_get_time(stat);
- + }
- +}
- +
- +void wbt_requeue(struct rq_wb *rwb, struct wb_issue_stat *stat)
- +{
- + if (!rwb_enabled(rwb))
- + return;
- + if (stat == rwb->sync_cookie) {
- + rwb->sync_issue = 0;
- + rwb->sync_cookie = NULL;
- + }
- +}
- +
- +void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
- +{
- + if (rwb) {
- + rwb->queue_depth = depth;
- + wbt_update_limits(rwb);
- + }
- +}
- +
- +void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
- +{
- + if (rwb)
- + rwb->wc = write_cache_on;
- +}
- +
- +struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops,
- + void *ops_data)
- +{
- + struct rq_wb *rwb;
- +
- + rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
- + if (!rwb)
- + return ERR_PTR(-ENOMEM);
- +
- + atomic_set(&rwb->inflight, 0);
- + init_waitqueue_head(&rwb->wait);
- + setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
- + rwb->wc = 1;
- + rwb->queue_depth = RWB_MAX_DEPTH;
- + rwb->last_comp = rwb->last_issue = jiffies;
- + rwb->bdi = bdi;
- + rwb->win_nsec = RWB_WINDOW_NSEC;
- + rwb->stat_ops = ops,
- + rwb->ops_data = ops_data;
- + wbt_update_limits(rwb);
- + return rwb;
- +}
- +
- +void wbt_exit(struct rq_wb *rwb)
- +{
- + if (rwb) {
- + del_timer_sync(&rwb->window_timer);
- + kfree(rwb);
- + }
- +}
- diff -Naur linux-4.4.6-gentoo-orig/mm/backing-dev.c linux-4.4.6-gentoo-patched/mm/backing-dev.c
- --- linux-4.4.6-gentoo-orig/mm/backing-dev.c 2016-05-04 11:19:37.620649827 +0300
- +++ linux-4.4.6-gentoo-patched/mm/backing-dev.c 2016-05-04 11:03:27.412730745 +0300
- @@ -310,6 +310,7 @@
- spin_lock_init(&wb->work_lock);
- INIT_LIST_HEAD(&wb->work_list);
- INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
- + atomic_set(&wb->dirty_sleeping, 0);
- wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
- if (!wb->congested)
- diff -Naur linux-4.4.6-gentoo-orig/mm/page-writeback.c linux-4.4.6-gentoo-patched/mm/page-writeback.c
- --- linux-4.4.6-gentoo-orig/mm/page-writeback.c 2016-05-04 11:19:37.621649827 +0300
- +++ linux-4.4.6-gentoo-patched/mm/page-writeback.c 2016-05-04 11:03:27.412730745 +0300
- @@ -1735,7 +1735,9 @@
- pause,
- start_time);
- __set_current_state(TASK_KILLABLE);
- + atomic_inc(&wb->dirty_sleeping);
- io_schedule_timeout(pause);
- + atomic_dec(&wb->dirty_sleeping);
- current->dirty_paused_when = now + pause;
- current->nr_dirtied = 0;
Add Comment
Please, Sign In to add comment