kosmik2001

wb-4.4.6-v5.patch

May 4th, 2016
320
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 247.86 KB | None | 0 0
  1. diff -Naur linux-4.4.6-gentoo-orig/arch/um/drivers/ubd_kern.c linux-4.4.6-gentoo-patched/arch/um/drivers/ubd_kern.c
  2. --- linux-4.4.6-gentoo-orig/arch/um/drivers/ubd_kern.c  2016-05-04 11:19:37.591649829 +0300
  3. +++ linux-4.4.6-gentoo-patched/arch/um/drivers/ubd_kern.c   2016-05-04 11:02:48.599733982 +0300
  4. @@ -866,7 +866,7 @@
  5.         goto out;
  6.     }
  7.     ubd_dev->queue->queuedata = ubd_dev;
  8. -   blk_queue_flush(ubd_dev->queue, REQ_FLUSH);
  9. +   blk_queue_write_cache(ubd_dev->queue, true, false);
  10.  
  11.     blk_queue_max_segments(ubd_dev->queue, MAX_SG);
  12.     err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]);
  13. diff -Naur linux-4.4.6-gentoo-orig/block/blk-core.c linux-4.4.6-gentoo-patched/block/blk-core.c
  14. --- linux-4.4.6-gentoo-orig/block/blk-core.c    2016-05-04 11:19:37.593649829 +0300
  15. +++ linux-4.4.6-gentoo-patched/block/blk-core.c 2016-05-04 11:02:48.599733982 +0300
  16. @@ -33,6 +33,7 @@
  17.  #include <linux/ratelimit.h>
  18.  #include <linux/pm_runtime.h>
  19.  #include <linux/blk-cgroup.h>
  20. +#include <linux/wbt.h>
  21.  
  22.  #define CREATE_TRACE_POINTS
  23.  #include <trace/events/block.h>
  24. @@ -872,6 +873,8 @@
  25.  
  26.  fail:
  27.     blk_free_flush_queue(q->fq);
  28. +   wbt_exit(q->rq_wb);
  29. +   q->rq_wb = NULL;
  30.     return NULL;
  31.  }
  32.  EXPORT_SYMBOL(blk_init_allocated_queue);
  33. @@ -1385,6 +1388,7 @@
  34.     blk_delete_timer(rq);
  35.     blk_clear_rq_complete(rq);
  36.     trace_block_rq_requeue(q, rq);
  37. +   wbt_requeue(q->rq_wb, &rq->wb_stat);
  38.  
  39.     if (rq->cmd_flags & REQ_QUEUED)
  40.         blk_queue_end_tag(q, rq);
  41. @@ -1475,6 +1479,8 @@
  42.     /* this is a bio leak */
  43.     WARN_ON(req->bio != NULL);
  44.  
  45. +   wbt_done(q->rq_wb, &req->wb_stat);
  46. +
  47.     /*
  48.      * Request may not have originated from ll_rw_blk. if not,
  49.      * it didn't come out of our reserved rq pools
  50. @@ -1704,6 +1710,7 @@
  51.     int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
  52.     struct request *req;
  53.     unsigned int request_count = 0;
  54. +   bool wb_acct;
  55.  
  56.     /*
  57.      * low level driver can indicate that it wants pages above a
  58. @@ -1756,6 +1763,8 @@
  59.     }
  60.  
  61.  get_rq:
  62. +   wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, q->queue_lock);
  63. +
  64.     /*
  65.      * This sync check and mask will be re-done in init_request_from_bio(),
  66.      * but we need to set it earlier to expose the sync flag to the
  67. @@ -1771,11 +1780,16 @@
  68.      */
  69.     req = get_request(q, rw_flags, bio, GFP_NOIO);
  70.     if (IS_ERR(req)) {
  71. +       if (wb_acct)
  72. +           __wbt_done(q->rq_wb);
  73.         bio->bi_error = PTR_ERR(req);
  74.         bio_endio(bio);
  75.         goto out_unlock;
  76.     }
  77.  
  78. +   if (wb_acct)
  79. +       wbt_mark_tracked(&req->wb_stat);
  80. +
  81.     /*
  82.      * After dropping the lock and possibly sleeping here, our request
  83.      * may now be mergeable after it had proven unmergeable (above).
  84. @@ -1953,7 +1967,8 @@
  85.      * drivers without flush support don't have to worry
  86.      * about them.
  87.      */
  88. -   if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
  89. +   if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
  90. +       !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
  91.         bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
  92.         if (!nr_sectors) {
  93.             err = 0;
  94. @@ -2502,6 +2517,8 @@
  95.  {
  96.     blk_dequeue_request(req);
  97.  
  98. +   wbt_issue(req->q->rq_wb, &req->wb_stat);
  99. +
  100.     /*
  101.      * We are now handing the request to the hardware, initialize
  102.      * resid_len to full count and add the timeout handler.
  103. @@ -2569,6 +2586,8 @@
  104.  
  105.     trace_block_rq_complete(req->q, req, nr_bytes);
  106.  
  107. +   blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req);
  108. +
  109.     if (!req->bio)
  110.         return false;
  111.  
  112. @@ -2736,9 +2755,10 @@
  113.  
  114.     blk_account_io_done(req);
  115.  
  116. -   if (req->end_io)
  117. +   if (req->end_io) {
  118. +       wbt_done(req->q->rq_wb, &req->wb_stat);
  119.         req->end_io(req, error);
  120. -   else {
  121. +   } else {
  122.         if (blk_bidi_rq(req))
  123.             __blk_put_request(req->next_rq->q, req->next_rq);
  124.  
  125. diff -Naur linux-4.4.6-gentoo-orig/block/blk-flush.c linux-4.4.6-gentoo-patched/block/blk-flush.c
  126. --- linux-4.4.6-gentoo-orig/block/blk-flush.c   2016-05-04 11:19:37.593649829 +0300
  127. +++ linux-4.4.6-gentoo-patched/block/blk-flush.c    2016-05-04 11:02:48.599733982 +0300
  128. @@ -95,17 +95,18 @@
  129.  static bool blk_kick_flush(struct request_queue *q,
  130.                struct blk_flush_queue *fq);
  131.  
  132. -static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
  133. +static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
  134.  {
  135.     unsigned int policy = 0;
  136.  
  137.     if (blk_rq_sectors(rq))
  138.         policy |= REQ_FSEQ_DATA;
  139.  
  140. -   if (fflags & REQ_FLUSH) {
  141. +   if (fflags & (1UL << QUEUE_FLAG_WC)) {
  142.         if (rq->cmd_flags & REQ_FLUSH)
  143.             policy |= REQ_FSEQ_PREFLUSH;
  144. -       if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
  145. +       if (!(fflags & (1UL << QUEUE_FLAG_FUA)) &&
  146. +           (rq->cmd_flags & REQ_FUA))
  147.             policy |= REQ_FSEQ_POSTFLUSH;
  148.     }
  149.     return policy;
  150. @@ -384,7 +385,7 @@
  151.  void blk_insert_flush(struct request *rq)
  152.  {
  153.     struct request_queue *q = rq->q;
  154. -   unsigned int fflags = q->flush_flags;   /* may change, cache */
  155. +   unsigned long fflags = q->queue_flags;  /* may change, cache */
  156.     unsigned int policy = blk_flush_policy(fflags, rq);
  157.     struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
  158.  
  159. @@ -393,7 +394,7 @@
  160.      * REQ_FLUSH and FUA for the driver.
  161.      */
  162.     rq->cmd_flags &= ~REQ_FLUSH;
  163. -   if (!(fflags & REQ_FUA))
  164. +   if (!(fflags & (1UL << QUEUE_FLAG_FUA)))
  165.         rq->cmd_flags &= ~REQ_FUA;
  166.  
  167.     /*
  168. diff -Naur linux-4.4.6-gentoo-orig/block/blk-mq.c linux-4.4.6-gentoo-patched/block/blk-mq.c
  169. --- linux-4.4.6-gentoo-orig/block/blk-mq.c  2016-05-04 11:19:37.594649829 +0300
  170. +++ linux-4.4.6-gentoo-patched/block/blk-mq.c   2016-05-04 11:02:48.600733982 +0300
  171. @@ -22,6 +22,7 @@
  172.  #include <linux/sched/sysctl.h>
  173.  #include <linux/delay.h>
  174.  #include <linux/crash_dump.h>
  175. +#include <linux/wbt.h>
  176.  
  177.  #include <trace/events/block.h>
  178.  
  179. @@ -29,6 +30,7 @@
  180.  #include "blk.h"
  181.  #include "blk-mq.h"
  182.  #include "blk-mq-tag.h"
  183. +#include "blk-stat.h"
  184.  
  185.  static DEFINE_MUTEX(all_q_mutex);
  186.  static LIST_HEAD(all_q_list);
  187. @@ -276,6 +278,8 @@
  188.  
  189.     if (rq->cmd_flags & REQ_MQ_INFLIGHT)
  190.         atomic_dec(&hctx->nr_active);
  191. +
  192. +   wbt_done(q->rq_wb, &rq->wb_stat);
  193.     rq->cmd_flags = 0;
  194.  
  195.     clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
  196. @@ -308,6 +312,7 @@
  197.     blk_account_io_done(rq);
  198.  
  199.     if (rq->end_io) {
  200. +       wbt_done(rq->q->rq_wb, &rq->wb_stat);
  201.         rq->end_io(rq, error);
  202.     } else {
  203.         if (unlikely(blk_bidi_rq(rq)))
  204. @@ -358,10 +363,19 @@
  205.     put_cpu();
  206.  }
  207.  
  208. +static void blk_mq_stat_add(struct request *rq)
  209. +{
  210. +   struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)];
  211. +
  212. +   blk_stat_add(stat, rq);
  213. +}
  214. +
  215.  static void __blk_mq_complete_request(struct request *rq)
  216.  {
  217.     struct request_queue *q = rq->q;
  218.  
  219. +   blk_mq_stat_add(rq);
  220. +
  221.     if (!q->softirq_done_fn)
  222.         blk_mq_end_request(rq, rq->errors);
  223.     else
  224. @@ -405,6 +419,8 @@
  225.     if (unlikely(blk_bidi_rq(rq)))
  226.         rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
  227.  
  228. +   wbt_issue(q->rq_wb, &rq->wb_stat);
  229. +
  230.     blk_add_timer(rq);
  231.  
  232.     /*
  233. @@ -440,6 +456,7 @@
  234.     struct request_queue *q = rq->q;
  235.  
  236.     trace_block_rq_requeue(q, rq);
  237. +   wbt_requeue(q->rq_wb, &rq->wb_stat);
  238.  
  239.     if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
  240.         if (q->dma_drain_size && blk_rq_bytes(rq))
  241. @@ -1249,6 +1266,7 @@
  242.     struct blk_plug *plug;
  243.     struct request *same_queue_rq = NULL;
  244.     blk_qc_t cookie;
  245. +   bool wb_acct;
  246.  
  247.     blk_queue_bounce(q, &bio);
  248.  
  249. @@ -1266,9 +1284,17 @@
  250.     } else
  251.         request_count = blk_plug_queued_count(q);
  252.  
  253. +   wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, NULL);
  254. +
  255.     rq = blk_mq_map_request(q, bio, &data);
  256. -   if (unlikely(!rq))
  257. +   if (unlikely(!rq)) {
  258. +       if (wb_acct)
  259. +           __wbt_done(q->rq_wb);
  260.         return BLK_QC_T_NONE;
  261. +   }
  262. +
  263. +   if (wb_acct)
  264. +       wbt_mark_tracked(&rq->wb_stat);
  265.  
  266.     cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
  267.  
  268. @@ -1345,6 +1371,7 @@
  269.     struct blk_map_ctx data;
  270.     struct request *rq;
  271.     blk_qc_t cookie;
  272. +   bool wb_acct;
  273.  
  274.     blk_queue_bounce(q, &bio);
  275.  
  276. @@ -1359,9 +1386,17 @@
  277.         blk_attempt_plug_merge(q, bio, &request_count, NULL))
  278.         return BLK_QC_T_NONE;
  279.  
  280. +   wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, NULL);
  281. +
  282.     rq = blk_mq_map_request(q, bio, &data);
  283. -   if (unlikely(!rq))
  284. +   if (unlikely(!rq)) {
  285. +       if (wb_acct)
  286. +           __wbt_done(q->rq_wb);
  287.         return BLK_QC_T_NONE;
  288. +   }
  289. +
  290. +   if (wb_acct)
  291. +       wbt_mark_tracked(&rq->wb_stat);
  292.  
  293.     cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
  294.  
  295. @@ -1782,6 +1817,8 @@
  296.         spin_lock_init(&__ctx->lock);
  297.         INIT_LIST_HEAD(&__ctx->rq_list);
  298.         __ctx->queue = q;
  299. +       blk_stat_init(&__ctx->stat[0]);
  300. +       blk_stat_init(&__ctx->stat[1]);
  301.  
  302.         /* If the cpu isn't online, the cpu is mapped to first hctx */
  303.         if (!cpu_online(i))
  304. @@ -2095,6 +2132,9 @@
  305.     list_del_init(&q->all_q_node);
  306.     mutex_unlock(&all_q_mutex);
  307.  
  308. +   wbt_exit(q->rq_wb);
  309. +   q->rq_wb = NULL;
  310. +
  311.     blk_mq_del_queue_tag_set(q);
  312.  
  313.     blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
  314. diff -Naur linux-4.4.6-gentoo-orig/block/blk-mq.h linux-4.4.6-gentoo-patched/block/blk-mq.h
  315. --- linux-4.4.6-gentoo-orig/block/blk-mq.h  2016-05-04 11:19:37.594649829 +0300
  316. +++ linux-4.4.6-gentoo-patched/block/blk-mq.h   2016-05-04 11:02:48.600733982 +0300
  317. @@ -1,6 +1,8 @@
  318.  #ifndef INT_BLK_MQ_H
  319.  #define INT_BLK_MQ_H
  320.  
  321. +#include "blk-stat.h"
  322. +
  323.  struct blk_mq_tag_set;
  324.  
  325.  struct blk_mq_ctx {
  326. @@ -20,6 +22,7 @@
  327.  
  328.     /* incremented at completion time */
  329.     unsigned long       ____cacheline_aligned_in_smp rq_completed[2];
  330. +   struct blk_rq_stat  stat[2];
  331.  
  332.     struct request_queue    *queue;
  333.     struct kobject      kobj;
  334. diff -Naur linux-4.4.6-gentoo-orig/block/blk-mq-sysfs.c linux-4.4.6-gentoo-patched/block/blk-mq-sysfs.c
  335. --- linux-4.4.6-gentoo-orig/block/blk-mq-sysfs.c    2016-05-04 11:19:37.595649829 +0300
  336. +++ linux-4.4.6-gentoo-patched/block/blk-mq-sysfs.c 2016-05-04 11:02:48.599733982 +0300
  337. @@ -247,6 +247,47 @@
  338.     return ret;
  339.  }
  340.  
  341. +static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx)
  342. +{
  343. +   struct blk_mq_ctx *ctx;
  344. +   unsigned int i;
  345. +
  346. +   hctx_for_each_ctx(hctx, ctx, i) {
  347. +       blk_stat_init(&ctx->stat[0]);
  348. +       blk_stat_init(&ctx->stat[1]);
  349. +   }
  350. +}
  351. +
  352. +static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx,
  353. +                     const char *page, size_t count)
  354. +{
  355. +   blk_mq_stat_clear(hctx);
  356. +   return count;
  357. +}
  358. +
  359. +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
  360. +{
  361. +   return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
  362. +           pre, (long long) stat->nr_samples,
  363. +           (long long) stat->mean, (long long) stat->min,
  364. +           (long long) stat->max);
  365. +}
  366. +
  367. +static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page)
  368. +{
  369. +   struct blk_rq_stat stat[2];
  370. +   ssize_t ret;
  371. +
  372. +   blk_stat_init(&stat[0]);
  373. +   blk_stat_init(&stat[1]);
  374. +
  375. +   blk_hctx_stat_get(hctx, stat);
  376. +
  377. +   ret = print_stat(page, &stat[0], "read :");
  378. +   ret += print_stat(page + ret, &stat[1], "write:");
  379. +   return ret;
  380. +}
  381. +
  382.  static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
  383.     .attr = {.name = "dispatched", .mode = S_IRUGO },
  384.     .show = blk_mq_sysfs_dispatched_show,
  385. @@ -304,6 +345,11 @@
  386.     .attr = {.name = "io_poll", .mode = S_IRUGO },
  387.     .show = blk_mq_hw_sysfs_poll_show,
  388.  };
  389. +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = {
  390. +   .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR },
  391. +   .show = blk_mq_hw_sysfs_stat_show,
  392. +   .store = blk_mq_hw_sysfs_stat_store,
  393. +};
  394.  
  395.  static struct attribute *default_hw_ctx_attrs[] = {
  396.     &blk_mq_hw_sysfs_queued.attr,
  397. @@ -314,6 +360,7 @@
  398.     &blk_mq_hw_sysfs_cpus.attr,
  399.     &blk_mq_hw_sysfs_active.attr,
  400.     &blk_mq_hw_sysfs_poll.attr,
  401. +   &blk_mq_hw_sysfs_stat.attr,
  402.     NULL,
  403.  };
  404.  
  405. diff -Naur linux-4.4.6-gentoo-orig/block/blk-settings.c linux-4.4.6-gentoo-patched/block/blk-settings.c
  406. --- linux-4.4.6-gentoo-orig/block/blk-settings.c    2016-05-04 11:19:37.595649829 +0300
  407. +++ linux-4.4.6-gentoo-patched/block/blk-settings.c 2016-05-04 11:02:48.600733982 +0300
  408. @@ -820,31 +820,54 @@
  409.  }
  410.  EXPORT_SYMBOL(blk_queue_update_dma_alignment);
  411.  
  412. +void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
  413. +{
  414. +   spin_lock_irq(q->queue_lock);
  415. +   if (queueable)
  416. +       clear_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags);
  417. +   else
  418. +       set_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags);
  419. +   spin_unlock_irq(q->queue_lock);
  420. +}
  421. +EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
  422. +
  423.  /**
  424. - * blk_queue_flush - configure queue's cache flush capability
  425. + * blk_set_queue_depth - tell the block layer about the device queue depth
  426.   * @q:     the request queue for the device
  427. - * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
  428. + * @depth:     queue depth
  429.   *
  430. - * Tell block layer cache flush capability of @q.  If it supports
  431. - * flushing, REQ_FLUSH should be set.  If it supports bypassing
  432. - * write cache for individual writes, REQ_FUA should be set.
  433.   */
  434. -void blk_queue_flush(struct request_queue *q, unsigned int flush)
  435. +void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
  436.  {
  437. -   WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
  438. -
  439. -   if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
  440. -       flush &= ~REQ_FUA;
  441. -
  442. -   q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
  443. +   q->queue_depth = depth;
  444. +   wbt_set_queue_depth(q->rq_wb, depth);
  445.  }
  446. -EXPORT_SYMBOL_GPL(blk_queue_flush);
  447. +EXPORT_SYMBOL(blk_set_queue_depth);
  448.  
  449. -void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
  450. +/**
  451. + * blk_queue_write_cache - configure queue's write cache
  452. + * @q:     the request queue for the device
  453. + * @wc:        write back cache on or off
  454. + * @fua:   device supports FUA writes, if true
  455. + *
  456. + * Tell the block layer about the write cache of @q.
  457. + */
  458. +void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
  459.  {
  460. -   q->flush_not_queueable = !queueable;
  461. +   spin_lock_irq(q->queue_lock);
  462. +   if (wc)
  463. +       queue_flag_set(QUEUE_FLAG_WC, q);
  464. +   else
  465. +       queue_flag_clear(QUEUE_FLAG_WC, q);
  466. +   if (fua)
  467. +       queue_flag_set(QUEUE_FLAG_FUA, q);
  468. +   else
  469. +       queue_flag_clear(QUEUE_FLAG_FUA, q);
  470. +   spin_unlock_irq(q->queue_lock);
  471. +
  472. +   wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
  473.  }
  474. -EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
  475. +EXPORT_SYMBOL_GPL(blk_queue_write_cache);
  476.  
  477.  static int __init blk_settings_init(void)
  478.  {
  479. diff -Naur linux-4.4.6-gentoo-orig/block/blk-stat.c linux-4.4.6-gentoo-patched/block/blk-stat.c
  480. --- linux-4.4.6-gentoo-orig/block/blk-stat.c    1970-01-01 03:00:00.000000000 +0300
  481. +++ linux-4.4.6-gentoo-patched/block/blk-stat.c 2016-05-04 11:02:48.600733982 +0300
  482. @@ -0,0 +1,185 @@
  483. +/*
  484. + * Block stat tracking code
  485. + *
  486. + * Copyright (C) 2016 Jens Axboe
  487. + */
  488. +#include <linux/kernel.h>
  489. +#include <linux/blk-mq.h>
  490. +
  491. +#include "blk-stat.h"
  492. +#include "blk-mq.h"
  493. +
  494. +void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
  495. +{
  496. +   if (!src->nr_samples)
  497. +       return;
  498. +
  499. +   dst->min = min(dst->min, src->min);
  500. +   dst->max = max(dst->max, src->max);
  501. +
  502. +   if (!dst->nr_samples)
  503. +       dst->mean = src->mean;
  504. +   else {
  505. +       dst->mean = div64_s64((src->mean * src->nr_samples) +
  506. +                   (dst->mean * dst->nr_samples),
  507. +                   dst->nr_samples + src->nr_samples);
  508. +   }
  509. +   dst->nr_samples += src->nr_samples;
  510. +}
  511. +
  512. +static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
  513. +{
  514. +   struct blk_mq_hw_ctx *hctx;
  515. +   struct blk_mq_ctx *ctx;
  516. +   int i, j, nr;
  517. +
  518. +   blk_stat_init(&dst[0]);
  519. +   blk_stat_init(&dst[1]);
  520. +
  521. +   nr = 0;
  522. +   do {
  523. +       uint64_t newest = 0;
  524. +
  525. +       queue_for_each_hw_ctx(q, hctx, i) {
  526. +           hctx_for_each_ctx(hctx, ctx, j) {
  527. +               if (!ctx->stat[0].nr_samples &&
  528. +                   !ctx->stat[1].nr_samples)
  529. +                   continue;
  530. +               if (ctx->stat[0].time > newest)
  531. +                   newest = ctx->stat[0].time;
  532. +               if (ctx->stat[1].time > newest)
  533. +                   newest = ctx->stat[1].time;
  534. +           }
  535. +       }
  536. +
  537. +       /*
  538. +        * No samples
  539. +        */
  540. +       if (!newest)
  541. +           break;
  542. +
  543. +       queue_for_each_hw_ctx(q, hctx, i) {
  544. +           hctx_for_each_ctx(hctx, ctx, j) {
  545. +               if (ctx->stat[0].time == newest) {
  546. +                   blk_stat_sum(&dst[0], &ctx->stat[0]);
  547. +                   nr++;
  548. +               }
  549. +               if (ctx->stat[1].time == newest) {
  550. +                   blk_stat_sum(&dst[1], &ctx->stat[1]);
  551. +                   nr++;
  552. +               }
  553. +           }
  554. +       }
  555. +       /*
  556. +        * If we race on finding an entry, just loop back again.
  557. +        * Should be very rare.
  558. +        */
  559. +   } while (!nr);
  560. +}
  561. +
  562. +void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
  563. +{
  564. +   if (q->mq_ops)
  565. +       blk_mq_stat_get(q, dst);
  566. +   else {
  567. +       memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat));
  568. +       memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat));
  569. +   }
  570. +}
  571. +
  572. +void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
  573. +{
  574. +   struct blk_mq_ctx *ctx;
  575. +   unsigned int i, nr;
  576. +
  577. +   nr = 0;
  578. +   do {
  579. +       uint64_t newest = 0;
  580. +
  581. +       hctx_for_each_ctx(hctx, ctx, i) {
  582. +           if (!ctx->stat[0].nr_samples &&
  583. +               !ctx->stat[1].nr_samples)
  584. +               continue;
  585. +
  586. +           if (ctx->stat[0].time > newest)
  587. +               newest = ctx->stat[0].time;
  588. +           if (ctx->stat[1].time > newest)
  589. +               newest = ctx->stat[1].time;
  590. +       }
  591. +
  592. +       if (!newest)
  593. +           break;
  594. +
  595. +       hctx_for_each_ctx(hctx, ctx, i) {
  596. +           if (ctx->stat[0].time == newest) {
  597. +               blk_stat_sum(&dst[0], &ctx->stat[0]);
  598. +               nr++;
  599. +           }
  600. +           if (ctx->stat[1].time == newest) {
  601. +               blk_stat_sum(&dst[1], &ctx->stat[1]);
  602. +               nr++;
  603. +           }
  604. +       }
  605. +       /*
  606. +        * If we race on finding an entry, just loop back again.
  607. +        * Should be very rare, as the window is only updated
  608. +        * occasionally
  609. +        */
  610. +   } while (!nr);
  611. +}
  612. +
  613. +static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
  614. +{
  615. +   stat->min = -1ULL;
  616. +   stat->max = stat->nr_samples = stat->mean = 0;
  617. +   stat->time = time_now & BLK_STAT_MASK;
  618. +}
  619. +
  620. +void blk_stat_init(struct blk_rq_stat *stat)
  621. +{
  622. +   __blk_stat_init(stat, ktime_to_ns(ktime_get()));
  623. +}
  624. +
  625. +void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
  626. +{
  627. +   s64 delta, now, value;
  628. +   u64 rq_time = wbt_issue_stat_get_time(&rq->wb_stat);
  629. +
  630. +   now = ktime_to_ns(ktime_get());
  631. +   if (now < rq_time)
  632. +       return;
  633. +
  634. +   if ((now & BLK_STAT_MASK) != (stat->time & BLK_STAT_MASK))
  635. +       __blk_stat_init(stat, now);
  636. +
  637. +   value = now - rq_time;
  638. +   if (value > stat->max)
  639. +       stat->max = value;
  640. +   if (value < stat->min)
  641. +       stat->min = value;
  642. +
  643. +   delta = value - stat->mean;
  644. +   if (delta)
  645. +       stat->mean += div64_s64(delta, stat->nr_samples + 1);
  646. +
  647. +   stat->nr_samples++;
  648. +}
  649. +
  650. +void blk_stat_clear(struct request_queue *q)
  651. +{
  652. +   if (q->mq_ops) {
  653. +       struct blk_mq_hw_ctx *hctx;
  654. +       struct blk_mq_ctx *ctx;
  655. +       int i, j;
  656. +
  657. +       queue_for_each_hw_ctx(q, hctx, i) {
  658. +           hctx_for_each_ctx(hctx, ctx, j) {
  659. +               blk_stat_init(&ctx->stat[0]);
  660. +               blk_stat_init(&ctx->stat[1]);
  661. +           }
  662. +       }
  663. +   } else {
  664. +       blk_stat_init(&q->rq_stats[0]);
  665. +       blk_stat_init(&q->rq_stats[1]);
  666. +   }
  667. +}
  668. diff -Naur linux-4.4.6-gentoo-orig/block/blk-stat.h linux-4.4.6-gentoo-patched/block/blk-stat.h
  669. --- linux-4.4.6-gentoo-orig/block/blk-stat.h    1970-01-01 03:00:00.000000000 +0300
  670. +++ linux-4.4.6-gentoo-patched/block/blk-stat.h 2016-05-04 11:02:48.600733982 +0300
  671. @@ -0,0 +1,17 @@
  672. +#ifndef BLK_STAT_H
  673. +#define BLK_STAT_H
  674. +
  675. +/*
  676. + * ~0.13s window as a power-of-2 (2^27 nsecs)
  677. + */
  678. +#define BLK_STAT_NSEC  134217728ULL
  679. +#define BLK_STAT_MASK  ~(BLK_STAT_NSEC - 1)
  680. +
  681. +void blk_stat_add(struct blk_rq_stat *, struct request *);
  682. +void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
  683. +void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
  684. +void blk_stat_clear(struct request_queue *q);
  685. +void blk_stat_init(struct blk_rq_stat *);
  686. +void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *);
  687. +
  688. +#endif
  689. diff -Naur linux-4.4.6-gentoo-orig/block/blk-sysfs.c linux-4.4.6-gentoo-patched/block/blk-sysfs.c
  690. --- linux-4.4.6-gentoo-orig/block/blk-sysfs.c   2016-05-04 11:19:37.596649829 +0300
  691. +++ linux-4.4.6-gentoo-patched/block/blk-sysfs.c    2016-05-04 11:02:48.600733982 +0300
  692. @@ -10,6 +10,7 @@
  693.  #include <linux/blktrace_api.h>
  694.  #include <linux/blk-mq.h>
  695.  #include <linux/blk-cgroup.h>
  696. +#include <linux/wbt.h>
  697.  
  698.  #include "blk.h"
  699.  #include "blk-mq.h"
  700. @@ -41,6 +42,19 @@
  701.     return count;
  702.  }
  703.  
  704. +static ssize_t queue_var_store64(u64 *var, const char *page)
  705. +{
  706. +   int err;
  707. +   u64 v;
  708. +
  709. +   err = kstrtou64(page, 10, &v);
  710. +   if (err < 0)
  711. +       return err;
  712. +
  713. +   *var = v;
  714. +   return 0;
  715. +}
  716. +
  717.  static ssize_t queue_requests_show(struct request_queue *q, char *page)
  718.  {
  719.     return queue_var_show(q->nr_requests, (page));
  720. @@ -348,6 +362,110 @@
  721.     return ret;
  722.  }
  723.  
  724. +static ssize_t queue_wb_win_show(struct request_queue *q, char *page)
  725. +{
  726. +   if (!q->rq_wb)
  727. +       return -EINVAL;
  728. +
  729. +   return sprintf(page, "%llu\n", div_u64(q->rq_wb->win_nsec, 1000));
  730. +}
  731. +
  732. +static ssize_t queue_wb_win_store(struct request_queue *q, const char *page,
  733. +                 size_t count)
  734. +{
  735. +   ssize_t ret;
  736. +   u64 val;
  737. +
  738. +   if (!q->rq_wb)
  739. +       return -EINVAL;
  740. +
  741. +   ret = queue_var_store64(&val, page);
  742. +   if (ret < 0)
  743. +       return ret;
  744. +
  745. +   q->rq_wb->win_nsec = val * 1000ULL;
  746. +   wbt_update_limits(q->rq_wb);
  747. +   return count;
  748. +}
  749. +
  750. +static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
  751. +{
  752. +   if (!q->rq_wb)
  753. +       return -EINVAL;
  754. +
  755. +   return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
  756. +}
  757. +
  758. +static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
  759. +                 size_t count)
  760. +{
  761. +   ssize_t ret;
  762. +   u64 val;
  763. +
  764. +   if (!q->rq_wb)
  765. +       return -EINVAL;
  766. +
  767. +   ret = queue_var_store64(&val, page);
  768. +   if (ret < 0)
  769. +       return ret;
  770. +
  771. +   q->rq_wb->min_lat_nsec = val * 1000ULL;
  772. +   wbt_update_limits(q->rq_wb);
  773. +   return count;
  774. +}
  775. +
  776. +static ssize_t queue_wc_show(struct request_queue *q, char *page)
  777. +{
  778. +   if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
  779. +       return sprintf(page, "write back\n");
  780. +
  781. +   return sprintf(page, "write through\n");
  782. +}
  783. +
  784. +static ssize_t queue_wc_store(struct request_queue *q, const char *page,
  785. +                 size_t count)
  786. +{
  787. +   int set = -1;
  788. +
  789. +   if (!strncmp(page, "write back", 10))
  790. +       set = 1;
  791. +   else if (!strncmp(page, "write through", 13) ||
  792. +        !strncmp(page, "none", 4))
  793. +       set = 0;
  794. +
  795. +   if (set == -1)
  796. +       return -EINVAL;
  797. +
  798. +   spin_lock_irq(q->queue_lock);
  799. +   if (set)
  800. +       queue_flag_set(QUEUE_FLAG_WC, q);
  801. +   else
  802. +       queue_flag_clear(QUEUE_FLAG_WC, q);
  803. +   spin_unlock_irq(q->queue_lock);
  804. +
  805. +   return count;
  806. +}
  807. +
  808. +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
  809. +{
  810. +   return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
  811. +           pre, (long long) stat->nr_samples,
  812. +           (long long) stat->mean, (long long) stat->min,
  813. +           (long long) stat->max);
  814. +}
  815. +
  816. +static ssize_t queue_stats_show(struct request_queue *q, char *page)
  817. +{
  818. +   struct blk_rq_stat stat[2];
  819. +   ssize_t ret;
  820. +
  821. +   blk_queue_stat_get(q, stat);
  822. +
  823. +   ret = print_stat(page, &stat[0], "read :");
  824. +   ret += print_stat(page + ret, &stat[1], "write:");
  825. +   return ret;
  826. +}
  827. +
  828.  static struct queue_sysfs_entry queue_requests_entry = {
  829.     .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
  830.     .show = queue_requests_show,
  831. @@ -479,6 +597,29 @@
  832.     .store = queue_poll_store,
  833.  };
  834.  
  835. +static struct queue_sysfs_entry queue_wc_entry = {
  836. +   .attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR },
  837. +   .show = queue_wc_show,
  838. +   .store = queue_wc_store,
  839. +};
  840. +
  841. +static struct queue_sysfs_entry queue_stats_entry = {
  842. +   .attr = {.name = "stats", .mode = S_IRUGO },
  843. +   .show = queue_stats_show,
  844. +};
  845. +
  846. +static struct queue_sysfs_entry queue_wb_lat_entry = {
  847. +   .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
  848. +   .show = queue_wb_lat_show,
  849. +   .store = queue_wb_lat_store,
  850. +};
  851. +
  852. +static struct queue_sysfs_entry queue_wb_win_entry = {
  853. +   .attr = {.name = "wbt_window_usec", .mode = S_IRUGO | S_IWUSR },
  854. +   .show = queue_wb_win_show,
  855. +   .store = queue_wb_win_store,
  856. +};
  857. +
  858.  static struct attribute *default_attrs[] = {
  859.     &queue_requests_entry.attr,
  860.     &queue_ra_entry.attr,
  861. @@ -504,6 +645,10 @@
  862.     &queue_iostats_entry.attr,
  863.     &queue_random_entry.attr,
  864.     &queue_poll_entry.attr,
  865. +   &queue_wc_entry.attr,
  866. +   &queue_stats_entry.attr,
  867. +   &queue_wb_lat_entry.attr,
  868. +   &queue_wb_win_entry.attr,
  869.     NULL,
  870.  };
  871.  
  872. @@ -618,6 +763,43 @@
  873.     .release    = blk_release_queue,
  874.  };
  875.  
  876. +static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat)
  877. +{
  878. +   blk_queue_stat_get(data, stat);
  879. +}
  880. +
  881. +static void blk_wb_stat_clear(void *data)
  882. +{
  883. +   blk_stat_clear(data);
  884. +}
  885. +
  886. +static struct wb_stat_ops wb_stat_ops = {
  887. +   .get    = blk_wb_stat_get,
  888. +   .clear  = blk_wb_stat_clear,
  889. +};
  890. +
  891. +static void blk_wb_init(struct request_queue *q)
  892. +{
  893. +   struct rq_wb *rwb;
  894. +
  895. +   rwb = wbt_init(&q->backing_dev_info, &wb_stat_ops, q);
  896. +
  897. +   /*
  898. +    * If this fails, we don't get throttling
  899. +    */
  900. +   if (IS_ERR(rwb))
  901. +       return;
  902. +
  903. +   if (blk_queue_nonrot(q))
  904. +       rwb->min_lat_nsec = 2000000ULL;
  905. +   else
  906. +       rwb->min_lat_nsec = 75000000ULL;
  907. +
  908. +   wbt_set_queue_depth(rwb, blk_queue_depth(q));
  909. +   wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
  910. +   q->rq_wb = rwb;
  911. +}
  912. +
  913.  int blk_register_queue(struct gendisk *disk)
  914.  {
  915.     int ret;
  916. @@ -657,6 +839,8 @@
  917.     if (q->mq_ops)
  918.         blk_mq_register_disk(disk);
  919.  
  920. +   blk_wb_init(q);
  921. +
  922.     if (!q->request_fn)
  923.         return 0;
  924.  
  925. diff -Naur linux-4.4.6-gentoo-orig/block/Kconfig linux-4.4.6-gentoo-patched/block/Kconfig
  926. --- linux-4.4.6-gentoo-orig/block/Kconfig   2016-05-04 11:19:37.596649829 +0300
  927. +++ linux-4.4.6-gentoo-patched/block/Kconfig    2016-05-04 11:02:48.599733982 +0300
  928. @@ -4,6 +4,7 @@
  929.  menuconfig BLOCK
  930.         bool "Enable the block layer" if EXPERT
  931.         default y
  932. +       select WBT
  933.         help
  934.      Provide block layer support for the kernel.
  935.  
  936. diff -Naur linux-4.4.6-gentoo-orig/block/Makefile linux-4.4.6-gentoo-patched/block/Makefile
  937. --- linux-4.4.6-gentoo-orig/block/Makefile  2016-05-04 11:19:37.596649829 +0300
  938. +++ linux-4.4.6-gentoo-patched/block/Makefile   2016-05-04 11:10:18.790696435 +0300
  939. @@ -5,7 +5,7 @@
  940.  obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
  941.             blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
  942.             blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
  943. -           blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
  944. +           blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
  945.             blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
  946.             genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
  947.             partitions/
  948. diff -Naur linux-4.4.6-gentoo-orig/Documentation/block/queue-sysfs.txt linux-4.4.6-gentoo-patched/Documentation/block/queue-sysfs.txt
  949. --- linux-4.4.6-gentoo-orig/Documentation/block/queue-sysfs.txt 2016-05-04 11:19:37.597649829 +0300
  950. +++ linux-4.4.6-gentoo-patched/Documentation/block/queue-sysfs.txt  2016-05-04 11:02:48.598733982 +0300
  951. @@ -141,6 +141,28 @@
  952.  an IO scheduler name to this file will attempt to load that IO scheduler
  953.  module, if it isn't already present in the system.
  954.  
  955. +write_cache (RW)
  956. +----------------
  957. +When read, this file will display whether the device has write back
  958. +caching enabled or not. It will return "write back" for the former
  959. +case, and "write through" for the latter. Writing to this file can
  960. +change the kernels view of the device, but it doesn't alter the
  961. +device state. This means that it might not be safe to toggle the
  962. +setting from "write back" to "write through", since that will also
  963. +eliminate cache flushes issued by the kernel.
  964. +
  965. +wb_lat_usec (RW)
  966. +----------------
  967. +If the device is registered for writeback throttling, then this file shows
  968. +the target minimum read latency. If this latency is exceeded in a given
  969. +window of time (see wb_window_usec), then the writeback throttling will start
  970. +scaling back writes.
  971. +
  972. +wb_window_usec (RW)
  973. +-------------------
  974. +If the device is registered for writeback throttling, then this file shows
  975. +the value of the monitoring window in which we'll look at the target
  976. +latency. See wb_lat_usec.
  977.  
  978.  
  979.  Jens Axboe <jens.axboe@oracle.com>, February 2009
  980. diff -Naur linux-4.4.6-gentoo-orig/Documentation/block/writeback_cache_control.txt linux-4.4.6-gentoo-patched/Documentation/block/writeback_cache_control.txt
  981. --- linux-4.4.6-gentoo-orig/Documentation/block/writeback_cache_control.txt 2016-05-04 11:19:37.597649829 +0300
  982. +++ linux-4.4.6-gentoo-patched/Documentation/block/writeback_cache_control.txt  2016-05-04 11:02:48.598733982 +0300
  983. @@ -71,7 +71,7 @@
  984.  driver needs to tell the block layer that it supports flushing caches by
  985.  doing:
  986.  
  987. -   blk_queue_flush(sdkp->disk->queue, REQ_FLUSH);
  988. +   blk_queue_write_cache(sdkp->disk->queue, true, false);
  989.  
  990.  and handle empty REQ_FLUSH requests in its prep_fn/request_fn.  Note that
  991.  REQ_FLUSH requests with a payload are automatically turned into a sequence
  992. @@ -79,7 +79,7 @@
  993.  layer.  For devices that also support the FUA bit the block layer needs
  994.  to be told to pass through the REQ_FUA bit using:
  995.  
  996. -   blk_queue_flush(sdkp->disk->queue, REQ_FLUSH | REQ_FUA);
  997. +   blk_queue_write_cache(sdkp->disk->queue, true, true);
  998.  
  999.  and the driver must handle write requests that have the REQ_FUA bit set
  1000.  in prep_fn/request_fn.  If the FUA bit is not natively supported the block
  1001. diff -Naur linux-4.4.6-gentoo-orig/drivers/block/drbd/drbd_main.c linux-4.4.6-gentoo-patched/drivers/block/drbd/drbd_main.c
  1002. --- linux-4.4.6-gentoo-orig/drivers/block/drbd/drbd_main.c  2016-05-04 11:19:37.598649829 +0300
  1003. +++ linux-4.4.6-gentoo-patched/drivers/block/drbd/drbd_main.c   2016-05-04 11:02:48.601733981 +0300
  1004. @@ -2769,7 +2769,7 @@
  1005.     q->backing_dev_info.congested_data = device;
  1006.  
  1007.     blk_queue_make_request(q, drbd_make_request);
  1008. -   blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
  1009. +   blk_queue_write_cache(q, true, true);
  1010.     /* Setting the max_hw_sectors to an odd value of 8kibyte here
  1011.        This triggers a max_bio_size message upon first attach or connect */
  1012.     blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
  1013. diff -Naur linux-4.4.6-gentoo-orig/drivers/block/loop.c linux-4.4.6-gentoo-patched/drivers/block/loop.c
  1014. --- linux-4.4.6-gentoo-orig/drivers/block/loop.c    2016-05-04 11:19:37.598649829 +0300
  1015. +++ linux-4.4.6-gentoo-patched/drivers/block/loop.c 2016-05-04 11:02:48.601733981 +0300
  1016. @@ -937,7 +937,7 @@
  1017.     mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
  1018.  
  1019.     if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
  1020. -       blk_queue_flush(lo->lo_queue, REQ_FLUSH);
  1021. +       blk_queue_write_cache(lo->lo_queue, true, false);
  1022.  
  1023.     loop_update_dio(lo);
  1024.     set_capacity(lo->lo_disk, size);
  1025. diff -Naur linux-4.4.6-gentoo-orig/drivers/block/mtip32xx/mtip32xx.c linux-4.4.6-gentoo-patched/drivers/block/mtip32xx/mtip32xx.c
  1026. --- linux-4.4.6-gentoo-orig/drivers/block/mtip32xx/mtip32xx.c   2016-05-04 11:19:37.599649829 +0300
  1027. +++ linux-4.4.6-gentoo-patched/drivers/block/mtip32xx/mtip32xx.c    2016-05-04 11:02:48.602733981 +0300
  1028. @@ -3913,12 +3913,6 @@
  1029.     blk_queue_io_min(dd->queue, 4096);
  1030.     blk_queue_bounce_limit(dd->queue, dd->pdev->dma_mask);
  1031.  
  1032. -   /*
  1033. -    * write back cache is not supported in the device. FUA depends on
  1034. -    * write back cache support, hence setting flush support to zero.
  1035. -    */
  1036. -   blk_queue_flush(dd->queue, 0);
  1037. -
  1038.     /* Signal trim support */
  1039.     if (dd->trim_supp == true) {
  1040.         set_bit(QUEUE_FLAG_DISCARD, &dd->queue->queue_flags);
  1041. diff -Naur linux-4.4.6-gentoo-orig/drivers/block/nbd.c linux-4.4.6-gentoo-patched/drivers/block/nbd.c
  1042. --- linux-4.4.6-gentoo-orig/drivers/block/nbd.c 2016-05-04 11:19:37.600649829 +0300
  1043. +++ linux-4.4.6-gentoo-patched/drivers/block/nbd.c  2016-05-04 11:02:48.602733981 +0300
  1044. @@ -750,9 +750,9 @@
  1045.             queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
  1046.                 nbd->disk->queue);
  1047.         if (nbd->flags & NBD_FLAG_SEND_FLUSH)
  1048. -           blk_queue_flush(nbd->disk->queue, REQ_FLUSH);
  1049. +           blk_queue_write_cache(nbd->disk->queue, true, false);
  1050.         else
  1051. -           blk_queue_flush(nbd->disk->queue, 0);
  1052. +           blk_queue_write_cache(nbd->disk->queue, false, false);
  1053.  
  1054.         thread = kthread_run(nbd_thread_send, nbd, "%s",
  1055.                      nbd_name(nbd));
  1056. diff -Naur linux-4.4.6-gentoo-orig/drivers/block/osdblk.c linux-4.4.6-gentoo-patched/drivers/block/osdblk.c
  1057. --- linux-4.4.6-gentoo-orig/drivers/block/osdblk.c  2016-05-04 11:19:37.600649829 +0300
  1058. +++ linux-4.4.6-gentoo-patched/drivers/block/osdblk.c   2016-05-04 11:02:48.602733981 +0300
  1059. @@ -437,7 +437,7 @@
  1060.     blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
  1061.  
  1062.     blk_queue_prep_rq(q, blk_queue_start_tag);
  1063. -   blk_queue_flush(q, REQ_FLUSH);
  1064. +   blk_queue_write_cache(q, true, false);
  1065.  
  1066.     disk->queue = q;
  1067.  
  1068. diff -Naur linux-4.4.6-gentoo-orig/drivers/block/ps3disk.c linux-4.4.6-gentoo-patched/drivers/block/ps3disk.c
  1069. --- linux-4.4.6-gentoo-orig/drivers/block/ps3disk.c 2016-05-04 11:19:37.601649829 +0300
  1070. +++ linux-4.4.6-gentoo-patched/drivers/block/ps3disk.c  2016-05-04 11:02:48.602733981 +0300
  1071. @@ -468,7 +468,7 @@
  1072.     blk_queue_dma_alignment(queue, dev->blk_size-1);
  1073.     blk_queue_logical_block_size(queue, dev->blk_size);
  1074.  
  1075. -   blk_queue_flush(queue, REQ_FLUSH);
  1076. +   blk_queue_write_cache(queue, true, false);
  1077.  
  1078.     blk_queue_max_segments(queue, -1);
  1079.     blk_queue_max_segment_size(queue, dev->bounce_size);
  1080. diff -Naur linux-4.4.6-gentoo-orig/drivers/block/skd_main.c linux-4.4.6-gentoo-patched/drivers/block/skd_main.c
  1081. --- linux-4.4.6-gentoo-orig/drivers/block/skd_main.c    2016-05-04 11:19:37.601649829 +0300
  1082. +++ linux-4.4.6-gentoo-patched/drivers/block/skd_main.c 2016-05-04 11:02:48.603733981 +0300
  1083. @@ -4412,7 +4412,7 @@
  1084.     disk->queue = q;
  1085.     q->queuedata = skdev;
  1086.  
  1087. -   blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
  1088. +   blk_queue_write_cache(q, true, true);
  1089.     blk_queue_max_segments(q, skdev->sgs_per_request);
  1090.     blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS);
  1091.  
  1092. diff -Naur linux-4.4.6-gentoo-orig/drivers/block/virtio_blk.c linux-4.4.6-gentoo-patched/drivers/block/virtio_blk.c
  1093. --- linux-4.4.6-gentoo-orig/drivers/block/virtio_blk.c  2016-05-04 11:19:37.602649828 +0300
  1094. +++ linux-4.4.6-gentoo-patched/drivers/block/virtio_blk.c   2016-05-04 11:02:48.603733981 +0300
  1095. @@ -488,11 +488,7 @@
  1096.     u8 writeback = virtblk_get_cache_mode(vdev);
  1097.     struct virtio_blk *vblk = vdev->priv;
  1098.  
  1099. -   if (writeback)
  1100. -       blk_queue_flush(vblk->disk->queue, REQ_FLUSH);
  1101. -   else
  1102. -       blk_queue_flush(vblk->disk->queue, 0);
  1103. -
  1104. +   blk_queue_write_cache(vblk->disk->queue, writeback, false);
  1105.     revalidate_disk(vblk->disk);
  1106.  }
  1107.  
  1108. diff -Naur linux-4.4.6-gentoo-orig/drivers/block/xen-blkback/xenbus.c linux-4.4.6-gentoo-patched/drivers/block/xen-blkback/xenbus.c
  1109. --- linux-4.4.6-gentoo-orig/drivers/block/xen-blkback/xenbus.c  2016-05-04 11:19:37.603649828 +0300
  1110. +++ linux-4.4.6-gentoo-patched/drivers/block/xen-blkback/xenbus.c   2016-05-04 11:02:48.603733981 +0300
  1111. @@ -413,7 +413,7 @@
  1112.         vbd->type |= VDISK_REMOVABLE;
  1113.  
  1114.     q = bdev_get_queue(bdev);
  1115. -   if (q && q->flush_flags)
  1116. +   if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags))
  1117.         vbd->flush_support = true;
  1118.  
  1119.     if (q && blk_queue_secdiscard(q))
  1120. diff -Naur linux-4.4.6-gentoo-orig/drivers/block/xen-blkfront.c linux-4.4.6-gentoo-patched/drivers/block/xen-blkfront.c
  1121. --- linux-4.4.6-gentoo-orig/drivers/block/xen-blkfront.c    2016-05-04 11:19:37.603649828 +0300
  1122. +++ linux-4.4.6-gentoo-patched/drivers/block/xen-blkfront.c 2016-05-04 11:02:48.603733981 +0300
  1123. @@ -851,7 +851,8 @@
  1124.  
  1125.  static void xlvbd_flush(struct blkfront_info *info)
  1126.  {
  1127. -   blk_queue_flush(info->rq, info->feature_flush);
  1128. +   blk_queue_write_cache(info->rq, info->feature_flush & REQ_FLUSH,
  1129. +               info->feature_flush & REQ_FUA);
  1130.     pr_info("blkfront: %s: %s %s %s %s %s\n",
  1131.         info->gd->disk_name, flush_info(info->feature_flush),
  1132.         "persistent grants:", info->feature_persistent ?
  1133. diff -Naur linux-4.4.6-gentoo-orig/drivers/ide/ide-disk.c linux-4.4.6-gentoo-patched/drivers/ide/ide-disk.c
  1134. --- linux-4.4.6-gentoo-orig/drivers/ide/ide-disk.c  2016-05-04 11:19:37.604649828 +0300
  1135. +++ linux-4.4.6-gentoo-patched/drivers/ide/ide-disk.c   2016-05-04 11:02:48.603733981 +0300
  1136. @@ -522,7 +522,7 @@
  1137.  static void update_flush(ide_drive_t *drive)
  1138.  {
  1139.     u16 *id = drive->id;
  1140. -   unsigned flush = 0;
  1141. +   bool wc = false;
  1142.  
  1143.     if (drive->dev_flags & IDE_DFLAG_WCACHE) {
  1144.         unsigned long long capacity;
  1145. @@ -546,12 +546,12 @@
  1146.                drive->name, barrier ? "" : "not ");
  1147.  
  1148.         if (barrier) {
  1149. -           flush = REQ_FLUSH;
  1150. +           wc = true;
  1151.             blk_queue_prep_rq(drive->queue, idedisk_prep_fn);
  1152.         }
  1153.     }
  1154.  
  1155. -   blk_queue_flush(drive->queue, flush);
  1156. +   blk_queue_write_cache(drive->queue, wc, false);
  1157.  }
  1158.  
  1159.  ide_devset_get_flag(wcache, IDE_DFLAG_WCACHE);
  1160. diff -Naur linux-4.4.6-gentoo-orig/drivers/md/bcache/super.c linux-4.4.6-gentoo-patched/drivers/md/bcache/super.c
  1161. --- linux-4.4.6-gentoo-orig/drivers/md/bcache/super.c   2016-05-04 11:19:37.604649828 +0300
  1162. +++ linux-4.4.6-gentoo-patched/drivers/md/bcache/super.c    2016-05-04 11:02:48.604733981 +0300
  1163. @@ -816,7 +816,7 @@
  1164.     clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
  1165.     set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
  1166.  
  1167. -   blk_queue_flush(q, REQ_FLUSH|REQ_FUA);
  1168. +   blk_queue_write_cache(q, true, true);
  1169.  
  1170.     return 0;
  1171.  }
  1172. diff -Naur linux-4.4.6-gentoo-orig/drivers/md/dm-table.c linux-4.4.6-gentoo-patched/drivers/md/dm-table.c
  1173. --- linux-4.4.6-gentoo-orig/drivers/md/dm-table.c   2016-05-04 11:19:37.605649828 +0300
  1174. +++ linux-4.4.6-gentoo-patched/drivers/md/dm-table.c    2016-05-04 11:02:48.604733981 +0300
  1175. @@ -1312,13 +1312,13 @@
  1176.  static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
  1177.                 sector_t start, sector_t len, void *data)
  1178.  {
  1179. -   unsigned flush = (*(unsigned *)data);
  1180. +   unsigned long flush = (unsigned long) data;
  1181.     struct request_queue *q = bdev_get_queue(dev->bdev);
  1182.  
  1183. -   return q && (q->flush_flags & flush);
  1184. +   return q && (q->queue_flags & flush);
  1185.  }
  1186.  
  1187. -static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
  1188. +static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
  1189.  {
  1190.     struct dm_target *ti;
  1191.     unsigned i = 0;
  1192. @@ -1339,7 +1339,7 @@
  1193.             return true;
  1194.  
  1195.         if (ti->type->iterate_devices &&
  1196. -           ti->type->iterate_devices(ti, device_flush_capable, &flush))
  1197. +           ti->type->iterate_devices(ti, device_flush_capable, (void *) flush))
  1198.             return true;
  1199.     }
  1200.  
  1201. @@ -1470,7 +1470,7 @@
  1202.  void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
  1203.                    struct queue_limits *limits)
  1204.  {
  1205. -   unsigned flush = 0;
  1206. +   bool wc = false, fua = false;
  1207.  
  1208.     /*
  1209.      * Copy table's limits to the DM device's request_queue
  1210. @@ -1482,12 +1482,12 @@
  1211.     else
  1212.         queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
  1213.  
  1214. -   if (dm_table_supports_flush(t, REQ_FLUSH)) {
  1215. -       flush |= REQ_FLUSH;
  1216. -       if (dm_table_supports_flush(t, REQ_FUA))
  1217. -           flush |= REQ_FUA;
  1218. +   if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) {
  1219. +       wc = true;
  1220. +       if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA)))
  1221. +           fua = true;
  1222.     }
  1223. -   blk_queue_flush(q, flush);
  1224. +   blk_queue_write_cache(q, wc, fua);
  1225.  
  1226.     if (!dm_table_discard_zeroes_data(t))
  1227.         q->limits.discard_zeroes_data = 0;
  1228. diff -Naur linux-4.4.6-gentoo-orig/drivers/md/md.c linux-4.4.6-gentoo-patched/drivers/md/md.c
  1229. --- linux-4.4.6-gentoo-orig/drivers/md/md.c 2016-05-04 11:19:37.606649828 +0300
  1230. +++ linux-4.4.6-gentoo-patched/drivers/md/md.c  2016-05-04 11:02:48.605733981 +0300
  1231. @@ -5037,7 +5037,7 @@
  1232.     disk->fops = &md_fops;
  1233.     disk->private_data = mddev;
  1234.     disk->queue = mddev->queue;
  1235. -   blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
  1236. +   blk_queue_write_cache(mddev->queue, true, true);
  1237.     /* Allow extended partitions.  This makes the
  1238.      * 'mdp' device redundant, but we can't really
  1239.      * remove it now.
  1240. diff -Naur linux-4.4.6-gentoo-orig/drivers/md/raid5-cache.c linux-4.4.6-gentoo-patched/drivers/md/raid5-cache.c
  1241. --- linux-4.4.6-gentoo-orig/drivers/md/raid5-cache.c    2016-05-04 11:19:37.607649828 +0300
  1242. +++ linux-4.4.6-gentoo-patched/drivers/md/raid5-cache.c 2016-05-04 11:02:48.605733981 +0300
  1243. @@ -1133,6 +1133,7 @@
  1244.  
  1245.  int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
  1246.  {
  1247. +   struct request_queue *q = bdev_get_queue(rdev->bdev);
  1248.     struct r5l_log *log;
  1249.  
  1250.     if (PAGE_SIZE != 4096)
  1251. @@ -1142,7 +1143,7 @@
  1252.         return -ENOMEM;
  1253.     log->rdev = rdev;
  1254.  
  1255. -   log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0);
  1256. +   log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
  1257.  
  1258.     log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
  1259.                        sizeof(rdev->mddev->uuid));
  1260. diff -Naur linux-4.4.6-gentoo-orig/drivers/mmc/card/block.c linux-4.4.6-gentoo-patched/drivers/mmc/card/block.c
  1261. --- linux-4.4.6-gentoo-orig/drivers/mmc/card/block.c    2016-05-04 11:19:37.608649828 +0300
  1262. +++ linux-4.4.6-gentoo-patched/drivers/mmc/card/block.c 2016-05-04 11:02:48.605733981 +0300
  1263. @@ -2282,7 +2282,7 @@
  1264.         ((card->ext_csd.rel_param & EXT_CSD_WR_REL_PARAM_EN) ||
  1265.          card->ext_csd.rel_sectors)) {
  1266.         md->flags |= MMC_BLK_REL_WR;
  1267. -       blk_queue_flush(md->queue.queue, REQ_FLUSH | REQ_FUA);
  1268. +       blk_queue_write_cache(md->queue.queue, true, true);
  1269.     }
  1270.  
  1271.     if (mmc_card_mmc(card) &&
  1272. diff -Naur linux-4.4.6-gentoo-orig/drivers/mtd/mtd_blkdevs.c linux-4.4.6-gentoo-patched/drivers/mtd/mtd_blkdevs.c
  1273. --- linux-4.4.6-gentoo-orig/drivers/mtd/mtd_blkdevs.c   2016-05-04 11:19:37.608649828 +0300
  1274. +++ linux-4.4.6-gentoo-patched/drivers/mtd/mtd_blkdevs.c    2016-05-04 11:02:48.605733981 +0300
  1275. @@ -409,7 +409,7 @@
  1276.         goto error3;
  1277.  
  1278.     if (tr->flush)
  1279. -       blk_queue_flush(new->rq, REQ_FLUSH);
  1280. +       blk_queue_write_cache(new->rq, true, false);
  1281.  
  1282.     new->rq->queuedata = new;
  1283.     blk_queue_logical_block_size(new->rq, tr->blksize);
  1284. diff -Naur linux-4.4.6-gentoo-orig/drivers/nvme/host/pci.c linux-4.4.6-gentoo-patched/drivers/nvme/host/pci.c
  1285. --- linux-4.4.6-gentoo-orig/drivers/nvme/host/pci.c 2016-01-11 02:01:32.000000000 +0300
  1286. +++ linux-4.4.6-gentoo-patched/drivers/nvme/host/pci.c  2016-05-04 11:48:03.179507579 +0300
  1287. @@ -2272,6 +2272,7 @@
  1288.     list_add_tail(&ns->list, &dev->namespaces);
  1289.  
  1290.     blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
  1291. +   bool vwc = false;
  1292.     if (dev->max_hw_sectors) {
  1293.         blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
  1294.         blk_queue_max_segments(ns->queue,
  1295. @@ -2279,8 +2280,10 @@
  1296.     }
  1297.     if (dev->stripe_size)
  1298.         blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
  1299. -   if (dev->vwc & NVME_CTRL_VWC_PRESENT)
  1300. -       blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
  1301. +   if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
  1302. +       vwc = true;
  1303. +   blk_queue_write_cache(q, vwc, vwc);
  1304. +
  1305.     blk_queue_virt_boundary(ns->queue, dev->page_size - 1);
  1306.  
  1307.     disk->major = nvme_major;
  1308. diff -Naur linux-4.4.6-gentoo-orig/drivers/scsi/scsi.c linux-4.4.6-gentoo-patched/drivers/scsi/scsi.c
  1309. --- linux-4.4.6-gentoo-orig/drivers/scsi/scsi.c 2016-05-04 11:19:37.609649828 +0300
  1310. +++ linux-4.4.6-gentoo-patched/drivers/scsi/scsi.c  2016-05-04 11:03:27.408730745 +0300
  1311. @@ -621,6 +621,9 @@
  1312.         wmb();
  1313.     }
  1314.  
  1315. +   if (sdev->request_queue)
  1316. +       blk_set_queue_depth(sdev->request_queue, depth);
  1317. +
  1318.     return sdev->queue_depth;
  1319.  }
  1320.  EXPORT_SYMBOL(scsi_change_queue_depth);
  1321. diff -Naur linux-4.4.6-gentoo-orig/drivers/scsi/sd.c linux-4.4.6-gentoo-patched/drivers/scsi/sd.c
  1322. --- linux-4.4.6-gentoo-orig/drivers/scsi/sd.c   2016-05-04 11:19:37.609649828 +0300
  1323. +++ linux-4.4.6-gentoo-patched/drivers/scsi/sd.c    2016-05-04 11:03:27.408730745 +0300
  1324. @@ -137,15 +137,15 @@
  1325.  
  1326.  static void sd_set_flush_flag(struct scsi_disk *sdkp)
  1327.  {
  1328. -   unsigned flush = 0;
  1329. +   bool wc = false, fua = false;
  1330.  
  1331.     if (sdkp->WCE) {
  1332. -       flush |= REQ_FLUSH;
  1333. +       wc = true;
  1334.         if (sdkp->DPOFUA)
  1335. -           flush |= REQ_FUA;
  1336. +           fua = true;
  1337.     }
  1338.  
  1339. -   blk_queue_flush(sdkp->disk->queue, flush);
  1340. +   blk_queue_write_cache(sdkp->disk->queue, wc, fua);
  1341.  }
  1342.  
  1343.  static ssize_t
  1344. diff -Naur linux-4.4.6-gentoo-orig/drivers/target/target_core_iblock.c linux-4.4.6-gentoo-patched/drivers/target/target_core_iblock.c
  1345. --- linux-4.4.6-gentoo-orig/drivers/target/target_core_iblock.c 2016-05-04 11:19:37.610649828 +0300
  1346. +++ linux-4.4.6-gentoo-patched/drivers/target/target_core_iblock.c  2016-05-04 11:03:27.409730745 +0300
  1347. @@ -653,10 +653,10 @@
  1348.          * Force writethrough using WRITE_FUA if a volatile write cache
  1349.          * is not enabled, or if initiator set the Force Unit Access bit.
  1350.          */
  1351. -       if (q->flush_flags & REQ_FUA) {
  1352. +       if (test_bit(QUEUE_FLAG_FUA, &q->queue_flags)) {
  1353.             if (cmd->se_cmd_flags & SCF_FUA)
  1354.                 rw = WRITE_FUA;
  1355. -           else if (!(q->flush_flags & REQ_FLUSH))
  1356. +           else if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
  1357.                 rw = WRITE_FUA;
  1358.             else
  1359.                 rw = WRITE;
  1360. @@ -802,7 +802,7 @@
  1361.     struct block_device *bd = ib_dev->ibd_bd;
  1362.     struct request_queue *q = bdev_get_queue(bd);
  1363.  
  1364. -   return q->flush_flags & REQ_FLUSH;
  1365. +   return test_bit(QUEUE_FLAG_WC, &q->queue_flags);
  1366.  }
  1367.  
  1368.  static const struct target_backend_ops iblock_ops = {
  1369. diff -Naur linux-4.4.6-gentoo-orig/fs/block_dev.c linux-4.4.6-gentoo-patched/fs/block_dev.c
  1370. --- linux-4.4.6-gentoo-orig/fs/block_dev.c  2016-05-04 11:19:37.610649828 +0300
  1371. +++ linux-4.4.6-gentoo-patched/fs/block_dev.c   2016-05-04 11:03:27.409730745 +0300
  1372. @@ -427,7 +427,7 @@
  1373.             struct page *page, struct writeback_control *wbc)
  1374.  {
  1375.     int result;
  1376. -   int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
  1377. +   int rw = wbc_to_write_cmd(wbc);
  1378.     const struct block_device_operations *ops = bdev->bd_disk->fops;
  1379.  
  1380.     if (!ops->rw_page || bdev_get_integrity(bdev))
  1381. diff -Naur linux-4.4.6-gentoo-orig/fs/buffer.c linux-4.4.6-gentoo-patched/fs/buffer.c
  1382. --- linux-4.4.6-gentoo-orig/fs/buffer.c 2016-05-04 11:19:37.611649828 +0300
  1383. +++ linux-4.4.6-gentoo-patched/fs/buffer.c  2016-05-04 11:03:27.409730745 +0300
  1384. @@ -1708,7 +1708,7 @@
  1385.     struct buffer_head *bh, *head;
  1386.     unsigned int blocksize, bbits;
  1387.     int nr_underway = 0;
  1388. -   int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
  1389. +   int write_op = wbc_to_write_cmd(wbc);
  1390.  
  1391.     head = create_page_buffers(page, inode,
  1392.                     (1 << BH_Dirty)|(1 << BH_Uptodate));
  1393. diff -Naur linux-4.4.6-gentoo-orig/fs/f2fs/data.c linux-4.4.6-gentoo-patched/fs/f2fs/data.c
  1394. --- linux-4.4.6-gentoo-orig/fs/f2fs/data.c  2016-05-04 11:19:37.612649828 +0300
  1395. +++ linux-4.4.6-gentoo-patched/fs/f2fs/data.c   2016-05-04 11:03:27.409730745 +0300
  1396. @@ -1115,7 +1115,7 @@
  1397.     struct f2fs_io_info fio = {
  1398.         .sbi = sbi,
  1399.         .type = DATA,
  1400. -       .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
  1401. +       .rw = wbc_to_write_cmd(wbc),
  1402.         .page = page,
  1403.         .encrypted_page = NULL,
  1404.     };
  1405. diff -Naur linux-4.4.6-gentoo-orig/fs/f2fs/node.c linux-4.4.6-gentoo-patched/fs/f2fs/node.c
  1406. --- linux-4.4.6-gentoo-orig/fs/f2fs/node.c  2016-05-04 11:19:37.612649828 +0300
  1407. +++ linux-4.4.6-gentoo-patched/fs/f2fs/node.c   2016-05-04 11:03:27.409730745 +0300
  1408. @@ -1305,7 +1305,7 @@
  1409.     struct f2fs_io_info fio = {
  1410.         .sbi = sbi,
  1411.         .type = NODE,
  1412. -       .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
  1413. +       .rw = wbc_to_write_cmd(wbc),
  1414.         .page = page,
  1415.         .encrypted_page = NULL,
  1416.     };
  1417. diff -Naur linux-4.4.6-gentoo-orig/fs/fs-writeback.c.orig linux-4.4.6-gentoo-patched/fs/fs-writeback.c.orig
  1418. --- linux-4.4.6-gentoo-orig/fs/fs-writeback.c.orig  2016-05-04 11:19:37.613649828 +0300
  1419. +++ linux-4.4.6-gentoo-patched/fs/fs-writeback.c.orig   1970-01-01 03:00:00.000000000 +0300
  1420. @@ -1,2394 +0,0 @@
  1421. -/*
  1422. - * fs/fs-writeback.c
  1423. - *
  1424. - * Copyright (C) 2002, Linus Torvalds.
  1425. - *
  1426. - * Contains all the functions related to writing back and waiting
  1427. - * upon dirty inodes against superblocks, and writing back dirty
  1428. - * pages against inodes.  ie: data writeback.  Writeout of the
  1429. - * inode itself is not handled here.
  1430. - *
  1431. - * 10Apr2002   Andrew Morton
  1432. - *     Split out of fs/inode.c
  1433. - *     Additions for address_space-based writeback
  1434. - */
  1435. -
  1436. -#include <linux/kernel.h>
  1437. -#include <linux/export.h>
  1438. -#include <linux/spinlock.h>
  1439. -#include <linux/slab.h>
  1440. -#include <linux/sched.h>
  1441. -#include <linux/fs.h>
  1442. -#include <linux/mm.h>
  1443. -#include <linux/pagemap.h>
  1444. -#include <linux/kthread.h>
  1445. -#include <linux/writeback.h>
  1446. -#include <linux/blkdev.h>
  1447. -#include <linux/backing-dev.h>
  1448. -#include <linux/tracepoint.h>
  1449. -#include <linux/device.h>
  1450. -#include <linux/memcontrol.h>
  1451. -#include "internal.h"
  1452. -
  1453. -/*
  1454. - * 4MB minimal write chunk size
  1455. - */
  1456. -#define MIN_WRITEBACK_PAGES    (4096UL >> (PAGE_CACHE_SHIFT - 10))
  1457. -
  1458. -struct wb_completion {
  1459. -   atomic_t        cnt;
  1460. -};
  1461. -
  1462. -/*
  1463. - * Passed into wb_writeback(), essentially a subset of writeback_control
  1464. - */
  1465. -struct wb_writeback_work {
  1466. -   long nr_pages;
  1467. -   struct super_block *sb;
  1468. -   unsigned long *older_than_this;
  1469. -   enum writeback_sync_modes sync_mode;
  1470. -   unsigned int tagged_writepages:1;
  1471. -   unsigned int for_kupdate:1;
  1472. -   unsigned int range_cyclic:1;
  1473. -   unsigned int for_background:1;
  1474. -   unsigned int for_sync:1;    /* sync(2) WB_SYNC_ALL writeback */
  1475. -   unsigned int auto_free:1;   /* free on completion */
  1476. -   enum wb_reason reason;      /* why was writeback initiated? */
  1477. -
  1478. -   struct list_head list;      /* pending work list */
  1479. -   struct wb_completion *done; /* set if the caller waits */
  1480. -};
  1481. -
  1482. -/*
  1483. - * If one wants to wait for one or more wb_writeback_works, each work's
  1484. - * ->done should be set to a wb_completion defined using the following
  1485. - * macro.  Once all work items are issued with wb_queue_work(), the caller
  1486. - * can wait for the completion of all using wb_wait_for_completion().  Work
  1487. - * items which are waited upon aren't freed automatically on completion.
  1488. - */
  1489. -#define DEFINE_WB_COMPLETION_ONSTACK(cmpl)             \
  1490. -   struct wb_completion cmpl = {                   \
  1491. -       .cnt        = ATOMIC_INIT(1),           \
  1492. -   }
  1493. -
  1494. -
  1495. -/*
  1496. - * If an inode is constantly having its pages dirtied, but then the
  1497. - * updates stop dirtytime_expire_interval seconds in the past, it's
  1498. - * possible for the worst case time between when an inode has its
  1499. - * timestamps updated and when they finally get written out to be two
  1500. - * dirtytime_expire_intervals.  We set the default to 12 hours (in
  1501. - * seconds), which means most of the time inodes will have their
  1502. - * timestamps written to disk after 12 hours, but in the worst case a
  1503. - * few inodes might not their timestamps updated for 24 hours.
  1504. - */
  1505. -unsigned int dirtytime_expire_interval = 12 * 60 * 60;
  1506. -
  1507. -static inline struct inode *wb_inode(struct list_head *head)
  1508. -{
  1509. -   return list_entry(head, struct inode, i_io_list);
  1510. -}
  1511. -
  1512. -/*
  1513. - * Include the creation of the trace points after defining the
  1514. - * wb_writeback_work structure and inline functions so that the definition
  1515. - * remains local to this file.
  1516. - */
  1517. -#define CREATE_TRACE_POINTS
  1518. -#include <trace/events/writeback.h>
  1519. -
  1520. -EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
  1521. -
  1522. -static bool wb_io_lists_populated(struct bdi_writeback *wb)
  1523. -{
  1524. -   if (wb_has_dirty_io(wb)) {
  1525. -       return false;
  1526. -   } else {
  1527. -       set_bit(WB_has_dirty_io, &wb->state);
  1528. -       WARN_ON_ONCE(!wb->avg_write_bandwidth);
  1529. -       atomic_long_add(wb->avg_write_bandwidth,
  1530. -               &wb->bdi->tot_write_bandwidth);
  1531. -       return true;
  1532. -   }
  1533. -}
  1534. -
  1535. -static void wb_io_lists_depopulated(struct bdi_writeback *wb)
  1536. -{
  1537. -   if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
  1538. -       list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
  1539. -       clear_bit(WB_has_dirty_io, &wb->state);
  1540. -       WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
  1541. -                   &wb->bdi->tot_write_bandwidth) < 0);
  1542. -   }
  1543. -}
  1544. -
  1545. -/**
  1546. - * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
  1547. - * @inode: inode to be moved
  1548. - * @wb: target bdi_writeback
  1549. - * @head: one of @wb->b_{dirty|io|more_io}
  1550. - *
  1551. - * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
  1552. - * Returns %true if @inode is the first occupant of the !dirty_time IO
  1553. - * lists; otherwise, %false.
  1554. - */
  1555. -static bool inode_io_list_move_locked(struct inode *inode,
  1556. -                     struct bdi_writeback *wb,
  1557. -                     struct list_head *head)
  1558. -{
  1559. -   assert_spin_locked(&wb->list_lock);
  1560. -
  1561. -   list_move(&inode->i_io_list, head);
  1562. -
  1563. -   /* dirty_time doesn't count as dirty_io until expiration */
  1564. -   if (head != &wb->b_dirty_time)
  1565. -       return wb_io_lists_populated(wb);
  1566. -
  1567. -   wb_io_lists_depopulated(wb);
  1568. -   return false;
  1569. -}
  1570. -
  1571. -/**
  1572. - * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
  1573. - * @inode: inode to be removed
  1574. - * @wb: bdi_writeback @inode is being removed from
  1575. - *
  1576. - * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
  1577. - * clear %WB_has_dirty_io if all are empty afterwards.
  1578. - */
  1579. -static void inode_io_list_del_locked(struct inode *inode,
  1580. -                    struct bdi_writeback *wb)
  1581. -{
  1582. -   assert_spin_locked(&wb->list_lock);
  1583. -
  1584. -   list_del_init(&inode->i_io_list);
  1585. -   wb_io_lists_depopulated(wb);
  1586. -}
  1587. -
  1588. -static void wb_wakeup(struct bdi_writeback *wb)
  1589. -{
  1590. -   spin_lock_bh(&wb->work_lock);
  1591. -   if (test_bit(WB_registered, &wb->state))
  1592. -       mod_delayed_work(bdi_wq, &wb->dwork, 0);
  1593. -   spin_unlock_bh(&wb->work_lock);
  1594. -}
  1595. -
  1596. -static void wb_queue_work(struct bdi_writeback *wb,
  1597. -             struct wb_writeback_work *work)
  1598. -{
  1599. -   trace_writeback_queue(wb, work);
  1600. -
  1601. -   spin_lock_bh(&wb->work_lock);
  1602. -   if (!test_bit(WB_registered, &wb->state))
  1603. -       goto out_unlock;
  1604. -   if (work->done)
  1605. -       atomic_inc(&work->done->cnt);
  1606. -   list_add_tail(&work->list, &wb->work_list);
  1607. -   mod_delayed_work(bdi_wq, &wb->dwork, 0);
  1608. -out_unlock:
  1609. -   spin_unlock_bh(&wb->work_lock);
  1610. -}
  1611. -
  1612. -/**
  1613. - * wb_wait_for_completion - wait for completion of bdi_writeback_works
  1614. - * @bdi: bdi work items were issued to
  1615. - * @done: target wb_completion
  1616. - *
  1617. - * Wait for one or more work items issued to @bdi with their ->done field
  1618. - * set to @done, which should have been defined with
  1619. - * DEFINE_WB_COMPLETION_ONSTACK().  This function returns after all such
  1620. - * work items are completed.  Work items which are waited upon aren't freed
  1621. - * automatically on completion.
  1622. - */
  1623. -static void wb_wait_for_completion(struct backing_dev_info *bdi,
  1624. -                  struct wb_completion *done)
  1625. -{
  1626. -   atomic_dec(&done->cnt);     /* put down the initial count */
  1627. -   wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
  1628. -}
  1629. -
  1630. -#ifdef CONFIG_CGROUP_WRITEBACK
  1631. -
  1632. -/* parameters for foreign inode detection, see wb_detach_inode() */
  1633. -#define WB_FRN_TIME_SHIFT  13  /* 1s = 2^13, upto 8 secs w/ 16bit */
  1634. -#define WB_FRN_TIME_AVG_SHIFT  3   /* avg = avg * 7/8 + new * 1/8 */
  1635. -#define WB_FRN_TIME_CUT_DIV    2   /* ignore rounds < avg / 2 */
  1636. -#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT))  /* 2s */
  1637. -
  1638. -#define WB_FRN_HIST_SLOTS  16  /* inode->i_wb_frn_history is 16bit */
  1639. -#define WB_FRN_HIST_UNIT   (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
  1640. -                   /* each slot's duration is 2s / 16 */
  1641. -#define WB_FRN_HIST_THR_SLOTS  (WB_FRN_HIST_SLOTS / 2)
  1642. -                   /* if foreign slots >= 8, switch */
  1643. -#define WB_FRN_HIST_MAX_SLOTS  (WB_FRN_HIST_THR_SLOTS / 2 + 1)
  1644. -                   /* one round can affect upto 5 slots */
  1645. -
  1646. -static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
  1647. -static struct workqueue_struct *isw_wq;
  1648. -
  1649. -void __inode_attach_wb(struct inode *inode, struct page *page)
  1650. -{
  1651. -   struct backing_dev_info *bdi = inode_to_bdi(inode);
  1652. -   struct bdi_writeback *wb = NULL;
  1653. -
  1654. -   if (inode_cgwb_enabled(inode)) {
  1655. -       struct cgroup_subsys_state *memcg_css;
  1656. -
  1657. -       if (page) {
  1658. -           memcg_css = mem_cgroup_css_from_page(page);
  1659. -           wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
  1660. -       } else {
  1661. -           /* must pin memcg_css, see wb_get_create() */
  1662. -           memcg_css = task_get_css(current, memory_cgrp_id);
  1663. -           wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
  1664. -           css_put(memcg_css);
  1665. -       }
  1666. -   }
  1667. -
  1668. -   if (!wb)
  1669. -       wb = &bdi->wb;
  1670. -
  1671. -   /*
  1672. -    * There may be multiple instances of this function racing to
  1673. -    * update the same inode.  Use cmpxchg() to tell the winner.
  1674. -    */
  1675. -   if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
  1676. -       wb_put(wb);
  1677. -}
  1678. -
  1679. -/**
  1680. - * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
  1681. - * @inode: inode of interest with i_lock held
  1682. - *
  1683. - * Returns @inode's wb with its list_lock held.  @inode->i_lock must be
  1684. - * held on entry and is released on return.  The returned wb is guaranteed
  1685. - * to stay @inode's associated wb until its list_lock is released.
  1686. - */
  1687. -static struct bdi_writeback *
  1688. -locked_inode_to_wb_and_lock_list(struct inode *inode)
  1689. -   __releases(&inode->i_lock)
  1690. -   __acquires(&wb->list_lock)
  1691. -{
  1692. -   while (true) {
  1693. -       struct bdi_writeback *wb = inode_to_wb(inode);
  1694. -
  1695. -       /*
  1696. -        * inode_to_wb() association is protected by both
  1697. -        * @inode->i_lock and @wb->list_lock but list_lock nests
  1698. -        * outside i_lock.  Drop i_lock and verify that the
  1699. -        * association hasn't changed after acquiring list_lock.
  1700. -        */
  1701. -       wb_get(wb);
  1702. -       spin_unlock(&inode->i_lock);
  1703. -       spin_lock(&wb->list_lock);
  1704. -       wb_put(wb);     /* not gonna deref it anymore */
  1705. -
  1706. -       /* i_wb may have changed inbetween, can't use inode_to_wb() */
  1707. -       if (likely(wb == inode->i_wb))
  1708. -           return wb;  /* @inode already has ref */
  1709. -
  1710. -       spin_unlock(&wb->list_lock);
  1711. -       cpu_relax();
  1712. -       spin_lock(&inode->i_lock);
  1713. -   }
  1714. -}
  1715. -
  1716. -/**
  1717. - * inode_to_wb_and_lock_list - determine an inode's wb and lock it
  1718. - * @inode: inode of interest
  1719. - *
  1720. - * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
  1721. - * on entry.
  1722. - */
  1723. -static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
  1724. -   __acquires(&wb->list_lock)
  1725. -{
  1726. -   spin_lock(&inode->i_lock);
  1727. -   return locked_inode_to_wb_and_lock_list(inode);
  1728. -}
  1729. -
  1730. -struct inode_switch_wbs_context {
  1731. -   struct inode        *inode;
  1732. -   struct bdi_writeback    *new_wb;
  1733. -
  1734. -   struct rcu_head     rcu_head;
  1735. -   struct work_struct  work;
  1736. -};
  1737. -
  1738. -static void inode_switch_wbs_work_fn(struct work_struct *work)
  1739. -{
  1740. -   struct inode_switch_wbs_context *isw =
  1741. -       container_of(work, struct inode_switch_wbs_context, work);
  1742. -   struct inode *inode = isw->inode;
  1743. -   struct address_space *mapping = inode->i_mapping;
  1744. -   struct bdi_writeback *old_wb = inode->i_wb;
  1745. -   struct bdi_writeback *new_wb = isw->new_wb;
  1746. -   struct radix_tree_iter iter;
  1747. -   bool switched = false;
  1748. -   void **slot;
  1749. -
  1750. -   /*
  1751. -    * By the time control reaches here, RCU grace period has passed
  1752. -    * since I_WB_SWITCH assertion and all wb stat update transactions
  1753. -    * between unlocked_inode_to_wb_begin/end() are guaranteed to be
  1754. -    * synchronizing against mapping->tree_lock.
  1755. -    *
  1756. -    * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
  1757. -    * gives us exclusion against all wb related operations on @inode
  1758. -    * including IO list manipulations and stat updates.
  1759. -    */
  1760. -   if (old_wb < new_wb) {
  1761. -       spin_lock(&old_wb->list_lock);
  1762. -       spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
  1763. -   } else {
  1764. -       spin_lock(&new_wb->list_lock);
  1765. -       spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
  1766. -   }
  1767. -   spin_lock(&inode->i_lock);
  1768. -   spin_lock_irq(&mapping->tree_lock);
  1769. -
  1770. -   /*
  1771. -    * Once I_FREEING is visible under i_lock, the eviction path owns
  1772. -    * the inode and we shouldn't modify ->i_io_list.
  1773. -    */
  1774. -   if (unlikely(inode->i_state & I_FREEING))
  1775. -       goto skip_switch;
  1776. -
  1777. -   /*
  1778. -    * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
  1779. -    * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
  1780. -    * pages actually under underwriteback.
  1781. -    */
  1782. -   radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
  1783. -                  PAGECACHE_TAG_DIRTY) {
  1784. -       struct page *page = radix_tree_deref_slot_protected(slot,
  1785. -                           &mapping->tree_lock);
  1786. -       if (likely(page) && PageDirty(page)) {
  1787. -           __dec_wb_stat(old_wb, WB_RECLAIMABLE);
  1788. -           __inc_wb_stat(new_wb, WB_RECLAIMABLE);
  1789. -       }
  1790. -   }
  1791. -
  1792. -   radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
  1793. -                  PAGECACHE_TAG_WRITEBACK) {
  1794. -       struct page *page = radix_tree_deref_slot_protected(slot,
  1795. -                           &mapping->tree_lock);
  1796. -       if (likely(page)) {
  1797. -           WARN_ON_ONCE(!PageWriteback(page));
  1798. -           __dec_wb_stat(old_wb, WB_WRITEBACK);
  1799. -           __inc_wb_stat(new_wb, WB_WRITEBACK);
  1800. -       }
  1801. -   }
  1802. -
  1803. -   wb_get(new_wb);
  1804. -
  1805. -   /*
  1806. -    * Transfer to @new_wb's IO list if necessary.  The specific list
  1807. -    * @inode was on is ignored and the inode is put on ->b_dirty which
  1808. -    * is always correct including from ->b_dirty_time.  The transfer
  1809. -    * preserves @inode->dirtied_when ordering.
  1810. -    */
  1811. -   if (!list_empty(&inode->i_io_list)) {
  1812. -       struct inode *pos;
  1813. -
  1814. -       inode_io_list_del_locked(inode, old_wb);
  1815. -       inode->i_wb = new_wb;
  1816. -       list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
  1817. -           if (time_after_eq(inode->dirtied_when,
  1818. -                     pos->dirtied_when))
  1819. -               break;
  1820. -       inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
  1821. -   } else {
  1822. -       inode->i_wb = new_wb;
  1823. -   }
  1824. -
  1825. -   /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
  1826. -   inode->i_wb_frn_winner = 0;
  1827. -   inode->i_wb_frn_avg_time = 0;
  1828. -   inode->i_wb_frn_history = 0;
  1829. -   switched = true;
  1830. -skip_switch:
  1831. -   /*
  1832. -    * Paired with load_acquire in unlocked_inode_to_wb_begin() and
  1833. -    * ensures that the new wb is visible if they see !I_WB_SWITCH.
  1834. -    */
  1835. -   smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
  1836. -
  1837. -   spin_unlock_irq(&mapping->tree_lock);
  1838. -   spin_unlock(&inode->i_lock);
  1839. -   spin_unlock(&new_wb->list_lock);
  1840. -   spin_unlock(&old_wb->list_lock);
  1841. -
  1842. -   if (switched) {
  1843. -       wb_wakeup(new_wb);
  1844. -       wb_put(old_wb);
  1845. -   }
  1846. -   wb_put(new_wb);
  1847. -
  1848. -   iput(inode);
  1849. -   kfree(isw);
  1850. -
  1851. -   atomic_dec(&isw_nr_in_flight);
  1852. -}
  1853. -
  1854. -static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
  1855. -{
  1856. -   struct inode_switch_wbs_context *isw = container_of(rcu_head,
  1857. -               struct inode_switch_wbs_context, rcu_head);
  1858. -
  1859. -   /* needs to grab bh-unsafe locks, bounce to work item */
  1860. -   INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
  1861. -   queue_work(isw_wq, &isw->work);
  1862. -}
  1863. -
  1864. -/**
  1865. - * inode_switch_wbs - change the wb association of an inode
  1866. - * @inode: target inode
  1867. - * @new_wb_id: ID of the new wb
  1868. - *
  1869. - * Switch @inode's wb association to the wb identified by @new_wb_id.  The
  1870. - * switching is performed asynchronously and may fail silently.
  1871. - */
  1872. -static void inode_switch_wbs(struct inode *inode, int new_wb_id)
  1873. -{
  1874. -   struct backing_dev_info *bdi = inode_to_bdi(inode);
  1875. -   struct cgroup_subsys_state *memcg_css;
  1876. -   struct inode_switch_wbs_context *isw;
  1877. -
  1878. -   /* noop if seems to be already in progress */
  1879. -   if (inode->i_state & I_WB_SWITCH)
  1880. -       return;
  1881. -
  1882. -   isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
  1883. -   if (!isw)
  1884. -       return;
  1885. -
  1886. -   /* find and pin the new wb */
  1887. -   rcu_read_lock();
  1888. -   memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
  1889. -   if (memcg_css)
  1890. -       isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
  1891. -   rcu_read_unlock();
  1892. -   if (!isw->new_wb)
  1893. -       goto out_free;
  1894. -
  1895. -   /* while holding I_WB_SWITCH, no one else can update the association */
  1896. -   spin_lock(&inode->i_lock);
  1897. -   if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
  1898. -       inode->i_state & (I_WB_SWITCH | I_FREEING) ||
  1899. -       inode_to_wb(inode) == isw->new_wb) {
  1900. -       spin_unlock(&inode->i_lock);
  1901. -       goto out_free;
  1902. -   }
  1903. -   inode->i_state |= I_WB_SWITCH;
  1904. -   spin_unlock(&inode->i_lock);
  1905. -
  1906. -   ihold(inode);
  1907. -   isw->inode = inode;
  1908. -
  1909. -   atomic_inc(&isw_nr_in_flight);
  1910. -
  1911. -   /*
  1912. -    * In addition to synchronizing among switchers, I_WB_SWITCH tells
  1913. -    * the RCU protected stat update paths to grab the mapping's
  1914. -    * tree_lock so that stat transfer can synchronize against them.
  1915. -    * Let's continue after I_WB_SWITCH is guaranteed to be visible.
  1916. -    */
  1917. -   call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
  1918. -   return;
  1919. -
  1920. -out_free:
  1921. -   if (isw->new_wb)
  1922. -       wb_put(isw->new_wb);
  1923. -   kfree(isw);
  1924. -}
  1925. -
  1926. -/**
  1927. - * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
  1928. - * @wbc: writeback_control of interest
  1929. - * @inode: target inode
  1930. - *
  1931. - * @inode is locked and about to be written back under the control of @wbc.
  1932. - * Record @inode's writeback context into @wbc and unlock the i_lock.  On
  1933. - * writeback completion, wbc_detach_inode() should be called.  This is used
  1934. - * to track the cgroup writeback context.
  1935. - */
  1936. -void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
  1937. -                struct inode *inode)
  1938. -{
  1939. -   if (!inode_cgwb_enabled(inode)) {
  1940. -       spin_unlock(&inode->i_lock);
  1941. -       return;
  1942. -   }
  1943. -
  1944. -   wbc->wb = inode_to_wb(inode);
  1945. -   wbc->inode = inode;
  1946. -
  1947. -   wbc->wb_id = wbc->wb->memcg_css->id;
  1948. -   wbc->wb_lcand_id = inode->i_wb_frn_winner;
  1949. -   wbc->wb_tcand_id = 0;
  1950. -   wbc->wb_bytes = 0;
  1951. -   wbc->wb_lcand_bytes = 0;
  1952. -   wbc->wb_tcand_bytes = 0;
  1953. -
  1954. -   wb_get(wbc->wb);
  1955. -   spin_unlock(&inode->i_lock);
  1956. -
  1957. -   /*
  1958. -    * A dying wb indicates that the memcg-blkcg mapping has changed
  1959. -    * and a new wb is already serving the memcg.  Switch immediately.
  1960. -    */
  1961. -   if (unlikely(wb_dying(wbc->wb)))
  1962. -       inode_switch_wbs(inode, wbc->wb_id);
  1963. -}
  1964. -
  1965. -/**
  1966. - * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
  1967. - * @wbc: writeback_control of the just finished writeback
  1968. - *
  1969. - * To be called after a writeback attempt of an inode finishes and undoes
  1970. - * wbc_attach_and_unlock_inode().  Can be called under any context.
  1971. - *
  1972. - * As concurrent write sharing of an inode is expected to be very rare and
  1973. - * memcg only tracks page ownership on first-use basis severely confining
  1974. - * the usefulness of such sharing, cgroup writeback tracks ownership
  1975. - * per-inode.  While the support for concurrent write sharing of an inode
  1976. - * is deemed unnecessary, an inode being written to by different cgroups at
  1977. - * different points in time is a lot more common, and, more importantly,
  1978. - * charging only by first-use can too readily lead to grossly incorrect
  1979. - * behaviors (single foreign page can lead to gigabytes of writeback to be
  1980. - * incorrectly attributed).
  1981. - *
  1982. - * To resolve this issue, cgroup writeback detects the majority dirtier of
  1983. - * an inode and transfers the ownership to it.  To avoid unnnecessary
  1984. - * oscillation, the detection mechanism keeps track of history and gives
  1985. - * out the switch verdict only if the foreign usage pattern is stable over
  1986. - * a certain amount of time and/or writeback attempts.
  1987. - *
  1988. - * On each writeback attempt, @wbc tries to detect the majority writer
  1989. - * using Boyer-Moore majority vote algorithm.  In addition to the byte
  1990. - * count from the majority voting, it also counts the bytes written for the
  1991. - * current wb and the last round's winner wb (max of last round's current
  1992. - * wb, the winner from two rounds ago, and the last round's majority
  1993. - * candidate).  Keeping track of the historical winner helps the algorithm
  1994. - * to semi-reliably detect the most active writer even when it's not the
  1995. - * absolute majority.
  1996. - *
  1997. - * Once the winner of the round is determined, whether the winner is
  1998. - * foreign or not and how much IO time the round consumed is recorded in
  1999. - * inode->i_wb_frn_history.  If the amount of recorded foreign IO time is
  2000. - * over a certain threshold, the switch verdict is given.
  2001. - */
  2002. -void wbc_detach_inode(struct writeback_control *wbc)
  2003. -{
  2004. -   struct bdi_writeback *wb = wbc->wb;
  2005. -   struct inode *inode = wbc->inode;
  2006. -   unsigned long avg_time, max_bytes, max_time;
  2007. -   u16 history;
  2008. -   int max_id;
  2009. -
  2010. -   if (!wb)
  2011. -       return;
  2012. -
  2013. -   history = inode->i_wb_frn_history;
  2014. -   avg_time = inode->i_wb_frn_avg_time;
  2015. -
  2016. -   /* pick the winner of this round */
  2017. -   if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
  2018. -       wbc->wb_bytes >= wbc->wb_tcand_bytes) {
  2019. -       max_id = wbc->wb_id;
  2020. -       max_bytes = wbc->wb_bytes;
  2021. -   } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
  2022. -       max_id = wbc->wb_lcand_id;
  2023. -       max_bytes = wbc->wb_lcand_bytes;
  2024. -   } else {
  2025. -       max_id = wbc->wb_tcand_id;
  2026. -       max_bytes = wbc->wb_tcand_bytes;
  2027. -   }
  2028. -
  2029. -   /*
  2030. -    * Calculate the amount of IO time the winner consumed and fold it
  2031. -    * into the running average kept per inode.  If the consumed IO
  2032. -    * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
  2033. -    * deciding whether to switch or not.  This is to prevent one-off
  2034. -    * small dirtiers from skewing the verdict.
  2035. -    */
  2036. -   max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
  2037. -               wb->avg_write_bandwidth);
  2038. -   if (avg_time)
  2039. -       avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
  2040. -               (avg_time >> WB_FRN_TIME_AVG_SHIFT);
  2041. -   else
  2042. -       avg_time = max_time;    /* immediate catch up on first run */
  2043. -
  2044. -   if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
  2045. -       int slots;
  2046. -
  2047. -       /*
  2048. -        * The switch verdict is reached if foreign wb's consume
  2049. -        * more than a certain proportion of IO time in a
  2050. -        * WB_FRN_TIME_PERIOD.  This is loosely tracked by 16 slot
  2051. -        * history mask where each bit represents one sixteenth of
  2052. -        * the period.  Determine the number of slots to shift into
  2053. -        * history from @max_time.
  2054. -        */
  2055. -       slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
  2056. -               (unsigned long)WB_FRN_HIST_MAX_SLOTS);
  2057. -       history <<= slots;
  2058. -       if (wbc->wb_id != max_id)
  2059. -           history |= (1U << slots) - 1;
  2060. -
  2061. -       /*
  2062. -        * Switch if the current wb isn't the consistent winner.
  2063. -        * If there are multiple closely competing dirtiers, the
  2064. -        * inode may switch across them repeatedly over time, which
  2065. -        * is okay.  The main goal is avoiding keeping an inode on
  2066. -        * the wrong wb for an extended period of time.
  2067. -        */
  2068. -       if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
  2069. -           inode_switch_wbs(inode, max_id);
  2070. -   }
  2071. -
  2072. -   /*
  2073. -    * Multiple instances of this function may race to update the
  2074. -    * following fields but we don't mind occassional inaccuracies.
  2075. -    */
  2076. -   inode->i_wb_frn_winner = max_id;
  2077. -   inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
  2078. -   inode->i_wb_frn_history = history;
  2079. -
  2080. -   wb_put(wbc->wb);
  2081. -   wbc->wb = NULL;
  2082. -}
  2083. -
  2084. -/**
  2085. - * wbc_account_io - account IO issued during writeback
  2086. - * @wbc: writeback_control of the writeback in progress
  2087. - * @page: page being written out
  2088. - * @bytes: number of bytes being written out
  2089. - *
  2090. - * @bytes from @page are about to written out during the writeback
  2091. - * controlled by @wbc.  Keep the book for foreign inode detection.  See
  2092. - * wbc_detach_inode().
  2093. - */
  2094. -void wbc_account_io(struct writeback_control *wbc, struct page *page,
  2095. -           size_t bytes)
  2096. -{
  2097. -   int id;
  2098. -
  2099. -   /*
  2100. -    * pageout() path doesn't attach @wbc to the inode being written
  2101. -    * out.  This is intentional as we don't want the function to block
  2102. -    * behind a slow cgroup.  Ultimately, we want pageout() to kick off
  2103. -    * regular writeback instead of writing things out itself.
  2104. -    */
  2105. -   if (!wbc->wb)
  2106. -       return;
  2107. -
  2108. -   rcu_read_lock();
  2109. -   id = mem_cgroup_css_from_page(page)->id;
  2110. -   rcu_read_unlock();
  2111. -
  2112. -   if (id == wbc->wb_id) {
  2113. -       wbc->wb_bytes += bytes;
  2114. -       return;
  2115. -   }
  2116. -
  2117. -   if (id == wbc->wb_lcand_id)
  2118. -       wbc->wb_lcand_bytes += bytes;
  2119. -
  2120. -   /* Boyer-Moore majority vote algorithm */
  2121. -   if (!wbc->wb_tcand_bytes)
  2122. -       wbc->wb_tcand_id = id;
  2123. -   if (id == wbc->wb_tcand_id)
  2124. -       wbc->wb_tcand_bytes += bytes;
  2125. -   else
  2126. -       wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
  2127. -}
  2128. -EXPORT_SYMBOL_GPL(wbc_account_io);
  2129. -
  2130. -/**
  2131. - * inode_congested - test whether an inode is congested
  2132. - * @inode: inode to test for congestion (may be NULL)
  2133. - * @cong_bits: mask of WB_[a]sync_congested bits to test
  2134. - *
  2135. - * Tests whether @inode is congested.  @cong_bits is the mask of congestion
  2136. - * bits to test and the return value is the mask of set bits.
  2137. - *
  2138. - * If cgroup writeback is enabled for @inode, the congestion state is
  2139. - * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
  2140. - * associated with @inode is congested; otherwise, the root wb's congestion
  2141. - * state is used.
  2142. - *
  2143. - * @inode is allowed to be NULL as this function is often called on
  2144. - * mapping->host which is NULL for the swapper space.
  2145. - */
  2146. -int inode_congested(struct inode *inode, int cong_bits)
  2147. -{
  2148. -   /*
  2149. -    * Once set, ->i_wb never becomes NULL while the inode is alive.
  2150. -    * Start transaction iff ->i_wb is visible.
  2151. -    */
  2152. -   if (inode && inode_to_wb_is_valid(inode)) {
  2153. -       struct bdi_writeback *wb;
  2154. -       bool locked, congested;
  2155. -
  2156. -       wb = unlocked_inode_to_wb_begin(inode, &locked);
  2157. -       congested = wb_congested(wb, cong_bits);
  2158. -       unlocked_inode_to_wb_end(inode, locked);
  2159. -       return congested;
  2160. -   }
  2161. -
  2162. -   return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
  2163. -}
  2164. -EXPORT_SYMBOL_GPL(inode_congested);
  2165. -
  2166. -/**
  2167. - * wb_split_bdi_pages - split nr_pages to write according to bandwidth
  2168. - * @wb: target bdi_writeback to split @nr_pages to
  2169. - * @nr_pages: number of pages to write for the whole bdi
  2170. - *
  2171. - * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
  2172. - * relation to the total write bandwidth of all wb's w/ dirty inodes on
  2173. - * @wb->bdi.
  2174. - */
  2175. -static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
  2176. -{
  2177. -   unsigned long this_bw = wb->avg_write_bandwidth;
  2178. -   unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
  2179. -
  2180. -   if (nr_pages == LONG_MAX)
  2181. -       return LONG_MAX;
  2182. -
  2183. -   /*
  2184. -    * This may be called on clean wb's and proportional distribution
  2185. -    * may not make sense, just use the original @nr_pages in those
  2186. -    * cases.  In general, we wanna err on the side of writing more.
  2187. -    */
  2188. -   if (!tot_bw || this_bw >= tot_bw)
  2189. -       return nr_pages;
  2190. -   else
  2191. -       return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
  2192. -}
  2193. -
  2194. -/**
  2195. - * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
  2196. - * @bdi: target backing_dev_info
  2197. - * @base_work: wb_writeback_work to issue
  2198. - * @skip_if_busy: skip wb's which already have writeback in progress
  2199. - *
  2200. - * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
  2201. - * have dirty inodes.  If @base_work->nr_page isn't %LONG_MAX, it's
  2202. - * distributed to the busy wbs according to each wb's proportion in the
  2203. - * total active write bandwidth of @bdi.
  2204. - */
  2205. -static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
  2206. -                 struct wb_writeback_work *base_work,
  2207. -                 bool skip_if_busy)
  2208. -{
  2209. -   struct bdi_writeback *last_wb = NULL;
  2210. -   struct bdi_writeback *wb = list_entry(&bdi->wb_list,
  2211. -                         struct bdi_writeback, bdi_node);
  2212. -
  2213. -   might_sleep();
  2214. -restart:
  2215. -   rcu_read_lock();
  2216. -   list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
  2217. -       DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
  2218. -       struct wb_writeback_work fallback_work;
  2219. -       struct wb_writeback_work *work;
  2220. -       long nr_pages;
  2221. -
  2222. -       if (last_wb) {
  2223. -           wb_put(last_wb);
  2224. -           last_wb = NULL;
  2225. -       }
  2226. -
  2227. -       /* SYNC_ALL writes out I_DIRTY_TIME too */
  2228. -       if (!wb_has_dirty_io(wb) &&
  2229. -           (base_work->sync_mode == WB_SYNC_NONE ||
  2230. -            list_empty(&wb->b_dirty_time)))
  2231. -           continue;
  2232. -       if (skip_if_busy && writeback_in_progress(wb))
  2233. -           continue;
  2234. -
  2235. -       nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
  2236. -
  2237. -       work = kmalloc(sizeof(*work), GFP_ATOMIC);
  2238. -       if (work) {
  2239. -           *work = *base_work;
  2240. -           work->nr_pages = nr_pages;
  2241. -           work->auto_free = 1;
  2242. -           wb_queue_work(wb, work);
  2243. -           continue;
  2244. -       }
  2245. -
  2246. -       /* alloc failed, execute synchronously using on-stack fallback */
  2247. -       work = &fallback_work;
  2248. -       *work = *base_work;
  2249. -       work->nr_pages = nr_pages;
  2250. -       work->auto_free = 0;
  2251. -       work->done = &fallback_work_done;
  2252. -
  2253. -       wb_queue_work(wb, work);
  2254. -
  2255. -       /*
  2256. -        * Pin @wb so that it stays on @bdi->wb_list.  This allows
  2257. -        * continuing iteration from @wb after dropping and
  2258. -        * regrabbing rcu read lock.
  2259. -        */
  2260. -       wb_get(wb);
  2261. -       last_wb = wb;
  2262. -
  2263. -       rcu_read_unlock();
  2264. -       wb_wait_for_completion(bdi, &fallback_work_done);
  2265. -       goto restart;
  2266. -   }
  2267. -   rcu_read_unlock();
  2268. -
  2269. -   if (last_wb)
  2270. -       wb_put(last_wb);
  2271. -}
  2272. -
  2273. -/**
  2274. - * cgroup_writeback_umount - flush inode wb switches for umount
  2275. - *
  2276. - * This function is called when a super_block is about to be destroyed and
  2277. - * flushes in-flight inode wb switches.  An inode wb switch goes through
  2278. - * RCU and then workqueue, so the two need to be flushed in order to ensure
  2279. - * that all previously scheduled switches are finished.  As wb switches are
  2280. - * rare occurrences and synchronize_rcu() can take a while, perform
  2281. - * flushing iff wb switches are in flight.
  2282. - */
  2283. -void cgroup_writeback_umount(void)
  2284. -{
  2285. -   if (atomic_read(&isw_nr_in_flight)) {
  2286. -       synchronize_rcu();
  2287. -       flush_workqueue(isw_wq);
  2288. -   }
  2289. -}
  2290. -
  2291. -static int __init cgroup_writeback_init(void)
  2292. -{
  2293. -   isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
  2294. -   if (!isw_wq)
  2295. -       return -ENOMEM;
  2296. -   return 0;
  2297. -}
  2298. -fs_initcall(cgroup_writeback_init);
  2299. -
  2300. -#else  /* CONFIG_CGROUP_WRITEBACK */
  2301. -
  2302. -static struct bdi_writeback *
  2303. -locked_inode_to_wb_and_lock_list(struct inode *inode)
  2304. -   __releases(&inode->i_lock)
  2305. -   __acquires(&wb->list_lock)
  2306. -{
  2307. -   struct bdi_writeback *wb = inode_to_wb(inode);
  2308. -
  2309. -   spin_unlock(&inode->i_lock);
  2310. -   spin_lock(&wb->list_lock);
  2311. -   return wb;
  2312. -}
  2313. -
  2314. -static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
  2315. -   __acquires(&wb->list_lock)
  2316. -{
  2317. -   struct bdi_writeback *wb = inode_to_wb(inode);
  2318. -
  2319. -   spin_lock(&wb->list_lock);
  2320. -   return wb;
  2321. -}
  2322. -
  2323. -static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
  2324. -{
  2325. -   return nr_pages;
  2326. -}
  2327. -
  2328. -static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
  2329. -                 struct wb_writeback_work *base_work,
  2330. -                 bool skip_if_busy)
  2331. -{
  2332. -   might_sleep();
  2333. -
  2334. -   if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
  2335. -       base_work->auto_free = 0;
  2336. -       wb_queue_work(&bdi->wb, base_work);
  2337. -   }
  2338. -}
  2339. -
  2340. -#endif /* CONFIG_CGROUP_WRITEBACK */
  2341. -
  2342. -void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
  2343. -           bool range_cyclic, enum wb_reason reason)
  2344. -{
  2345. -   struct wb_writeback_work *work;
  2346. -
  2347. -   if (!wb_has_dirty_io(wb))
  2348. -       return;
  2349. -
  2350. -   /*
  2351. -    * This is WB_SYNC_NONE writeback, so if allocation fails just
  2352. -    * wakeup the thread for old dirty data writeback
  2353. -    */
  2354. -   work = kzalloc(sizeof(*work), GFP_ATOMIC);
  2355. -   if (!work) {
  2356. -       trace_writeback_nowork(wb);
  2357. -       wb_wakeup(wb);
  2358. -       return;
  2359. -   }
  2360. -
  2361. -   work->sync_mode = WB_SYNC_NONE;
  2362. -   work->nr_pages  = nr_pages;
  2363. -   work->range_cyclic = range_cyclic;
  2364. -   work->reason    = reason;
  2365. -   work->auto_free = 1;
  2366. -
  2367. -   wb_queue_work(wb, work);
  2368. -}
  2369. -
  2370. -/**
  2371. - * wb_start_background_writeback - start background writeback
  2372. - * @wb: bdi_writback to write from
  2373. - *
  2374. - * Description:
  2375. - *   This makes sure WB_SYNC_NONE background writeback happens. When
  2376. - *   this function returns, it is only guaranteed that for given wb
  2377. - *   some IO is happening if we are over background dirty threshold.
  2378. - *   Caller need not hold sb s_umount semaphore.
  2379. - */
  2380. -void wb_start_background_writeback(struct bdi_writeback *wb)
  2381. -{
  2382. -   /*
  2383. -    * We just wake up the flusher thread. It will perform background
  2384. -    * writeback as soon as there is no other work to do.
  2385. -    */
  2386. -   trace_writeback_wake_background(wb);
  2387. -   wb_wakeup(wb);
  2388. -}
  2389. -
  2390. -/*
  2391. - * Remove the inode from the writeback list it is on.
  2392. - */
  2393. -void inode_io_list_del(struct inode *inode)
  2394. -{
  2395. -   struct bdi_writeback *wb;
  2396. -
  2397. -   wb = inode_to_wb_and_lock_list(inode);
  2398. -   inode_io_list_del_locked(inode, wb);
  2399. -   spin_unlock(&wb->list_lock);
  2400. -}
  2401. -
  2402. -/*
  2403. - * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
  2404. - * furthest end of its superblock's dirty-inode list.
  2405. - *
  2406. - * Before stamping the inode's ->dirtied_when, we check to see whether it is
  2407. - * already the most-recently-dirtied inode on the b_dirty list.  If that is
  2408. - * the case then the inode must have been redirtied while it was being written
  2409. - * out and we don't reset its dirtied_when.
  2410. - */
  2411. -static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
  2412. -{
  2413. -   if (!list_empty(&wb->b_dirty)) {
  2414. -       struct inode *tail;
  2415. -
  2416. -       tail = wb_inode(wb->b_dirty.next);
  2417. -       if (time_before(inode->dirtied_when, tail->dirtied_when))
  2418. -           inode->dirtied_when = jiffies;
  2419. -   }
  2420. -   inode_io_list_move_locked(inode, wb, &wb->b_dirty);
  2421. -}
  2422. -
  2423. -/*
  2424. - * requeue inode for re-scanning after bdi->b_io list is exhausted.
  2425. - */
  2426. -static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
  2427. -{
  2428. -   inode_io_list_move_locked(inode, wb, &wb->b_more_io);
  2429. -}
  2430. -
  2431. -static void inode_sync_complete(struct inode *inode)
  2432. -{
  2433. -   inode->i_state &= ~I_SYNC;
  2434. -   /* If inode is clean an unused, put it into LRU now... */
  2435. -   inode_add_lru(inode);
  2436. -   /* Waiters must see I_SYNC cleared before being woken up */
  2437. -   smp_mb();
  2438. -   wake_up_bit(&inode->i_state, __I_SYNC);
  2439. -}
  2440. -
  2441. -static bool inode_dirtied_after(struct inode *inode, unsigned long t)
  2442. -{
  2443. -   bool ret = time_after(inode->dirtied_when, t);
  2444. -#ifndef CONFIG_64BIT
  2445. -   /*
  2446. -    * For inodes being constantly redirtied, dirtied_when can get stuck.
  2447. -    * It _appears_ to be in the future, but is actually in distant past.
  2448. -    * This test is necessary to prevent such wrapped-around relative times
  2449. -    * from permanently stopping the whole bdi writeback.
  2450. -    */
  2451. -   ret = ret && time_before_eq(inode->dirtied_when, jiffies);
  2452. -#endif
  2453. -   return ret;
  2454. -}
  2455. -
  2456. -#define EXPIRE_DIRTY_ATIME 0x0001
  2457. -
  2458. -/*
  2459. - * Move expired (dirtied before work->older_than_this) dirty inodes from
  2460. - * @delaying_queue to @dispatch_queue.
  2461. - */
  2462. -static int move_expired_inodes(struct list_head *delaying_queue,
  2463. -                  struct list_head *dispatch_queue,
  2464. -                  int flags,
  2465. -                  struct wb_writeback_work *work)
  2466. -{
  2467. -   unsigned long *older_than_this = NULL;
  2468. -   unsigned long expire_time;
  2469. -   LIST_HEAD(tmp);
  2470. -   struct list_head *pos, *node;
  2471. -   struct super_block *sb = NULL;
  2472. -   struct inode *inode;
  2473. -   int do_sb_sort = 0;
  2474. -   int moved = 0;
  2475. -
  2476. -   if ((flags & EXPIRE_DIRTY_ATIME) == 0)
  2477. -       older_than_this = work->older_than_this;
  2478. -   else if (!work->for_sync) {
  2479. -       expire_time = jiffies - (dirtytime_expire_interval * HZ);
  2480. -       older_than_this = &expire_time;
  2481. -   }
  2482. -   while (!list_empty(delaying_queue)) {
  2483. -       inode = wb_inode(delaying_queue->prev);
  2484. -       if (older_than_this &&
  2485. -           inode_dirtied_after(inode, *older_than_this))
  2486. -           break;
  2487. -       list_move(&inode->i_io_list, &tmp);
  2488. -       moved++;
  2489. -       if (flags & EXPIRE_DIRTY_ATIME)
  2490. -           set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
  2491. -       if (sb_is_blkdev_sb(inode->i_sb))
  2492. -           continue;
  2493. -       if (sb && sb != inode->i_sb)
  2494. -           do_sb_sort = 1;
  2495. -       sb = inode->i_sb;
  2496. -   }
  2497. -
  2498. -   /* just one sb in list, splice to dispatch_queue and we're done */
  2499. -   if (!do_sb_sort) {
  2500. -       list_splice(&tmp, dispatch_queue);
  2501. -       goto out;
  2502. -   }
  2503. -
  2504. -   /* Move inodes from one superblock together */
  2505. -   while (!list_empty(&tmp)) {
  2506. -       sb = wb_inode(tmp.prev)->i_sb;
  2507. -       list_for_each_prev_safe(pos, node, &tmp) {
  2508. -           inode = wb_inode(pos);
  2509. -           if (inode->i_sb == sb)
  2510. -               list_move(&inode->i_io_list, dispatch_queue);
  2511. -       }
  2512. -   }
  2513. -out:
  2514. -   return moved;
  2515. -}
  2516. -
  2517. -/*
  2518. - * Queue all expired dirty inodes for io, eldest first.
  2519. - * Before
  2520. - *         newly dirtied     b_dirty    b_io    b_more_io
  2521. - *         =============>    gf         edc     BA
  2522. - * After
  2523. - *         newly dirtied     b_dirty    b_io    b_more_io
  2524. - *         =============>    g          fBAedc
  2525. - *                                           |
  2526. - *                                           +--> dequeue for IO
  2527. - */
  2528. -static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
  2529. -{
  2530. -   int moved;
  2531. -
  2532. -   assert_spin_locked(&wb->list_lock);
  2533. -   list_splice_init(&wb->b_more_io, &wb->b_io);
  2534. -   moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
  2535. -   moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
  2536. -                    EXPIRE_DIRTY_ATIME, work);
  2537. -   if (moved)
  2538. -       wb_io_lists_populated(wb);
  2539. -   trace_writeback_queue_io(wb, work, moved);
  2540. -}
  2541. -
  2542. -static int write_inode(struct inode *inode, struct writeback_control *wbc)
  2543. -{
  2544. -   int ret;
  2545. -
  2546. -   if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
  2547. -       trace_writeback_write_inode_start(inode, wbc);
  2548. -       ret = inode->i_sb->s_op->write_inode(inode, wbc);
  2549. -       trace_writeback_write_inode(inode, wbc);
  2550. -       return ret;
  2551. -   }
  2552. -   return 0;
  2553. -}
  2554. -
  2555. -/*
  2556. - * Wait for writeback on an inode to complete. Called with i_lock held.
  2557. - * Caller must make sure inode cannot go away when we drop i_lock.
  2558. - */
  2559. -static void __inode_wait_for_writeback(struct inode *inode)
  2560. -   __releases(inode->i_lock)
  2561. -   __acquires(inode->i_lock)
  2562. -{
  2563. -   DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
  2564. -   wait_queue_head_t *wqh;
  2565. -
  2566. -   wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
  2567. -   while (inode->i_state & I_SYNC) {
  2568. -       spin_unlock(&inode->i_lock);
  2569. -       __wait_on_bit(wqh, &wq, bit_wait,
  2570. -                 TASK_UNINTERRUPTIBLE);
  2571. -       spin_lock(&inode->i_lock);
  2572. -   }
  2573. -}
  2574. -
  2575. -/*
  2576. - * Wait for writeback on an inode to complete. Caller must have inode pinned.
  2577. - */
  2578. -void inode_wait_for_writeback(struct inode *inode)
  2579. -{
  2580. -   spin_lock(&inode->i_lock);
  2581. -   __inode_wait_for_writeback(inode);
  2582. -   spin_unlock(&inode->i_lock);
  2583. -}
  2584. -
  2585. -/*
  2586. - * Sleep until I_SYNC is cleared. This function must be called with i_lock
  2587. - * held and drops it. It is aimed for callers not holding any inode reference
  2588. - * so once i_lock is dropped, inode can go away.
  2589. - */
  2590. -static void inode_sleep_on_writeback(struct inode *inode)
  2591. -   __releases(inode->i_lock)
  2592. -{
  2593. -   DEFINE_WAIT(wait);
  2594. -   wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
  2595. -   int sleep;
  2596. -
  2597. -   prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
  2598. -   sleep = inode->i_state & I_SYNC;
  2599. -   spin_unlock(&inode->i_lock);
  2600. -   if (sleep)
  2601. -       schedule();
  2602. -   finish_wait(wqh, &wait);
  2603. -}
  2604. -
  2605. -/*
  2606. - * Find proper writeback list for the inode depending on its current state and
  2607. - * possibly also change of its state while we were doing writeback.  Here we
  2608. - * handle things such as livelock prevention or fairness of writeback among
  2609. - * inodes. This function can be called only by flusher thread - noone else
  2610. - * processes all inodes in writeback lists and requeueing inodes behind flusher
  2611. - * thread's back can have unexpected consequences.
  2612. - */
  2613. -static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
  2614. -             struct writeback_control *wbc)
  2615. -{
  2616. -   if (inode->i_state & I_FREEING)
  2617. -       return;
  2618. -
  2619. -   /*
  2620. -    * Sync livelock prevention. Each inode is tagged and synced in one
  2621. -    * shot. If still dirty, it will be redirty_tail()'ed below.  Update
  2622. -    * the dirty time to prevent enqueue and sync it again.
  2623. -    */
  2624. -   if ((inode->i_state & I_DIRTY) &&
  2625. -       (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
  2626. -       inode->dirtied_when = jiffies;
  2627. -
  2628. -   if (wbc->pages_skipped) {
  2629. -       /*
  2630. -        * writeback is not making progress due to locked
  2631. -        * buffers. Skip this inode for now.
  2632. -        */
  2633. -       redirty_tail(inode, wb);
  2634. -       return;
  2635. -   }
  2636. -
  2637. -   if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
  2638. -       /*
  2639. -        * We didn't write back all the pages.  nfs_writepages()
  2640. -        * sometimes bales out without doing anything.
  2641. -        */
  2642. -       if (wbc->nr_to_write <= 0) {
  2643. -           /* Slice used up. Queue for next turn. */
  2644. -           requeue_io(inode, wb);
  2645. -       } else {
  2646. -           /*
  2647. -            * Writeback blocked by something other than
  2648. -            * congestion. Delay the inode for some time to
  2649. -            * avoid spinning on the CPU (100% iowait)
  2650. -            * retrying writeback of the dirty page/inode
  2651. -            * that cannot be performed immediately.
  2652. -            */
  2653. -           redirty_tail(inode, wb);
  2654. -       }
  2655. -   } else if (inode->i_state & I_DIRTY) {
  2656. -       /*
  2657. -        * Filesystems can dirty the inode during writeback operations,
  2658. -        * such as delayed allocation during submission or metadata
  2659. -        * updates after data IO completion.
  2660. -        */
  2661. -       redirty_tail(inode, wb);
  2662. -   } else if (inode->i_state & I_DIRTY_TIME) {
  2663. -       inode->dirtied_when = jiffies;
  2664. -       inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
  2665. -   } else {
  2666. -       /* The inode is clean. Remove from writeback lists. */
  2667. -       inode_io_list_del_locked(inode, wb);
  2668. -   }
  2669. -}
  2670. -
  2671. -/*
  2672. - * Write out an inode and its dirty pages. Do not update the writeback list
  2673. - * linkage. That is left to the caller. The caller is also responsible for
  2674. - * setting I_SYNC flag and calling inode_sync_complete() to clear it.
  2675. - */
  2676. -static int
  2677. -__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  2678. -{
  2679. -   struct address_space *mapping = inode->i_mapping;
  2680. -   long nr_to_write = wbc->nr_to_write;
  2681. -   unsigned dirty;
  2682. -   int ret;
  2683. -
  2684. -   WARN_ON(!(inode->i_state & I_SYNC));
  2685. -
  2686. -   trace_writeback_single_inode_start(inode, wbc, nr_to_write);
  2687. -
  2688. -   ret = do_writepages(mapping, wbc);
  2689. -
  2690. -   /*
  2691. -    * Make sure to wait on the data before writing out the metadata.
  2692. -    * This is important for filesystems that modify metadata on data
  2693. -    * I/O completion. We don't do it for sync(2) writeback because it has a
  2694. -    * separate, external IO completion path and ->sync_fs for guaranteeing
  2695. -    * inode metadata is written back correctly.
  2696. -    */
  2697. -   if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
  2698. -       int err = filemap_fdatawait(mapping);
  2699. -       if (ret == 0)
  2700. -           ret = err;
  2701. -   }
  2702. -
  2703. -   /*
  2704. -    * Some filesystems may redirty the inode during the writeback
  2705. -    * due to delalloc, clear dirty metadata flags right before
  2706. -    * write_inode()
  2707. -    */
  2708. -   spin_lock(&inode->i_lock);
  2709. -
  2710. -   dirty = inode->i_state & I_DIRTY;
  2711. -   if (inode->i_state & I_DIRTY_TIME) {
  2712. -       if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
  2713. -           unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
  2714. -           unlikely(time_after(jiffies,
  2715. -                   (inode->dirtied_time_when +
  2716. -                    dirtytime_expire_interval * HZ)))) {
  2717. -           dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
  2718. -           trace_writeback_lazytime(inode);
  2719. -       }
  2720. -   } else
  2721. -       inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
  2722. -   inode->i_state &= ~dirty;
  2723. -
  2724. -   /*
  2725. -    * Paired with smp_mb() in __mark_inode_dirty().  This allows
  2726. -    * __mark_inode_dirty() to test i_state without grabbing i_lock -
  2727. -    * either they see the I_DIRTY bits cleared or we see the dirtied
  2728. -    * inode.
  2729. -    *
  2730. -    * I_DIRTY_PAGES is always cleared together above even if @mapping
  2731. -    * still has dirty pages.  The flag is reinstated after smp_mb() if
  2732. -    * necessary.  This guarantees that either __mark_inode_dirty()
  2733. -    * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
  2734. -    */
  2735. -   smp_mb();
  2736. -
  2737. -   if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
  2738. -       inode->i_state |= I_DIRTY_PAGES;
  2739. -
  2740. -   spin_unlock(&inode->i_lock);
  2741. -
  2742. -   if (dirty & I_DIRTY_TIME)
  2743. -       mark_inode_dirty_sync(inode);
  2744. -   /* Don't write the inode if only I_DIRTY_PAGES was set */
  2745. -   if (dirty & ~I_DIRTY_PAGES) {
  2746. -       int err = write_inode(inode, wbc);
  2747. -       if (ret == 0)
  2748. -           ret = err;
  2749. -   }
  2750. -   trace_writeback_single_inode(inode, wbc, nr_to_write);
  2751. -   return ret;
  2752. -}
  2753. -
  2754. -/*
  2755. - * Write out an inode's dirty pages. Either the caller has an active reference
  2756. - * on the inode or the inode has I_WILL_FREE set.
  2757. - *
  2758. - * This function is designed to be called for writing back one inode which
  2759. - * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
  2760. - * and does more profound writeback list handling in writeback_sb_inodes().
  2761. - */
  2762. -static int
  2763. -writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
  2764. -              struct writeback_control *wbc)
  2765. -{
  2766. -   int ret = 0;
  2767. -
  2768. -   spin_lock(&inode->i_lock);
  2769. -   if (!atomic_read(&inode->i_count))
  2770. -       WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
  2771. -   else
  2772. -       WARN_ON(inode->i_state & I_WILL_FREE);
  2773. -
  2774. -   if (inode->i_state & I_SYNC) {
  2775. -       if (wbc->sync_mode != WB_SYNC_ALL)
  2776. -           goto out;
  2777. -       /*
  2778. -        * It's a data-integrity sync. We must wait. Since callers hold
  2779. -        * inode reference or inode has I_WILL_FREE set, it cannot go
  2780. -        * away under us.
  2781. -        */
  2782. -       __inode_wait_for_writeback(inode);
  2783. -   }
  2784. -   WARN_ON(inode->i_state & I_SYNC);
  2785. -   /*
  2786. -    * Skip inode if it is clean and we have no outstanding writeback in
  2787. -    * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
  2788. -    * function since flusher thread may be doing for example sync in
  2789. -    * parallel and if we move the inode, it could get skipped. So here we
  2790. -    * make sure inode is on some writeback list and leave it there unless
  2791. -    * we have completely cleaned the inode.
  2792. -    */
  2793. -   if (!(inode->i_state & I_DIRTY_ALL) &&
  2794. -       (wbc->sync_mode != WB_SYNC_ALL ||
  2795. -        !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
  2796. -       goto out;
  2797. -   inode->i_state |= I_SYNC;
  2798. -   wbc_attach_and_unlock_inode(wbc, inode);
  2799. -
  2800. -   ret = __writeback_single_inode(inode, wbc);
  2801. -
  2802. -   wbc_detach_inode(wbc);
  2803. -   spin_lock(&wb->list_lock);
  2804. -   spin_lock(&inode->i_lock);
  2805. -   /*
  2806. -    * If inode is clean, remove it from writeback lists. Otherwise don't
  2807. -    * touch it. See comment above for explanation.
  2808. -    */
  2809. -   if (!(inode->i_state & I_DIRTY_ALL))
  2810. -       inode_io_list_del_locked(inode, wb);
  2811. -   spin_unlock(&wb->list_lock);
  2812. -   inode_sync_complete(inode);
  2813. -out:
  2814. -   spin_unlock(&inode->i_lock);
  2815. -   return ret;
  2816. -}
  2817. -
  2818. -static long writeback_chunk_size(struct bdi_writeback *wb,
  2819. -                struct wb_writeback_work *work)
  2820. -{
  2821. -   long pages;
  2822. -
  2823. -   /*
  2824. -    * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
  2825. -    * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
  2826. -    * here avoids calling into writeback_inodes_wb() more than once.
  2827. -    *
  2828. -    * The intended call sequence for WB_SYNC_ALL writeback is:
  2829. -    *
  2830. -    *      wb_writeback()
  2831. -    *          writeback_sb_inodes()       <== called only once
  2832. -    *              write_cache_pages()     <== called once for each inode
  2833. -    *                   (quickly) tag currently dirty pages
  2834. -    *                   (maybe slowly) sync all tagged pages
  2835. -    */
  2836. -   if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
  2837. -       pages = LONG_MAX;
  2838. -   else {
  2839. -       pages = min(wb->avg_write_bandwidth / 2,
  2840. -               global_wb_domain.dirty_limit / DIRTY_SCOPE);
  2841. -       pages = min(pages, work->nr_pages);
  2842. -       pages = round_down(pages + MIN_WRITEBACK_PAGES,
  2843. -                  MIN_WRITEBACK_PAGES);
  2844. -   }
  2845. -
  2846. -   return pages;
  2847. -}
  2848. -
  2849. -/*
  2850. - * Write a portion of b_io inodes which belong to @sb.
  2851. - *
  2852. - * Return the number of pages and/or inodes written.
  2853. - *
  2854. - * NOTE! This is called with wb->list_lock held, and will
  2855. - * unlock and relock that for each inode it ends up doing
  2856. - * IO for.
  2857. - */
  2858. -static long writeback_sb_inodes(struct super_block *sb,
  2859. -               struct bdi_writeback *wb,
  2860. -               struct wb_writeback_work *work)
  2861. -{
  2862. -   struct writeback_control wbc = {
  2863. -       .sync_mode      = work->sync_mode,
  2864. -       .tagged_writepages  = work->tagged_writepages,
  2865. -       .for_kupdate        = work->for_kupdate,
  2866. -       .for_background     = work->for_background,
  2867. -       .for_sync       = work->for_sync,
  2868. -       .range_cyclic       = work->range_cyclic,
  2869. -       .range_start        = 0,
  2870. -       .range_end      = LLONG_MAX,
  2871. -   };
  2872. -   unsigned long start_time = jiffies;
  2873. -   long write_chunk;
  2874. -   long wrote = 0;  /* count both pages and inodes */
  2875. -
  2876. -   while (!list_empty(&wb->b_io)) {
  2877. -       struct inode *inode = wb_inode(wb->b_io.prev);
  2878. -
  2879. -       if (inode->i_sb != sb) {
  2880. -           if (work->sb) {
  2881. -               /*
  2882. -                * We only want to write back data for this
  2883. -                * superblock, move all inodes not belonging
  2884. -                * to it back onto the dirty list.
  2885. -                */
  2886. -               redirty_tail(inode, wb);
  2887. -               continue;
  2888. -           }
  2889. -
  2890. -           /*
  2891. -            * The inode belongs to a different superblock.
  2892. -            * Bounce back to the caller to unpin this and
  2893. -            * pin the next superblock.
  2894. -            */
  2895. -           break;
  2896. -       }
  2897. -
  2898. -       /*
  2899. -        * Don't bother with new inodes or inodes being freed, first
  2900. -        * kind does not need periodic writeout yet, and for the latter
  2901. -        * kind writeout is handled by the freer.
  2902. -        */
  2903. -       spin_lock(&inode->i_lock);
  2904. -       if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
  2905. -           spin_unlock(&inode->i_lock);
  2906. -           redirty_tail(inode, wb);
  2907. -           continue;
  2908. -       }
  2909. -       if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
  2910. -           /*
  2911. -            * If this inode is locked for writeback and we are not
  2912. -            * doing writeback-for-data-integrity, move it to
  2913. -            * b_more_io so that writeback can proceed with the
  2914. -            * other inodes on s_io.
  2915. -            *
  2916. -            * We'll have another go at writing back this inode
  2917. -            * when we completed a full scan of b_io.
  2918. -            */
  2919. -           spin_unlock(&inode->i_lock);
  2920. -           requeue_io(inode, wb);
  2921. -           trace_writeback_sb_inodes_requeue(inode);
  2922. -           continue;
  2923. -       }
  2924. -       spin_unlock(&wb->list_lock);
  2925. -
  2926. -       /*
  2927. -        * We already requeued the inode if it had I_SYNC set and we
  2928. -        * are doing WB_SYNC_NONE writeback. So this catches only the
  2929. -        * WB_SYNC_ALL case.
  2930. -        */
  2931. -       if (inode->i_state & I_SYNC) {
  2932. -           /* Wait for I_SYNC. This function drops i_lock... */
  2933. -           inode_sleep_on_writeback(inode);
  2934. -           /* Inode may be gone, start again */
  2935. -           spin_lock(&wb->list_lock);
  2936. -           continue;
  2937. -       }
  2938. -       inode->i_state |= I_SYNC;
  2939. -       wbc_attach_and_unlock_inode(&wbc, inode);
  2940. -
  2941. -       write_chunk = writeback_chunk_size(wb, work);
  2942. -       wbc.nr_to_write = write_chunk;
  2943. -       wbc.pages_skipped = 0;
  2944. -
  2945. -       /*
  2946. -        * We use I_SYNC to pin the inode in memory. While it is set
  2947. -        * evict_inode() will wait so the inode cannot be freed.
  2948. -        */
  2949. -       __writeback_single_inode(inode, &wbc);
  2950. -
  2951. -       wbc_detach_inode(&wbc);
  2952. -       work->nr_pages -= write_chunk - wbc.nr_to_write;
  2953. -       wrote += write_chunk - wbc.nr_to_write;
  2954. -
  2955. -       if (need_resched()) {
  2956. -           /*
  2957. -            * We're trying to balance between building up a nice
  2958. -            * long list of IOs to improve our merge rate, and
  2959. -            * getting those IOs out quickly for anyone throttling
  2960. -            * in balance_dirty_pages().  cond_resched() doesn't
  2961. -            * unplug, so get our IOs out the door before we
  2962. -            * give up the CPU.
  2963. -            */
  2964. -           blk_flush_plug(current);
  2965. -           cond_resched();
  2966. -       }
  2967. -
  2968. -
  2969. -       spin_lock(&wb->list_lock);
  2970. -       spin_lock(&inode->i_lock);
  2971. -       if (!(inode->i_state & I_DIRTY_ALL))
  2972. -           wrote++;
  2973. -       requeue_inode(inode, wb, &wbc);
  2974. -       inode_sync_complete(inode);
  2975. -       spin_unlock(&inode->i_lock);
  2976. -
  2977. -       /*
  2978. -        * bail out to wb_writeback() often enough to check
  2979. -        * background threshold and other termination conditions.
  2980. -        */
  2981. -       if (wrote) {
  2982. -           if (time_is_before_jiffies(start_time + HZ / 10UL))
  2983. -               break;
  2984. -           if (work->nr_pages <= 0)
  2985. -               break;
  2986. -       }
  2987. -   }
  2988. -   return wrote;
  2989. -}
  2990. -
  2991. -static long __writeback_inodes_wb(struct bdi_writeback *wb,
  2992. -                 struct wb_writeback_work *work)
  2993. -{
  2994. -   unsigned long start_time = jiffies;
  2995. -   long wrote = 0;
  2996. -
  2997. -   while (!list_empty(&wb->b_io)) {
  2998. -       struct inode *inode = wb_inode(wb->b_io.prev);
  2999. -       struct super_block *sb = inode->i_sb;
  3000. -
  3001. -       if (!trylock_super(sb)) {
  3002. -           /*
  3003. -            * trylock_super() may fail consistently due to
  3004. -            * s_umount being grabbed by someone else. Don't use
  3005. -            * requeue_io() to avoid busy retrying the inode/sb.
  3006. -            */
  3007. -           redirty_tail(inode, wb);
  3008. -           continue;
  3009. -       }
  3010. -       wrote += writeback_sb_inodes(sb, wb, work);
  3011. -       up_read(&sb->s_umount);
  3012. -
  3013. -       /* refer to the same tests at the end of writeback_sb_inodes */
  3014. -       if (wrote) {
  3015. -           if (time_is_before_jiffies(start_time + HZ / 10UL))
  3016. -               break;
  3017. -           if (work->nr_pages <= 0)
  3018. -               break;
  3019. -       }
  3020. -   }
  3021. -   /* Leave any unwritten inodes on b_io */
  3022. -   return wrote;
  3023. -}
  3024. -
  3025. -static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
  3026. -               enum wb_reason reason)
  3027. -{
  3028. -   struct wb_writeback_work work = {
  3029. -       .nr_pages   = nr_pages,
  3030. -       .sync_mode  = WB_SYNC_NONE,
  3031. -       .range_cyclic   = 1,
  3032. -       .reason     = reason,
  3033. -   };
  3034. -   struct blk_plug plug;
  3035. -
  3036. -   blk_start_plug(&plug);
  3037. -   spin_lock(&wb->list_lock);
  3038. -   if (list_empty(&wb->b_io))
  3039. -       queue_io(wb, &work);
  3040. -   __writeback_inodes_wb(wb, &work);
  3041. -   spin_unlock(&wb->list_lock);
  3042. -   blk_finish_plug(&plug);
  3043. -
  3044. -   return nr_pages - work.nr_pages;
  3045. -}
  3046. -
  3047. -/*
  3048. - * Explicit flushing or periodic writeback of "old" data.
  3049. - *
  3050. - * Define "old": the first time one of an inode's pages is dirtied, we mark the
  3051. - * dirtying-time in the inode's address_space.  So this periodic writeback code
  3052. - * just walks the superblock inode list, writing back any inodes which are
  3053. - * older than a specific point in time.
  3054. - *
  3055. - * Try to run once per dirty_writeback_interval.  But if a writeback event
  3056. - * takes longer than a dirty_writeback_interval interval, then leave a
  3057. - * one-second gap.
  3058. - *
  3059. - * older_than_this takes precedence over nr_to_write.  So we'll only write back
  3060. - * all dirty pages if they are all attached to "old" mappings.
  3061. - */
  3062. -static long wb_writeback(struct bdi_writeback *wb,
  3063. -            struct wb_writeback_work *work)
  3064. -{
  3065. -   unsigned long wb_start = jiffies;
  3066. -   long nr_pages = work->nr_pages;
  3067. -   unsigned long oldest_jif;
  3068. -   struct inode *inode;
  3069. -   long progress;
  3070. -   struct blk_plug plug;
  3071. -
  3072. -   oldest_jif = jiffies;
  3073. -   work->older_than_this = &oldest_jif;
  3074. -
  3075. -   blk_start_plug(&plug);
  3076. -   spin_lock(&wb->list_lock);
  3077. -   for (;;) {
  3078. -       /*
  3079. -        * Stop writeback when nr_pages has been consumed
  3080. -        */
  3081. -       if (work->nr_pages <= 0)
  3082. -           break;
  3083. -
  3084. -       /*
  3085. -        * Background writeout and kupdate-style writeback may
  3086. -        * run forever. Stop them if there is other work to do
  3087. -        * so that e.g. sync can proceed. They'll be restarted
  3088. -        * after the other works are all done.
  3089. -        */
  3090. -       if ((work->for_background || work->for_kupdate) &&
  3091. -           !list_empty(&wb->work_list))
  3092. -           break;
  3093. -
  3094. -       /*
  3095. -        * For background writeout, stop when we are below the
  3096. -        * background dirty threshold
  3097. -        */
  3098. -       if (work->for_background && !wb_over_bg_thresh(wb))
  3099. -           break;
  3100. -
  3101. -       /*
  3102. -        * Kupdate and background works are special and we want to
  3103. -        * include all inodes that need writing. Livelock avoidance is
  3104. -        * handled by these works yielding to any other work so we are
  3105. -        * safe.
  3106. -        */
  3107. -       if (work->for_kupdate) {
  3108. -           oldest_jif = jiffies -
  3109. -               msecs_to_jiffies(dirty_expire_interval * 10);
  3110. -       } else if (work->for_background)
  3111. -           oldest_jif = jiffies;
  3112. -
  3113. -       trace_writeback_start(wb, work);
  3114. -       if (list_empty(&wb->b_io))
  3115. -           queue_io(wb, work);
  3116. -       if (work->sb)
  3117. -           progress = writeback_sb_inodes(work->sb, wb, work);
  3118. -       else
  3119. -           progress = __writeback_inodes_wb(wb, work);
  3120. -       trace_writeback_written(wb, work);
  3121. -
  3122. -       wb_update_bandwidth(wb, wb_start);
  3123. -
  3124. -       /*
  3125. -        * Did we write something? Try for more
  3126. -        *
  3127. -        * Dirty inodes are moved to b_io for writeback in batches.
  3128. -        * The completion of the current batch does not necessarily
  3129. -        * mean the overall work is done. So we keep looping as long
  3130. -        * as made some progress on cleaning pages or inodes.
  3131. -        */
  3132. -       if (progress)
  3133. -           continue;
  3134. -       /*
  3135. -        * No more inodes for IO, bail
  3136. -        */
  3137. -       if (list_empty(&wb->b_more_io))
  3138. -           break;
  3139. -       /*
  3140. -        * Nothing written. Wait for some inode to
  3141. -        * become available for writeback. Otherwise
  3142. -        * we'll just busyloop.
  3143. -        */
  3144. -       if (!list_empty(&wb->b_more_io))  {
  3145. -           trace_writeback_wait(wb, work);
  3146. -           inode = wb_inode(wb->b_more_io.prev);
  3147. -           spin_lock(&inode->i_lock);
  3148. -           spin_unlock(&wb->list_lock);
  3149. -           /* This function drops i_lock... */
  3150. -           inode_sleep_on_writeback(inode);
  3151. -           spin_lock(&wb->list_lock);
  3152. -       }
  3153. -   }
  3154. -   spin_unlock(&wb->list_lock);
  3155. -   blk_finish_plug(&plug);
  3156. -
  3157. -   return nr_pages - work->nr_pages;
  3158. -}
  3159. -
  3160. -/*
  3161. - * Return the next wb_writeback_work struct that hasn't been processed yet.
  3162. - */
  3163. -static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
  3164. -{
  3165. -   struct wb_writeback_work *work = NULL;
  3166. -
  3167. -   spin_lock_bh(&wb->work_lock);
  3168. -   if (!list_empty(&wb->work_list)) {
  3169. -       work = list_entry(wb->work_list.next,
  3170. -                 struct wb_writeback_work, list);
  3171. -       list_del_init(&work->list);
  3172. -   }
  3173. -   spin_unlock_bh(&wb->work_lock);
  3174. -   return work;
  3175. -}
  3176. -
  3177. -/*
  3178. - * Add in the number of potentially dirty inodes, because each inode
  3179. - * write can dirty pagecache in the underlying blockdev.
  3180. - */
  3181. -static unsigned long get_nr_dirty_pages(void)
  3182. -{
  3183. -   return global_page_state(NR_FILE_DIRTY) +
  3184. -       global_page_state(NR_UNSTABLE_NFS) +
  3185. -       get_nr_dirty_inodes();
  3186. -}
  3187. -
  3188. -static long wb_check_background_flush(struct bdi_writeback *wb)
  3189. -{
  3190. -   if (wb_over_bg_thresh(wb)) {
  3191. -
  3192. -       struct wb_writeback_work work = {
  3193. -           .nr_pages   = LONG_MAX,
  3194. -           .sync_mode  = WB_SYNC_NONE,
  3195. -           .for_background = 1,
  3196. -           .range_cyclic   = 1,
  3197. -           .reason     = WB_REASON_BACKGROUND,
  3198. -       };
  3199. -
  3200. -       return wb_writeback(wb, &work);
  3201. -   }
  3202. -
  3203. -   return 0;
  3204. -}
  3205. -
  3206. -static long wb_check_old_data_flush(struct bdi_writeback *wb)
  3207. -{
  3208. -   unsigned long expired;
  3209. -   long nr_pages;
  3210. -
  3211. -   /*
  3212. -    * When set to zero, disable periodic writeback
  3213. -    */
  3214. -   if (!dirty_writeback_interval)
  3215. -       return 0;
  3216. -
  3217. -   expired = wb->last_old_flush +
  3218. -           msecs_to_jiffies(dirty_writeback_interval * 10);
  3219. -   if (time_before(jiffies, expired))
  3220. -       return 0;
  3221. -
  3222. -   wb->last_old_flush = jiffies;
  3223. -   nr_pages = get_nr_dirty_pages();
  3224. -
  3225. -   if (nr_pages) {
  3226. -       struct wb_writeback_work work = {
  3227. -           .nr_pages   = nr_pages,
  3228. -           .sync_mode  = WB_SYNC_NONE,
  3229. -           .for_kupdate    = 1,
  3230. -           .range_cyclic   = 1,
  3231. -           .reason     = WB_REASON_PERIODIC,
  3232. -       };
  3233. -
  3234. -       return wb_writeback(wb, &work);
  3235. -   }
  3236. -
  3237. -   return 0;
  3238. -}
  3239. -
  3240. -/*
  3241. - * Retrieve work items and do the writeback they describe
  3242. - */
  3243. -static long wb_do_writeback(struct bdi_writeback *wb)
  3244. -{
  3245. -   struct wb_writeback_work *work;
  3246. -   long wrote = 0;
  3247. -
  3248. -   set_bit(WB_writeback_running, &wb->state);
  3249. -   while ((work = get_next_work_item(wb)) != NULL) {
  3250. -       struct wb_completion *done = work->done;
  3251. -
  3252. -       trace_writeback_exec(wb, work);
  3253. -
  3254. -       wrote += wb_writeback(wb, work);
  3255. -
  3256. -       if (work->auto_free)
  3257. -           kfree(work);
  3258. -       if (done && atomic_dec_and_test(&done->cnt))
  3259. -           wake_up_all(&wb->bdi->wb_waitq);
  3260. -   }
  3261. -
  3262. -   /*
  3263. -    * Check for periodic writeback, kupdated() style
  3264. -    */
  3265. -   wrote += wb_check_old_data_flush(wb);
  3266. -   wrote += wb_check_background_flush(wb);
  3267. -   clear_bit(WB_writeback_running, &wb->state);
  3268. -
  3269. -   return wrote;
  3270. -}
  3271. -
  3272. -/*
  3273. - * Handle writeback of dirty data for the device backed by this bdi. Also
  3274. - * reschedules periodically and does kupdated style flushing.
  3275. - */
  3276. -void wb_workfn(struct work_struct *work)
  3277. -{
  3278. -   struct bdi_writeback *wb = container_of(to_delayed_work(work),
  3279. -                       struct bdi_writeback, dwork);
  3280. -   long pages_written;
  3281. -
  3282. -   set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
  3283. -   current->flags |= PF_SWAPWRITE;
  3284. -
  3285. -   if (likely(!current_is_workqueue_rescuer() ||
  3286. -          !test_bit(WB_registered, &wb->state))) {
  3287. -       /*
  3288. -        * The normal path.  Keep writing back @wb until its
  3289. -        * work_list is empty.  Note that this path is also taken
  3290. -        * if @wb is shutting down even when we're running off the
  3291. -        * rescuer as work_list needs to be drained.
  3292. -        */
  3293. -       do {
  3294. -           pages_written = wb_do_writeback(wb);
  3295. -           trace_writeback_pages_written(pages_written);
  3296. -       } while (!list_empty(&wb->work_list));
  3297. -   } else {
  3298. -       /*
  3299. -        * bdi_wq can't get enough workers and we're running off
  3300. -        * the emergency worker.  Don't hog it.  Hopefully, 1024 is
  3301. -        * enough for efficient IO.
  3302. -        */
  3303. -       pages_written = writeback_inodes_wb(wb, 1024,
  3304. -                           WB_REASON_FORKER_THREAD);
  3305. -       trace_writeback_pages_written(pages_written);
  3306. -   }
  3307. -
  3308. -   if (!list_empty(&wb->work_list))
  3309. -       mod_delayed_work(bdi_wq, &wb->dwork, 0);
  3310. -   else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
  3311. -       wb_wakeup_delayed(wb);
  3312. -
  3313. -   current->flags &= ~PF_SWAPWRITE;
  3314. -}
  3315. -
  3316. -/*
  3317. - * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
  3318. - * the whole world.
  3319. - */
  3320. -void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
  3321. -{
  3322. -   struct backing_dev_info *bdi;
  3323. -
  3324. -   if (!nr_pages)
  3325. -       nr_pages = get_nr_dirty_pages();
  3326. -
  3327. -   rcu_read_lock();
  3328. -   list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
  3329. -       struct bdi_writeback *wb;
  3330. -
  3331. -       if (!bdi_has_dirty_io(bdi))
  3332. -           continue;
  3333. -
  3334. -       list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
  3335. -           wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
  3336. -                      false, reason);
  3337. -   }
  3338. -   rcu_read_unlock();
  3339. -}
  3340. -
  3341. -/*
  3342. - * Wake up bdi's periodically to make sure dirtytime inodes gets
  3343. - * written back periodically.  We deliberately do *not* check the
  3344. - * b_dirtytime list in wb_has_dirty_io(), since this would cause the
  3345. - * kernel to be constantly waking up once there are any dirtytime
  3346. - * inodes on the system.  So instead we define a separate delayed work
  3347. - * function which gets called much more rarely.  (By default, only
  3348. - * once every 12 hours.)
  3349. - *
  3350. - * If there is any other write activity going on in the file system,
  3351. - * this function won't be necessary.  But if the only thing that has
  3352. - * happened on the file system is a dirtytime inode caused by an atime
  3353. - * update, we need this infrastructure below to make sure that inode
  3354. - * eventually gets pushed out to disk.
  3355. - */
  3356. -static void wakeup_dirtytime_writeback(struct work_struct *w);
  3357. -static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
  3358. -
  3359. -static void wakeup_dirtytime_writeback(struct work_struct *w)
  3360. -{
  3361. -   struct backing_dev_info *bdi;
  3362. -
  3363. -   rcu_read_lock();
  3364. -   list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
  3365. -       struct bdi_writeback *wb;
  3366. -
  3367. -       list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
  3368. -           if (!list_empty(&wb->b_dirty_time))
  3369. -               wb_wakeup(wb);
  3370. -   }
  3371. -   rcu_read_unlock();
  3372. -   schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
  3373. -}
  3374. -
  3375. -static int __init start_dirtytime_writeback(void)
  3376. -{
  3377. -   schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
  3378. -   return 0;
  3379. -}
  3380. -__initcall(start_dirtytime_writeback);
  3381. -
  3382. -int dirtytime_interval_handler(struct ctl_table *table, int write,
  3383. -                  void __user *buffer, size_t *lenp, loff_t *ppos)
  3384. -{
  3385. -   int ret;
  3386. -
  3387. -   ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
  3388. -   if (ret == 0 && write)
  3389. -       mod_delayed_work(system_wq, &dirtytime_work, 0);
  3390. -   return ret;
  3391. -}
  3392. -
  3393. -static noinline void block_dump___mark_inode_dirty(struct inode *inode)
  3394. -{
  3395. -   if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
  3396. -       struct dentry *dentry;
  3397. -       const char *name = "?";
  3398. -
  3399. -       dentry = d_find_alias(inode);
  3400. -       if (dentry) {
  3401. -           spin_lock(&dentry->d_lock);
  3402. -           name = (const char *) dentry->d_name.name;
  3403. -       }
  3404. -       printk(KERN_DEBUG
  3405. -              "%s(%d): dirtied inode %lu (%s) on %s\n",
  3406. -              current->comm, task_pid_nr(current), inode->i_ino,
  3407. -              name, inode->i_sb->s_id);
  3408. -       if (dentry) {
  3409. -           spin_unlock(&dentry->d_lock);
  3410. -           dput(dentry);
  3411. -       }
  3412. -   }
  3413. -}
  3414. -
  3415. -/**
  3416. - * __mark_inode_dirty -    internal function
  3417. - * @inode: inode to mark
  3418. - * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
  3419. - * Mark an inode as dirty. Callers should use mark_inode_dirty or
  3420. - *     mark_inode_dirty_sync.
  3421. - *
  3422. - * Put the inode on the super block's dirty list.
  3423. - *
  3424. - * CAREFUL! We mark it dirty unconditionally, but move it onto the
  3425. - * dirty list only if it is hashed or if it refers to a blockdev.
  3426. - * If it was not hashed, it will never be added to the dirty list
  3427. - * even if it is later hashed, as it will have been marked dirty already.
  3428. - *
  3429. - * In short, make sure you hash any inodes _before_ you start marking
  3430. - * them dirty.
  3431. - *
  3432. - * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
  3433. - * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
  3434. - * the kernel-internal blockdev inode represents the dirtying time of the
  3435. - * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
  3436. - * page->mapping->host, so the page-dirtying time is recorded in the internal
  3437. - * blockdev inode.
  3438. - */
  3439. -void __mark_inode_dirty(struct inode *inode, int flags)
  3440. -{
  3441. -#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
  3442. -   struct super_block *sb = inode->i_sb;
  3443. -   int dirtytime;
  3444. -
  3445. -   trace_writeback_mark_inode_dirty(inode, flags);
  3446. -
  3447. -   /*
  3448. -    * Don't do this for I_DIRTY_PAGES - that doesn't actually
  3449. -    * dirty the inode itself
  3450. -    */
  3451. -   if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
  3452. -       trace_writeback_dirty_inode_start(inode, flags);
  3453. -
  3454. -       if (sb->s_op->dirty_inode)
  3455. -           sb->s_op->dirty_inode(inode, flags);
  3456. -
  3457. -       trace_writeback_dirty_inode(inode, flags);
  3458. -   }
  3459. -   if (flags & I_DIRTY_INODE)
  3460. -       flags &= ~I_DIRTY_TIME;
  3461. -   dirtytime = flags & I_DIRTY_TIME;
  3462. -
  3463. -   /*
  3464. -    * Paired with smp_mb() in __writeback_single_inode() for the
  3465. -    * following lockless i_state test.  See there for details.
  3466. -    */
  3467. -   smp_mb();
  3468. -
  3469. -   if (((inode->i_state & flags) == flags) ||
  3470. -       (dirtytime && (inode->i_state & I_DIRTY_INODE)))
  3471. -       return;
  3472. -
  3473. -   if (unlikely(block_dump))
  3474. -       block_dump___mark_inode_dirty(inode);
  3475. -
  3476. -   spin_lock(&inode->i_lock);
  3477. -   if (dirtytime && (inode->i_state & I_DIRTY_INODE))
  3478. -       goto out_unlock_inode;
  3479. -   if ((inode->i_state & flags) != flags) {
  3480. -       const int was_dirty = inode->i_state & I_DIRTY;
  3481. -
  3482. -       inode_attach_wb(inode, NULL);
  3483. -
  3484. -       if (flags & I_DIRTY_INODE)
  3485. -           inode->i_state &= ~I_DIRTY_TIME;
  3486. -       inode->i_state |= flags;
  3487. -
  3488. -       /*
  3489. -        * If the inode is being synced, just update its dirty state.
  3490. -        * The unlocker will place the inode on the appropriate
  3491. -        * superblock list, based upon its state.
  3492. -        */
  3493. -       if (inode->i_state & I_SYNC)
  3494. -           goto out_unlock_inode;
  3495. -
  3496. -       /*
  3497. -        * Only add valid (hashed) inodes to the superblock's
  3498. -        * dirty list.  Add blockdev inodes as well.
  3499. -        */
  3500. -       if (!S_ISBLK(inode->i_mode)) {
  3501. -           if (inode_unhashed(inode))
  3502. -               goto out_unlock_inode;
  3503. -       }
  3504. -       if (inode->i_state & I_FREEING)
  3505. -           goto out_unlock_inode;
  3506. -
  3507. -       /*
  3508. -        * If the inode was already on b_dirty/b_io/b_more_io, don't
  3509. -        * reposition it (that would break b_dirty time-ordering).
  3510. -        */
  3511. -       if (!was_dirty) {
  3512. -           struct bdi_writeback *wb;
  3513. -           struct list_head *dirty_list;
  3514. -           bool wakeup_bdi = false;
  3515. -
  3516. -           wb = locked_inode_to_wb_and_lock_list(inode);
  3517. -
  3518. -           WARN(bdi_cap_writeback_dirty(wb->bdi) &&
  3519. -                !test_bit(WB_registered, &wb->state),
  3520. -                "bdi-%s not registered\n", wb->bdi->name);
  3521. -
  3522. -           inode->dirtied_when = jiffies;
  3523. -           if (dirtytime)
  3524. -               inode->dirtied_time_when = jiffies;
  3525. -
  3526. -           if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
  3527. -               dirty_list = &wb->b_dirty;
  3528. -           else
  3529. -               dirty_list = &wb->b_dirty_time;
  3530. -
  3531. -           wakeup_bdi = inode_io_list_move_locked(inode, wb,
  3532. -                                  dirty_list);
  3533. -
  3534. -           spin_unlock(&wb->list_lock);
  3535. -           trace_writeback_dirty_inode_enqueue(inode);
  3536. -
  3537. -           /*
  3538. -            * If this is the first dirty inode for this bdi,
  3539. -            * we have to wake-up the corresponding bdi thread
  3540. -            * to make sure background write-back happens
  3541. -            * later.
  3542. -            */
  3543. -           if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
  3544. -               wb_wakeup_delayed(wb);
  3545. -           return;
  3546. -       }
  3547. -   }
  3548. -out_unlock_inode:
  3549. -   spin_unlock(&inode->i_lock);
  3550. -
  3551. -#undef I_DIRTY_INODE
  3552. -}
  3553. -EXPORT_SYMBOL(__mark_inode_dirty);
  3554. -
  3555. -/*
  3556. - * The @s_sync_lock is used to serialise concurrent sync operations
  3557. - * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
  3558. - * Concurrent callers will block on the s_sync_lock rather than doing contending
  3559. - * walks. The queueing maintains sync(2) required behaviour as all the IO that
  3560. - * has been issued up to the time this function is enter is guaranteed to be
  3561. - * completed by the time we have gained the lock and waited for all IO that is
  3562. - * in progress regardless of the order callers are granted the lock.
  3563. - */
  3564. -static void wait_sb_inodes(struct super_block *sb)
  3565. -{
  3566. -   struct inode *inode, *old_inode = NULL;
  3567. -
  3568. -   /*
  3569. -    * We need to be protected against the filesystem going from
  3570. -    * r/o to r/w or vice versa.
  3571. -    */
  3572. -   WARN_ON(!rwsem_is_locked(&sb->s_umount));
  3573. -
  3574. -   mutex_lock(&sb->s_sync_lock);
  3575. -   spin_lock(&sb->s_inode_list_lock);
  3576. -
  3577. -   /*
  3578. -    * Data integrity sync. Must wait for all pages under writeback,
  3579. -    * because there may have been pages dirtied before our sync
  3580. -    * call, but which had writeout started before we write it out.
  3581. -    * In which case, the inode may not be on the dirty list, but
  3582. -    * we still have to wait for that writeout.
  3583. -    */
  3584. -   list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
  3585. -       struct address_space *mapping = inode->i_mapping;
  3586. -
  3587. -       spin_lock(&inode->i_lock);
  3588. -       if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
  3589. -           (mapping->nrpages == 0)) {
  3590. -           spin_unlock(&inode->i_lock);
  3591. -           continue;
  3592. -       }
  3593. -       __iget(inode);
  3594. -       spin_unlock(&inode->i_lock);
  3595. -       spin_unlock(&sb->s_inode_list_lock);
  3596. -
  3597. -       /*
  3598. -        * We hold a reference to 'inode' so it couldn't have been
  3599. -        * removed from s_inodes list while we dropped the
  3600. -        * s_inode_list_lock.  We cannot iput the inode now as we can
  3601. -        * be holding the last reference and we cannot iput it under
  3602. -        * s_inode_list_lock. So we keep the reference and iput it
  3603. -        * later.
  3604. -        */
  3605. -       iput(old_inode);
  3606. -       old_inode = inode;
  3607. -
  3608. -       /*
  3609. -        * We keep the error status of individual mapping so that
  3610. -        * applications can catch the writeback error using fsync(2).
  3611. -        * See filemap_fdatawait_keep_errors() for details.
  3612. -        */
  3613. -       filemap_fdatawait_keep_errors(mapping);
  3614. -
  3615. -       cond_resched();
  3616. -
  3617. -       spin_lock(&sb->s_inode_list_lock);
  3618. -   }
  3619. -   spin_unlock(&sb->s_inode_list_lock);
  3620. -   iput(old_inode);
  3621. -   mutex_unlock(&sb->s_sync_lock);
  3622. -}
  3623. -
  3624. -static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
  3625. -                    enum wb_reason reason, bool skip_if_busy)
  3626. -{
  3627. -   DEFINE_WB_COMPLETION_ONSTACK(done);
  3628. -   struct wb_writeback_work work = {
  3629. -       .sb         = sb,
  3630. -       .sync_mode      = WB_SYNC_NONE,
  3631. -       .tagged_writepages  = 1,
  3632. -       .done           = &done,
  3633. -       .nr_pages       = nr,
  3634. -       .reason         = reason,
  3635. -   };
  3636. -   struct backing_dev_info *bdi = sb->s_bdi;
  3637. -
  3638. -   if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
  3639. -       return;
  3640. -   WARN_ON(!rwsem_is_locked(&sb->s_umount));
  3641. -
  3642. -   bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
  3643. -   wb_wait_for_completion(bdi, &done);
  3644. -}
  3645. -
  3646. -/**
  3647. - * writeback_inodes_sb_nr -    writeback dirty inodes from given super_block
  3648. - * @sb: the superblock
  3649. - * @nr: the number of pages to write
  3650. - * @reason: reason why some writeback work initiated
  3651. - *
  3652. - * Start writeback on some inodes on this super_block. No guarantees are made
  3653. - * on how many (if any) will be written, and this function does not wait
  3654. - * for IO completion of submitted IO.
  3655. - */
  3656. -void writeback_inodes_sb_nr(struct super_block *sb,
  3657. -               unsigned long nr,
  3658. -               enum wb_reason reason)
  3659. -{
  3660. -   __writeback_inodes_sb_nr(sb, nr, reason, false);
  3661. -}
  3662. -EXPORT_SYMBOL(writeback_inodes_sb_nr);
  3663. -
  3664. -/**
  3665. - * writeback_inodes_sb -   writeback dirty inodes from given super_block
  3666. - * @sb: the superblock
  3667. - * @reason: reason why some writeback work was initiated
  3668. - *
  3669. - * Start writeback on some inodes on this super_block. No guarantees are made
  3670. - * on how many (if any) will be written, and this function does not wait
  3671. - * for IO completion of submitted IO.
  3672. - */
  3673. -void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
  3674. -{
  3675. -   return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
  3676. -}
  3677. -EXPORT_SYMBOL(writeback_inodes_sb);
  3678. -
  3679. -/**
  3680. - * try_to_writeback_inodes_sb_nr - try to start writeback if none underway
  3681. - * @sb: the superblock
  3682. - * @nr: the number of pages to write
  3683. - * @reason: the reason of writeback
  3684. - *
  3685. - * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
  3686. - * Returns 1 if writeback was started, 0 if not.
  3687. - */
  3688. -bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
  3689. -                  enum wb_reason reason)
  3690. -{
  3691. -   if (!down_read_trylock(&sb->s_umount))
  3692. -       return false;
  3693. -
  3694. -   __writeback_inodes_sb_nr(sb, nr, reason, true);
  3695. -   up_read(&sb->s_umount);
  3696. -   return true;
  3697. -}
  3698. -EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
  3699. -
  3700. -/**
  3701. - * try_to_writeback_inodes_sb - try to start writeback if none underway
  3702. - * @sb: the superblock
  3703. - * @reason: reason why some writeback work was initiated
  3704. - *
  3705. - * Implement by try_to_writeback_inodes_sb_nr()
  3706. - * Returns 1 if writeback was started, 0 if not.
  3707. - */
  3708. -bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
  3709. -{
  3710. -   return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
  3711. -}
  3712. -EXPORT_SYMBOL(try_to_writeback_inodes_sb);
  3713. -
  3714. -/**
  3715. - * sync_inodes_sb  -   sync sb inode pages
  3716. - * @sb: the superblock
  3717. - *
  3718. - * This function writes and waits on any dirty inode belonging to this
  3719. - * super_block.
  3720. - */
  3721. -void sync_inodes_sb(struct super_block *sb)
  3722. -{
  3723. -   DEFINE_WB_COMPLETION_ONSTACK(done);
  3724. -   struct wb_writeback_work work = {
  3725. -       .sb     = sb,
  3726. -       .sync_mode  = WB_SYNC_ALL,
  3727. -       .nr_pages   = LONG_MAX,
  3728. -       .range_cyclic   = 0,
  3729. -       .done       = &done,
  3730. -       .reason     = WB_REASON_SYNC,
  3731. -       .for_sync   = 1,
  3732. -   };
  3733. -   struct backing_dev_info *bdi = sb->s_bdi;
  3734. -
  3735. -   /*
  3736. -    * Can't skip on !bdi_has_dirty() because we should wait for !dirty
  3737. -    * inodes under writeback and I_DIRTY_TIME inodes ignored by
  3738. -    * bdi_has_dirty() need to be written out too.
  3739. -    */
  3740. -   if (bdi == &noop_backing_dev_info)
  3741. -       return;
  3742. -   WARN_ON(!rwsem_is_locked(&sb->s_umount));
  3743. -
  3744. -   bdi_split_work_to_wbs(bdi, &work, false);
  3745. -   wb_wait_for_completion(bdi, &done);
  3746. -
  3747. -   wait_sb_inodes(sb);
  3748. -}
  3749. -EXPORT_SYMBOL(sync_inodes_sb);
  3750. -
  3751. -/**
  3752. - * write_inode_now -   write an inode to disk
  3753. - * @inode: inode to write to disk
  3754. - * @sync: whether the write should be synchronous or not
  3755. - *
  3756. - * This function commits an inode to disk immediately if it is dirty. This is
  3757. - * primarily needed by knfsd.
  3758. - *
  3759. - * The caller must either have a ref on the inode or must have set I_WILL_FREE.
  3760. - */
  3761. -int write_inode_now(struct inode *inode, int sync)
  3762. -{
  3763. -   struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
  3764. -   struct writeback_control wbc = {
  3765. -       .nr_to_write = LONG_MAX,
  3766. -       .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
  3767. -       .range_start = 0,
  3768. -       .range_end = LLONG_MAX,
  3769. -   };
  3770. -
  3771. -   if (!mapping_cap_writeback_dirty(inode->i_mapping))
  3772. -       wbc.nr_to_write = 0;
  3773. -
  3774. -   might_sleep();
  3775. -   return writeback_single_inode(inode, wb, &wbc);
  3776. -}
  3777. -EXPORT_SYMBOL(write_inode_now);
  3778. -
  3779. -/**
  3780. - * sync_inode - write an inode and its pages to disk.
  3781. - * @inode: the inode to sync
  3782. - * @wbc: controls the writeback mode
  3783. - *
  3784. - * sync_inode() will write an inode and its pages to disk.  It will also
  3785. - * correctly update the inode on its superblock's dirty inode lists and will
  3786. - * update inode->i_state.
  3787. - *
  3788. - * The caller must have a ref on the inode.
  3789. - */
  3790. -int sync_inode(struct inode *inode, struct writeback_control *wbc)
  3791. -{
  3792. -   return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc);
  3793. -}
  3794. -EXPORT_SYMBOL(sync_inode);
  3795. -
  3796. -/**
  3797. - * sync_inode_metadata - write an inode to disk
  3798. - * @inode: the inode to sync
  3799. - * @wait: wait for I/O to complete.
  3800. - *
  3801. - * Write an inode to disk and adjust its dirty state after completion.
  3802. - *
  3803. - * Note: only writes the actual inode, no associated data or other metadata.
  3804. - */
  3805. -int sync_inode_metadata(struct inode *inode, int wait)
  3806. -{
  3807. -   struct writeback_control wbc = {
  3808. -       .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
  3809. -       .nr_to_write = 0, /* metadata-only */
  3810. -   };
  3811. -
  3812. -   return sync_inode(inode, &wbc);
  3813. -}
  3814. -EXPORT_SYMBOL(sync_inode_metadata);
  3815. diff -Naur linux-4.4.6-gentoo-orig/fs/gfs2/meta_io.c linux-4.4.6-gentoo-patched/fs/gfs2/meta_io.c
  3816. --- linux-4.4.6-gentoo-orig/fs/gfs2/meta_io.c   2016-05-04 11:19:37.613649828 +0300
  3817. +++ linux-4.4.6-gentoo-patched/fs/gfs2/meta_io.c    2016-05-04 11:03:27.410730745 +0300
  3818. @@ -37,8 +37,7 @@
  3819.  {
  3820.     struct buffer_head *bh, *head;
  3821.     int nr_underway = 0;
  3822. -   int write_op = REQ_META | REQ_PRIO |
  3823. -       (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
  3824. +   int write_op = REQ_META | REQ_PRIO | wbc_to_write_cmd(wbc);
  3825.  
  3826.     BUG_ON(!PageLocked(page));
  3827.     BUG_ON(!page_has_buffers(page));
  3828. diff -Naur linux-4.4.6-gentoo-orig/fs/mpage.c linux-4.4.6-gentoo-patched/fs/mpage.c
  3829. --- linux-4.4.6-gentoo-orig/fs/mpage.c  2016-05-04 11:19:37.614649827 +0300
  3830. +++ linux-4.4.6-gentoo-patched/fs/mpage.c   2016-05-04 11:03:27.410730745 +0300
  3831. @@ -485,7 +485,6 @@
  3832.     struct buffer_head map_bh;
  3833.     loff_t i_size = i_size_read(inode);
  3834.     int ret = 0;
  3835. -   int wr = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
  3836.  
  3837.     if (page_has_buffers(page)) {
  3838.         struct buffer_head *head = page_buffers(page);
  3839. @@ -594,7 +593,7 @@
  3840.      * This page will go to BIO.  Do we need to send this BIO off first?
  3841.      */
  3842.     if (bio && mpd->last_block_in_bio != blocks[0] - 1)
  3843. -       bio = mpage_bio_submit(wr, bio);
  3844. +       bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio);
  3845.  
  3846.  alloc_new:
  3847.     if (bio == NULL) {
  3848. @@ -621,7 +620,7 @@
  3849.     wbc_account_io(wbc, page, PAGE_SIZE);
  3850.     length = first_unmapped << blkbits;
  3851.     if (bio_add_page(bio, page, length, 0) < length) {
  3852. -       bio = mpage_bio_submit(wr, bio);
  3853. +       bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio);
  3854.         goto alloc_new;
  3855.     }
  3856.  
  3857. @@ -631,7 +630,7 @@
  3858.     set_page_writeback(page);
  3859.     unlock_page(page);
  3860.     if (boundary || (first_unmapped != blocks_per_page)) {
  3861. -       bio = mpage_bio_submit(wr, bio);
  3862. +       bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio);
  3863.         if (boundary_block) {
  3864.             write_boundary_block(boundary_bdev,
  3865.                     boundary_block, 1 << blkbits);
  3866. @@ -643,7 +642,7 @@
  3867.  
  3868.  confused:
  3869.     if (bio)
  3870. -       bio = mpage_bio_submit(wr, bio);
  3871. +       bio = mpage_bio_submit(wbc_to_write_cmd(wbc), bio);
  3872.  
  3873.     if (mpd->use_writepage) {
  3874.         ret = mapping->a_ops->writepage(page, wbc);
  3875. diff -Naur linux-4.4.6-gentoo-orig/fs/xfs/xfs_aops.c linux-4.4.6-gentoo-patched/fs/xfs/xfs_aops.c
  3876. --- linux-4.4.6-gentoo-orig/fs/xfs/xfs_aops.c   2016-05-04 11:19:37.614649827 +0300
  3877. +++ linux-4.4.6-gentoo-patched/fs/xfs/xfs_aops.c    2016-05-04 11:03:27.410730745 +0300
  3878. @@ -382,7 +382,7 @@
  3879.     atomic_inc(&ioend->io_remaining);
  3880.     bio->bi_private = ioend;
  3881.     bio->bi_end_io = xfs_end_bio;
  3882. -   submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
  3883. +   submit_bio(wbc_to_write_cmd(wbc), bio);
  3884.  }
  3885.  
  3886.  STATIC struct bio *
  3887. diff -Naur linux-4.4.6-gentoo-orig/include/linux/backing-dev-defs.h linux-4.4.6-gentoo-patched/include/linux/backing-dev-defs.h
  3888. --- linux-4.4.6-gentoo-orig/include/linux/backing-dev-defs.h    2016-05-04 11:19:37.615649827 +0300
  3889. +++ linux-4.4.6-gentoo-patched/include/linux/backing-dev-defs.h 2016-05-04 11:03:27.410730745 +0300
  3890. @@ -116,6 +116,8 @@
  3891.     struct list_head work_list;
  3892.     struct delayed_work dwork;  /* work item used for writeback */
  3893.  
  3894. +   atomic_t dirty_sleeping;    /* waiting on dirty limit exceeded */
  3895. +
  3896.     struct list_head bdi_node;  /* anchored at bdi->wb_list */
  3897.  
  3898.  #ifdef CONFIG_CGROUP_WRITEBACK
  3899. diff -Naur linux-4.4.6-gentoo-orig/include/linux/blkdev.h linux-4.4.6-gentoo-patched/include/linux/blkdev.h
  3900. --- linux-4.4.6-gentoo-orig/include/linux/blkdev.h  2016-05-04 11:19:37.615649827 +0300
  3901. +++ linux-4.4.6-gentoo-patched/include/linux/blkdev.h   2016-05-04 11:03:27.410730745 +0300
  3902. @@ -23,6 +23,7 @@
  3903.  #include <linux/rcupdate.h>
  3904.  #include <linux/percpu-refcount.h>
  3905.  #include <linux/scatterlist.h>
  3906. +#include <linux/wbt.h>
  3907.  
  3908.  struct module;
  3909.  struct scsi_ioctl_command;
  3910. @@ -36,6 +37,7 @@
  3911.  struct blkcg_gq;
  3912.  struct blk_flush_queue;
  3913.  struct pr_ops;
  3914. +struct rq_wb;
  3915.  
  3916.  #define BLKDEV_MIN_RQ  4
  3917.  #define BLKDEV_MAX_RQ  128 /* Default maximum */
  3918. @@ -152,6 +154,7 @@
  3919.     struct gendisk *rq_disk;
  3920.     struct hd_struct *part;
  3921.     unsigned long start_time;
  3922. +   struct wb_issue_stat wb_stat;
  3923.  #ifdef CONFIG_BLK_CGROUP
  3924.     struct request_list *rl;        /* rl this rq is alloced from */
  3925.     unsigned long long start_time_ns;
  3926. @@ -289,6 +292,8 @@
  3927.     int         nr_rqs[2];  /* # allocated [a]sync rqs */
  3928.     int         nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
  3929.  
  3930. +   struct rq_wb        *rq_wb;
  3931. +
  3932.     /*
  3933.      * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
  3934.      * is used, root blkg allocates from @q->root_rl and all other
  3935. @@ -314,6 +319,8 @@
  3936.     struct blk_mq_ctx __percpu  *queue_ctx;
  3937.     unsigned int        nr_queues;
  3938.  
  3939. +   unsigned int        queue_depth;
  3940. +
  3941.     /* hw dispatch queues */
  3942.     struct blk_mq_hw_ctx    **queue_hw_ctx;
  3943.     unsigned int        nr_hw_queues;
  3944. @@ -399,6 +406,9 @@
  3945.  
  3946.     unsigned int        nr_sorted;
  3947.     unsigned int        in_flight[2];
  3948. +
  3949. +   struct blk_rq_stat  rq_stats[2];
  3950. +
  3951.     /*
  3952.      * Number of active block driver functions for which blk_drain_queue()
  3953.      * must wait. Must be incremented around functions that unlock the
  3954. @@ -431,8 +441,6 @@
  3955.     /*
  3956.      * for flush operations
  3957.      */
  3958. -   unsigned int        flush_flags;
  3959. -   unsigned int        flush_not_queueable:1;
  3960.     struct blk_flush_queue  *fq;
  3961.  
  3962.     struct list_head    requeue_list;
  3963. @@ -489,6 +497,9 @@
  3964.  #define QUEUE_FLAG_INIT_DONE   20  /* queue is initialized */
  3965.  #define QUEUE_FLAG_NO_SG_MERGE 21  /* don't attempt to merge SG segments*/
  3966.  #define QUEUE_FLAG_POLL           22   /* IO polling enabled if set */
  3967. +#define QUEUE_FLAG_WC         23   /* Write back caching */
  3968. +#define QUEUE_FLAG_FUA        24   /* device supports FUA writes */
  3969. +#define QUEUE_FLAG_FLUSH_NQ    25  /* flush not queueuable */
  3970.  
  3971.  #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) |        \
  3972.                  (1 << QUEUE_FLAG_STACKABLE)    |   \
  3973. @@ -677,6 +688,14 @@
  3974.     return false;
  3975.  }
  3976.  
  3977. +static inline unsigned int blk_queue_depth(struct request_queue *q)
  3978. +{
  3979. +   if (q->queue_depth)
  3980. +       return q->queue_depth;
  3981. +
  3982. +   return q->nr_requests;
  3983. +}
  3984. +
  3985.  /*
  3986.   * q->prep_rq_fn return values
  3987.   */
  3988. @@ -977,6 +996,7 @@
  3989.  extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
  3990.  extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
  3991.  extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
  3992. +extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
  3993.  extern void blk_set_default_limits(struct queue_limits *lim);
  3994.  extern void blk_set_stacking_limits(struct queue_limits *lim);
  3995.  extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
  3996. @@ -1001,8 +1021,8 @@
  3997.  extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
  3998.  extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
  3999.  extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
  4000. -extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
  4001.  extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
  4002. +extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
  4003.  extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
  4004.  
  4005.  extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
  4006. @@ -1355,7 +1375,7 @@
  4007.  
  4008.  static inline bool queue_flush_queueable(struct request_queue *q)
  4009.  {
  4010. -   return !q->flush_not_queueable;
  4011. +   return !test_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags);
  4012.  }
  4013.  
  4014.  typedef struct {struct page *v;} Sector;
  4015. diff -Naur linux-4.4.6-gentoo-orig/include/linux/blk_types.h linux-4.4.6-gentoo-patched/include/linux/blk_types.h
  4016. --- linux-4.4.6-gentoo-orig/include/linux/blk_types.h   2016-05-04 11:19:37.616649827 +0300
  4017. +++ linux-4.4.6-gentoo-patched/include/linux/blk_types.h    2016-05-04 11:03:27.410730745 +0300
  4018. @@ -161,6 +161,7 @@
  4019.     __REQ_INTEGRITY,    /* I/O includes block integrity payload */
  4020.     __REQ_FUA,      /* forced unit access */
  4021.     __REQ_FLUSH,        /* request for cache flush */
  4022. +   __REQ_BG,       /* background activity */
  4023.  
  4024.     /* bio only flags */
  4025.     __REQ_RAHEAD,       /* read ahead, can fail anytime */
  4026. @@ -209,7 +210,7 @@
  4027.  #define REQ_COMMON_MASK \
  4028.     (REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | \
  4029.      REQ_DISCARD | REQ_WRITE_SAME | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | \
  4030. -    REQ_SECURE | REQ_INTEGRITY)
  4031. +    REQ_SECURE | REQ_INTEGRITY | REQ_BG)
  4032.  #define REQ_CLONE_MASK     REQ_COMMON_MASK
  4033.  
  4034.  #define BIO_NO_ADVANCE_ITER_MASK   (REQ_DISCARD|REQ_WRITE_SAME)
  4035. @@ -236,6 +237,7 @@
  4036.  #define REQ_COPY_USER      (1ULL << __REQ_COPY_USER)
  4037.  #define REQ_FLUSH      (1ULL << __REQ_FLUSH)
  4038.  #define REQ_FLUSH_SEQ      (1ULL << __REQ_FLUSH_SEQ)
  4039. +#define REQ_BG         (1ULL << __REQ_BG)
  4040.  #define REQ_IO_STAT        (1ULL << __REQ_IO_STAT)
  4041.  #define REQ_MIXED_MERGE        (1ULL << __REQ_MIXED_MERGE)
  4042.  #define REQ_SECURE     (1ULL << __REQ_SECURE)
  4043. @@ -268,4 +270,12 @@
  4044.     return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
  4045.  }
  4046.  
  4047. +struct blk_rq_stat {
  4048. +   s64 mean;
  4049. +   u64 min;
  4050. +   u64 max;
  4051. +   s64 nr_samples;
  4052. +   s64 time;
  4053. +};
  4054. +
  4055.  #endif /* __LINUX_BLK_TYPES_H */
  4056. diff -Naur linux-4.4.6-gentoo-orig/include/linux/fs.h linux-4.4.6-gentoo-patched/include/linux/fs.h
  4057. --- linux-4.4.6-gentoo-orig/include/linux/fs.h  2016-05-04 11:19:37.616649827 +0300
  4058. +++ linux-4.4.6-gentoo-patched/include/linux/fs.h   2016-05-04 11:03:27.411730745 +0300
  4059. @@ -189,6 +189,9 @@
  4060.   * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded
  4061.   *         by a cache flush and data is guaranteed to be on
  4062.   *         non-volatile media on completion.
  4063. + * WRITE_BG        Background write. This is for background activity like
  4064. + *         the periodic flush and background threshold writeback
  4065. + *
  4066.   *
  4067.   */
  4068.  #define RW_MASK            REQ_WRITE
  4069. @@ -204,6 +207,7 @@
  4070.  #define WRITE_FLUSH        (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH)
  4071.  #define WRITE_FUA      (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA)
  4072.  #define WRITE_FLUSH_FUA        (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
  4073. +#define WRITE_BG       (WRITE | REQ_NOIDLE | REQ_BG)
  4074.  
  4075.  /*
  4076.   * Attribute flags.  These should be or-ed together to figure out what
  4077. diff -Naur linux-4.4.6-gentoo-orig/include/linux/sched.h.orig linux-4.4.6-gentoo-patched/include/linux/sched.h.orig
  4078. --- linux-4.4.6-gentoo-orig/include/linux/sched.h.orig  2016-05-04 11:19:37.618649827 +0300
  4079. +++ linux-4.4.6-gentoo-patched/include/linux/sched.h.orig   1970-01-01 03:00:00.000000000 +0300
  4080. @@ -1,3194 +0,0 @@
  4081. -#ifndef _LINUX_SCHED_H
  4082. -#define _LINUX_SCHED_H
  4083. -
  4084. -#include <uapi/linux/sched.h>
  4085. -
  4086. -#include <linux/sched/prio.h>
  4087. -
  4088. -
  4089. -struct sched_param {
  4090. -   int sched_priority;
  4091. -};
  4092. -
  4093. -#include <asm/param.h> /* for HZ */
  4094. -
  4095. -#include <linux/capability.h>
  4096. -#include <linux/threads.h>
  4097. -#include <linux/kernel.h>
  4098. -#include <linux/types.h>
  4099. -#include <linux/timex.h>
  4100. -#include <linux/jiffies.h>
  4101. -#include <linux/plist.h>
  4102. -#include <linux/rbtree.h>
  4103. -#include <linux/thread_info.h>
  4104. -#include <linux/cpumask.h>
  4105. -#include <linux/errno.h>
  4106. -#include <linux/nodemask.h>
  4107. -#include <linux/mm_types.h>
  4108. -#include <linux/preempt.h>
  4109. -
  4110. -#include <asm/page.h>
  4111. -#include <asm/ptrace.h>
  4112. -#include <linux/cputime.h>
  4113. -
  4114. -#include <linux/smp.h>
  4115. -#include <linux/sem.h>
  4116. -#include <linux/shm.h>
  4117. -#include <linux/signal.h>
  4118. -#include <linux/compiler.h>
  4119. -#include <linux/completion.h>
  4120. -#include <linux/pid.h>
  4121. -#include <linux/percpu.h>
  4122. -#include <linux/topology.h>
  4123. -#include <linux/proportions.h>
  4124. -#include <linux/seccomp.h>
  4125. -#include <linux/rcupdate.h>
  4126. -#include <linux/rculist.h>
  4127. -#include <linux/rtmutex.h>
  4128. -
  4129. -#include <linux/time.h>
  4130. -#include <linux/param.h>
  4131. -#include <linux/resource.h>
  4132. -#include <linux/timer.h>
  4133. -#include <linux/hrtimer.h>
  4134. -#include <linux/task_io_accounting.h>
  4135. -#include <linux/latencytop.h>
  4136. -#include <linux/cred.h>
  4137. -#include <linux/llist.h>
  4138. -#include <linux/uidgid.h>
  4139. -#include <linux/gfp.h>
  4140. -#include <linux/magic.h>
  4141. -#include <linux/cgroup-defs.h>
  4142. -
  4143. -#include <asm/processor.h>
  4144. -
  4145. -#define SCHED_ATTR_SIZE_VER0   48  /* sizeof first published struct */
  4146. -
  4147. -/*
  4148. - * Extended scheduling parameters data structure.
  4149. - *
  4150. - * This is needed because the original struct sched_param can not be
  4151. - * altered without introducing ABI issues with legacy applications
  4152. - * (e.g., in sched_getparam()).
  4153. - *
  4154. - * However, the possibility of specifying more than just a priority for
  4155. - * the tasks may be useful for a wide variety of application fields, e.g.,
  4156. - * multimedia, streaming, automation and control, and many others.
  4157. - *
  4158. - * This variant (sched_attr) is meant at describing a so-called
  4159. - * sporadic time-constrained task. In such model a task is specified by:
  4160. - *  - the activation period or minimum instance inter-arrival time;
  4161. - *  - the maximum (or average, depending on the actual scheduling
  4162. - *    discipline) computation time of all instances, a.k.a. runtime;
  4163. - *  - the deadline (relative to the actual activation time) of each
  4164. - *    instance.
  4165. - * Very briefly, a periodic (sporadic) task asks for the execution of
  4166. - * some specific computation --which is typically called an instance--
  4167. - * (at most) every period. Moreover, each instance typically lasts no more
  4168. - * than the runtime and must be completed by time instant t equal to
  4169. - * the instance activation time + the deadline.
  4170. - *
  4171. - * This is reflected by the actual fields of the sched_attr structure:
  4172. - *
  4173. - *  @size      size of the structure, for fwd/bwd compat.
  4174. - *
  4175. - *  @sched_policy  task's scheduling policy
  4176. - *  @sched_flags   for customizing the scheduler behaviour
  4177. - *  @sched_nice        task's nice value      (SCHED_NORMAL/BATCH)
  4178. - *  @sched_priority    task's static priority (SCHED_FIFO/RR)
  4179. - *  @sched_deadline    representative of the task's deadline
  4180. - *  @sched_runtime representative of the task's runtime
  4181. - *  @sched_period  representative of the task's period
  4182. - *
  4183. - * Given this task model, there are a multiplicity of scheduling algorithms
  4184. - * and policies, that can be used to ensure all the tasks will make their
  4185. - * timing constraints.
  4186. - *
  4187. - * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
  4188. - * only user of this new interface. More information about the algorithm
  4189. - * available in the scheduling class file or in Documentation/.
  4190. - */
  4191. -struct sched_attr {
  4192. -   u32 size;
  4193. -
  4194. -   u32 sched_policy;
  4195. -   u64 sched_flags;
  4196. -
  4197. -   /* SCHED_NORMAL, SCHED_BATCH */
  4198. -   s32 sched_nice;
  4199. -
  4200. -   /* SCHED_FIFO, SCHED_RR */
  4201. -   u32 sched_priority;
  4202. -
  4203. -   /* SCHED_DEADLINE */
  4204. -   u64 sched_runtime;
  4205. -   u64 sched_deadline;
  4206. -   u64 sched_period;
  4207. -};
  4208. -
  4209. -struct futex_pi_state;
  4210. -struct robust_list_head;
  4211. -struct bio_list;
  4212. -struct fs_struct;
  4213. -struct perf_event_context;
  4214. -struct blk_plug;
  4215. -struct filename;
  4216. -struct nameidata;
  4217. -
  4218. -#define VMACACHE_BITS 2
  4219. -#define VMACACHE_SIZE (1U << VMACACHE_BITS)
  4220. -#define VMACACHE_MASK (VMACACHE_SIZE - 1)
  4221. -
  4222. -/*
  4223. - * These are the constant used to fake the fixed-point load-average
  4224. - * counting. Some notes:
  4225. - *  - 11 bit fractions expand to 22 bits by the multiplies: this gives
  4226. - *    a load-average precision of 10 bits integer + 11 bits fractional
  4227. - *  - if you want to count load-averages more often, you need more
  4228. - *    precision, or rounding will get you. With 2-second counting freq,
  4229. - *    the EXP_n values would be 1981, 2034 and 2043 if still using only
  4230. - *    11 bit fractions.
  4231. - */
  4232. -extern unsigned long avenrun[];        /* Load averages */
  4233. -extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
  4234. -
  4235. -#define FSHIFT     11      /* nr of bits of precision */
  4236. -#define FIXED_1        (1<<FSHIFT) /* 1.0 as fixed-point */
  4237. -#define LOAD_FREQ  (5*HZ+1)    /* 5 sec intervals */
  4238. -#define EXP_1      1884        /* 1/exp(5sec/1min) as fixed-point */
  4239. -#define EXP_5      2014        /* 1/exp(5sec/5min) */
  4240. -#define EXP_15     2037        /* 1/exp(5sec/15min) */
  4241. -
  4242. -#define CALC_LOAD(load,exp,n) \
  4243. -   load *= exp; \
  4244. -   load += n*(FIXED_1-exp); \
  4245. -   load >>= FSHIFT;
  4246. -
  4247. -extern unsigned long total_forks;
  4248. -extern int nr_threads;
  4249. -DECLARE_PER_CPU(unsigned long, process_counts);
  4250. -extern int nr_processes(void);
  4251. -extern unsigned long nr_running(void);
  4252. -extern bool single_task_running(void);
  4253. -extern unsigned long nr_iowait(void);
  4254. -extern unsigned long nr_iowait_cpu(int cpu);
  4255. -extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
  4256. -
  4257. -extern void calc_global_load(unsigned long ticks);
  4258. -
  4259. -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
  4260. -extern void update_cpu_load_nohz(void);
  4261. -#else
  4262. -static inline void update_cpu_load_nohz(void) { }
  4263. -#endif
  4264. -
  4265. -extern unsigned long get_parent_ip(unsigned long addr);
  4266. -
  4267. -extern void dump_cpu_task(int cpu);
  4268. -
  4269. -struct seq_file;
  4270. -struct cfs_rq;
  4271. -struct task_group;
  4272. -#ifdef CONFIG_SCHED_DEBUG
  4273. -extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
  4274. -extern void proc_sched_set_task(struct task_struct *p);
  4275. -#endif
  4276. -
  4277. -/*
  4278. - * Task state bitmask. NOTE! These bits are also
  4279. - * encoded in fs/proc/array.c: get_task_state().
  4280. - *
  4281. - * We have two separate sets of flags: task->state
  4282. - * is about runnability, while task->exit_state are
  4283. - * about the task exiting. Confusing, but this way
  4284. - * modifying one set can't modify the other one by
  4285. - * mistake.
  4286. - */
  4287. -#define TASK_RUNNING       0
  4288. -#define TASK_INTERRUPTIBLE 1
  4289. -#define TASK_UNINTERRUPTIBLE   2
  4290. -#define __TASK_STOPPED     4
  4291. -#define __TASK_TRACED      8
  4292. -/* in tsk->exit_state */
  4293. -#define EXIT_DEAD      16
  4294. -#define EXIT_ZOMBIE        32
  4295. -#define EXIT_TRACE     (EXIT_ZOMBIE | EXIT_DEAD)
  4296. -/* in tsk->state again */
  4297. -#define TASK_DEAD      64
  4298. -#define TASK_WAKEKILL      128
  4299. -#define TASK_WAKING        256
  4300. -#define TASK_PARKED        512
  4301. -#define TASK_NOLOAD        1024
  4302. -#define TASK_STATE_MAX     2048
  4303. -
  4304. -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
  4305. -
  4306. -extern char ___assert_task_state[1 - 2*!!(
  4307. -       sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
  4308. -
  4309. -/* Convenience macros for the sake of set_task_state */
  4310. -#define TASK_KILLABLE      (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
  4311. -#define TASK_STOPPED       (TASK_WAKEKILL | __TASK_STOPPED)
  4312. -#define TASK_TRACED        (TASK_WAKEKILL | __TASK_TRACED)
  4313. -
  4314. -#define TASK_IDLE      (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
  4315. -
  4316. -/* Convenience macros for the sake of wake_up */
  4317. -#define TASK_NORMAL        (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
  4318. -#define TASK_ALL       (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
  4319. -
  4320. -/* get_task_state() */
  4321. -#define TASK_REPORT        (TASK_RUNNING | TASK_INTERRUPTIBLE | \
  4322. -                TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
  4323. -                __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
  4324. -
  4325. -#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
  4326. -#define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
  4327. -#define task_is_stopped_or_traced(task)    \
  4328. -           ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
  4329. -#define task_contributes_to_load(task) \
  4330. -               ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
  4331. -                (task->flags & PF_FROZEN) == 0 && \
  4332. -                (task->state & TASK_NOLOAD) == 0)
  4333. -
  4334. -#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
  4335. -
  4336. -#define __set_task_state(tsk, state_value)         \
  4337. -   do {                            \
  4338. -       (tsk)->task_state_change = _THIS_IP_;       \
  4339. -       (tsk)->state = (state_value);           \
  4340. -   } while (0)
  4341. -#define set_task_state(tsk, state_value)           \
  4342. -   do {                            \
  4343. -       (tsk)->task_state_change = _THIS_IP_;       \
  4344. -       smp_store_mb((tsk)->state, (state_value));      \
  4345. -   } while (0)
  4346. -
  4347. -/*
  4348. - * set_current_state() includes a barrier so that the write of current->state
  4349. - * is correctly serialised wrt the caller's subsequent test of whether to
  4350. - * actually sleep:
  4351. - *
  4352. - * set_current_state(TASK_UNINTERRUPTIBLE);
  4353. - * if (do_i_need_to_sleep())
  4354. - *     schedule();
  4355. - *
  4356. - * If the caller does not need such serialisation then use __set_current_state()
  4357. - */
  4358. -#define __set_current_state(state_value)           \
  4359. -   do {                            \
  4360. -       current->task_state_change = _THIS_IP_;     \
  4361. -       current->state = (state_value);         \
  4362. -   } while (0)
  4363. -#define set_current_state(state_value)             \
  4364. -   do {                            \
  4365. -       current->task_state_change = _THIS_IP_;     \
  4366. -       smp_store_mb(current->state, (state_value));        \
  4367. -   } while (0)
  4368. -
  4369. -#else
  4370. -
  4371. -#define __set_task_state(tsk, state_value)     \
  4372. -   do { (tsk)->state = (state_value); } while (0)
  4373. -#define set_task_state(tsk, state_value)       \
  4374. -   smp_store_mb((tsk)->state, (state_value))
  4375. -
  4376. -/*
  4377. - * set_current_state() includes a barrier so that the write of current->state
  4378. - * is correctly serialised wrt the caller's subsequent test of whether to
  4379. - * actually sleep:
  4380. - *
  4381. - * set_current_state(TASK_UNINTERRUPTIBLE);
  4382. - * if (do_i_need_to_sleep())
  4383. - *     schedule();
  4384. - *
  4385. - * If the caller does not need such serialisation then use __set_current_state()
  4386. - */
  4387. -#define __set_current_state(state_value)       \
  4388. -   do { current->state = (state_value); } while (0)
  4389. -#define set_current_state(state_value)         \
  4390. -   smp_store_mb(current->state, (state_value))
  4391. -
  4392. -#endif
  4393. -
  4394. -/* Task command name length */
  4395. -#define TASK_COMM_LEN 16
  4396. -
  4397. -#include <linux/spinlock.h>
  4398. -
  4399. -/*
  4400. - * This serializes "schedule()" and also protects
  4401. - * the run-queue from deletions/modifications (but
  4402. - * _adding_ to the beginning of the run-queue has
  4403. - * a separate lock).
  4404. - */
  4405. -extern rwlock_t tasklist_lock;
  4406. -extern spinlock_t mmlist_lock;
  4407. -
  4408. -struct task_struct;
  4409. -
  4410. -#ifdef CONFIG_PROVE_RCU
  4411. -extern int lockdep_tasklist_lock_is_held(void);
  4412. -#endif /* #ifdef CONFIG_PROVE_RCU */
  4413. -
  4414. -extern void sched_init(void);
  4415. -extern void sched_init_smp(void);
  4416. -extern asmlinkage void schedule_tail(struct task_struct *prev);
  4417. -extern void init_idle(struct task_struct *idle, int cpu);
  4418. -extern void init_idle_bootup_task(struct task_struct *idle);
  4419. -
  4420. -extern cpumask_var_t cpu_isolated_map;
  4421. -
  4422. -extern int runqueue_is_locked(int cpu);
  4423. -
  4424. -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
  4425. -extern void nohz_balance_enter_idle(int cpu);
  4426. -extern void set_cpu_sd_state_idle(void);
  4427. -extern int get_nohz_timer_target(void);
  4428. -#else
  4429. -static inline void nohz_balance_enter_idle(int cpu) { }
  4430. -static inline void set_cpu_sd_state_idle(void) { }
  4431. -#endif
  4432. -
  4433. -/*
  4434. - * Only dump TASK_* tasks. (0 for all tasks)
  4435. - */
  4436. -extern void show_state_filter(unsigned long state_filter);
  4437. -
  4438. -static inline void show_state(void)
  4439. -{
  4440. -   show_state_filter(0);
  4441. -}
  4442. -
  4443. -extern void show_regs(struct pt_regs *);
  4444. -
  4445. -/*
  4446. - * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
  4447. - * task), SP is the stack pointer of the first frame that should be shown in the back
  4448. - * trace (or NULL if the entire call-chain of the task should be shown).
  4449. - */
  4450. -extern void show_stack(struct task_struct *task, unsigned long *sp);
  4451. -
  4452. -extern void cpu_init (void);
  4453. -extern void trap_init(void);
  4454. -extern void update_process_times(int user);
  4455. -extern void scheduler_tick(void);
  4456. -
  4457. -extern void sched_show_task(struct task_struct *p);
  4458. -
  4459. -#ifdef CONFIG_LOCKUP_DETECTOR
  4460. -extern void touch_softlockup_watchdog(void);
  4461. -extern void touch_softlockup_watchdog_sync(void);
  4462. -extern void touch_all_softlockup_watchdogs(void);
  4463. -extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
  4464. -                 void __user *buffer,
  4465. -                 size_t *lenp, loff_t *ppos);
  4466. -extern unsigned int  softlockup_panic;
  4467. -extern unsigned int  hardlockup_panic;
  4468. -void lockup_detector_init(void);
  4469. -#else
  4470. -static inline void touch_softlockup_watchdog(void)
  4471. -{
  4472. -}
  4473. -static inline void touch_softlockup_watchdog_sync(void)
  4474. -{
  4475. -}
  4476. -static inline void touch_all_softlockup_watchdogs(void)
  4477. -{
  4478. -}
  4479. -static inline void lockup_detector_init(void)
  4480. -{
  4481. -}
  4482. -#endif
  4483. -
  4484. -#ifdef CONFIG_DETECT_HUNG_TASK
  4485. -void reset_hung_task_detector(void);
  4486. -#else
  4487. -static inline void reset_hung_task_detector(void)
  4488. -{
  4489. -}
  4490. -#endif
  4491. -
  4492. -/* Attach to any functions which should be ignored in wchan output. */
  4493. -#define __sched        __attribute__((__section__(".sched.text")))
  4494. -
  4495. -/* Linker adds these: start and end of __sched functions */
  4496. -extern char __sched_text_start[], __sched_text_end[];
  4497. -
  4498. -/* Is this address in the __sched functions? */
  4499. -extern int in_sched_functions(unsigned long addr);
  4500. -
  4501. -#define    MAX_SCHEDULE_TIMEOUT    LONG_MAX
  4502. -extern signed long schedule_timeout(signed long timeout);
  4503. -extern signed long schedule_timeout_interruptible(signed long timeout);
  4504. -extern signed long schedule_timeout_killable(signed long timeout);
  4505. -extern signed long schedule_timeout_uninterruptible(signed long timeout);
  4506. -asmlinkage void schedule(void);
  4507. -extern void schedule_preempt_disabled(void);
  4508. -
  4509. -extern long io_schedule_timeout(long timeout);
  4510. -
  4511. -static inline void io_schedule(void)
  4512. -{
  4513. -   io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
  4514. -}
  4515. -
  4516. -struct nsproxy;
  4517. -struct user_namespace;
  4518. -
  4519. -#ifdef CONFIG_MMU
  4520. -extern void arch_pick_mmap_layout(struct mm_struct *mm);
  4521. -extern unsigned long
  4522. -arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
  4523. -              unsigned long, unsigned long);
  4524. -extern unsigned long
  4525. -arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
  4526. -             unsigned long len, unsigned long pgoff,
  4527. -             unsigned long flags);
  4528. -#else
  4529. -static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
  4530. -#endif
  4531. -
  4532. -#define SUID_DUMP_DISABLE  0   /* No setuid dumping */
  4533. -#define SUID_DUMP_USER     1   /* Dump as user of process */
  4534. -#define SUID_DUMP_ROOT     2   /* Dump as root */
  4535. -
  4536. -/* mm flags */
  4537. -
  4538. -/* for SUID_DUMP_* above */
  4539. -#define MMF_DUMPABLE_BITS 2
  4540. -#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
  4541. -
  4542. -extern void set_dumpable(struct mm_struct *mm, int value);
  4543. -/*
  4544. - * This returns the actual value of the suid_dumpable flag. For things
  4545. - * that are using this for checking for privilege transitions, it must
  4546. - * test against SUID_DUMP_USER rather than treating it as a boolean
  4547. - * value.
  4548. - */
  4549. -static inline int __get_dumpable(unsigned long mm_flags)
  4550. -{
  4551. -   return mm_flags & MMF_DUMPABLE_MASK;
  4552. -}
  4553. -
  4554. -static inline int get_dumpable(struct mm_struct *mm)
  4555. -{
  4556. -   return __get_dumpable(mm->flags);
  4557. -}
  4558. -
  4559. -/* coredump filter bits */
  4560. -#define MMF_DUMP_ANON_PRIVATE  2
  4561. -#define MMF_DUMP_ANON_SHARED   3
  4562. -#define MMF_DUMP_MAPPED_PRIVATE    4
  4563. -#define MMF_DUMP_MAPPED_SHARED 5
  4564. -#define MMF_DUMP_ELF_HEADERS   6
  4565. -#define MMF_DUMP_HUGETLB_PRIVATE 7
  4566. -#define MMF_DUMP_HUGETLB_SHARED  8
  4567. -#define MMF_DUMP_DAX_PRIVATE   9
  4568. -#define MMF_DUMP_DAX_SHARED    10
  4569. -
  4570. -#define MMF_DUMP_FILTER_SHIFT  MMF_DUMPABLE_BITS
  4571. -#define MMF_DUMP_FILTER_BITS   9
  4572. -#define MMF_DUMP_FILTER_MASK \
  4573. -   (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
  4574. -#define MMF_DUMP_FILTER_DEFAULT \
  4575. -   ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\
  4576. -    (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
  4577. -
  4578. -#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
  4579. -# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS)
  4580. -#else
  4581. -# define MMF_DUMP_MASK_DEFAULT_ELF 0
  4582. -#endif
  4583. -                   /* leave room for more dump flags */
  4584. -#define MMF_VM_MERGEABLE   16  /* KSM may merge identical pages */
  4585. -#define MMF_VM_HUGEPAGE        17  /* set when VM_HUGEPAGE is set on vma */
  4586. -#define MMF_EXE_FILE_CHANGED   18  /* see prctl_set_mm_exe_file() */
  4587. -
  4588. -#define MMF_HAS_UPROBES        19  /* has uprobes */
  4589. -#define MMF_RECALC_UPROBES 20  /* MMF_HAS_UPROBES can be wrong */
  4590. -
  4591. -#define MMF_INIT_MASK      (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
  4592. -
  4593. -struct sighand_struct {
  4594. -   atomic_t        count;
  4595. -   struct k_sigaction  action[_NSIG];
  4596. -   spinlock_t      siglock;
  4597. -   wait_queue_head_t   signalfd_wqh;
  4598. -};
  4599. -
  4600. -struct pacct_struct {
  4601. -   int         ac_flag;
  4602. -   long            ac_exitcode;
  4603. -   unsigned long       ac_mem;
  4604. -   cputime_t       ac_utime, ac_stime;
  4605. -   unsigned long       ac_minflt, ac_majflt;
  4606. -};
  4607. -
  4608. -struct cpu_itimer {
  4609. -   cputime_t expires;
  4610. -   cputime_t incr;
  4611. -   u32 error;
  4612. -   u32 incr_error;
  4613. -};
  4614. -
  4615. -/**
  4616. - * struct prev_cputime - snaphsot of system and user cputime
  4617. - * @utime: time spent in user mode
  4618. - * @stime: time spent in system mode
  4619. - * @lock: protects the above two fields
  4620. - *
  4621. - * Stores previous user/system time values such that we can guarantee
  4622. - * monotonicity.
  4623. - */
  4624. -struct prev_cputime {
  4625. -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
  4626. -   cputime_t utime;
  4627. -   cputime_t stime;
  4628. -   raw_spinlock_t lock;
  4629. -#endif
  4630. -};
  4631. -
  4632. -static inline void prev_cputime_init(struct prev_cputime *prev)
  4633. -{
  4634. -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
  4635. -   prev->utime = prev->stime = 0;
  4636. -   raw_spin_lock_init(&prev->lock);
  4637. -#endif
  4638. -}
  4639. -
  4640. -/**
  4641. - * struct task_cputime - collected CPU time counts
  4642. - * @utime:     time spent in user mode, in &cputime_t units
  4643. - * @stime:     time spent in kernel mode, in &cputime_t units
  4644. - * @sum_exec_runtime:  total time spent on the CPU, in nanoseconds
  4645. - *
  4646. - * This structure groups together three kinds of CPU time that are tracked for
  4647. - * threads and thread groups.  Most things considering CPU time want to group
  4648. - * these counts together and treat all three of them in parallel.
  4649. - */
  4650. -struct task_cputime {
  4651. -   cputime_t utime;
  4652. -   cputime_t stime;
  4653. -   unsigned long long sum_exec_runtime;
  4654. -};
  4655. -
  4656. -/* Alternate field names when used to cache expirations. */
  4657. -#define virt_exp   utime
  4658. -#define prof_exp   stime
  4659. -#define sched_exp  sum_exec_runtime
  4660. -
  4661. -#define INIT_CPUTIME   \
  4662. -   (struct task_cputime) {                 \
  4663. -       .utime = 0,                 \
  4664. -       .stime = 0,                 \
  4665. -       .sum_exec_runtime = 0,              \
  4666. -   }
  4667. -
  4668. -/*
  4669. - * This is the atomic variant of task_cputime, which can be used for
  4670. - * storing and updating task_cputime statistics without locking.
  4671. - */
  4672. -struct task_cputime_atomic {
  4673. -   atomic64_t utime;
  4674. -   atomic64_t stime;
  4675. -   atomic64_t sum_exec_runtime;
  4676. -};
  4677. -
  4678. -#define INIT_CPUTIME_ATOMIC \
  4679. -   (struct task_cputime_atomic) {              \
  4680. -       .utime = ATOMIC64_INIT(0),          \
  4681. -       .stime = ATOMIC64_INIT(0),          \
  4682. -       .sum_exec_runtime = ATOMIC64_INIT(0),       \
  4683. -   }
  4684. -
  4685. -#define PREEMPT_DISABLED   (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
  4686. -
  4687. -/*
  4688. - * Disable preemption until the scheduler is running -- use an unconditional
  4689. - * value so that it also works on !PREEMPT_COUNT kernels.
  4690. - *
  4691. - * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
  4692. - */
  4693. -#define INIT_PREEMPT_COUNT PREEMPT_OFFSET
  4694. -
  4695. -/*
  4696. - * Initial preempt_count value; reflects the preempt_count schedule invariant
  4697. - * which states that during context switches:
  4698. - *
  4699. - *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
  4700. - *
  4701. - * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
  4702. - * Note: See finish_task_switch().
  4703. - */
  4704. -#define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
  4705. -
  4706. -/**
  4707. - * struct thread_group_cputimer - thread group interval timer counts
  4708. - * @cputime_atomic:    atomic thread group interval timers.
  4709. - * @running:       true when there are timers running and
  4710. - *         @cputime_atomic receives updates.
  4711. - * @checking_timer:    true when a thread in the group is in the
  4712. - *         process of checking for thread group timers.
  4713. - *
  4714. - * This structure contains the version of task_cputime, above, that is
  4715. - * used for thread group CPU timer calculations.
  4716. - */
  4717. -struct thread_group_cputimer {
  4718. -   struct task_cputime_atomic cputime_atomic;
  4719. -   bool running;
  4720. -   bool checking_timer;
  4721. -};
  4722. -
  4723. -#include <linux/rwsem.h>
  4724. -struct autogroup;
  4725. -
  4726. -/*
  4727. - * NOTE! "signal_struct" does not have its own
  4728. - * locking, because a shared signal_struct always
  4729. - * implies a shared sighand_struct, so locking
  4730. - * sighand_struct is always a proper superset of
  4731. - * the locking of signal_struct.
  4732. - */
  4733. -struct signal_struct {
  4734. -   atomic_t        sigcnt;
  4735. -   atomic_t        live;
  4736. -   int         nr_threads;
  4737. -   struct list_head    thread_head;
  4738. -
  4739. -   wait_queue_head_t   wait_chldexit;  /* for wait4() */
  4740. -
  4741. -   /* current thread group signal load-balancing target: */
  4742. -   struct task_struct  *curr_target;
  4743. -
  4744. -   /* shared signal handling: */
  4745. -   struct sigpending   shared_pending;
  4746. -
  4747. -   /* thread group exit support */
  4748. -   int         group_exit_code;
  4749. -   /* overloaded:
  4750. -    * - notify group_exit_task when ->count is equal to notify_count
  4751. -    * - everyone except group_exit_task is stopped during signal delivery
  4752. -    *   of fatal signals, group_exit_task processes the signal.
  4753. -    */
  4754. -   int         notify_count;
  4755. -   struct task_struct  *group_exit_task;
  4756. -
  4757. -   /* thread group stop support, overloads group_exit_code too */
  4758. -   int         group_stop_count;
  4759. -   unsigned int        flags; /* see SIGNAL_* flags below */
  4760. -
  4761. -   /*
  4762. -    * PR_SET_CHILD_SUBREAPER marks a process, like a service
  4763. -    * manager, to re-parent orphan (double-forking) child processes
  4764. -    * to this process instead of 'init'. The service manager is
  4765. -    * able to receive SIGCHLD signals and is able to investigate
  4766. -    * the process until it calls wait(). All children of this
  4767. -    * process will inherit a flag if they should look for a
  4768. -    * child_subreaper process at exit.
  4769. -    */
  4770. -   unsigned int        is_child_subreaper:1;
  4771. -   unsigned int        has_child_subreaper:1;
  4772. -
  4773. -   /* POSIX.1b Interval Timers */
  4774. -   int         posix_timer_id;
  4775. -   struct list_head    posix_timers;
  4776. -
  4777. -   /* ITIMER_REAL timer for the process */
  4778. -   struct hrtimer real_timer;
  4779. -   struct pid *leader_pid;
  4780. -   ktime_t it_real_incr;
  4781. -
  4782. -   /*
  4783. -    * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
  4784. -    * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
  4785. -    * values are defined to 0 and 1 respectively
  4786. -    */
  4787. -   struct cpu_itimer it[2];
  4788. -
  4789. -   /*
  4790. -    * Thread group totals for process CPU timers.
  4791. -    * See thread_group_cputimer(), et al, for details.
  4792. -    */
  4793. -   struct thread_group_cputimer cputimer;
  4794. -
  4795. -   /* Earliest-expiration cache. */
  4796. -   struct task_cputime cputime_expires;
  4797. -
  4798. -   struct list_head cpu_timers[3];
  4799. -
  4800. -   struct pid *tty_old_pgrp;
  4801. -
  4802. -   /* boolean value for session group leader */
  4803. -   int leader;
  4804. -
  4805. -   struct tty_struct *tty; /* NULL if no tty */
  4806. -
  4807. -#ifdef CONFIG_SCHED_AUTOGROUP
  4808. -   struct autogroup *autogroup;
  4809. -#endif
  4810. -   /*
  4811. -    * Cumulative resource counters for dead threads in the group,
  4812. -    * and for reaped dead child processes forked by this group.
  4813. -    * Live threads maintain their own counters and add to these
  4814. -    * in __exit_signal, except for the group leader.
  4815. -    */
  4816. -   seqlock_t stats_lock;
  4817. -   cputime_t utime, stime, cutime, cstime;
  4818. -   cputime_t gtime;
  4819. -   cputime_t cgtime;
  4820. -   struct prev_cputime prev_cputime;
  4821. -   unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
  4822. -   unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
  4823. -   unsigned long inblock, oublock, cinblock, coublock;
  4824. -   unsigned long maxrss, cmaxrss;
  4825. -   struct task_io_accounting ioac;
  4826. -
  4827. -   /*
  4828. -    * Cumulative ns of schedule CPU time fo dead threads in the
  4829. -    * group, not including a zombie group leader, (This only differs
  4830. -    * from jiffies_to_ns(utime + stime) if sched_clock uses something
  4831. -    * other than jiffies.)
  4832. -    */
  4833. -   unsigned long long sum_sched_runtime;
  4834. -
  4835. -   /*
  4836. -    * We don't bother to synchronize most readers of this at all,
  4837. -    * because there is no reader checking a limit that actually needs
  4838. -    * to get both rlim_cur and rlim_max atomically, and either one
  4839. -    * alone is a single word that can safely be read normally.
  4840. -    * getrlimit/setrlimit use task_lock(current->group_leader) to
  4841. -    * protect this instead of the siglock, because they really
  4842. -    * have no need to disable irqs.
  4843. -    */
  4844. -   struct rlimit rlim[RLIM_NLIMITS];
  4845. -
  4846. -#ifdef CONFIG_BSD_PROCESS_ACCT
  4847. -   struct pacct_struct pacct;  /* per-process accounting information */
  4848. -#endif
  4849. -#ifdef CONFIG_TASKSTATS
  4850. -   struct taskstats *stats;
  4851. -#endif
  4852. -#ifdef CONFIG_AUDIT
  4853. -   unsigned audit_tty;
  4854. -   unsigned audit_tty_log_passwd;
  4855. -   struct tty_audit_buf *tty_audit_buf;
  4856. -#endif
  4857. -
  4858. -   oom_flags_t oom_flags;
  4859. -   short oom_score_adj;        /* OOM kill score adjustment */
  4860. -   short oom_score_adj_min;    /* OOM kill score adjustment min value.
  4861. -                    * Only settable by CAP_SYS_RESOURCE. */
  4862. -
  4863. -   struct mutex cred_guard_mutex;  /* guard against foreign influences on
  4864. -                    * credential calculations
  4865. -                    * (notably. ptrace) */
  4866. -};
  4867. -
  4868. -/*
  4869. - * Bits in flags field of signal_struct.
  4870. - */
  4871. -#define SIGNAL_STOP_STOPPED    0x00000001 /* job control stop in effect */
  4872. -#define SIGNAL_STOP_CONTINUED  0x00000002 /* SIGCONT since WCONTINUED reap */
  4873. -#define SIGNAL_GROUP_EXIT  0x00000004 /* group exit in progress */
  4874. -#define SIGNAL_GROUP_COREDUMP  0x00000008 /* coredump in progress */
  4875. -/*
  4876. - * Pending notifications to parent.
  4877. - */
  4878. -#define SIGNAL_CLD_STOPPED 0x00000010
  4879. -#define SIGNAL_CLD_CONTINUED   0x00000020
  4880. -#define SIGNAL_CLD_MASK        (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
  4881. -
  4882. -#define SIGNAL_UNKILLABLE  0x00000040 /* for init: ignore fatal signals */
  4883. -
  4884. -/* If true, all threads except ->group_exit_task have pending SIGKILL */
  4885. -static inline int signal_group_exit(const struct signal_struct *sig)
  4886. -{
  4887. -   return  (sig->flags & SIGNAL_GROUP_EXIT) ||
  4888. -       (sig->group_exit_task != NULL);
  4889. -}
  4890. -
  4891. -/*
  4892. - * Some day this will be a full-fledged user tracking system..
  4893. - */
  4894. -struct user_struct {
  4895. -   atomic_t __count;   /* reference count */
  4896. -   atomic_t processes; /* How many processes does this user have? */
  4897. -   atomic_t sigpending;    /* How many pending signals does this user have? */
  4898. -#ifdef CONFIG_INOTIFY_USER
  4899. -   atomic_t inotify_watches; /* How many inotify watches does this user have? */
  4900. -   atomic_t inotify_devs;  /* How many inotify devs does this user have opened? */
  4901. -#endif
  4902. -#ifdef CONFIG_FANOTIFY
  4903. -   atomic_t fanotify_listeners;
  4904. -#endif
  4905. -#ifdef CONFIG_EPOLL
  4906. -   atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
  4907. -#endif
  4908. -#ifdef CONFIG_POSIX_MQUEUE
  4909. -   /* protected by mq_lock */
  4910. -   unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
  4911. -#endif
  4912. -   unsigned long locked_shm; /* How many pages of mlocked shm ? */
  4913. -   unsigned long unix_inflight;    /* How many files in flight in unix sockets */
  4914. -
  4915. -#ifdef CONFIG_KEYS
  4916. -   struct key *uid_keyring;    /* UID specific keyring */
  4917. -   struct key *session_keyring;    /* UID's default session keyring */
  4918. -#endif
  4919. -
  4920. -   /* Hash table maintenance information */
  4921. -   struct hlist_node uidhash_node;
  4922. -   kuid_t uid;
  4923. -
  4924. -#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL)
  4925. -   atomic_long_t locked_vm;
  4926. -#endif
  4927. -};
  4928. -
  4929. -extern int uids_sysfs_init(void);
  4930. -
  4931. -extern struct user_struct *find_user(kuid_t);
  4932. -
  4933. -extern struct user_struct root_user;
  4934. -#define INIT_USER (&root_user)
  4935. -
  4936. -
  4937. -struct backing_dev_info;
  4938. -struct reclaim_state;
  4939. -
  4940. -#ifdef CONFIG_SCHED_INFO
  4941. -struct sched_info {
  4942. -   /* cumulative counters */
  4943. -   unsigned long pcount;         /* # of times run on this cpu */
  4944. -   unsigned long long run_delay; /* time spent waiting on a runqueue */
  4945. -
  4946. -   /* timestamps */
  4947. -   unsigned long long last_arrival,/* when we last ran on a cpu */
  4948. -              last_queued; /* when we were last queued to run */
  4949. -};
  4950. -#endif /* CONFIG_SCHED_INFO */
  4951. -
  4952. -#ifdef CONFIG_TASK_DELAY_ACCT
  4953. -struct task_delay_info {
  4954. -   spinlock_t  lock;
  4955. -   unsigned int    flags;  /* Private per-task flags */
  4956. -
  4957. -   /* For each stat XXX, add following, aligned appropriately
  4958. -    *
  4959. -    * struct timespec XXX_start, XXX_end;
  4960. -    * u64 XXX_delay;
  4961. -    * u32 XXX_count;
  4962. -    *
  4963. -    * Atomicity of updates to XXX_delay, XXX_count protected by
  4964. -    * single lock above (split into XXX_lock if contention is an issue).
  4965. -    */
  4966. -
  4967. -   /*
  4968. -    * XXX_count is incremented on every XXX operation, the delay
  4969. -    * associated with the operation is added to XXX_delay.
  4970. -    * XXX_delay contains the accumulated delay time in nanoseconds.
  4971. -    */
  4972. -   u64 blkio_start;    /* Shared by blkio, swapin */
  4973. -   u64 blkio_delay;    /* wait for sync block io completion */
  4974. -   u64 swapin_delay;   /* wait for swapin block io completion */
  4975. -   u32 blkio_count;    /* total count of the number of sync block */
  4976. -               /* io operations performed */
  4977. -   u32 swapin_count;   /* total count of the number of swapin block */
  4978. -               /* io operations performed */
  4979. -
  4980. -   u64 freepages_start;
  4981. -   u64 freepages_delay;    /* wait for memory reclaim */
  4982. -   u32 freepages_count;    /* total count of memory reclaim */
  4983. -};
  4984. -#endif /* CONFIG_TASK_DELAY_ACCT */
  4985. -
  4986. -static inline int sched_info_on(void)
  4987. -{
  4988. -#ifdef CONFIG_SCHEDSTATS
  4989. -   return 1;
  4990. -#elif defined(CONFIG_TASK_DELAY_ACCT)
  4991. -   extern int delayacct_on;
  4992. -   return delayacct_on;
  4993. -#else
  4994. -   return 0;
  4995. -#endif
  4996. -}
  4997. -
  4998. -enum cpu_idle_type {
  4999. -   CPU_IDLE,
  5000. -   CPU_NOT_IDLE,
  5001. -   CPU_NEWLY_IDLE,
  5002. -   CPU_MAX_IDLE_TYPES
  5003. -};
  5004. -
  5005. -/*
  5006. - * Increase resolution of cpu_capacity calculations
  5007. - */
  5008. -#define SCHED_CAPACITY_SHIFT   10
  5009. -#define SCHED_CAPACITY_SCALE   (1L << SCHED_CAPACITY_SHIFT)
  5010. -
  5011. -/*
  5012. - * Wake-queues are lists of tasks with a pending wakeup, whose
  5013. - * callers have already marked the task as woken internally,
  5014. - * and can thus carry on. A common use case is being able to
  5015. - * do the wakeups once the corresponding user lock as been
  5016. - * released.
  5017. - *
  5018. - * We hold reference to each task in the list across the wakeup,
  5019. - * thus guaranteeing that the memory is still valid by the time
  5020. - * the actual wakeups are performed in wake_up_q().
  5021. - *
  5022. - * One per task suffices, because there's never a need for a task to be
  5023. - * in two wake queues simultaneously; it is forbidden to abandon a task
  5024. - * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
  5025. - * already in a wake queue, the wakeup will happen soon and the second
  5026. - * waker can just skip it.
  5027. - *
  5028. - * The WAKE_Q macro declares and initializes the list head.
  5029. - * wake_up_q() does NOT reinitialize the list; it's expected to be
  5030. - * called near the end of a function, where the fact that the queue is
  5031. - * not used again will be easy to see by inspection.
  5032. - *
  5033. - * Note that this can cause spurious wakeups. schedule() callers
  5034. - * must ensure the call is done inside a loop, confirming that the
  5035. - * wakeup condition has in fact occurred.
  5036. - */
  5037. -struct wake_q_node {
  5038. -   struct wake_q_node *next;
  5039. -};
  5040. -
  5041. -struct wake_q_head {
  5042. -   struct wake_q_node *first;
  5043. -   struct wake_q_node **lastp;
  5044. -};
  5045. -
  5046. -#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
  5047. -
  5048. -#define WAKE_Q(name)                   \
  5049. -   struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
  5050. -
  5051. -extern void wake_q_add(struct wake_q_head *head,
  5052. -              struct task_struct *task);
  5053. -extern void wake_up_q(struct wake_q_head *head);
  5054. -
  5055. -/*
  5056. - * sched-domains (multiprocessor balancing) declarations:
  5057. - */
  5058. -#ifdef CONFIG_SMP
  5059. -#define SD_LOAD_BALANCE        0x0001  /* Do load balancing on this domain. */
  5060. -#define SD_BALANCE_NEWIDLE 0x0002  /* Balance when about to become idle */
  5061. -#define SD_BALANCE_EXEC        0x0004  /* Balance on exec */
  5062. -#define SD_BALANCE_FORK        0x0008  /* Balance on fork, clone */
  5063. -#define SD_BALANCE_WAKE        0x0010  /* Balance on wakeup */
  5064. -#define SD_WAKE_AFFINE     0x0020  /* Wake task to waking CPU */
  5065. -#define SD_SHARE_CPUCAPACITY   0x0080  /* Domain members share cpu power */
  5066. -#define SD_SHARE_POWERDOMAIN   0x0100  /* Domain members share power domain */
  5067. -#define SD_SHARE_PKG_RESOURCES 0x0200  /* Domain members share cpu pkg resources */
  5068. -#define SD_SERIALIZE       0x0400  /* Only a single load balancing instance */
  5069. -#define SD_ASYM_PACKING        0x0800  /* Place busy groups earlier in the domain */
  5070. -#define SD_PREFER_SIBLING  0x1000  /* Prefer to place tasks in a sibling domain */
  5071. -#define SD_OVERLAP     0x2000  /* sched_domains of this level overlap */
  5072. -#define SD_NUMA            0x4000  /* cross-node balancing */
  5073. -
  5074. -#ifdef CONFIG_SCHED_SMT
  5075. -static inline int cpu_smt_flags(void)
  5076. -{
  5077. -   return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
  5078. -}
  5079. -#endif
  5080. -
  5081. -#ifdef CONFIG_SCHED_MC
  5082. -static inline int cpu_core_flags(void)
  5083. -{
  5084. -   return SD_SHARE_PKG_RESOURCES;
  5085. -}
  5086. -#endif
  5087. -
  5088. -#ifdef CONFIG_NUMA
  5089. -static inline int cpu_numa_flags(void)
  5090. -{
  5091. -   return SD_NUMA;
  5092. -}
  5093. -#endif
  5094. -
  5095. -struct sched_domain_attr {
  5096. -   int relax_domain_level;
  5097. -};
  5098. -
  5099. -#define SD_ATTR_INIT   (struct sched_domain_attr) {    \
  5100. -   .relax_domain_level = -1,           \
  5101. -}
  5102. -
  5103. -extern int sched_domain_level_max;
  5104. -
  5105. -struct sched_group;
  5106. -
  5107. -struct sched_domain {
  5108. -   /* These fields must be setup */
  5109. -   struct sched_domain *parent;    /* top domain must be null terminated */
  5110. -   struct sched_domain *child; /* bottom domain must be null terminated */
  5111. -   struct sched_group *groups; /* the balancing groups of the domain */
  5112. -   unsigned long min_interval; /* Minimum balance interval ms */
  5113. -   unsigned long max_interval; /* Maximum balance interval ms */
  5114. -   unsigned int busy_factor;   /* less balancing by factor if busy */
  5115. -   unsigned int imbalance_pct; /* No balance until over watermark */
  5116. -   unsigned int cache_nice_tries;  /* Leave cache hot tasks for # tries */
  5117. -   unsigned int busy_idx;
  5118. -   unsigned int idle_idx;
  5119. -   unsigned int newidle_idx;
  5120. -   unsigned int wake_idx;
  5121. -   unsigned int forkexec_idx;
  5122. -   unsigned int smt_gain;
  5123. -
  5124. -   int nohz_idle;          /* NOHZ IDLE status */
  5125. -   int flags;          /* See SD_* */
  5126. -   int level;
  5127. -
  5128. -   /* Runtime fields. */
  5129. -   unsigned long last_balance; /* init to jiffies. units in jiffies */
  5130. -   unsigned int balance_interval;  /* initialise to 1. units in ms. */
  5131. -   unsigned int nr_balance_failed; /* initialise to 0 */
  5132. -
  5133. -   /* idle_balance() stats */
  5134. -   u64 max_newidle_lb_cost;
  5135. -   unsigned long next_decay_max_lb_cost;
  5136. -
  5137. -#ifdef CONFIG_SCHEDSTATS
  5138. -   /* load_balance() stats */
  5139. -   unsigned int lb_count[CPU_MAX_IDLE_TYPES];
  5140. -   unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
  5141. -   unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
  5142. -   unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
  5143. -   unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
  5144. -   unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
  5145. -   unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
  5146. -   unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
  5147. -
  5148. -   /* Active load balancing */
  5149. -   unsigned int alb_count;
  5150. -   unsigned int alb_failed;
  5151. -   unsigned int alb_pushed;
  5152. -
  5153. -   /* SD_BALANCE_EXEC stats */
  5154. -   unsigned int sbe_count;
  5155. -   unsigned int sbe_balanced;
  5156. -   unsigned int sbe_pushed;
  5157. -
  5158. -   /* SD_BALANCE_FORK stats */
  5159. -   unsigned int sbf_count;
  5160. -   unsigned int sbf_balanced;
  5161. -   unsigned int sbf_pushed;
  5162. -
  5163. -   /* try_to_wake_up() stats */
  5164. -   unsigned int ttwu_wake_remote;
  5165. -   unsigned int ttwu_move_affine;
  5166. -   unsigned int ttwu_move_balance;
  5167. -#endif
  5168. -#ifdef CONFIG_SCHED_DEBUG
  5169. -   char *name;
  5170. -#endif
  5171. -   union {
  5172. -       void *private;      /* used during construction */
  5173. -       struct rcu_head rcu;    /* used during destruction */
  5174. -   };
  5175. -
  5176. -   unsigned int span_weight;
  5177. -   /*
  5178. -    * Span of all CPUs in this domain.
  5179. -    *
  5180. -    * NOTE: this field is variable length. (Allocated dynamically
  5181. -    * by attaching extra space to the end of the structure,
  5182. -    * depending on how many CPUs the kernel has booted up with)
  5183. -    */
  5184. -   unsigned long span[0];
  5185. -};
  5186. -
  5187. -static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
  5188. -{
  5189. -   return to_cpumask(sd->span);
  5190. -}
  5191. -
  5192. -extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
  5193. -                   struct sched_domain_attr *dattr_new);
  5194. -
  5195. -/* Allocate an array of sched domains, for partition_sched_domains(). */
  5196. -cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
  5197. -void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
  5198. -
  5199. -bool cpus_share_cache(int this_cpu, int that_cpu);
  5200. -
  5201. -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
  5202. -typedef int (*sched_domain_flags_f)(void);
  5203. -
  5204. -#define SDTL_OVERLAP   0x01
  5205. -
  5206. -struct sd_data {
  5207. -   struct sched_domain **__percpu sd;
  5208. -   struct sched_group **__percpu sg;
  5209. -   struct sched_group_capacity **__percpu sgc;
  5210. -};
  5211. -
  5212. -struct sched_domain_topology_level {
  5213. -   sched_domain_mask_f mask;
  5214. -   sched_domain_flags_f sd_flags;
  5215. -   int         flags;
  5216. -   int         numa_level;
  5217. -   struct sd_data      data;
  5218. -#ifdef CONFIG_SCHED_DEBUG
  5219. -   char                *name;
  5220. -#endif
  5221. -};
  5222. -
  5223. -extern void set_sched_topology(struct sched_domain_topology_level *tl);
  5224. -extern void wake_up_if_idle(int cpu);
  5225. -
  5226. -#ifdef CONFIG_SCHED_DEBUG
  5227. -# define SD_INIT_NAME(type)        .name = #type
  5228. -#else
  5229. -# define SD_INIT_NAME(type)
  5230. -#endif
  5231. -
  5232. -#else /* CONFIG_SMP */
  5233. -
  5234. -struct sched_domain_attr;
  5235. -
  5236. -static inline void
  5237. -partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
  5238. -           struct sched_domain_attr *dattr_new)
  5239. -{
  5240. -}
  5241. -
  5242. -static inline bool cpus_share_cache(int this_cpu, int that_cpu)
  5243. -{
  5244. -   return true;
  5245. -}
  5246. -
  5247. -#endif /* !CONFIG_SMP */
  5248. -
  5249. -
  5250. -struct io_context;         /* See blkdev.h */
  5251. -
  5252. -
  5253. -#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
  5254. -extern void prefetch_stack(struct task_struct *t);
  5255. -#else
  5256. -static inline void prefetch_stack(struct task_struct *t) { }
  5257. -#endif
  5258. -
  5259. -struct audit_context;      /* See audit.c */
  5260. -struct mempolicy;
  5261. -struct pipe_inode_info;
  5262. -struct uts_namespace;
  5263. -
  5264. -struct load_weight {
  5265. -   unsigned long weight;
  5266. -   u32 inv_weight;
  5267. -};
  5268. -
  5269. -/*
  5270. - * The load_avg/util_avg accumulates an infinite geometric series.
  5271. - * 1) load_avg factors frequency scaling into the amount of time that a
  5272. - * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
  5273. - * aggregated such weights of all runnable and blocked sched_entities.
  5274. - * 2) util_avg factors frequency and cpu scaling into the amount of time
  5275. - * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
  5276. - * For cfs_rq, it is the aggregated such times of all runnable and
  5277. - * blocked sched_entities.
  5278. - * The 64 bit load_sum can:
  5279. - * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with
  5280. - * the highest weight (=88761) always runnable, we should not overflow
  5281. - * 2) for entity, support any load.weight always runnable
  5282. - */
  5283. -struct sched_avg {
  5284. -   u64 last_update_time, load_sum;
  5285. -   u32 util_sum, period_contrib;
  5286. -   unsigned long load_avg, util_avg;
  5287. -};
  5288. -
  5289. -#ifdef CONFIG_SCHEDSTATS
  5290. -struct sched_statistics {
  5291. -   u64         wait_start;
  5292. -   u64         wait_max;
  5293. -   u64         wait_count;
  5294. -   u64         wait_sum;
  5295. -   u64         iowait_count;
  5296. -   u64         iowait_sum;
  5297. -
  5298. -   u64         sleep_start;
  5299. -   u64         sleep_max;
  5300. -   s64         sum_sleep_runtime;
  5301. -
  5302. -   u64         block_start;
  5303. -   u64         block_max;
  5304. -   u64         exec_max;
  5305. -   u64         slice_max;
  5306. -
  5307. -   u64         nr_migrations_cold;
  5308. -   u64         nr_failed_migrations_affine;
  5309. -   u64         nr_failed_migrations_running;
  5310. -   u64         nr_failed_migrations_hot;
  5311. -   u64         nr_forced_migrations;
  5312. -
  5313. -   u64         nr_wakeups;
  5314. -   u64         nr_wakeups_sync;
  5315. -   u64         nr_wakeups_migrate;
  5316. -   u64         nr_wakeups_local;
  5317. -   u64         nr_wakeups_remote;
  5318. -   u64         nr_wakeups_affine;
  5319. -   u64         nr_wakeups_affine_attempts;
  5320. -   u64         nr_wakeups_passive;
  5321. -   u64         nr_wakeups_idle;
  5322. -};
  5323. -#endif
  5324. -
  5325. -struct sched_entity {
  5326. -   struct load_weight  load;       /* for load-balancing */
  5327. -   struct rb_node      run_node;
  5328. -   struct list_head    group_node;
  5329. -   unsigned int        on_rq;
  5330. -
  5331. -   u64         exec_start;
  5332. -   u64         sum_exec_runtime;
  5333. -   u64         vruntime;
  5334. -   u64         prev_sum_exec_runtime;
  5335. -
  5336. -   u64         nr_migrations;
  5337. -
  5338. -#ifdef CONFIG_SCHEDSTATS
  5339. -   struct sched_statistics statistics;
  5340. -#endif
  5341. -
  5342. -#ifdef CONFIG_FAIR_GROUP_SCHED
  5343. -   int         depth;
  5344. -   struct sched_entity *parent;
  5345. -   /* rq on which this entity is (to be) queued: */
  5346. -   struct cfs_rq       *cfs_rq;
  5347. -   /* rq "owned" by this entity/group: */
  5348. -   struct cfs_rq       *my_q;
  5349. -#endif
  5350. -
  5351. -#ifdef CONFIG_SMP
  5352. -   /* Per entity load average tracking */
  5353. -   struct sched_avg    avg;
  5354. -#endif
  5355. -};
  5356. -
  5357. -struct sched_rt_entity {
  5358. -   struct list_head run_list;
  5359. -   unsigned long timeout;
  5360. -   unsigned long watchdog_stamp;
  5361. -   unsigned int time_slice;
  5362. -
  5363. -   struct sched_rt_entity *back;
  5364. -#ifdef CONFIG_RT_GROUP_SCHED
  5365. -   struct sched_rt_entity  *parent;
  5366. -   /* rq on which this entity is (to be) queued: */
  5367. -   struct rt_rq        *rt_rq;
  5368. -   /* rq "owned" by this entity/group: */
  5369. -   struct rt_rq        *my_q;
  5370. -#endif
  5371. -};
  5372. -
  5373. -struct sched_dl_entity {
  5374. -   struct rb_node  rb_node;
  5375. -
  5376. -   /*
  5377. -    * Original scheduling parameters. Copied here from sched_attr
  5378. -    * during sched_setattr(), they will remain the same until
  5379. -    * the next sched_setattr().
  5380. -    */
  5381. -   u64 dl_runtime;     /* maximum runtime for each instance    */
  5382. -   u64 dl_deadline;    /* relative deadline of each instance   */
  5383. -   u64 dl_period;      /* separation of two instances (period) */
  5384. -   u64 dl_bw;      /* dl_runtime / dl_deadline     */
  5385. -
  5386. -   /*
  5387. -    * Actual scheduling parameters. Initialized with the values above,
  5388. -    * they are continously updated during task execution. Note that
  5389. -    * the remaining runtime could be < 0 in case we are in overrun.
  5390. -    */
  5391. -   s64 runtime;        /* remaining runtime for this instance  */
  5392. -   u64 deadline;       /* absolute deadline for this instance  */
  5393. -   unsigned int flags; /* specifying the scheduler behaviour   */
  5394. -
  5395. -   /*
  5396. -    * Some bool flags:
  5397. -    *
  5398. -    * @dl_throttled tells if we exhausted the runtime. If so, the
  5399. -    * task has to wait for a replenishment to be performed at the
  5400. -    * next firing of dl_timer.
  5401. -    *
  5402. -    * @dl_new tells if a new instance arrived. If so we must
  5403. -    * start executing it with full runtime and reset its absolute
  5404. -    * deadline;
  5405. -    *
  5406. -    * @dl_boosted tells if we are boosted due to DI. If so we are
  5407. -    * outside bandwidth enforcement mechanism (but only until we
  5408. -    * exit the critical section);
  5409. -    *
  5410. -    * @dl_yielded tells if task gave up the cpu before consuming
  5411. -    * all its available runtime during the last job.
  5412. -    */
  5413. -   int dl_throttled, dl_new, dl_boosted, dl_yielded;
  5414. -
  5415. -   /*
  5416. -    * Bandwidth enforcement timer. Each -deadline task has its
  5417. -    * own bandwidth to be enforced, thus we need one timer per task.
  5418. -    */
  5419. -   struct hrtimer dl_timer;
  5420. -};
  5421. -
  5422. -union rcu_special {
  5423. -   struct {
  5424. -       u8 blocked;
  5425. -       u8 need_qs;
  5426. -       u8 exp_need_qs;
  5427. -       u8 pad; /* Otherwise the compiler can store garbage here. */
  5428. -   } b; /* Bits. */
  5429. -   u32 s; /* Set of bits. */
  5430. -};
  5431. -struct rcu_node;
  5432. -
  5433. -enum perf_event_task_context {
  5434. -   perf_invalid_context = -1,
  5435. -   perf_hw_context = 0,
  5436. -   perf_sw_context,
  5437. -   perf_nr_task_contexts,
  5438. -};
  5439. -
  5440. -/* Track pages that require TLB flushes */
  5441. -struct tlbflush_unmap_batch {
  5442. -   /*
  5443. -    * Each bit set is a CPU that potentially has a TLB entry for one of
  5444. -    * the PFNs being flushed. See set_tlb_ubc_flush_pending().
  5445. -    */
  5446. -   struct cpumask cpumask;
  5447. -
  5448. -   /* True if any bit in cpumask is set */
  5449. -   bool flush_required;
  5450. -
  5451. -   /*
  5452. -    * If true then the PTE was dirty when unmapped. The entry must be
  5453. -    * flushed before IO is initiated or a stale TLB entry potentially
  5454. -    * allows an update without redirtying the page.
  5455. -    */
  5456. -   bool writable;
  5457. -};
  5458. -
  5459. -struct task_struct {
  5460. -   volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
  5461. -   void *stack;
  5462. -   atomic_t usage;
  5463. -   unsigned int flags; /* per process flags, defined below */
  5464. -   unsigned int ptrace;
  5465. -
  5466. -#ifdef CONFIG_SMP
  5467. -   struct llist_node wake_entry;
  5468. -   int on_cpu;
  5469. -   unsigned int wakee_flips;
  5470. -   unsigned long wakee_flip_decay_ts;
  5471. -   struct task_struct *last_wakee;
  5472. -
  5473. -   int wake_cpu;
  5474. -#endif
  5475. -   int on_rq;
  5476. -
  5477. -   int prio, static_prio, normal_prio;
  5478. -   unsigned int rt_priority;
  5479. -   const struct sched_class *sched_class;
  5480. -   struct sched_entity se;
  5481. -   struct sched_rt_entity rt;
  5482. -#ifdef CONFIG_CGROUP_SCHED
  5483. -   struct task_group *sched_task_group;
  5484. -#endif
  5485. -   struct sched_dl_entity dl;
  5486. -
  5487. -#ifdef CONFIG_PREEMPT_NOTIFIERS
  5488. -   /* list of struct preempt_notifier: */
  5489. -   struct hlist_head preempt_notifiers;
  5490. -#endif
  5491. -
  5492. -#ifdef CONFIG_BLK_DEV_IO_TRACE
  5493. -   unsigned int btrace_seq;
  5494. -#endif
  5495. -
  5496. -   unsigned int policy;
  5497. -   int nr_cpus_allowed;
  5498. -   cpumask_t cpus_allowed;
  5499. -
  5500. -#ifdef CONFIG_PREEMPT_RCU
  5501. -   int rcu_read_lock_nesting;
  5502. -   union rcu_special rcu_read_unlock_special;
  5503. -   struct list_head rcu_node_entry;
  5504. -   struct rcu_node *rcu_blocked_node;
  5505. -#endif /* #ifdef CONFIG_PREEMPT_RCU */
  5506. -#ifdef CONFIG_TASKS_RCU
  5507. -   unsigned long rcu_tasks_nvcsw;
  5508. -   bool rcu_tasks_holdout;
  5509. -   struct list_head rcu_tasks_holdout_list;
  5510. -   int rcu_tasks_idle_cpu;
  5511. -#endif /* #ifdef CONFIG_TASKS_RCU */
  5512. -
  5513. -#ifdef CONFIG_SCHED_INFO
  5514. -   struct sched_info sched_info;
  5515. -#endif
  5516. -
  5517. -   struct list_head tasks;
  5518. -#ifdef CONFIG_SMP
  5519. -   struct plist_node pushable_tasks;
  5520. -   struct rb_node pushable_dl_tasks;
  5521. -#endif
  5522. -
  5523. -   struct mm_struct *mm, *active_mm;
  5524. -   /* per-thread vma caching */
  5525. -   u32 vmacache_seqnum;
  5526. -   struct vm_area_struct *vmacache[VMACACHE_SIZE];
  5527. -#if defined(SPLIT_RSS_COUNTING)
  5528. -   struct task_rss_stat    rss_stat;
  5529. -#endif
  5530. -/* task state */
  5531. -   int exit_state;
  5532. -   int exit_code, exit_signal;
  5533. -   int pdeath_signal;  /*  The signal sent when the parent dies  */
  5534. -   unsigned long jobctl;   /* JOBCTL_*, siglock protected */
  5535. -
  5536. -   /* Used for emulating ABI behavior of previous Linux versions */
  5537. -   unsigned int personality;
  5538. -
  5539. -   /* scheduler bits, serialized by scheduler locks */
  5540. -   unsigned sched_reset_on_fork:1;
  5541. -   unsigned sched_contributes_to_load:1;
  5542. -   unsigned sched_migrated:1;
  5543. -   unsigned :0; /* force alignment to the next boundary */
  5544. -
  5545. -   /* unserialized, strictly 'current' */
  5546. -   unsigned in_execve:1; /* bit to tell LSMs we're in execve */
  5547. -   unsigned in_iowait:1;
  5548. -#ifdef CONFIG_MEMCG
  5549. -   unsigned memcg_may_oom:1;
  5550. -#endif
  5551. -#ifdef CONFIG_MEMCG_KMEM
  5552. -   unsigned memcg_kmem_skip_account:1;
  5553. -#endif
  5554. -#ifdef CONFIG_COMPAT_BRK
  5555. -   unsigned brk_randomized:1;
  5556. -#endif
  5557. -
  5558. -   unsigned long atomic_flags; /* Flags needing atomic access. */
  5559. -
  5560. -   struct restart_block restart_block;
  5561. -
  5562. -   pid_t pid;
  5563. -   pid_t tgid;
  5564. -
  5565. -#ifdef CONFIG_CC_STACKPROTECTOR
  5566. -   /* Canary value for the -fstack-protector gcc feature */
  5567. -   unsigned long stack_canary;
  5568. -#endif
  5569. -   /*
  5570. -    * pointers to (original) parent process, youngest child, younger sibling,
  5571. -    * older sibling, respectively.  (p->father can be replaced with
  5572. -    * p->real_parent->pid)
  5573. -    */
  5574. -   struct task_struct __rcu *real_parent; /* real parent process */
  5575. -   struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
  5576. -   /*
  5577. -    * children/sibling forms the list of my natural children
  5578. -    */
  5579. -   struct list_head children;  /* list of my children */
  5580. -   struct list_head sibling;   /* linkage in my parent's children list */
  5581. -   struct task_struct *group_leader;   /* threadgroup leader */
  5582. -
  5583. -   /*
  5584. -    * ptraced is the list of tasks this task is using ptrace on.
  5585. -    * This includes both natural children and PTRACE_ATTACH targets.
  5586. -    * p->ptrace_entry is p's link on the p->parent->ptraced list.
  5587. -    */
  5588. -   struct list_head ptraced;
  5589. -   struct list_head ptrace_entry;
  5590. -
  5591. -   /* PID/PID hash table linkage. */
  5592. -   struct pid_link pids[PIDTYPE_MAX];
  5593. -   struct list_head thread_group;
  5594. -   struct list_head thread_node;
  5595. -
  5596. -   struct completion *vfork_done;      /* for vfork() */
  5597. -   int __user *set_child_tid;      /* CLONE_CHILD_SETTID */
  5598. -   int __user *clear_child_tid;        /* CLONE_CHILD_CLEARTID */
  5599. -
  5600. -   cputime_t utime, stime, utimescaled, stimescaled;
  5601. -   cputime_t gtime;
  5602. -   struct prev_cputime prev_cputime;
  5603. -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
  5604. -   seqlock_t vtime_seqlock;
  5605. -   unsigned long long vtime_snap;
  5606. -   enum {
  5607. -       VTIME_SLEEPING = 0,
  5608. -       VTIME_USER,
  5609. -       VTIME_SYS,
  5610. -   } vtime_snap_whence;
  5611. -#endif
  5612. -   unsigned long nvcsw, nivcsw; /* context switch counts */
  5613. -   u64 start_time;     /* monotonic time in nsec */
  5614. -   u64 real_start_time;    /* boot based time in nsec */
  5615. -/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
  5616. -   unsigned long min_flt, maj_flt;
  5617. -
  5618. -   struct task_cputime cputime_expires;
  5619. -   struct list_head cpu_timers[3];
  5620. -
  5621. -/* process credentials */
  5622. -   const struct cred __rcu *real_cred; /* objective and real subjective task
  5623. -                    * credentials (COW) */
  5624. -   const struct cred __rcu *cred;  /* effective (overridable) subjective task
  5625. -                    * credentials (COW) */
  5626. -   char comm[TASK_COMM_LEN]; /* executable name excluding path
  5627. -                    - access with [gs]et_task_comm (which lock
  5628. -                      it with task_lock())
  5629. -                    - initialized normally by setup_new_exec */
  5630. -/* file system info */
  5631. -   struct nameidata *nameidata;
  5632. -#ifdef CONFIG_SYSVIPC
  5633. -/* ipc stuff */
  5634. -   struct sysv_sem sysvsem;
  5635. -   struct sysv_shm sysvshm;
  5636. -#endif
  5637. -#ifdef CONFIG_DETECT_HUNG_TASK
  5638. -/* hung task detection */
  5639. -   unsigned long last_switch_count;
  5640. -#endif
  5641. -/* filesystem information */
  5642. -   struct fs_struct *fs;
  5643. -/* open file information */
  5644. -   struct files_struct *files;
  5645. -/* namespaces */
  5646. -   struct nsproxy *nsproxy;
  5647. -/* signal handlers */
  5648. -   struct signal_struct *signal;
  5649. -   struct sighand_struct *sighand;
  5650. -
  5651. -   sigset_t blocked, real_blocked;
  5652. -   sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
  5653. -   struct sigpending pending;
  5654. -
  5655. -   unsigned long sas_ss_sp;
  5656. -   size_t sas_ss_size;
  5657. -
  5658. -   struct callback_head *task_works;
  5659. -
  5660. -   struct audit_context *audit_context;
  5661. -#ifdef CONFIG_AUDITSYSCALL
  5662. -   kuid_t loginuid;
  5663. -   unsigned int sessionid;
  5664. -#endif
  5665. -   struct seccomp seccomp;
  5666. -
  5667. -/* Thread group tracking */
  5668. -       u32 parent_exec_id;
  5669. -       u32 self_exec_id;
  5670. -/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
  5671. - * mempolicy */
  5672. -   spinlock_t alloc_lock;
  5673. -
  5674. -   /* Protection of the PI data structures: */
  5675. -   raw_spinlock_t pi_lock;
  5676. -
  5677. -   struct wake_q_node wake_q;
  5678. -
  5679. -#ifdef CONFIG_RT_MUTEXES
  5680. -   /* PI waiters blocked on a rt_mutex held by this task */
  5681. -   struct rb_root pi_waiters;
  5682. -   struct rb_node *pi_waiters_leftmost;
  5683. -   /* Deadlock detection and priority inheritance handling */
  5684. -   struct rt_mutex_waiter *pi_blocked_on;
  5685. -#endif
  5686. -
  5687. -#ifdef CONFIG_DEBUG_MUTEXES
  5688. -   /* mutex deadlock detection */
  5689. -   struct mutex_waiter *blocked_on;
  5690. -#endif
  5691. -#ifdef CONFIG_TRACE_IRQFLAGS
  5692. -   unsigned int irq_events;
  5693. -   unsigned long hardirq_enable_ip;
  5694. -   unsigned long hardirq_disable_ip;
  5695. -   unsigned int hardirq_enable_event;
  5696. -   unsigned int hardirq_disable_event;
  5697. -   int hardirqs_enabled;
  5698. -   int hardirq_context;
  5699. -   unsigned long softirq_disable_ip;
  5700. -   unsigned long softirq_enable_ip;
  5701. -   unsigned int softirq_disable_event;
  5702. -   unsigned int softirq_enable_event;
  5703. -   int softirqs_enabled;
  5704. -   int softirq_context;
  5705. -#endif
  5706. -#ifdef CONFIG_LOCKDEP
  5707. -# define MAX_LOCK_DEPTH 48UL
  5708. -   u64 curr_chain_key;
  5709. -   int lockdep_depth;
  5710. -   unsigned int lockdep_recursion;
  5711. -   struct held_lock held_locks[MAX_LOCK_DEPTH];
  5712. -   gfp_t lockdep_reclaim_gfp;
  5713. -#endif
  5714. -
  5715. -/* journalling filesystem info */
  5716. -   void *journal_info;
  5717. -
  5718. -/* stacked block device info */
  5719. -   struct bio_list *bio_list;
  5720. -
  5721. -#ifdef CONFIG_BLOCK
  5722. -/* stack plugging */
  5723. -   struct blk_plug *plug;
  5724. -#endif
  5725. -
  5726. -/* VM state */
  5727. -   struct reclaim_state *reclaim_state;
  5728. -
  5729. -   struct backing_dev_info *backing_dev_info;
  5730. -
  5731. -   struct io_context *io_context;
  5732. -
  5733. -   unsigned long ptrace_message;
  5734. -   siginfo_t *last_siginfo; /* For ptrace use.  */
  5735. -   struct task_io_accounting ioac;
  5736. -#if defined(CONFIG_TASK_XACCT)
  5737. -   u64 acct_rss_mem1;  /* accumulated rss usage */
  5738. -   u64 acct_vm_mem1;   /* accumulated virtual memory usage */
  5739. -   cputime_t acct_timexpd; /* stime + utime since last update */
  5740. -#endif
  5741. -#ifdef CONFIG_CPUSETS
  5742. -   nodemask_t mems_allowed;    /* Protected by alloc_lock */
  5743. -   seqcount_t mems_allowed_seq;    /* Seqence no to catch updates */
  5744. -   int cpuset_mem_spread_rotor;
  5745. -   int cpuset_slab_spread_rotor;
  5746. -#endif
  5747. -#ifdef CONFIG_CGROUPS
  5748. -   /* Control Group info protected by css_set_lock */
  5749. -   struct css_set __rcu *cgroups;
  5750. -   /* cg_list protected by css_set_lock and tsk->alloc_lock */
  5751. -   struct list_head cg_list;
  5752. -#endif
  5753. -#ifdef CONFIG_FUTEX
  5754. -   struct robust_list_head __user *robust_list;
  5755. -#ifdef CONFIG_COMPAT
  5756. -   struct compat_robust_list_head __user *compat_robust_list;
  5757. -#endif
  5758. -   struct list_head pi_state_list;
  5759. -   struct futex_pi_state *pi_state_cache;
  5760. -#endif
  5761. -#ifdef CONFIG_PERF_EVENTS
  5762. -   struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
  5763. -   struct mutex perf_event_mutex;
  5764. -   struct list_head perf_event_list;
  5765. -#endif
  5766. -#ifdef CONFIG_DEBUG_PREEMPT
  5767. -   unsigned long preempt_disable_ip;
  5768. -#endif
  5769. -#ifdef CONFIG_NUMA
  5770. -   struct mempolicy *mempolicy;    /* Protected by alloc_lock */
  5771. -   short il_next;
  5772. -   short pref_node_fork;
  5773. -#endif
  5774. -#ifdef CONFIG_NUMA_BALANCING
  5775. -   int numa_scan_seq;
  5776. -   unsigned int numa_scan_period;
  5777. -   unsigned int numa_scan_period_max;
  5778. -   int numa_preferred_nid;
  5779. -   unsigned long numa_migrate_retry;
  5780. -   u64 node_stamp;         /* migration stamp  */
  5781. -   u64 last_task_numa_placement;
  5782. -   u64 last_sum_exec_runtime;
  5783. -   struct callback_head numa_work;
  5784. -
  5785. -   struct list_head numa_entry;
  5786. -   struct numa_group *numa_group;
  5787. -
  5788. -   /*
  5789. -    * numa_faults is an array split into four regions:
  5790. -    * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
  5791. -    * in this precise order.
  5792. -    *
  5793. -    * faults_memory: Exponential decaying average of faults on a per-node
  5794. -    * basis. Scheduling placement decisions are made based on these
  5795. -    * counts. The values remain static for the duration of a PTE scan.
  5796. -    * faults_cpu: Track the nodes the process was running on when a NUMA
  5797. -    * hinting fault was incurred.
  5798. -    * faults_memory_buffer and faults_cpu_buffer: Record faults per node
  5799. -    * during the current scan window. When the scan completes, the counts
  5800. -    * in faults_memory and faults_cpu decay and these values are copied.
  5801. -    */
  5802. -   unsigned long *numa_faults;
  5803. -   unsigned long total_numa_faults;
  5804. -
  5805. -   /*
  5806. -    * numa_faults_locality tracks if faults recorded during the last
  5807. -    * scan window were remote/local or failed to migrate. The task scan
  5808. -    * period is adapted based on the locality of the faults with different
  5809. -    * weights depending on whether they were shared or private faults
  5810. -    */
  5811. -   unsigned long numa_faults_locality[3];
  5812. -
  5813. -   unsigned long numa_pages_migrated;
  5814. -#endif /* CONFIG_NUMA_BALANCING */
  5815. -
  5816. -#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
  5817. -   struct tlbflush_unmap_batch tlb_ubc;
  5818. -#endif
  5819. -
  5820. -   struct rcu_head rcu;
  5821. -
  5822. -   /*
  5823. -    * cache last used pipe for splice
  5824. -    */
  5825. -   struct pipe_inode_info *splice_pipe;
  5826. -
  5827. -   struct page_frag task_frag;
  5828. -
  5829. -#ifdef CONFIG_TASK_DELAY_ACCT
  5830. -   struct task_delay_info *delays;
  5831. -#endif
  5832. -#ifdef CONFIG_FAULT_INJECTION
  5833. -   int make_it_fail;
  5834. -#endif
  5835. -   /*
  5836. -    * when (nr_dirtied >= nr_dirtied_pause), it's time to call
  5837. -    * balance_dirty_pages() for some dirty throttling pause
  5838. -    */
  5839. -   int nr_dirtied;
  5840. -   int nr_dirtied_pause;
  5841. -   unsigned long dirty_paused_when; /* start of a write-and-pause period */
  5842. -
  5843. -#ifdef CONFIG_LATENCYTOP
  5844. -   int latency_record_count;
  5845. -   struct latency_record latency_record[LT_SAVECOUNT];
  5846. -#endif
  5847. -   /*
  5848. -    * time slack values; these are used to round up poll() and
  5849. -    * select() etc timeout values. These are in nanoseconds.
  5850. -    */
  5851. -   unsigned long timer_slack_ns;
  5852. -   unsigned long default_timer_slack_ns;
  5853. -
  5854. -#ifdef CONFIG_KASAN
  5855. -   unsigned int kasan_depth;
  5856. -#endif
  5857. -#ifdef CONFIG_FUNCTION_GRAPH_TRACER
  5858. -   /* Index of current stored address in ret_stack */
  5859. -   int curr_ret_stack;
  5860. -   /* Stack of return addresses for return function tracing */
  5861. -   struct ftrace_ret_stack *ret_stack;
  5862. -   /* time stamp for last schedule */
  5863. -   unsigned long long ftrace_timestamp;
  5864. -   /*
  5865. -    * Number of functions that haven't been traced
  5866. -    * because of depth overrun.
  5867. -    */
  5868. -   atomic_t trace_overrun;
  5869. -   /* Pause for the tracing */
  5870. -   atomic_t tracing_graph_pause;
  5871. -#endif
  5872. -#ifdef CONFIG_TRACING
  5873. -   /* state flags for use by tracers */
  5874. -   unsigned long trace;
  5875. -   /* bitmask and counter of trace recursion */
  5876. -   unsigned long trace_recursion;
  5877. -#endif /* CONFIG_TRACING */
  5878. -#ifdef CONFIG_MEMCG
  5879. -   struct mem_cgroup *memcg_in_oom;
  5880. -   gfp_t memcg_oom_gfp_mask;
  5881. -   int memcg_oom_order;
  5882. -
  5883. -   /* number of pages to reclaim on returning to userland */
  5884. -   unsigned int memcg_nr_pages_over_high;
  5885. -#endif
  5886. -#ifdef CONFIG_UPROBES
  5887. -   struct uprobe_task *utask;
  5888. -#endif
  5889. -#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
  5890. -   unsigned int    sequential_io;
  5891. -   unsigned int    sequential_io_avg;
  5892. -#endif
  5893. -#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
  5894. -   unsigned long   task_state_change;
  5895. -#endif
  5896. -   int pagefault_disabled;
  5897. -/* CPU-specific state of this task */
  5898. -   struct thread_struct thread;
  5899. -/*
  5900. - * WARNING: on x86, 'thread_struct' contains a variable-sized
  5901. - * structure.  It *MUST* be at the end of 'task_struct'.
  5902. - *
  5903. - * Do not put anything below here!
  5904. - */
  5905. -};
  5906. -
  5907. -#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
  5908. -extern int arch_task_struct_size __read_mostly;
  5909. -#else
  5910. -# define arch_task_struct_size (sizeof(struct task_struct))
  5911. -#endif
  5912. -
  5913. -/* Future-safe accessor for struct task_struct's cpus_allowed. */
  5914. -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
  5915. -
  5916. -#define TNF_MIGRATED   0x01
  5917. -#define TNF_NO_GROUP   0x02
  5918. -#define TNF_SHARED 0x04
  5919. -#define TNF_FAULT_LOCAL    0x08
  5920. -#define TNF_MIGRATE_FAIL 0x10
  5921. -
  5922. -#ifdef CONFIG_NUMA_BALANCING
  5923. -extern void task_numa_fault(int last_node, int node, int pages, int flags);
  5924. -extern pid_t task_numa_group_id(struct task_struct *p);
  5925. -extern void set_numabalancing_state(bool enabled);
  5926. -extern void task_numa_free(struct task_struct *p);
  5927. -extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
  5928. -                   int src_nid, int dst_cpu);
  5929. -#else
  5930. -static inline void task_numa_fault(int last_node, int node, int pages,
  5931. -                  int flags)
  5932. -{
  5933. -}
  5934. -static inline pid_t task_numa_group_id(struct task_struct *p)
  5935. -{
  5936. -   return 0;
  5937. -}
  5938. -static inline void set_numabalancing_state(bool enabled)
  5939. -{
  5940. -}
  5941. -static inline void task_numa_free(struct task_struct *p)
  5942. -{
  5943. -}
  5944. -static inline bool should_numa_migrate_memory(struct task_struct *p,
  5945. -               struct page *page, int src_nid, int dst_cpu)
  5946. -{
  5947. -   return true;
  5948. -}
  5949. -#endif
  5950. -
  5951. -static inline struct pid *task_pid(struct task_struct *task)
  5952. -{
  5953. -   return task->pids[PIDTYPE_PID].pid;
  5954. -}
  5955. -
  5956. -static inline struct pid *task_tgid(struct task_struct *task)
  5957. -{
  5958. -   return task->group_leader->pids[PIDTYPE_PID].pid;
  5959. -}
  5960. -
  5961. -/*
  5962. - * Without tasklist or rcu lock it is not safe to dereference
  5963. - * the result of task_pgrp/task_session even if task == current,
  5964. - * we can race with another thread doing sys_setsid/sys_setpgid.
  5965. - */
  5966. -static inline struct pid *task_pgrp(struct task_struct *task)
  5967. -{
  5968. -   return task->group_leader->pids[PIDTYPE_PGID].pid;
  5969. -}
  5970. -
  5971. -static inline struct pid *task_session(struct task_struct *task)
  5972. -{
  5973. -   return task->group_leader->pids[PIDTYPE_SID].pid;
  5974. -}
  5975. -
  5976. -struct pid_namespace;
  5977. -
  5978. -/*
  5979. - * the helpers to get the task's different pids as they are seen
  5980. - * from various namespaces
  5981. - *
  5982. - * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
  5983. - * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
  5984. - *                     current.
  5985. - * task_xid_nr_ns()  : id seen from the ns specified;
  5986. - *
  5987. - * set_task_vxid()   : assigns a virtual id to a task;
  5988. - *
  5989. - * see also pid_nr() etc in include/linux/pid.h
  5990. - */
  5991. -pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
  5992. -           struct pid_namespace *ns);
  5993. -
  5994. -static inline pid_t task_pid_nr(struct task_struct *tsk)
  5995. -{
  5996. -   return tsk->pid;
  5997. -}
  5998. -
  5999. -static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
  6000. -                   struct pid_namespace *ns)
  6001. -{
  6002. -   return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
  6003. -}
  6004. -
  6005. -static inline pid_t task_pid_vnr(struct task_struct *tsk)
  6006. -{
  6007. -   return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
  6008. -}
  6009. -
  6010. -
  6011. -static inline pid_t task_tgid_nr(struct task_struct *tsk)
  6012. -{
  6013. -   return tsk->tgid;
  6014. -}
  6015. -
  6016. -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
  6017. -
  6018. -static inline pid_t task_tgid_vnr(struct task_struct *tsk)
  6019. -{
  6020. -   return pid_vnr(task_tgid(tsk));
  6021. -}
  6022. -
  6023. -
  6024. -static inline int pid_alive(const struct task_struct *p);
  6025. -static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
  6026. -{
  6027. -   pid_t pid = 0;
  6028. -
  6029. -   rcu_read_lock();
  6030. -   if (pid_alive(tsk))
  6031. -       pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
  6032. -   rcu_read_unlock();
  6033. -
  6034. -   return pid;
  6035. -}
  6036. -
  6037. -static inline pid_t task_ppid_nr(const struct task_struct *tsk)
  6038. -{
  6039. -   return task_ppid_nr_ns(tsk, &init_pid_ns);
  6040. -}
  6041. -
  6042. -static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
  6043. -                   struct pid_namespace *ns)
  6044. -{
  6045. -   return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
  6046. -}
  6047. -
  6048. -static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
  6049. -{
  6050. -   return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
  6051. -}
  6052. -
  6053. -
  6054. -static inline pid_t task_session_nr_ns(struct task_struct *tsk,
  6055. -                   struct pid_namespace *ns)
  6056. -{
  6057. -   return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
  6058. -}
  6059. -
  6060. -static inline pid_t task_session_vnr(struct task_struct *tsk)
  6061. -{
  6062. -   return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
  6063. -}
  6064. -
  6065. -/* obsolete, do not use */
  6066. -static inline pid_t task_pgrp_nr(struct task_struct *tsk)
  6067. -{
  6068. -   return task_pgrp_nr_ns(tsk, &init_pid_ns);
  6069. -}
  6070. -
  6071. -/**
  6072. - * pid_alive - check that a task structure is not stale
  6073. - * @p: Task structure to be checked.
  6074. - *
  6075. - * Test if a process is not yet dead (at most zombie state)
  6076. - * If pid_alive fails, then pointers within the task structure
  6077. - * can be stale and must not be dereferenced.
  6078. - *
  6079. - * Return: 1 if the process is alive. 0 otherwise.
  6080. - */
  6081. -static inline int pid_alive(const struct task_struct *p)
  6082. -{
  6083. -   return p->pids[PIDTYPE_PID].pid != NULL;
  6084. -}
  6085. -
  6086. -/**
  6087. - * is_global_init - check if a task structure is init. Since init
  6088. - * is free to have sub-threads we need to check tgid.
  6089. - * @tsk: Task structure to be checked.
  6090. - *
  6091. - * Check if a task structure is the first user space task the kernel created.
  6092. - *
  6093. - * Return: 1 if the task structure is init. 0 otherwise.
  6094. - */
  6095. -static inline int is_global_init(struct task_struct *tsk)
  6096. -{
  6097. -   return task_tgid_nr(tsk) == 1;
  6098. -}
  6099. -
  6100. -extern struct pid *cad_pid;
  6101. -
  6102. -extern void free_task(struct task_struct *tsk);
  6103. -#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
  6104. -
  6105. -extern void __put_task_struct(struct task_struct *t);
  6106. -
  6107. -static inline void put_task_struct(struct task_struct *t)
  6108. -{
  6109. -   if (atomic_dec_and_test(&t->usage))
  6110. -       __put_task_struct(t);
  6111. -}
  6112. -
  6113. -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
  6114. -extern void task_cputime(struct task_struct *t,
  6115. -            cputime_t *utime, cputime_t *stime);
  6116. -extern void task_cputime_scaled(struct task_struct *t,
  6117. -               cputime_t *utimescaled, cputime_t *stimescaled);
  6118. -extern cputime_t task_gtime(struct task_struct *t);
  6119. -#else
  6120. -static inline void task_cputime(struct task_struct *t,
  6121. -               cputime_t *utime, cputime_t *stime)
  6122. -{
  6123. -   if (utime)
  6124. -       *utime = t->utime;
  6125. -   if (stime)
  6126. -       *stime = t->stime;
  6127. -}
  6128. -
  6129. -static inline void task_cputime_scaled(struct task_struct *t,
  6130. -                      cputime_t *utimescaled,
  6131. -                      cputime_t *stimescaled)
  6132. -{
  6133. -   if (utimescaled)
  6134. -       *utimescaled = t->utimescaled;
  6135. -   if (stimescaled)
  6136. -       *stimescaled = t->stimescaled;
  6137. -}
  6138. -
  6139. -static inline cputime_t task_gtime(struct task_struct *t)
  6140. -{
  6141. -   return t->gtime;
  6142. -}
  6143. -#endif
  6144. -extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
  6145. -extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
  6146. -
  6147. -/*
  6148. - * Per process flags
  6149. - */
  6150. -#define PF_EXITING 0x00000004  /* getting shut down */
  6151. -#define PF_EXITPIDONE  0x00000008  /* pi exit done on shut down */
  6152. -#define PF_VCPU        0x00000010  /* I'm a virtual CPU */
  6153. -#define PF_WQ_WORKER   0x00000020  /* I'm a workqueue worker */
  6154. -#define PF_FORKNOEXEC  0x00000040  /* forked but didn't exec */
  6155. -#define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
  6156. -#define PF_SUPERPRIV   0x00000100  /* used super-user privileges */
  6157. -#define PF_DUMPCORE    0x00000200  /* dumped core */
  6158. -#define PF_SIGNALED    0x00000400  /* killed by a signal */
  6159. -#define PF_MEMALLOC    0x00000800  /* Allocating memory */
  6160. -#define PF_NPROC_EXCEEDED 0x00001000   /* set_user noticed that RLIMIT_NPROC was exceeded */
  6161. -#define PF_USED_MATH   0x00002000  /* if unset the fpu must be initialized before use */
  6162. -#define PF_USED_ASYNC  0x00004000  /* used async_schedule*(), used by module init */
  6163. -#define PF_NOFREEZE    0x00008000  /* this thread should not be frozen */
  6164. -#define PF_FROZEN  0x00010000  /* frozen for system suspend */
  6165. -#define PF_FSTRANS 0x00020000  /* inside a filesystem transaction */
  6166. -#define PF_KSWAPD  0x00040000  /* I am kswapd */
  6167. -#define PF_MEMALLOC_NOIO 0x00080000    /* Allocating memory without IO involved */
  6168. -#define PF_LESS_THROTTLE 0x00100000    /* Throttle me less: I clean memory */
  6169. -#define PF_KTHREAD 0x00200000  /* I am a kernel thread */
  6170. -#define PF_RANDOMIZE   0x00400000  /* randomize virtual address space */
  6171. -#define PF_SWAPWRITE   0x00800000  /* Allowed to write to swap */
  6172. -#define PF_NO_SETAFFINITY 0x04000000   /* Userland is not allowed to meddle with cpus_allowed */
  6173. -#define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
  6174. -#define PF_MUTEX_TESTER    0x20000000  /* Thread belongs to the rt mutex tester */
  6175. -#define PF_FREEZER_SKIP    0x40000000  /* Freezer should not count it as freezable */
  6176. -#define PF_SUSPEND_TASK 0x80000000      /* this thread called freeze_processes and should not be frozen */
  6177. -
  6178. -/*
  6179. - * Only the _current_ task can read/write to tsk->flags, but other
  6180. - * tasks can access tsk->flags in readonly mode for example
  6181. - * with tsk_used_math (like during threaded core dumping).
  6182. - * There is however an exception to this rule during ptrace
  6183. - * or during fork: the ptracer task is allowed to write to the
  6184. - * child->flags of its traced child (same goes for fork, the parent
  6185. - * can write to the child->flags), because we're guaranteed the
  6186. - * child is not running and in turn not changing child->flags
  6187. - * at the same time the parent does it.
  6188. - */
  6189. -#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
  6190. -#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
  6191. -#define clear_used_math() clear_stopped_child_used_math(current)
  6192. -#define set_used_math() set_stopped_child_used_math(current)
  6193. -#define conditional_stopped_child_used_math(condition, child) \
  6194. -   do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
  6195. -#define conditional_used_math(condition) \
  6196. -   conditional_stopped_child_used_math(condition, current)
  6197. -#define copy_to_stopped_child_used_math(child) \
  6198. -   do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
  6199. -/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
  6200. -#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
  6201. -#define used_math() tsk_used_math(current)
  6202. -
  6203. -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
  6204. - * __GFP_FS is also cleared as it implies __GFP_IO.
  6205. - */
  6206. -static inline gfp_t memalloc_noio_flags(gfp_t flags)
  6207. -{
  6208. -   if (unlikely(current->flags & PF_MEMALLOC_NOIO))
  6209. -       flags &= ~(__GFP_IO | __GFP_FS);
  6210. -   return flags;
  6211. -}
  6212. -
  6213. -static inline unsigned int memalloc_noio_save(void)
  6214. -{
  6215. -   unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
  6216. -   current->flags |= PF_MEMALLOC_NOIO;
  6217. -   return flags;
  6218. -}
  6219. -
  6220. -static inline void memalloc_noio_restore(unsigned int flags)
  6221. -{
  6222. -   current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
  6223. -}
  6224. -
  6225. -/* Per-process atomic flags. */
  6226. -#define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */
  6227. -#define PFA_SPREAD_PAGE  1      /* Spread page cache over cpuset */
  6228. -#define PFA_SPREAD_SLAB  2      /* Spread some slab caches over cpuset */
  6229. -
  6230. -
  6231. -#define TASK_PFA_TEST(name, func)                  \
  6232. -   static inline bool task_##func(struct task_struct *p)       \
  6233. -   { return test_bit(PFA_##name, &p->atomic_flags); }
  6234. -#define TASK_PFA_SET(name, func)                   \
  6235. -   static inline void task_set_##func(struct task_struct *p)   \
  6236. -   { set_bit(PFA_##name, &p->atomic_flags); }
  6237. -#define TASK_PFA_CLEAR(name, func)                 \
  6238. -   static inline void task_clear_##func(struct task_struct *p) \
  6239. -   { clear_bit(PFA_##name, &p->atomic_flags); }
  6240. -
  6241. -TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs)
  6242. -TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs)
  6243. -
  6244. -TASK_PFA_TEST(SPREAD_PAGE, spread_page)
  6245. -TASK_PFA_SET(SPREAD_PAGE, spread_page)
  6246. -TASK_PFA_CLEAR(SPREAD_PAGE, spread_page)
  6247. -
  6248. -TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
  6249. -TASK_PFA_SET(SPREAD_SLAB, spread_slab)
  6250. -TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
  6251. -
  6252. -/*
  6253. - * task->jobctl flags
  6254. - */
  6255. -#define JOBCTL_STOP_SIGMASK    0xffff  /* signr of the last group stop */
  6256. -
  6257. -#define JOBCTL_STOP_DEQUEUED_BIT 16    /* stop signal dequeued */
  6258. -#define JOBCTL_STOP_PENDING_BIT    17  /* task should stop for group stop */
  6259. -#define JOBCTL_STOP_CONSUME_BIT    18  /* consume group stop count */
  6260. -#define JOBCTL_TRAP_STOP_BIT   19  /* trap for STOP */
  6261. -#define JOBCTL_TRAP_NOTIFY_BIT 20  /* trap for NOTIFY */
  6262. -#define JOBCTL_TRAPPING_BIT    21  /* switching to TRACED */
  6263. -#define JOBCTL_LISTENING_BIT   22  /* ptracer is listening for events */
  6264. -
  6265. -#define JOBCTL_STOP_DEQUEUED   (1UL << JOBCTL_STOP_DEQUEUED_BIT)
  6266. -#define JOBCTL_STOP_PENDING    (1UL << JOBCTL_STOP_PENDING_BIT)
  6267. -#define JOBCTL_STOP_CONSUME    (1UL << JOBCTL_STOP_CONSUME_BIT)
  6268. -#define JOBCTL_TRAP_STOP   (1UL << JOBCTL_TRAP_STOP_BIT)
  6269. -#define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT)
  6270. -#define JOBCTL_TRAPPING        (1UL << JOBCTL_TRAPPING_BIT)
  6271. -#define JOBCTL_LISTENING   (1UL << JOBCTL_LISTENING_BIT)
  6272. -
  6273. -#define JOBCTL_TRAP_MASK   (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
  6274. -#define JOBCTL_PENDING_MASK    (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
  6275. -
  6276. -extern bool task_set_jobctl_pending(struct task_struct *task,
  6277. -                   unsigned long mask);
  6278. -extern void task_clear_jobctl_trapping(struct task_struct *task);
  6279. -extern void task_clear_jobctl_pending(struct task_struct *task,
  6280. -                     unsigned long mask);
  6281. -
  6282. -static inline void rcu_copy_process(struct task_struct *p)
  6283. -{
  6284. -#ifdef CONFIG_PREEMPT_RCU
  6285. -   p->rcu_read_lock_nesting = 0;
  6286. -   p->rcu_read_unlock_special.s = 0;
  6287. -   p->rcu_blocked_node = NULL;
  6288. -   INIT_LIST_HEAD(&p->rcu_node_entry);
  6289. -#endif /* #ifdef CONFIG_PREEMPT_RCU */
  6290. -#ifdef CONFIG_TASKS_RCU
  6291. -   p->rcu_tasks_holdout = false;
  6292. -   INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
  6293. -   p->rcu_tasks_idle_cpu = -1;
  6294. -#endif /* #ifdef CONFIG_TASKS_RCU */
  6295. -}
  6296. -
  6297. -static inline void tsk_restore_flags(struct task_struct *task,
  6298. -               unsigned long orig_flags, unsigned long flags)
  6299. -{
  6300. -   task->flags &= ~flags;
  6301. -   task->flags |= orig_flags & flags;
  6302. -}
  6303. -
  6304. -extern int cpuset_cpumask_can_shrink(const struct cpumask *cur,
  6305. -                    const struct cpumask *trial);
  6306. -extern int task_can_attach(struct task_struct *p,
  6307. -              const struct cpumask *cs_cpus_allowed);
  6308. -#ifdef CONFIG_SMP
  6309. -extern void do_set_cpus_allowed(struct task_struct *p,
  6310. -                  const struct cpumask *new_mask);
  6311. -
  6312. -extern int set_cpus_allowed_ptr(struct task_struct *p,
  6313. -               const struct cpumask *new_mask);
  6314. -#else
  6315. -static inline void do_set_cpus_allowed(struct task_struct *p,
  6316. -                     const struct cpumask *new_mask)
  6317. -{
  6318. -}
  6319. -static inline int set_cpus_allowed_ptr(struct task_struct *p,
  6320. -                      const struct cpumask *new_mask)
  6321. -{
  6322. -   if (!cpumask_test_cpu(0, new_mask))
  6323. -       return -EINVAL;
  6324. -   return 0;
  6325. -}
  6326. -#endif
  6327. -
  6328. -#ifdef CONFIG_NO_HZ_COMMON
  6329. -void calc_load_enter_idle(void);
  6330. -void calc_load_exit_idle(void);
  6331. -#else
  6332. -static inline void calc_load_enter_idle(void) { }
  6333. -static inline void calc_load_exit_idle(void) { }
  6334. -#endif /* CONFIG_NO_HZ_COMMON */
  6335. -
  6336. -/*
  6337. - * Do not use outside of architecture code which knows its limitations.
  6338. - *
  6339. - * sched_clock() has no promise of monotonicity or bounded drift between
  6340. - * CPUs, use (which you should not) requires disabling IRQs.
  6341. - *
  6342. - * Please use one of the three interfaces below.
  6343. - */
  6344. -extern unsigned long long notrace sched_clock(void);
  6345. -/*
  6346. - * See the comment in kernel/sched/clock.c
  6347. - */
  6348. -extern u64 cpu_clock(int cpu);
  6349. -extern u64 local_clock(void);
  6350. -extern u64 running_clock(void);
  6351. -extern u64 sched_clock_cpu(int cpu);
  6352. -
  6353. -
  6354. -extern void sched_clock_init(void);
  6355. -
  6356. -#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
  6357. -static inline void sched_clock_tick(void)
  6358. -{
  6359. -}
  6360. -
  6361. -static inline void sched_clock_idle_sleep_event(void)
  6362. -{
  6363. -}
  6364. -
  6365. -static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
  6366. -{
  6367. -}
  6368. -#else
  6369. -/*
  6370. - * Architectures can set this to 1 if they have specified
  6371. - * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
  6372. - * but then during bootup it turns out that sched_clock()
  6373. - * is reliable after all:
  6374. - */
  6375. -extern int sched_clock_stable(void);
  6376. -extern void set_sched_clock_stable(void);
  6377. -extern void clear_sched_clock_stable(void);
  6378. -
  6379. -extern void sched_clock_tick(void);
  6380. -extern void sched_clock_idle_sleep_event(void);
  6381. -extern void sched_clock_idle_wakeup_event(u64 delta_ns);
  6382. -#endif
  6383. -
  6384. -#ifdef CONFIG_IRQ_TIME_ACCOUNTING
  6385. -/*
  6386. - * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
  6387. - * The reason for this explicit opt-in is not to have perf penalty with
  6388. - * slow sched_clocks.
  6389. - */
  6390. -extern void enable_sched_clock_irqtime(void);
  6391. -extern void disable_sched_clock_irqtime(void);
  6392. -#else
  6393. -static inline void enable_sched_clock_irqtime(void) {}
  6394. -static inline void disable_sched_clock_irqtime(void) {}
  6395. -#endif
  6396. -
  6397. -extern unsigned long long
  6398. -task_sched_runtime(struct task_struct *task);
  6399. -
  6400. -/* sched_exec is called by processes performing an exec */
  6401. -#ifdef CONFIG_SMP
  6402. -extern void sched_exec(void);
  6403. -#else
  6404. -#define sched_exec()   {}
  6405. -#endif
  6406. -
  6407. -extern void sched_clock_idle_sleep_event(void);
  6408. -extern void sched_clock_idle_wakeup_event(u64 delta_ns);
  6409. -
  6410. -#ifdef CONFIG_HOTPLUG_CPU
  6411. -extern void idle_task_exit(void);
  6412. -#else
  6413. -static inline void idle_task_exit(void) {}
  6414. -#endif
  6415. -
  6416. -#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
  6417. -extern void wake_up_nohz_cpu(int cpu);
  6418. -#else
  6419. -static inline void wake_up_nohz_cpu(int cpu) { }
  6420. -#endif
  6421. -
  6422. -#ifdef CONFIG_NO_HZ_FULL
  6423. -extern bool sched_can_stop_tick(void);
  6424. -extern u64 scheduler_tick_max_deferment(void);
  6425. -#else
  6426. -static inline bool sched_can_stop_tick(void) { return false; }
  6427. -#endif
  6428. -
  6429. -#ifdef CONFIG_SCHED_AUTOGROUP
  6430. -extern void sched_autogroup_create_attach(struct task_struct *p);
  6431. -extern void sched_autogroup_detach(struct task_struct *p);
  6432. -extern void sched_autogroup_fork(struct signal_struct *sig);
  6433. -extern void sched_autogroup_exit(struct signal_struct *sig);
  6434. -#ifdef CONFIG_PROC_FS
  6435. -extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
  6436. -extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice);
  6437. -#endif
  6438. -#else
  6439. -static inline void sched_autogroup_create_attach(struct task_struct *p) { }
  6440. -static inline void sched_autogroup_detach(struct task_struct *p) { }
  6441. -static inline void sched_autogroup_fork(struct signal_struct *sig) { }
  6442. -static inline void sched_autogroup_exit(struct signal_struct *sig) { }
  6443. -#endif
  6444. -
  6445. -extern int yield_to(struct task_struct *p, bool preempt);
  6446. -extern void set_user_nice(struct task_struct *p, long nice);
  6447. -extern int task_prio(const struct task_struct *p);
  6448. -/**
  6449. - * task_nice - return the nice value of a given task.
  6450. - * @p: the task in question.
  6451. - *
  6452. - * Return: The nice value [ -20 ... 0 ... 19 ].
  6453. - */
  6454. -static inline int task_nice(const struct task_struct *p)
  6455. -{
  6456. -   return PRIO_TO_NICE((p)->static_prio);
  6457. -}
  6458. -extern int can_nice(const struct task_struct *p, const int nice);
  6459. -extern int task_curr(const struct task_struct *p);
  6460. -extern int idle_cpu(int cpu);
  6461. -extern int sched_setscheduler(struct task_struct *, int,
  6462. -                 const struct sched_param *);
  6463. -extern int sched_setscheduler_nocheck(struct task_struct *, int,
  6464. -                     const struct sched_param *);
  6465. -extern int sched_setattr(struct task_struct *,
  6466. -            const struct sched_attr *);
  6467. -extern struct task_struct *idle_task(int cpu);
  6468. -/**
  6469. - * is_idle_task - is the specified task an idle task?
  6470. - * @p: the task in question.
  6471. - *
  6472. - * Return: 1 if @p is an idle task. 0 otherwise.
  6473. - */
  6474. -static inline bool is_idle_task(const struct task_struct *p)
  6475. -{
  6476. -   return p->pid == 0;
  6477. -}
  6478. -extern struct task_struct *curr_task(int cpu);
  6479. -extern void set_curr_task(int cpu, struct task_struct *p);
  6480. -
  6481. -void yield(void);
  6482. -
  6483. -union thread_union {
  6484. -   struct thread_info thread_info;
  6485. -   unsigned long stack[THREAD_SIZE/sizeof(long)];
  6486. -};
  6487. -
  6488. -#ifndef __HAVE_ARCH_KSTACK_END
  6489. -static inline int kstack_end(void *addr)
  6490. -{
  6491. -   /* Reliable end of stack detection:
  6492. -    * Some APM bios versions misalign the stack
  6493. -    */
  6494. -   return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
  6495. -}
  6496. -#endif
  6497. -
  6498. -extern union thread_union init_thread_union;
  6499. -extern struct task_struct init_task;
  6500. -
  6501. -extern struct   mm_struct init_mm;
  6502. -
  6503. -extern struct pid_namespace init_pid_ns;
  6504. -
  6505. -/*
  6506. - * find a task by one of its numerical ids
  6507. - *
  6508. - * find_task_by_pid_ns():
  6509. - *      finds a task by its pid in the specified namespace
  6510. - * find_task_by_vpid():
  6511. - *      finds a task by its virtual pid
  6512. - *
  6513. - * see also find_vpid() etc in include/linux/pid.h
  6514. - */
  6515. -
  6516. -extern struct task_struct *find_task_by_vpid(pid_t nr);
  6517. -extern struct task_struct *find_task_by_pid_ns(pid_t nr,
  6518. -       struct pid_namespace *ns);
  6519. -
  6520. -/* per-UID process charging. */
  6521. -extern struct user_struct * alloc_uid(kuid_t);
  6522. -static inline struct user_struct *get_uid(struct user_struct *u)
  6523. -{
  6524. -   atomic_inc(&u->__count);
  6525. -   return u;
  6526. -}
  6527. -extern void free_uid(struct user_struct *);
  6528. -
  6529. -#include <asm/current.h>
  6530. -
  6531. -extern void xtime_update(unsigned long ticks);
  6532. -
  6533. -extern int wake_up_state(struct task_struct *tsk, unsigned int state);
  6534. -extern int wake_up_process(struct task_struct *tsk);
  6535. -extern void wake_up_new_task(struct task_struct *tsk);
  6536. -#ifdef CONFIG_SMP
  6537. - extern void kick_process(struct task_struct *tsk);
  6538. -#else
  6539. - static inline void kick_process(struct task_struct *tsk) { }
  6540. -#endif
  6541. -extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
  6542. -extern void sched_dead(struct task_struct *p);
  6543. -
  6544. -extern void proc_caches_init(void);
  6545. -extern void flush_signals(struct task_struct *);
  6546. -extern void ignore_signals(struct task_struct *);
  6547. -extern void flush_signal_handlers(struct task_struct *, int force_default);
  6548. -extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
  6549. -
  6550. -static inline int kernel_dequeue_signal(siginfo_t *info)
  6551. -{
  6552. -   struct task_struct *tsk = current;
  6553. -   siginfo_t __info;
  6554. -   int ret;
  6555. -
  6556. -   spin_lock_irq(&tsk->sighand->siglock);
  6557. -   ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info);
  6558. -   spin_unlock_irq(&tsk->sighand->siglock);
  6559. -
  6560. -   return ret;
  6561. -}
  6562. -
  6563. -static inline void kernel_signal_stop(void)
  6564. -{
  6565. -   spin_lock_irq(&current->sighand->siglock);
  6566. -   if (current->jobctl & JOBCTL_STOP_DEQUEUED)
  6567. -       __set_current_state(TASK_STOPPED);
  6568. -   spin_unlock_irq(&current->sighand->siglock);
  6569. -
  6570. -   schedule();
  6571. -}
  6572. -
  6573. -extern void release_task(struct task_struct * p);
  6574. -extern int send_sig_info(int, struct siginfo *, struct task_struct *);
  6575. -extern int force_sigsegv(int, struct task_struct *);
  6576. -extern int force_sig_info(int, struct siginfo *, struct task_struct *);
  6577. -extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
  6578. -extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
  6579. -extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *,
  6580. -               const struct cred *, u32);
  6581. -extern int kill_pgrp(struct pid *pid, int sig, int priv);
  6582. -extern int kill_pid(struct pid *pid, int sig, int priv);
  6583. -extern int kill_proc_info(int, struct siginfo *, pid_t);
  6584. -extern __must_check bool do_notify_parent(struct task_struct *, int);
  6585. -extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
  6586. -extern void force_sig(int, struct task_struct *);
  6587. -extern int send_sig(int, struct task_struct *, int);
  6588. -extern int zap_other_threads(struct task_struct *p);
  6589. -extern struct sigqueue *sigqueue_alloc(void);
  6590. -extern void sigqueue_free(struct sigqueue *);
  6591. -extern int send_sigqueue(struct sigqueue *,  struct task_struct *, int group);
  6592. -extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
  6593. -
  6594. -static inline void restore_saved_sigmask(void)
  6595. -{
  6596. -   if (test_and_clear_restore_sigmask())
  6597. -       __set_current_blocked(&current->saved_sigmask);
  6598. -}
  6599. -
  6600. -static inline sigset_t *sigmask_to_save(void)
  6601. -{
  6602. -   sigset_t *res = &current->blocked;
  6603. -   if (unlikely(test_restore_sigmask()))
  6604. -       res = &current->saved_sigmask;
  6605. -   return res;
  6606. -}
  6607. -
  6608. -static inline int kill_cad_pid(int sig, int priv)
  6609. -{
  6610. -   return kill_pid(cad_pid, sig, priv);
  6611. -}
  6612. -
  6613. -/* These can be the second arg to send_sig_info/send_group_sig_info.  */
  6614. -#define SEND_SIG_NOINFO ((struct siginfo *) 0)
  6615. -#define SEND_SIG_PRIV  ((struct siginfo *) 1)
  6616. -#define SEND_SIG_FORCED    ((struct siginfo *) 2)
  6617. -
  6618. -/*
  6619. - * True if we are on the alternate signal stack.
  6620. - */
  6621. -static inline int on_sig_stack(unsigned long sp)
  6622. -{
  6623. -#ifdef CONFIG_STACK_GROWSUP
  6624. -   return sp >= current->sas_ss_sp &&
  6625. -       sp - current->sas_ss_sp < current->sas_ss_size;
  6626. -#else
  6627. -   return sp > current->sas_ss_sp &&
  6628. -       sp - current->sas_ss_sp <= current->sas_ss_size;
  6629. -#endif
  6630. -}
  6631. -
  6632. -static inline int sas_ss_flags(unsigned long sp)
  6633. -{
  6634. -   if (!current->sas_ss_size)
  6635. -       return SS_DISABLE;
  6636. -
  6637. -   return on_sig_stack(sp) ? SS_ONSTACK : 0;
  6638. -}
  6639. -
  6640. -static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
  6641. -{
  6642. -   if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
  6643. -#ifdef CONFIG_STACK_GROWSUP
  6644. -       return current->sas_ss_sp;
  6645. -#else
  6646. -       return current->sas_ss_sp + current->sas_ss_size;
  6647. -#endif
  6648. -   return sp;
  6649. -}
  6650. -
  6651. -/*
  6652. - * Routines for handling mm_structs
  6653. - */
  6654. -extern struct mm_struct * mm_alloc(void);
  6655. -
  6656. -/* mmdrop drops the mm and the page tables */
  6657. -extern void __mmdrop(struct mm_struct *);
  6658. -static inline void mmdrop(struct mm_struct * mm)
  6659. -{
  6660. -   if (unlikely(atomic_dec_and_test(&mm->mm_count)))
  6661. -       __mmdrop(mm);
  6662. -}
  6663. -
  6664. -/* mmput gets rid of the mappings and all user-space */
  6665. -extern void mmput(struct mm_struct *);
  6666. -/* Grab a reference to a task's mm, if it is not already going away */
  6667. -extern struct mm_struct *get_task_mm(struct task_struct *task);
  6668. -/*
  6669. - * Grab a reference to a task's mm, if it is not already going away
  6670. - * and ptrace_may_access with the mode parameter passed to it
  6671. - * succeeds.
  6672. - */
  6673. -extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
  6674. -/* Remove the current tasks stale references to the old mm_struct */
  6675. -extern void mm_release(struct task_struct *, struct mm_struct *);
  6676. -
  6677. -#ifdef CONFIG_HAVE_COPY_THREAD_TLS
  6678. -extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
  6679. -           struct task_struct *, unsigned long);
  6680. -#else
  6681. -extern int copy_thread(unsigned long, unsigned long, unsigned long,
  6682. -           struct task_struct *);
  6683. -
  6684. -/* Architectures that haven't opted into copy_thread_tls get the tls argument
  6685. - * via pt_regs, so ignore the tls argument passed via C. */
  6686. -static inline int copy_thread_tls(
  6687. -       unsigned long clone_flags, unsigned long sp, unsigned long arg,
  6688. -       struct task_struct *p, unsigned long tls)
  6689. -{
  6690. -   return copy_thread(clone_flags, sp, arg, p);
  6691. -}
  6692. -#endif
  6693. -extern void flush_thread(void);
  6694. -extern void exit_thread(void);
  6695. -
  6696. -extern void exit_files(struct task_struct *);
  6697. -extern void __cleanup_sighand(struct sighand_struct *);
  6698. -
  6699. -extern void exit_itimers(struct signal_struct *);
  6700. -extern void flush_itimer_signals(void);
  6701. -
  6702. -extern void do_group_exit(int);
  6703. -
  6704. -extern int do_execve(struct filename *,
  6705. -            const char __user * const __user *,
  6706. -            const char __user * const __user *);
  6707. -extern int do_execveat(int, struct filename *,
  6708. -              const char __user * const __user *,
  6709. -              const char __user * const __user *,
  6710. -              int);
  6711. -extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
  6712. -extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
  6713. -struct task_struct *fork_idle(int);
  6714. -extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
  6715. -
  6716. -extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
  6717. -static inline void set_task_comm(struct task_struct *tsk, const char *from)
  6718. -{
  6719. -   __set_task_comm(tsk, from, false);
  6720. -}
  6721. -extern char *get_task_comm(char *to, struct task_struct *tsk);
  6722. -
  6723. -#ifdef CONFIG_SMP
  6724. -void scheduler_ipi(void);
  6725. -extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
  6726. -#else
  6727. -static inline void scheduler_ipi(void) { }
  6728. -static inline unsigned long wait_task_inactive(struct task_struct *p,
  6729. -                          long match_state)
  6730. -{
  6731. -   return 1;
  6732. -}
  6733. -#endif
  6734. -
  6735. -#define tasklist_empty() \
  6736. -   list_empty(&init_task.tasks)
  6737. -
  6738. -#define next_task(p) \
  6739. -   list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
  6740. -
  6741. -#define for_each_process(p) \
  6742. -   for (p = &init_task ; (p = next_task(p)) != &init_task ; )
  6743. -
  6744. -extern bool current_is_single_threaded(void);
  6745. -
  6746. -/*
  6747. - * Careful: do_each_thread/while_each_thread is a double loop so
  6748. - *          'break' will not work as expected - use goto instead.
  6749. - */
  6750. -#define do_each_thread(g, t) \
  6751. -   for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
  6752. -
  6753. -#define while_each_thread(g, t) \
  6754. -   while ((t = next_thread(t)) != g)
  6755. -
  6756. -#define __for_each_thread(signal, t)   \
  6757. -   list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
  6758. -
  6759. -#define for_each_thread(p, t)      \
  6760. -   __for_each_thread((p)->signal, t)
  6761. -
  6762. -/* Careful: this is a double loop, 'break' won't work as expected. */
  6763. -#define for_each_process_thread(p, t)  \
  6764. -   for_each_process(p) for_each_thread(p, t)
  6765. -
  6766. -static inline int get_nr_threads(struct task_struct *tsk)
  6767. -{
  6768. -   return tsk->signal->nr_threads;
  6769. -}
  6770. -
  6771. -static inline bool thread_group_leader(struct task_struct *p)
  6772. -{
  6773. -   return p->exit_signal >= 0;
  6774. -}
  6775. -
  6776. -/* Do to the insanities of de_thread it is possible for a process
  6777. - * to have the pid of the thread group leader without actually being
  6778. - * the thread group leader.  For iteration through the pids in proc
  6779. - * all we care about is that we have a task with the appropriate
  6780. - * pid, we don't actually care if we have the right task.
  6781. - */
  6782. -static inline bool has_group_leader_pid(struct task_struct *p)
  6783. -{
  6784. -   return task_pid(p) == p->signal->leader_pid;
  6785. -}
  6786. -
  6787. -static inline
  6788. -bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
  6789. -{
  6790. -   return p1->signal == p2->signal;
  6791. -}
  6792. -
  6793. -static inline struct task_struct *next_thread(const struct task_struct *p)
  6794. -{
  6795. -   return list_entry_rcu(p->thread_group.next,
  6796. -                 struct task_struct, thread_group);
  6797. -}
  6798. -
  6799. -static inline int thread_group_empty(struct task_struct *p)
  6800. -{
  6801. -   return list_empty(&p->thread_group);
  6802. -}
  6803. -
  6804. -#define delay_group_leader(p) \
  6805. -       (thread_group_leader(p) && !thread_group_empty(p))
  6806. -
  6807. -/*
  6808. - * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  6809. - * subscriptions and synchronises with wait4().  Also used in procfs.  Also
  6810. - * pins the final release of task.io_context.  Also protects ->cpuset and
  6811. - * ->cgroup.subsys[]. And ->vfork_done.
  6812. - *
  6813. - * Nests both inside and outside of read_lock(&tasklist_lock).
  6814. - * It must not be nested with write_lock_irq(&tasklist_lock),
  6815. - * neither inside nor outside.
  6816. - */
  6817. -static inline void task_lock(struct task_struct *p)
  6818. -{
  6819. -   spin_lock(&p->alloc_lock);
  6820. -}
  6821. -
  6822. -static inline void task_unlock(struct task_struct *p)
  6823. -{
  6824. -   spin_unlock(&p->alloc_lock);
  6825. -}
  6826. -
  6827. -extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
  6828. -                           unsigned long *flags);
  6829. -
  6830. -static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
  6831. -                              unsigned long *flags)
  6832. -{
  6833. -   struct sighand_struct *ret;
  6834. -
  6835. -   ret = __lock_task_sighand(tsk, flags);
  6836. -   (void)__cond_lock(&tsk->sighand->siglock, ret);
  6837. -   return ret;
  6838. -}
  6839. -
  6840. -static inline void unlock_task_sighand(struct task_struct *tsk,
  6841. -                       unsigned long *flags)
  6842. -{
  6843. -   spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
  6844. -}
  6845. -
  6846. -/**
  6847. - * threadgroup_change_begin - mark the beginning of changes to a threadgroup
  6848. - * @tsk: task causing the changes
  6849. - *
  6850. - * All operations which modify a threadgroup - a new thread joining the
  6851. - * group, death of a member thread (the assertion of PF_EXITING) and
  6852. - * exec(2) dethreading the process and replacing the leader - are wrapped
  6853. - * by threadgroup_change_{begin|end}().  This is to provide a place which
  6854. - * subsystems needing threadgroup stability can hook into for
  6855. - * synchronization.
  6856. - */
  6857. -static inline void threadgroup_change_begin(struct task_struct *tsk)
  6858. -{
  6859. -   might_sleep();
  6860. -   cgroup_threadgroup_change_begin(tsk);
  6861. -}
  6862. -
  6863. -/**
  6864. - * threadgroup_change_end - mark the end of changes to a threadgroup
  6865. - * @tsk: task causing the changes
  6866. - *
  6867. - * See threadgroup_change_begin().
  6868. - */
  6869. -static inline void threadgroup_change_end(struct task_struct *tsk)
  6870. -{
  6871. -   cgroup_threadgroup_change_end(tsk);
  6872. -}
  6873. -
  6874. -#ifndef __HAVE_THREAD_FUNCTIONS
  6875. -
  6876. -#define task_thread_info(task) ((struct thread_info *)(task)->stack)
  6877. -#define task_stack_page(task)  ((task)->stack)
  6878. -
  6879. -static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
  6880. -{
  6881. -   *task_thread_info(p) = *task_thread_info(org);
  6882. -   task_thread_info(p)->task = p;
  6883. -}
  6884. -
  6885. -/*
  6886. - * Return the address of the last usable long on the stack.
  6887. - *
  6888. - * When the stack grows down, this is just above the thread
  6889. - * info struct. Going any lower will corrupt the threadinfo.
  6890. - *
  6891. - * When the stack grows up, this is the highest address.
  6892. - * Beyond that position, we corrupt data on the next page.
  6893. - */
  6894. -static inline unsigned long *end_of_stack(struct task_struct *p)
  6895. -{
  6896. -#ifdef CONFIG_STACK_GROWSUP
  6897. -   return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1;
  6898. -#else
  6899. -   return (unsigned long *)(task_thread_info(p) + 1);
  6900. -#endif
  6901. -}
  6902. -
  6903. -#endif
  6904. -#define task_stack_end_corrupted(task) \
  6905. -       (*(end_of_stack(task)) != STACK_END_MAGIC)
  6906. -
  6907. -static inline int object_is_on_stack(void *obj)
  6908. -{
  6909. -   void *stack = task_stack_page(current);
  6910. -
  6911. -   return (obj >= stack) && (obj < (stack + THREAD_SIZE));
  6912. -}
  6913. -
  6914. -extern void thread_info_cache_init(void);
  6915. -
  6916. -#ifdef CONFIG_DEBUG_STACK_USAGE
  6917. -static inline unsigned long stack_not_used(struct task_struct *p)
  6918. -{
  6919. -   unsigned long *n = end_of_stack(p);
  6920. -
  6921. -   do {    /* Skip over canary */
  6922. -       n++;
  6923. -   } while (!*n);
  6924. -
  6925. -   return (unsigned long)n - (unsigned long)end_of_stack(p);
  6926. -}
  6927. -#endif
  6928. -extern void set_task_stack_end_magic(struct task_struct *tsk);
  6929. -
  6930. -/* set thread flags in other task's structures
  6931. - * - see asm/thread_info.h for TIF_xxxx flags available
  6932. - */
  6933. -static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
  6934. -{
  6935. -   set_ti_thread_flag(task_thread_info(tsk), flag);
  6936. -}
  6937. -
  6938. -static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
  6939. -{
  6940. -   clear_ti_thread_flag(task_thread_info(tsk), flag);
  6941. -}
  6942. -
  6943. -static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
  6944. -{
  6945. -   return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
  6946. -}
  6947. -
  6948. -static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
  6949. -{
  6950. -   return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
  6951. -}
  6952. -
  6953. -static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
  6954. -{
  6955. -   return test_ti_thread_flag(task_thread_info(tsk), flag);
  6956. -}
  6957. -
  6958. -static inline void set_tsk_need_resched(struct task_struct *tsk)
  6959. -{
  6960. -   set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
  6961. -}
  6962. -
  6963. -static inline void clear_tsk_need_resched(struct task_struct *tsk)
  6964. -{
  6965. -   clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
  6966. -}
  6967. -
  6968. -static inline int test_tsk_need_resched(struct task_struct *tsk)
  6969. -{
  6970. -   return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
  6971. -}
  6972. -
  6973. -static inline int restart_syscall(void)
  6974. -{
  6975. -   set_tsk_thread_flag(current, TIF_SIGPENDING);
  6976. -   return -ERESTARTNOINTR;
  6977. -}
  6978. -
  6979. -static inline int signal_pending(struct task_struct *p)
  6980. -{
  6981. -   return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
  6982. -}
  6983. -
  6984. -static inline int __fatal_signal_pending(struct task_struct *p)
  6985. -{
  6986. -   return unlikely(sigismember(&p->pending.signal, SIGKILL));
  6987. -}
  6988. -
  6989. -static inline int fatal_signal_pending(struct task_struct *p)
  6990. -{
  6991. -   return signal_pending(p) && __fatal_signal_pending(p);
  6992. -}
  6993. -
  6994. -static inline int signal_pending_state(long state, struct task_struct *p)
  6995. -{
  6996. -   if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
  6997. -       return 0;
  6998. -   if (!signal_pending(p))
  6999. -       return 0;
  7000. -
  7001. -   return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
  7002. -}
  7003. -
  7004. -/*
  7005. - * cond_resched() and cond_resched_lock(): latency reduction via
  7006. - * explicit rescheduling in places that are safe. The return
  7007. - * value indicates whether a reschedule was done in fact.
  7008. - * cond_resched_lock() will drop the spinlock before scheduling,
  7009. - * cond_resched_softirq() will enable bhs before scheduling.
  7010. - */
  7011. -extern int _cond_resched(void);
  7012. -
  7013. -#define cond_resched() ({          \
  7014. -   ___might_sleep(__FILE__, __LINE__, 0);  \
  7015. -   _cond_resched();            \
  7016. -})
  7017. -
  7018. -extern int __cond_resched_lock(spinlock_t *lock);
  7019. -
  7020. -#define cond_resched_lock(lock) ({             \
  7021. -   ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
  7022. -   __cond_resched_lock(lock);              \
  7023. -})
  7024. -
  7025. -extern int __cond_resched_softirq(void);
  7026. -
  7027. -#define cond_resched_softirq() ({                  \
  7028. -   ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
  7029. -   __cond_resched_softirq();                   \
  7030. -})
  7031. -
  7032. -static inline void cond_resched_rcu(void)
  7033. -{
  7034. -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
  7035. -   rcu_read_unlock();
  7036. -   cond_resched();
  7037. -   rcu_read_lock();
  7038. -#endif
  7039. -}
  7040. -
  7041. -/*
  7042. - * Does a critical section need to be broken due to another
  7043. - * task waiting?: (technically does not depend on CONFIG_PREEMPT,
  7044. - * but a general need for low latency)
  7045. - */
  7046. -static inline int spin_needbreak(spinlock_t *lock)
  7047. -{
  7048. -#ifdef CONFIG_PREEMPT
  7049. -   return spin_is_contended(lock);
  7050. -#else
  7051. -   return 0;
  7052. -#endif
  7053. -}
  7054. -
  7055. -/*
  7056. - * Idle thread specific functions to determine the need_resched
  7057. - * polling state.
  7058. - */
  7059. -#ifdef TIF_POLLING_NRFLAG
  7060. -static inline int tsk_is_polling(struct task_struct *p)
  7061. -{
  7062. -   return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
  7063. -}
  7064. -
  7065. -static inline void __current_set_polling(void)
  7066. -{
  7067. -   set_thread_flag(TIF_POLLING_NRFLAG);
  7068. -}
  7069. -
  7070. -static inline bool __must_check current_set_polling_and_test(void)
  7071. -{
  7072. -   __current_set_polling();
  7073. -
  7074. -   /*
  7075. -    * Polling state must be visible before we test NEED_RESCHED,
  7076. -    * paired by resched_curr()
  7077. -    */
  7078. -   smp_mb__after_atomic();
  7079. -
  7080. -   return unlikely(tif_need_resched());
  7081. -}
  7082. -
  7083. -static inline void __current_clr_polling(void)
  7084. -{
  7085. -   clear_thread_flag(TIF_POLLING_NRFLAG);
  7086. -}
  7087. -
  7088. -static inline bool __must_check current_clr_polling_and_test(void)
  7089. -{
  7090. -   __current_clr_polling();
  7091. -
  7092. -   /*
  7093. -    * Polling state must be visible before we test NEED_RESCHED,
  7094. -    * paired by resched_curr()
  7095. -    */
  7096. -   smp_mb__after_atomic();
  7097. -
  7098. -   return unlikely(tif_need_resched());
  7099. -}
  7100. -
  7101. -#else
  7102. -static inline int tsk_is_polling(struct task_struct *p) { return 0; }
  7103. -static inline void __current_set_polling(void) { }
  7104. -static inline void __current_clr_polling(void) { }
  7105. -
  7106. -static inline bool __must_check current_set_polling_and_test(void)
  7107. -{
  7108. -   return unlikely(tif_need_resched());
  7109. -}
  7110. -static inline bool __must_check current_clr_polling_and_test(void)
  7111. -{
  7112. -   return unlikely(tif_need_resched());
  7113. -}
  7114. -#endif
  7115. -
  7116. -static inline void current_clr_polling(void)
  7117. -{
  7118. -   __current_clr_polling();
  7119. -
  7120. -   /*
  7121. -    * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
  7122. -    * Once the bit is cleared, we'll get IPIs with every new
  7123. -    * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
  7124. -    * fold.
  7125. -    */
  7126. -   smp_mb(); /* paired with resched_curr() */
  7127. -
  7128. -   preempt_fold_need_resched();
  7129. -}
  7130. -
  7131. -static __always_inline bool need_resched(void)
  7132. -{
  7133. -   return unlikely(tif_need_resched());
  7134. -}
  7135. -
  7136. -/*
  7137. - * Thread group CPU time accounting.
  7138. - */
  7139. -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
  7140. -void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
  7141. -
  7142. -/*
  7143. - * Reevaluate whether the task has signals pending delivery.
  7144. - * Wake the task if so.
  7145. - * This is required every time the blocked sigset_t changes.
  7146. - * callers must hold sighand->siglock.
  7147. - */
  7148. -extern void recalc_sigpending_and_wake(struct task_struct *t);
  7149. -extern void recalc_sigpending(void);
  7150. -
  7151. -extern void signal_wake_up_state(struct task_struct *t, unsigned int state);
  7152. -
  7153. -static inline void signal_wake_up(struct task_struct *t, bool resume)
  7154. -{
  7155. -   signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
  7156. -}
  7157. -static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
  7158. -{
  7159. -   signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
  7160. -}
  7161. -
  7162. -/*
  7163. - * Wrappers for p->thread_info->cpu access. No-op on UP.
  7164. - */
  7165. -#ifdef CONFIG_SMP
  7166. -
  7167. -static inline unsigned int task_cpu(const struct task_struct *p)
  7168. -{
  7169. -   return task_thread_info(p)->cpu;
  7170. -}
  7171. -
  7172. -static inline int task_node(const struct task_struct *p)
  7173. -{
  7174. -   return cpu_to_node(task_cpu(p));
  7175. -}
  7176. -
  7177. -extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
  7178. -
  7179. -#else
  7180. -
  7181. -static inline unsigned int task_cpu(const struct task_struct *p)
  7182. -{
  7183. -   return 0;
  7184. -}
  7185. -
  7186. -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
  7187. -{
  7188. -}
  7189. -
  7190. -#endif /* CONFIG_SMP */
  7191. -
  7192. -extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
  7193. -extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
  7194. -
  7195. -#ifdef CONFIG_CGROUP_SCHED
  7196. -extern struct task_group root_task_group;
  7197. -#endif /* CONFIG_CGROUP_SCHED */
  7198. -
  7199. -extern int task_can_switch_user(struct user_struct *up,
  7200. -                   struct task_struct *tsk);
  7201. -
  7202. -#ifdef CONFIG_TASK_XACCT
  7203. -static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
  7204. -{
  7205. -   tsk->ioac.rchar += amt;
  7206. -}
  7207. -
  7208. -static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
  7209. -{
  7210. -   tsk->ioac.wchar += amt;
  7211. -}
  7212. -
  7213. -static inline void inc_syscr(struct task_struct *tsk)
  7214. -{
  7215. -   tsk->ioac.syscr++;
  7216. -}
  7217. -
  7218. -static inline void inc_syscw(struct task_struct *tsk)
  7219. -{
  7220. -   tsk->ioac.syscw++;
  7221. -}
  7222. -#else
  7223. -static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
  7224. -{
  7225. -}
  7226. -
  7227. -static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
  7228. -{
  7229. -}
  7230. -
  7231. -static inline void inc_syscr(struct task_struct *tsk)
  7232. -{
  7233. -}
  7234. -
  7235. -static inline void inc_syscw(struct task_struct *tsk)
  7236. -{
  7237. -}
  7238. -#endif
  7239. -
  7240. -#ifndef TASK_SIZE_OF
  7241. -#define TASK_SIZE_OF(tsk)  TASK_SIZE
  7242. -#endif
  7243. -
  7244. -#ifdef CONFIG_MEMCG
  7245. -extern void mm_update_next_owner(struct mm_struct *mm);
  7246. -#else
  7247. -static inline void mm_update_next_owner(struct mm_struct *mm)
  7248. -{
  7249. -}
  7250. -#endif /* CONFIG_MEMCG */
  7251. -
  7252. -static inline unsigned long task_rlimit(const struct task_struct *tsk,
  7253. -       unsigned int limit)
  7254. -{
  7255. -   return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
  7256. -}
  7257. -
  7258. -static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
  7259. -       unsigned int limit)
  7260. -{
  7261. -   return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
  7262. -}
  7263. -
  7264. -static inline unsigned long rlimit(unsigned int limit)
  7265. -{
  7266. -   return task_rlimit(current, limit);
  7267. -}
  7268. -
  7269. -static inline unsigned long rlimit_max(unsigned int limit)
  7270. -{
  7271. -   return task_rlimit_max(current, limit);
  7272. -}
  7273. -
  7274. -#endif
  7275. diff -Naur linux-4.4.6-gentoo-orig/include/linux/wbt.h linux-4.4.6-gentoo-patched/include/linux/wbt.h
  7276. --- linux-4.4.6-gentoo-orig/include/linux/wbt.h 1970-01-01 03:00:00.000000000 +0300
  7277. +++ linux-4.4.6-gentoo-patched/include/linux/wbt.h  2016-05-04 11:03:27.411730745 +0300
  7278. @@ -0,0 +1,95 @@
  7279. +#ifndef WB_THROTTLE_H
  7280. +#define WB_THROTTLE_H
  7281. +
  7282. +#include <linux/atomic.h>
  7283. +#include <linux/wait.h>
  7284. +#include <linux/timer.h>
  7285. +#include <linux/ktime.h>
  7286. +
  7287. +#define ISSUE_STAT_MASK        (1ULL << 63)
  7288. +#define ISSUE_STAT_TIME_MASK   ~ISSUE_STAT_MASK
  7289. +
  7290. +struct wb_issue_stat {
  7291. +   u64 time;
  7292. +};
  7293. +
  7294. +static inline void wbt_issue_stat_set_time(struct wb_issue_stat *stat)
  7295. +{
  7296. +   stat->time = (stat->time & ISSUE_STAT_MASK) |
  7297. +           (ktime_to_ns(ktime_get()) & ISSUE_STAT_TIME_MASK);
  7298. +}
  7299. +
  7300. +static inline u64 wbt_issue_stat_get_time(struct wb_issue_stat *stat)
  7301. +{
  7302. +   return stat->time & ISSUE_STAT_TIME_MASK;
  7303. +}
  7304. +
  7305. +static inline void wbt_mark_tracked(struct wb_issue_stat *stat)
  7306. +{
  7307. +   stat->time |= ISSUE_STAT_MASK;
  7308. +}
  7309. +
  7310. +static inline void wbt_clear_tracked(struct wb_issue_stat *stat)
  7311. +{
  7312. +   stat->time &= ~ISSUE_STAT_MASK;
  7313. +}
  7314. +
  7315. +static inline bool wbt_tracked(struct wb_issue_stat *stat)
  7316. +{
  7317. +   return (stat->time & ISSUE_STAT_MASK) != 0;
  7318. +}
  7319. +
  7320. +struct wb_stat_ops {
  7321. +   void (*get)(void *, struct blk_rq_stat *);
  7322. +   void (*clear)(void *);
  7323. +};
  7324. +
  7325. +struct rq_wb {
  7326. +   /*
  7327. +    * Settings that govern how we throttle
  7328. +    */
  7329. +   unsigned int wb_background;     /* background writeback */
  7330. +   unsigned int wb_normal;         /* normal writeback */
  7331. +   unsigned int wb_max;            /* max throughput writeback */
  7332. +   unsigned int scale_step;
  7333. +
  7334. +   u64 win_nsec;               /* default window size */
  7335. +   u64 cur_win_nsec;           /* current window size */
  7336. +
  7337. +   unsigned int unknown_cnt;
  7338. +
  7339. +   struct timer_list window_timer;
  7340. +
  7341. +   s64 sync_issue;
  7342. +   void *sync_cookie;
  7343. +
  7344. +   unsigned int wc;
  7345. +   unsigned int queue_depth;
  7346. +
  7347. +   unsigned long last_issue;       /* last non-throttled issue */
  7348. +   unsigned long last_comp;        /* last non-throttled comp */
  7349. +   unsigned long min_lat_nsec;
  7350. +   struct backing_dev_info *bdi;
  7351. +   struct request_queue *q;
  7352. +   wait_queue_head_t wait;
  7353. +   atomic_t inflight;
  7354. +
  7355. +   struct wb_stat_ops *stat_ops;
  7356. +   void *ops_data;
  7357. +};
  7358. +
  7359. +struct backing_dev_info;
  7360. +
  7361. +void __wbt_done(struct rq_wb *);
  7362. +void wbt_done(struct rq_wb *, struct wb_issue_stat *);
  7363. +bool wbt_wait(struct rq_wb *, unsigned int, spinlock_t *);
  7364. +struct rq_wb *wbt_init(struct backing_dev_info *, struct wb_stat_ops *, void *);
  7365. +void wbt_exit(struct rq_wb *);
  7366. +void wbt_update_limits(struct rq_wb *);
  7367. +void wbt_requeue(struct rq_wb *, struct wb_issue_stat *);
  7368. +void wbt_issue(struct rq_wb *, struct wb_issue_stat *);
  7369. +
  7370. +void wbt_set_queue_depth(struct rq_wb *, unsigned int);
  7371. +void wbt_set_write_cache(struct rq_wb *, bool);
  7372. +
  7373. +#endif
  7374. diff -Naur linux-4.4.6-gentoo-orig/include/linux/writeback.h linux-4.4.6-gentoo-patched/include/linux/writeback.h
  7375. --- linux-4.4.6-gentoo-orig/include/linux/writeback.h   2016-05-04 11:19:37.618649827 +0300
  7376. +++ linux-4.4.6-gentoo-patched/include/linux/writeback.h    2016-05-04 11:03:27.411730745 +0300
  7377. @@ -106,6 +106,16 @@
  7378.  #endif
  7379.  };
  7380.  
  7381. +static inline int wbc_to_write_cmd(struct writeback_control *wbc)
  7382. +{
  7383. +   if (wbc->sync_mode == WB_SYNC_ALL)
  7384. +       return WRITE_SYNC;
  7385. +   else if (wbc->for_kupdate || wbc->for_background)
  7386. +       return WRITE_BG;
  7387. +
  7388. +   return WRITE;
  7389. +}
  7390. +
  7391.  /*
  7392.   * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
  7393.   * and are measured against each other in.  There always is one global
  7394. diff -Naur linux-4.4.6-gentoo-orig/include/trace/events/wbt.h linux-4.4.6-gentoo-patched/include/trace/events/wbt.h
  7395. --- linux-4.4.6-gentoo-orig/include/trace/events/wbt.h  1970-01-01 03:00:00.000000000 +0300
  7396. +++ linux-4.4.6-gentoo-patched/include/trace/events/wbt.h   2016-05-04 11:03:27.411730745 +0300
  7397. @@ -0,0 +1,122 @@
  7398. +#undef TRACE_SYSTEM
  7399. +#define TRACE_SYSTEM wbt
  7400. +
  7401. +#if !defined(_TRACE_WBT_H) || defined(TRACE_HEADER_MULTI_READ)
  7402. +#define _TRACE_WBT_H
  7403. +
  7404. +#include <linux/tracepoint.h>
  7405. +#include <linux/wbt.h>
  7406. +
  7407. +/**
  7408. + * wbt_stat - trace stats for blk_wb
  7409. + * @stat: array of read/write stats
  7410. + */
  7411. +TRACE_EVENT(wbt_stat,
  7412. +
  7413. +   TP_PROTO(struct backing_dev_info *bdi, struct blk_rq_stat *stat),
  7414. +
  7415. +   TP_ARGS(bdi, stat),
  7416. +
  7417. +   TP_STRUCT__entry(
  7418. +       __array(char, name, 32)
  7419. +       __field(s64, rmean)
  7420. +       __field(u64, rmin)
  7421. +       __field(u64, rmax)
  7422. +       __field(s64, rnr_samples)
  7423. +       __field(s64, rtime)
  7424. +       __field(s64, wmean)
  7425. +       __field(u64, wmin)
  7426. +       __field(u64, wmax)
  7427. +       __field(s64, wnr_samples)
  7428. +       __field(s64, wtime)
  7429. +   ),
  7430. +
  7431. +   TP_fast_assign(
  7432. +       strncpy(__entry->name, dev_name(bdi->dev), 32);
  7433. +       __entry->rmean      = stat[0].mean;
  7434. +       __entry->rmin       = stat[0].min;
  7435. +       __entry->rmax       = stat[0].max;
  7436. +       __entry->rnr_samples    = stat[0].nr_samples;
  7437. +       __entry->wmean      = stat[1].mean;
  7438. +       __entry->wmin       = stat[1].min;
  7439. +       __entry->wmax       = stat[1].max;
  7440. +       __entry->wnr_samples    = stat[1].nr_samples;
  7441. +   ),
  7442. +
  7443. +   TP_printk("%s: rmean=%llu, rmin=%llu, rmax=%llu, rsamples=%llu, "
  7444. +         "wmean=%llu, wmin=%llu, wmax=%llu, wsamples=%llu\n",
  7445. +         __entry->name, __entry->rmean, __entry->rmin, __entry->rmax,
  7446. +         __entry->rnr_samples, __entry->wmean, __entry->wmin,
  7447. +         __entry->wmax, __entry->wnr_samples)
  7448. +);
  7449. +
  7450. +/**
  7451. + * wbt_lat - trace latency event
  7452. + * @lat: latency trigger
  7453. + */
  7454. +TRACE_EVENT(wbt_lat,
  7455. +
  7456. +   TP_PROTO(struct backing_dev_info *bdi, unsigned long lat),
  7457. +
  7458. +   TP_ARGS(bdi, lat),
  7459. +
  7460. +   TP_STRUCT__entry(
  7461. +       __array(char, name, 32)
  7462. +       __field(unsigned long, lat)
  7463. +   ),
  7464. +
  7465. +   TP_fast_assign(
  7466. +       strncpy(__entry->name, dev_name(bdi->dev), 32);
  7467. +       __entry->lat = lat;
  7468. +   ),
  7469. +
  7470. +   TP_printk("%s: latency %llu\n", __entry->name,
  7471. +           (unsigned long long) __entry->lat)
  7472. +);
  7473. +
  7474. +/**
  7475. + * wbt_step - trace wb event step
  7476. + * @msg: context message
  7477. + * @step: the current scale step count
  7478. + * @window: the current monitoring window
  7479. + * @bg: the current background queue limit
  7480. + * @normal: the current normal writeback limit
  7481. + * @max: the current max throughput writeback limit
  7482. + */
  7483. +TRACE_EVENT(wbt_step,
  7484. +
  7485. +   TP_PROTO(struct backing_dev_info *bdi, const char *msg,
  7486. +        unsigned int step, unsigned long window, unsigned int bg,
  7487. +        unsigned int normal, unsigned int max),
  7488. +
  7489. +   TP_ARGS(bdi, msg, step, window, bg, normal, max),
  7490. +
  7491. +   TP_STRUCT__entry(
  7492. +       __array(char, name, 32)
  7493. +       __field(const char *, msg)
  7494. +       __field(unsigned int, step)
  7495. +       __field(unsigned long, window)
  7496. +       __field(unsigned int, bg)
  7497. +       __field(unsigned int, normal)
  7498. +       __field(unsigned int, max)
  7499. +   ),
  7500. +
  7501. +   TP_fast_assign(
  7502. +       strncpy(__entry->name, dev_name(bdi->dev), 32);
  7503. +       __entry->msg    = msg;
  7504. +       __entry->step   = step;
  7505. +       __entry->window = window;
  7506. +       __entry->bg = bg;
  7507. +       __entry->normal = normal;
  7508. +       __entry->max    = max;
  7509. +   ),
  7510. +
  7511. +   TP_printk("%s: %s: step=%u, window=%lu, background=%u, normal=%u, max=%u\n",
  7512. +         __entry->name, __entry->msg, __entry->step, __entry->window,
  7513. +         __entry->bg, __entry->normal, __entry->max)
  7514. +);
  7515. +
  7516. +#endif /* _TRACE_WBT_H */
  7517. +
  7518. +/* This part must be outside protection */
  7519. +#include <trace/define_trace.h>
  7520. diff -Naur linux-4.4.6-gentoo-orig/lib/Kconfig linux-4.4.6-gentoo-patched/lib/Kconfig
  7521. --- linux-4.4.6-gentoo-orig/lib/Kconfig 2016-05-04 11:19:37.619649827 +0300
  7522. +++ linux-4.4.6-gentoo-patched/lib/Kconfig  2016-05-04 11:03:27.411730745 +0300
  7523. @@ -531,4 +531,7 @@
  7524.  config ARCH_HAS_MMIO_FLUSH
  7525.     bool
  7526.  
  7527. +config WBT
  7528. +   bool
  7529. +
  7530.  endmenu
  7531. diff -Naur linux-4.4.6-gentoo-orig/lib/Makefile linux-4.4.6-gentoo-patched/lib/Makefile
  7532. --- linux-4.4.6-gentoo-orig/lib/Makefile    2016-05-04 11:19:37.619649827 +0300
  7533. +++ linux-4.4.6-gentoo-patched/lib/Makefile 2016-05-04 11:08:23.874706019 +0300
  7534. @@ -164,6 +164,7 @@
  7535.  
  7536.  obj-$(CONFIG_SG_SPLIT) += sg_split.o
  7537.  obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
  7538. +obj-$(CONFIG_WBT) += wbt.o
  7539.  
  7540.  libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \
  7541.            fdt_empty_tree.o
  7542. diff -Naur linux-4.4.6-gentoo-orig/lib/wbt.c linux-4.4.6-gentoo-patched/lib/wbt.c
  7543. --- linux-4.4.6-gentoo-orig/lib/wbt.c   1970-01-01 03:00:00.000000000 +0300
  7544. +++ linux-4.4.6-gentoo-patched/lib/wbt.c    2016-05-04 11:03:27.412730745 +0300
  7545. @@ -0,0 +1,524 @@
  7546. +/*
  7547. + * buffered writeback throttling. losely based on CoDel. We can't drop
  7548. + * packets for IO scheduling, so the logic is something like this:
  7549. + *
  7550. + * - Monitor latencies in a defined window of time.
  7551. + * - If the minimum latency in the above window exceeds some target, increment
  7552. + *   scaling step and scale down queue depth by a factor of 2x. The monitoring
  7553. + *   window is then shrunk to 100 / sqrt(scaling step + 1).
  7554. + * - For any window where we don't have solid data on what the latencies
  7555. + *   look like, retain status quo.
  7556. + * - If latencies look good, decrement scaling step.
  7557. + *
  7558. + * Copyright (C) 2016 Jens Axboe
  7559. + *
  7560. + * Things that (may) need changing:
  7561. + *
  7562. + * - Different scaling of background/normal/high priority writeback.
  7563. + *   We may have to violate guarantees for max.
  7564. + * - We can have mismatches between the stat window and our window.
  7565. + *
  7566. + */
  7567. +#include <linux/kernel.h>
  7568. +#include <linux/blk_types.h>
  7569. +#include <linux/slab.h>
  7570. +#include <linux/backing-dev.h>
  7571. +#include <linux/wbt.h>
  7572. +
  7573. +#define CREATE_TRACE_POINTS
  7574. +#include <trace/events/wbt.h>
  7575. +
  7576. +enum {
  7577. +   /*
  7578. +    * Might need to be higher
  7579. +    */
  7580. +   RWB_MAX_DEPTH   = 64,
  7581. +
  7582. +   /*
  7583. +    * 100msec window
  7584. +    */
  7585. +   RWB_WINDOW_NSEC     = 100 * 1000 * 1000ULL,
  7586. +
  7587. +   /*
  7588. +    * Disregard stats, if we don't meet these minimums
  7589. +    */
  7590. +   RWB_MIN_WRITE_SAMPLES   = 3,
  7591. +   RWB_MIN_READ_SAMPLES    = 1,
  7592. +
  7593. +   RWB_UNKNOWN_BUMP    = 5,
  7594. +};
  7595. +
  7596. +static inline bool rwb_enabled(struct rq_wb *rwb)
  7597. +{
  7598. +   return rwb && rwb->wb_normal != 0;
  7599. +}
  7600. +
  7601. +/*
  7602. + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
  7603. + * false if 'v' + 1 would be bigger than 'below'.
  7604. + */
  7605. +static bool atomic_inc_below(atomic_t *v, int below)
  7606. +{
  7607. +   int cur = atomic_read(v);
  7608. +
  7609. +   for (;;) {
  7610. +       int old;
  7611. +
  7612. +       if (cur >= below)
  7613. +           return false;
  7614. +       old = atomic_cmpxchg(v, cur, cur + 1);
  7615. +       if (old == cur)
  7616. +           break;
  7617. +       cur = old;
  7618. +   }
  7619. +
  7620. +   return true;
  7621. +}
  7622. +
  7623. +static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
  7624. +{
  7625. +   if (rwb_enabled(rwb)) {
  7626. +       const unsigned long cur = jiffies;
  7627. +
  7628. +       if (cur != *var)
  7629. +           *var = cur;
  7630. +   }
  7631. +}
  7632. +
  7633. +void __wbt_done(struct rq_wb *rwb)
  7634. +{
  7635. +   int inflight, limit = rwb->wb_normal;
  7636. +
  7637. +   /*
  7638. +    * If the device does write back caching, drop further down
  7639. +    * before we wake people up.
  7640. +    */
  7641. +   if (rwb->wc && !atomic_read(&rwb->bdi->wb.dirty_sleeping))
  7642. +       limit = 0;
  7643. +   else
  7644. +       limit = rwb->wb_normal;
  7645. +
  7646. +   /*
  7647. +    * Don't wake anyone up if we are above the normal limit. If
  7648. +    * throttling got disabled (limit == 0) with waiters, ensure
  7649. +    * that we wake them up.
  7650. +    */
  7651. +   inflight = atomic_dec_return(&rwb->inflight);
  7652. +   if (limit && inflight >= limit) {
  7653. +       if (!rwb->wb_max)
  7654. +           wake_up_all(&rwb->wait);
  7655. +       return;
  7656. +   }
  7657. +
  7658. +   if (waitqueue_active(&rwb->wait)) {
  7659. +       int diff = limit - inflight;
  7660. +
  7661. +       if (!inflight || diff >= rwb->wb_background / 2)
  7662. +           wake_up_nr(&rwb->wait, 1);
  7663. +   }
  7664. +}
  7665. +
  7666. +/*
  7667. + * Called on completion of a request. Note that it's also called when
  7668. + * a request is merged, when the request gets freed.
  7669. + */
  7670. +void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat)
  7671. +{
  7672. +   if (!rwb)
  7673. +       return;
  7674. +
  7675. +   if (!wbt_tracked(stat)) {
  7676. +       if (rwb->sync_cookie == stat) {
  7677. +           rwb->sync_issue = 0;
  7678. +           rwb->sync_cookie = NULL;
  7679. +       }
  7680. +
  7681. +       wb_timestamp(rwb, &rwb->last_comp);
  7682. +   } else {
  7683. +       WARN_ON_ONCE(stat == rwb->sync_cookie);
  7684. +       __wbt_done(rwb);
  7685. +       wbt_clear_tracked(stat);
  7686. +   }
  7687. +}
  7688. +
  7689. +static void calc_wb_limits(struct rq_wb *rwb)
  7690. +{
  7691. +   unsigned int depth;
  7692. +
  7693. +   if (!rwb->min_lat_nsec) {
  7694. +       rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
  7695. +       return;
  7696. +   }
  7697. +
  7698. +   depth = min_t(unsigned int, RWB_MAX_DEPTH, rwb->queue_depth);
  7699. +
  7700. +   /*
  7701. +    * Reduce max depth by 50%, and re-calculate normal/bg based on that
  7702. +    */
  7703. +   rwb->wb_max = 1 + ((depth - 1) >> min(31U, rwb->scale_step));
  7704. +   rwb->wb_normal = (rwb->wb_max + 1) / 2;
  7705. +   rwb->wb_background = (rwb->wb_max + 3) / 4;
  7706. +}
  7707. +
  7708. +static bool inline stat_sample_valid(struct blk_rq_stat *stat)
  7709. +{
  7710. +   /*
  7711. +    * We need at least one read sample, and a minimum of
  7712. +    * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
  7713. +    * that it's writes impacting us, and not just some sole read on
  7714. +    * a device that is in a lower power state.
  7715. +    */
  7716. +   return stat[0].nr_samples >= 1 &&
  7717. +       stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES;
  7718. +}
  7719. +
  7720. +static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
  7721. +{
  7722. +   u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
  7723. +
  7724. +   if (!issue || !rwb->sync_cookie)
  7725. +       return 0;
  7726. +
  7727. +   now = ktime_to_ns(ktime_get());
  7728. +   return now - issue;
  7729. +}
  7730. +
  7731. +enum {
  7732. +   LAT_OK,
  7733. +   LAT_UNKNOWN,
  7734. +   LAT_EXCEEDED,
  7735. +};
  7736. +
  7737. +static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
  7738. +{
  7739. +   u64 thislat;
  7740. +
  7741. +   /*
  7742. +    * If our stored sync issue exceeds the window size, or it
  7743. +    * exceeds our min target AND we haven't logged any entries,
  7744. +    * flag the latency as exceeded.
  7745. +    */
  7746. +   thislat = rwb_sync_issue_lat(rwb);
  7747. +   if (thislat > rwb->cur_win_nsec ||
  7748. +       (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) {
  7749. +       trace_wbt_lat(rwb->bdi, thislat);
  7750. +       return LAT_EXCEEDED;
  7751. +   }
  7752. +
  7753. +   if (!stat_sample_valid(stat))
  7754. +       return LAT_UNKNOWN;
  7755. +
  7756. +   /*
  7757. +    * If the 'min' latency exceeds our target, step down.
  7758. +    */
  7759. +   if (stat[0].min > rwb->min_lat_nsec) {
  7760. +       trace_wbt_lat(rwb->bdi, stat[0].min);
  7761. +       trace_wbt_stat(rwb->bdi, stat);
  7762. +       return LAT_EXCEEDED;
  7763. +   }
  7764. +
  7765. +   if (rwb->scale_step)
  7766. +       trace_wbt_stat(rwb->bdi, stat);
  7767. +
  7768. +   return LAT_OK;
  7769. +}
  7770. +
  7771. +static int latency_exceeded(struct rq_wb *rwb)
  7772. +{
  7773. +   struct blk_rq_stat stat[2];
  7774. +
  7775. +   rwb->stat_ops->get(rwb->ops_data, stat);
  7776. +   return __latency_exceeded(rwb, stat);
  7777. +}
  7778. +
  7779. +static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
  7780. +{
  7781. +   trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
  7782. +           rwb->wb_background, rwb->wb_normal, rwb->wb_max);
  7783. +}
  7784. +
  7785. +static void scale_up(struct rq_wb *rwb)
  7786. +{
  7787. +   /*
  7788. +    * If we're at 0, we can't go lower.
  7789. +    */
  7790. +   if (!rwb->scale_step)
  7791. +       return;
  7792. +
  7793. +   rwb->scale_step--;
  7794. +   rwb->unknown_cnt = 0;
  7795. +   rwb->stat_ops->clear(rwb->ops_data);
  7796. +   calc_wb_limits(rwb);
  7797. +
  7798. +   if (waitqueue_active(&rwb->wait))
  7799. +       wake_up_all(&rwb->wait);
  7800. +
  7801. +   rwb_trace_step(rwb, "step up");
  7802. +}
  7803. +
  7804. +static void scale_down(struct rq_wb *rwb)
  7805. +{
  7806. +   /*
  7807. +    * Stop scaling down when we've hit the limit. This also prevents
  7808. +    * ->scale_step from going to crazy values, if the device can't
  7809. +    * keep up.
  7810. +    */
  7811. +   if (rwb->wb_max == 1)
  7812. +       return;
  7813. +
  7814. +   rwb->scale_step++;
  7815. +   rwb->unknown_cnt = 0;
  7816. +   rwb->stat_ops->clear(rwb->ops_data);
  7817. +   calc_wb_limits(rwb);
  7818. +   rwb_trace_step(rwb, "step down");
  7819. +}
  7820. +
  7821. +static void rwb_arm_timer(struct rq_wb *rwb)
  7822. +{
  7823. +   unsigned long expires;
  7824. +
  7825. +   /*
  7826. +    * We should speed this up, using some variant of a fast integer
  7827. +    * inverse square root calculation. Since we only do this for
  7828. +    * every window expiration, it's not a huge deal, though.
  7829. +    */
  7830. +   rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
  7831. +                   int_sqrt((rwb->scale_step + 1) << 8));
  7832. +   expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
  7833. +   mod_timer(&rwb->window_timer, expires);
  7834. +}
  7835. +
  7836. +static void wb_timer_fn(unsigned long data)
  7837. +{
  7838. +   struct rq_wb *rwb = (struct rq_wb *) data;
  7839. +   int status;
  7840. +
  7841. +   /*
  7842. +    * If we exceeded the latency target, step down. If we did not,
  7843. +    * step one level up. If we don't know enough to say either exceeded
  7844. +    * or ok, then don't do anything.
  7845. +    */
  7846. +   status = latency_exceeded(rwb);
  7847. +   switch (status) {
  7848. +   case LAT_EXCEEDED:
  7849. +       scale_down(rwb);
  7850. +       break;
  7851. +   case LAT_OK:
  7852. +       scale_up(rwb);
  7853. +       break;
  7854. +   case LAT_UNKNOWN:
  7855. +       /*
  7856. +        * We had no read samples, start bumping up the write
  7857. +        * depth slowly
  7858. +        */
  7859. +       if (++rwb->unknown_cnt >= RWB_UNKNOWN_BUMP)
  7860. +           scale_up(rwb);
  7861. +       break;
  7862. +   default:
  7863. +       break;
  7864. +   }
  7865. +
  7866. +   /*
  7867. +    * Re-arm timer, if we have IO in flight
  7868. +    */
  7869. +   if (rwb->scale_step || atomic_read(&rwb->inflight))
  7870. +       rwb_arm_timer(rwb);
  7871. +}
  7872. +
  7873. +void wbt_update_limits(struct rq_wb *rwb)
  7874. +{
  7875. +   rwb->scale_step = 0;
  7876. +   calc_wb_limits(rwb);
  7877. +
  7878. +   if (waitqueue_active(&rwb->wait))
  7879. +       wake_up_all(&rwb->wait);
  7880. +}
  7881. +
  7882. +static bool close_io(struct rq_wb *rwb)
  7883. +{
  7884. +   const unsigned long now = jiffies;
  7885. +
  7886. +   return time_before(now, rwb->last_issue + HZ / 10) ||
  7887. +       time_before(now, rwb->last_comp + HZ / 10);
  7888. +}
  7889. +
  7890. +#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO)
  7891. +
  7892. +static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
  7893. +{
  7894. +   unsigned int limit;
  7895. +
  7896. +   /*
  7897. +    * At this point we know it's a buffered write. If REQ_SYNC is
  7898. +    * set, then it's WB_SYNC_ALL writeback, and we'll use the max
  7899. +    * limit for that. If the write is marked as a background write,
  7900. +    * then use the idle limit, or go to normal if we haven't had
  7901. +    * competing IO for a bit.
  7902. +    */
  7903. +   if ((rw & REQ_HIPRIO) || atomic_read(&rwb->bdi->wb.dirty_sleeping))
  7904. +       limit = rwb->wb_max;
  7905. +   else if ((rw & REQ_BG) || close_io(rwb)) {
  7906. +       /*
  7907. +        * If less than 100ms since we completed unrelated IO,
  7908. +        * limit us to half the depth for background writeback.
  7909. +        */
  7910. +       limit = rwb->wb_background;
  7911. +   } else
  7912. +       limit = rwb->wb_normal;
  7913. +
  7914. +   return limit;
  7915. +}
  7916. +
  7917. +static inline bool may_queue(struct rq_wb *rwb, unsigned long rw)
  7918. +{
  7919. +   /*
  7920. +    * inc it here even if disabled, since we'll dec it at completion.
  7921. +    * this only happens if the task was sleeping in __wbt_wait(),
  7922. +    * and someone turned it off at the same time.
  7923. +    */
  7924. +   if (!rwb_enabled(rwb)) {
  7925. +       atomic_inc(&rwb->inflight);
  7926. +       return true;
  7927. +   }
  7928. +
  7929. +   return atomic_inc_below(&rwb->inflight, get_limit(rwb, rw));
  7930. +}
  7931. +
  7932. +/*
  7933. + * Block if we will exceed our limit, or if we are currently waiting for
  7934. + * the timer to kick off queuing again.
  7935. + */
  7936. +static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
  7937. +{
  7938. +   DEFINE_WAIT(wait);
  7939. +
  7940. +   if (may_queue(rwb, rw))
  7941. +       return;
  7942. +
  7943. +   do {
  7944. +       prepare_to_wait_exclusive(&rwb->wait, &wait,
  7945. +                       TASK_UNINTERRUPTIBLE);
  7946. +
  7947. +       if (may_queue(rwb, rw))
  7948. +           break;
  7949. +
  7950. +       if (lock)
  7951. +           spin_unlock_irq(lock);
  7952. +
  7953. +       io_schedule();
  7954. +
  7955. +       if (lock)
  7956. +           spin_lock_irq(lock);
  7957. +   } while (1);
  7958. +
  7959. +   finish_wait(&rwb->wait, &wait);
  7960. +}
  7961. +
  7962. +static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw)
  7963. +{
  7964. +   /*
  7965. +    * If not a WRITE (or a discard), do nothing
  7966. +    */
  7967. +   if (!(rw & REQ_WRITE) || (rw & REQ_DISCARD))
  7968. +       return false;
  7969. +
  7970. +   /*
  7971. +    * Don't throttle WRITE_ODIRECT
  7972. +    */
  7973. +   if ((rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC)
  7974. +       return false;
  7975. +
  7976. +   return true;
  7977. +}
  7978. +
  7979. +/*
  7980. + * Returns true if the IO request should be accounted, false if not.
  7981. + * May sleep, if we have exceeded the writeback limits. Caller can pass
  7982. + * in an irq held spinlock, if it holds one when calling this function.
  7983. + * If we do sleep, we'll release and re-grab it.
  7984. + */
  7985. +bool wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock)
  7986. +{
  7987. +   if (!rwb_enabled(rwb))
  7988. +       return false;
  7989. +
  7990. +   if (!wbt_should_throttle(rwb, rw)) {
  7991. +       wb_timestamp(rwb, &rwb->last_issue);
  7992. +       return false;
  7993. +   }
  7994. +
  7995. +   __wbt_wait(rwb, rw, lock);
  7996. +
  7997. +   if (!timer_pending(&rwb->window_timer))
  7998. +       rwb_arm_timer(rwb);
  7999. +
  8000. +   return true;
  8001. +}
  8002. +
  8003. +void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat)
  8004. +{
  8005. +   if (!rwb_enabled(rwb))
  8006. +       return;
  8007. +
  8008. +   wbt_issue_stat_set_time(stat);
  8009. +
  8010. +   if (!wbt_tracked(stat) && !rwb->sync_issue) {
  8011. +       rwb->sync_cookie = stat;
  8012. +       rwb->sync_issue = wbt_issue_stat_get_time(stat);
  8013. +   }
  8014. +}
  8015. +
  8016. +void wbt_requeue(struct rq_wb *rwb, struct wb_issue_stat *stat)
  8017. +{
  8018. +   if (!rwb_enabled(rwb))
  8019. +       return;
  8020. +   if (stat == rwb->sync_cookie) {
  8021. +       rwb->sync_issue = 0;
  8022. +       rwb->sync_cookie = NULL;
  8023. +   }
  8024. +}
  8025. +
  8026. +void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
  8027. +{
  8028. +   if (rwb) {
  8029. +       rwb->queue_depth = depth;
  8030. +       wbt_update_limits(rwb);
  8031. +   }
  8032. +}
  8033. +
  8034. +void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
  8035. +{
  8036. +   if (rwb)
  8037. +       rwb->wc = write_cache_on;
  8038. +}
  8039. +
  8040. +struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops,
  8041. +              void *ops_data)
  8042. +{
  8043. +   struct rq_wb *rwb;
  8044. +
  8045. +   rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
  8046. +   if (!rwb)
  8047. +       return ERR_PTR(-ENOMEM);
  8048. +
  8049. +   atomic_set(&rwb->inflight, 0);
  8050. +   init_waitqueue_head(&rwb->wait);
  8051. +   setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
  8052. +   rwb->wc = 1;
  8053. +   rwb->queue_depth = RWB_MAX_DEPTH;
  8054. +   rwb->last_comp = rwb->last_issue = jiffies;
  8055. +   rwb->bdi = bdi;
  8056. +   rwb->win_nsec = RWB_WINDOW_NSEC;
  8057. +   rwb->stat_ops = ops,
  8058. +   rwb->ops_data = ops_data;
  8059. +   wbt_update_limits(rwb);
  8060. +   return rwb;
  8061. +}
  8062. +
  8063. +void wbt_exit(struct rq_wb *rwb)
  8064. +{
  8065. +   if (rwb) {
  8066. +       del_timer_sync(&rwb->window_timer);
  8067. +       kfree(rwb);
  8068. +   }
  8069. +}
  8070. diff -Naur linux-4.4.6-gentoo-orig/mm/backing-dev.c linux-4.4.6-gentoo-patched/mm/backing-dev.c
  8071. --- linux-4.4.6-gentoo-orig/mm/backing-dev.c    2016-05-04 11:19:37.620649827 +0300
  8072. +++ linux-4.4.6-gentoo-patched/mm/backing-dev.c 2016-05-04 11:03:27.412730745 +0300
  8073. @@ -310,6 +310,7 @@
  8074.     spin_lock_init(&wb->work_lock);
  8075.     INIT_LIST_HEAD(&wb->work_list);
  8076.     INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
  8077. +   atomic_set(&wb->dirty_sleeping, 0);
  8078.  
  8079.     wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
  8080.     if (!wb->congested)
  8081. diff -Naur linux-4.4.6-gentoo-orig/mm/page-writeback.c linux-4.4.6-gentoo-patched/mm/page-writeback.c
  8082. --- linux-4.4.6-gentoo-orig/mm/page-writeback.c 2016-05-04 11:19:37.621649827 +0300
  8083. +++ linux-4.4.6-gentoo-patched/mm/page-writeback.c  2016-05-04 11:03:27.412730745 +0300
  8084. @@ -1735,7 +1735,9 @@
  8085.                       pause,
  8086.                       start_time);
  8087.         __set_current_state(TASK_KILLABLE);
  8088. +       atomic_inc(&wb->dirty_sleeping);
  8089.         io_schedule_timeout(pause);
  8090. +       atomic_dec(&wb->dirty_sleeping);
  8091.  
  8092.         current->dirty_paused_when = now + pause;
  8093.         current->nr_dirtied = 0;
Add Comment
Please, Sign In to add comment