Advertisement
Guest User

BLD_fixed.3.3.7.patch

a guest
May 23rd, 2012
297
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 161.37 KB | None | 0 0
  1. diff -Naur linux-3.3.7.orig/init/Kconfig linux-3.3.7/init/Kconfig
  2. --- linux-3.3.7.orig/init/Kconfig   2012-05-21 21:42:51.000000000 +0300
  3. +++ linux-3.3.7/init/Kconfig    2012-05-23 21:52:22.000000000 +0300
  4. @@ -68,6 +68,14 @@
  5.     depends on BROKEN || !SMP
  6.     default y
  7.  
  8. +config BLD
  9. +   bool "An alternate CPU load distributor"
  10. +   depends on EXPERIMENTAL && SMP
  11. +   default y
  12. +   help
  13. +     This is an alternate CPU load distribution technique based
  14. +     on The Barbershop Load Distribution algorithm.
  15. +
  16.  config INIT_ENV_ARG_LIMIT
  17.     int
  18.     default 32 if !UML
  19. diff -Naur linux-3.3.7.orig/kernel/sched/bld.h linux-3.3.7/kernel/sched/bld.h
  20. --- linux-3.3.7.orig/kernel/sched/bld.h 1970-01-01 03:00:00.000000000 +0300
  21. +++ linux-3.3.7/kernel/sched/bld.h  2012-05-23 21:52:22.000000000 +0300
  22. @@ -0,0 +1,112 @@
  23. +#ifdef CONFIG_BLD
  24. +
  25. +static DEFINE_RWLOCK(disp_list_lock);
  26. +static LIST_HEAD(rq_head);
  27. +
  28. +static inline int list_is_first(const struct list_head *list,
  29. +               const struct list_head *head)
  30. +{
  31. +   return list == head->next;
  32. +}
  33. +
  34. +static inline int select_cpu_for_wakeup(struct task_struct *p, int sd_flags, int wake_flags)
  35. +{
  36. +   int cpu = smp_processor_id(), prev_cpu = task_cpu(p), i;
  37. +   /*bool sync = wake_flags & WF_SYNC; */
  38. +   unsigned long load, min_load = ULONG_MAX;
  39. +   struct cpumask *mask;
  40. +
  41. +   if (wake_flags & WF_SYNC) {
  42. +       if (cpu == prev_cpu)
  43. +           return cpu;
  44. +       mask = sched_group_cpus(cpu_rq(prev_cpu)->sd->groups);
  45. +   } else
  46. +       mask = sched_domain_span(cpu_rq(prev_cpu)->sd);
  47. +  
  48. +   for_each_cpu(i, mask) {
  49. +       load = cpu_rq(i)->load.weight;
  50. +       if (load < min_load) {
  51. +           min_load = load;
  52. +           cpu = i;
  53. +       }
  54. +   }
  55. +   return cpu;
  56. +}
  57. +
  58. +static int bld_select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
  59. +{
  60. +   struct rq *tmp;
  61. +   unsigned long flag;
  62. +   unsigned int cpu = smp_processor_id();
  63. +
  64. +   if (&p->cpus_allowed) {
  65. +       struct cpumask *taskmask;
  66. +       unsigned long min_load = ULONG_MAX, load, i;
  67. +       taskmask = tsk_cpus_allowed(p);
  68. +       for_each_cpu(i, taskmask) {
  69. +           load = cpu_rq(i)->load.weight;
  70. +           if (load < min_load) {
  71. +               min_load = load;
  72. +               cpu = i;
  73. +           }
  74. +       }
  75. +   } else  if (sd_flags & SD_BALANCE_WAKE) {
  76. +       cpu = select_cpu_for_wakeup(p, sd_flags, wake_flags);
  77. +       return cpu;
  78. +   } else {
  79. +       read_lock_irqsave(&disp_list_lock, flag);
  80. +       list_for_each_entry(tmp, &rq_head, disp_load_balance) {
  81. +           cpu = cpu_of(tmp);
  82. +           if (cpu_online(cpu))
  83. +               break;
  84. +       }
  85. +       read_unlock_irqrestore(&disp_list_lock, flag);
  86. +   }
  87. +   return cpu;
  88. +}
  89. +
  90. +static void bld_track_load_activate(struct rq *rq)
  91. +{
  92. +   unsigned long  flag;
  93. +   rq->this_cpu_load = rq->load.weight;
  94. +
  95. +   if (rq->pos != 2) { /* if rq isn't the last one */
  96. +       struct rq *last;
  97. +       write_lock_irqsave(&disp_list_lock, flag);
  98. +       last = list_entry(rq_head.prev, struct rq, disp_load_balance);
  99. +       if (rq->this_cpu_load > last->this_cpu_load) {
  100. +           list_del(&rq->disp_load_balance);
  101. +           list_add_tail(&rq->disp_load_balance, &rq_head);
  102. +           rq->pos = 2; last->pos = 1;
  103. +       }
  104. +       write_unlock_irqrestore(&disp_list_lock, flag);
  105. +   }
  106. +}
  107. +
  108. +static void bld_track_load_deactivate(struct rq *rq)
  109. +{
  110. +   unsigned long flag;
  111. +  
  112. +   rq->this_cpu_load = rq->load.weight;
  113. +
  114. +   if (rq->pos != 0) { /* If rq isn't first one */
  115. +       struct rq *first;
  116. +       first = list_entry(rq_head.prev, struct rq, disp_load_balance);
  117. +       write_lock_irqsave(&disp_list_lock, flag);
  118. +       if (rq->this_cpu_load <= first->this_cpu_load) {
  119. +           list_del(&rq->disp_load_balance);
  120. +           list_add_tail(&rq->disp_load_balance, &rq_head);
  121. +           rq->pos = 0; first->pos = 1;
  122. +       }
  123. +       write_unlock_irqrestore(&disp_list_lock, flag);
  124. +   }
  125. +}
  126. +#else
  127. +static inline void bld_track_load_activate(struct rq *rq)
  128. +{
  129. +}
  130. +
  131. +static inline void bld_track_load_deactivate(struct rq *rq)
  132. +{
  133. +}
  134. +#endif /* CONFIG_BLD */
  135. diff -Naur linux-3.3.7.orig/kernel/sched/core.c linux-3.3.7/kernel/sched/core.c
  136. --- linux-3.3.7.orig/kernel/sched/core.c    2012-05-21 21:42:51.000000000 +0300
  137. +++ linux-3.3.7/kernel/sched/core.c 2012-05-23 21:52:22.000000000 +0300
  138. @@ -24,6 +24,8 @@
  139.   *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  140.   *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
  141.   *              Thomas Gleixner, Mike Kravetz
  142. + *  2012-Feb   The Barbershop Load Distribution (BLD) algorithm, an alternate
  143. + *             load distribution algorithm by Rakib Mullick.
  144.   */
  145.  
  146.  #include <linux/mm.h>
  147. @@ -81,6 +83,7 @@
  148.  
  149.  #include "sched.h"
  150.  #include "../workqueue_sched.h"
  151. +#include "bld.h"
  152.  
  153.  #define CREATE_TRACE_POINTS
  154.  #include <trace/events/sched.h>
  155. @@ -578,6 +581,7 @@
  156.   */
  157.  void wake_up_idle_cpu(int cpu)
  158.  {
  159. +#ifndef CONFIG_BLD
  160.     struct rq *rq = cpu_rq(cpu);
  161.  
  162.     if (cpu == smp_processor_id())
  163. @@ -604,6 +608,7 @@
  164.     smp_mb();
  165.     if (!tsk_is_polling(rq->idle))
  166.         smp_send_reschedule(cpu);
  167. +#endif
  168.  }
  169.  
  170.  static inline bool got_nohz_idle_kick(void)
  171. @@ -730,6 +735,7 @@
  172.         rq->nr_uninterruptible--;
  173.  
  174.     enqueue_task(rq, p, flags);
  175. +   bld_track_load_activate(rq);
  176.  }
  177.  
  178.  void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
  179. @@ -738,6 +744,7 @@
  180.         rq->nr_uninterruptible++;
  181.  
  182.     dequeue_task(rq, p, flags);
  183. +   bld_track_load_deactivate(rq);
  184.  }
  185.  
  186.  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
  187. @@ -1297,7 +1304,12 @@
  188.  static inline
  189.  int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
  190.  {
  191. -   int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
  192. +   int cpu;
  193. +#ifdef CONFIG_BLD
  194. +   cpu = bld_select_task_rq(p, sd_flags, wake_flags);
  195. +#else
  196. +   cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
  197. +#endif
  198.  
  199.     /*
  200.      * In order not to call set_task_cpu() on a blocking task we need
  201. @@ -1453,7 +1465,11 @@
  202.  
  203.  void scheduler_ipi(void)
  204.  {
  205. +#ifndef CONFIG_BLD
  206.     if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
  207. +#else
  208. +   if (llist_empty(&this_rq()->wake_list))
  209. +#endif
  210.         return;
  211.  
  212.     /*
  213. @@ -1475,10 +1491,12 @@
  214.     /*
  215.      * Check if someone kicked us for doing the nohz idle load balance.
  216.      */
  217. +#ifndef CONFIG_BLD
  218.     if (unlikely(got_nohz_idle_kick() && !need_resched())) {
  219.         this_rq()->idle_balance = 1;
  220.         raise_softirq_irqoff(SCHED_SOFTIRQ);
  221.     }
  222. +#endif
  223.     irq_exit();
  224.  }
  225.  
  226. @@ -1518,12 +1536,14 @@
  227.     struct rq *rq = cpu_rq(cpu);
  228.  
  229.  #if defined(CONFIG_SMP)
  230. +#ifndef CONFIG_BLD
  231.     if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
  232.         sched_clock_cpu(cpu); /* sync clocks x-cpu */
  233.         ttwu_queue_remote(p, cpu);
  234.         return;
  235.     }
  236.  #endif
  237. +#endif
  238.  
  239.     raw_spin_lock(&rq->lock);
  240.     ttwu_do_activate(rq, p, 0);
  241. @@ -2268,6 +2288,7 @@
  242.   */
  243.  static void calc_global_nohz(void)
  244.  {
  245. +#ifndef CONFIG_BLD
  246.     long delta, active, n;
  247.  
  248.     /*
  249. @@ -2300,6 +2321,7 @@
  250.     avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
  251.  
  252.     calc_load_update += n * LOAD_FREQ;
  253. +#endif
  254.  }
  255.  #else
  256.  void calc_load_account_idle(struct rq *this_rq)
  257. @@ -3001,8 +3023,10 @@
  258.  
  259.  #ifdef CONFIG_SMP
  260.     rq->idle_balance = idle_cpu(cpu);
  261. +#ifndef CONFIG_BLD
  262.     trigger_load_balance(rq, cpu);
  263.  #endif
  264. +#endif
  265.  }
  266.  
  267.  notrace unsigned long get_parent_ip(unsigned long addr)
  268. @@ -3191,10 +3215,10 @@
  269.     }
  270.  
  271.     pre_schedule(rq, prev);
  272. -
  273. +#ifndef CONFIG_BLD
  274.     if (unlikely(!rq->nr_running))
  275.         idle_balance(cpu, rq);
  276. -
  277. +#endif
  278.     put_prev_task(rq, prev);
  279.     next = pick_next_task(rq);
  280.     clear_tsk_need_resched(prev);
  281. @@ -6946,6 +6970,11 @@
  282.  #endif
  283.         init_rq_hrtick(rq);
  284.         atomic_set(&rq->nr_iowait, 0);
  285. +#ifdef CONFIG_BLD
  286. +       INIT_LIST_HEAD(&rq->disp_load_balance);
  287. +       list_add_tail(&rq->disp_load_balance, &rq_head);
  288. +       rq->pos = 0;
  289. +#endif
  290.     }
  291.  
  292.     set_load_weight(&init_task);
  293. diff -Naur linux-3.3.7.orig/kernel/sched/fair.c linux-3.3.7/kernel/sched/fair.c
  294. --- linux-3.3.7.orig/kernel/sched/fair.c    2012-05-21 21:42:51.000000000 +0300
  295. +++ linux-3.3.7/kernel/sched/fair.c 2012-05-23 21:52:34.000000000 +0300
  296. @@ -5611,7 +5611,9 @@
  297.  __init void init_sched_fair_class(void)
  298.  {
  299.  #ifdef CONFIG_SMP
  300. +#ifndef CONFIG_BLD
  301.     open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
  302. +#endif /* BLD */
  303.  
  304.  #ifdef CONFIG_NO_HZ
  305.     zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
  306. diff -Naur linux-3.3.7.orig/kernel/sched/fair.c.orig linux-3.3.7/kernel/sched/fair.c.orig
  307. --- linux-3.3.7.orig/kernel/sched/fair.c.orig   1970-01-01 03:00:00.000000000 +0300
  308. +++ linux-3.3.7/kernel/sched/fair.c.orig    2012-05-23 21:52:34.000000000 +0300
  309. @@ -0,0 +1,5622 @@
  310. +/*
  311. + * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
  312. + *
  313. + *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  314. + *
  315. + *  Interactivity improvements by Mike Galbraith
  316. + *  (C) 2007 Mike Galbraith <efault@gmx.de>
  317. + *
  318. + *  Various enhancements by Dmitry Adamushko.
  319. + *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  320. + *
  321. + *  Group scheduling enhancements by Srivatsa Vaddagiri
  322. + *  Copyright IBM Corporation, 2007
  323. + *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  324. + *
  325. + *  Scaled math optimizations by Thomas Gleixner
  326. + *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  327. + *
  328. + *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  329. + *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  330. + */
  331. +
  332. +#include <linux/latencytop.h>
  333. +#include <linux/sched.h>
  334. +#include <linux/cpumask.h>
  335. +#include <linux/slab.h>
  336. +#include <linux/profile.h>
  337. +#include <linux/interrupt.h>
  338. +
  339. +#include <trace/events/sched.h>
  340. +
  341. +#include "sched.h"
  342. +
  343. +/*
  344. + * Targeted preemption latency for CPU-bound tasks:
  345. + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  346. + *
  347. + * NOTE: this latency value is not the same as the concept of
  348. + * 'timeslice length' - timeslices in CFS are of variable length
  349. + * and have no persistent notion like in traditional, time-slice
  350. + * based scheduling concepts.
  351. + *
  352. + * (to see the precise effective timeslice length of your workload,
  353. + *  run vmstat and monitor the context-switches (cs) field)
  354. + */
  355. +unsigned int sysctl_sched_latency = 6000000ULL;
  356. +unsigned int normalized_sysctl_sched_latency = 6000000ULL;
  357. +
  358. +/*
  359. + * The initial- and re-scaling of tunables is configurable
  360. + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  361. + *
  362. + * Options are:
  363. + * SCHED_TUNABLESCALING_NONE - unscaled, always *1
  364. + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
  365. + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  366. + */
  367. +enum sched_tunable_scaling sysctl_sched_tunable_scaling
  368. +   = SCHED_TUNABLESCALING_LOG;
  369. +
  370. +/*
  371. + * Minimal preemption granularity for CPU-bound tasks:
  372. + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  373. + */
  374. +unsigned int sysctl_sched_min_granularity = 750000ULL;
  375. +unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
  376. +
  377. +/*
  378. + * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  379. + */
  380. +static unsigned int sched_nr_latency = 8;
  381. +
  382. +/*
  383. + * After fork, child runs first. If set to 0 (default) then
  384. + * parent will (try to) run first.
  385. + */
  386. +unsigned int sysctl_sched_child_runs_first __read_mostly;
  387. +
  388. +/*
  389. + * SCHED_OTHER wake-up granularity.
  390. + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  391. + *
  392. + * This option delays the preemption effects of decoupled workloads
  393. + * and reduces their over-scheduling. Synchronous workloads will still
  394. + * have immediate wakeup/sleep latencies.
  395. + */
  396. +unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
  397. +unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
  398. +
  399. +const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
  400. +
  401. +/*
  402. + * The exponential sliding  window over which load is averaged for shares
  403. + * distribution.
  404. + * (default: 10msec)
  405. + */
  406. +unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
  407. +
  408. +#ifdef CONFIG_CFS_BANDWIDTH
  409. +/*
  410. + * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
  411. + * each time a cfs_rq requests quota.
  412. + *
  413. + * Note: in the case that the slice exceeds the runtime remaining (either due
  414. + * to consumption or the quota being specified to be smaller than the slice)
  415. + * we will always only issue the remaining available time.
  416. + *
  417. + * default: 5 msec, units: microseconds
  418. +  */
  419. +unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
  420. +#endif
  421. +
  422. +/*
  423. + * Increase the granularity value when there are more CPUs,
  424. + * because with more CPUs the 'effective latency' as visible
  425. + * to users decreases. But the relationship is not linear,
  426. + * so pick a second-best guess by going with the log2 of the
  427. + * number of CPUs.
  428. + *
  429. + * This idea comes from the SD scheduler of Con Kolivas:
  430. + */
  431. +static int get_update_sysctl_factor(void)
  432. +{
  433. +   unsigned int cpus = min_t(int, num_online_cpus(), 8);
  434. +   unsigned int factor;
  435. +
  436. +   switch (sysctl_sched_tunable_scaling) {
  437. +   case SCHED_TUNABLESCALING_NONE:
  438. +       factor = 1;
  439. +       break;
  440. +   case SCHED_TUNABLESCALING_LINEAR:
  441. +       factor = cpus;
  442. +       break;
  443. +   case SCHED_TUNABLESCALING_LOG:
  444. +   default:
  445. +       factor = 1 + ilog2(cpus);
  446. +       break;
  447. +   }
  448. +
  449. +   return factor;
  450. +}
  451. +
  452. +static void update_sysctl(void)
  453. +{
  454. +   unsigned int factor = get_update_sysctl_factor();
  455. +
  456. +#define SET_SYSCTL(name) \
  457. +   (sysctl_##name = (factor) * normalized_sysctl_##name)
  458. +   SET_SYSCTL(sched_min_granularity);
  459. +   SET_SYSCTL(sched_latency);
  460. +   SET_SYSCTL(sched_wakeup_granularity);
  461. +#undef SET_SYSCTL
  462. +}
  463. +
  464. +void sched_init_granularity(void)
  465. +{
  466. +   update_sysctl();
  467. +}
  468. +
  469. +#if BITS_PER_LONG == 32
  470. +# define WMULT_CONST   (~0UL)
  471. +#else
  472. +# define WMULT_CONST   (1UL << 32)
  473. +#endif
  474. +
  475. +#define WMULT_SHIFT    32
  476. +
  477. +/*
  478. + * Shift right and round:
  479. + */
  480. +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
  481. +
  482. +/*
  483. + * delta *= weight / lw
  484. + */
  485. +static unsigned long
  486. +calc_delta_mine(unsigned long delta_exec, unsigned long weight,
  487. +       struct load_weight *lw)
  488. +{
  489. +   u64 tmp;
  490. +
  491. +   /*
  492. +    * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
  493. +    * entities since MIN_SHARES = 2. Treat weight as 1 if less than
  494. +    * 2^SCHED_LOAD_RESOLUTION.
  495. +    */
  496. +   if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
  497. +       tmp = (u64)delta_exec * scale_load_down(weight);
  498. +   else
  499. +       tmp = (u64)delta_exec;
  500. +
  501. +   if (!lw->inv_weight) {
  502. +       unsigned long w = scale_load_down(lw->weight);
  503. +
  504. +       if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
  505. +           lw->inv_weight = 1;
  506. +       else if (unlikely(!w))
  507. +           lw->inv_weight = WMULT_CONST;
  508. +       else
  509. +           lw->inv_weight = WMULT_CONST / w;
  510. +   }
  511. +
  512. +   /*
  513. +    * Check whether we'd overflow the 64-bit multiplication:
  514. +    */
  515. +   if (unlikely(tmp > WMULT_CONST))
  516. +       tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
  517. +           WMULT_SHIFT/2);
  518. +   else
  519. +       tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
  520. +
  521. +   return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
  522. +}
  523. +
  524. +
  525. +const struct sched_class fair_sched_class;
  526. +
  527. +/**************************************************************
  528. + * CFS operations on generic schedulable entities:
  529. + */
  530. +
  531. +#ifdef CONFIG_FAIR_GROUP_SCHED
  532. +
  533. +/* cpu runqueue to which this cfs_rq is attached */
  534. +static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  535. +{
  536. +   return cfs_rq->rq;
  537. +}
  538. +
  539. +/* An entity is a task if it doesn't "own" a runqueue */
  540. +#define entity_is_task(se) (!se->my_q)
  541. +
  542. +static inline struct task_struct *task_of(struct sched_entity *se)
  543. +{
  544. +#ifdef CONFIG_SCHED_DEBUG
  545. +   WARN_ON_ONCE(!entity_is_task(se));
  546. +#endif
  547. +   return container_of(se, struct task_struct, se);
  548. +}
  549. +
  550. +/* Walk up scheduling entities hierarchy */
  551. +#define for_each_sched_entity(se) \
  552. +       for (; se; se = se->parent)
  553. +
  554. +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
  555. +{
  556. +   return p->se.cfs_rq;
  557. +}
  558. +
  559. +/* runqueue on which this entity is (to be) queued */
  560. +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
  561. +{
  562. +   return se->cfs_rq;
  563. +}
  564. +
  565. +/* runqueue "owned" by this group */
  566. +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
  567. +{
  568. +   return grp->my_q;
  569. +}
  570. +
  571. +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  572. +{
  573. +   if (!cfs_rq->on_list) {
  574. +       /*
  575. +        * Ensure we either appear before our parent (if already
  576. +        * enqueued) or force our parent to appear after us when it is
  577. +        * enqueued.  The fact that we always enqueue bottom-up
  578. +        * reduces this to two cases.
  579. +        */
  580. +       if (cfs_rq->tg->parent &&
  581. +           cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
  582. +           list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
  583. +               &rq_of(cfs_rq)->leaf_cfs_rq_list);
  584. +       } else {
  585. +           list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
  586. +               &rq_of(cfs_rq)->leaf_cfs_rq_list);
  587. +       }
  588. +
  589. +       cfs_rq->on_list = 1;
  590. +   }
  591. +}
  592. +
  593. +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  594. +{
  595. +   if (cfs_rq->on_list) {
  596. +       list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
  597. +       cfs_rq->on_list = 0;
  598. +   }
  599. +}
  600. +
  601. +/* Iterate thr' all leaf cfs_rq's on a runqueue */
  602. +#define for_each_leaf_cfs_rq(rq, cfs_rq) \
  603. +   list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
  604. +
  605. +/* Do the two (enqueued) entities belong to the same group ? */
  606. +static inline int
  607. +is_same_group(struct sched_entity *se, struct sched_entity *pse)
  608. +{
  609. +   if (se->cfs_rq == pse->cfs_rq)
  610. +       return 1;
  611. +
  612. +   return 0;
  613. +}
  614. +
  615. +static inline struct sched_entity *parent_entity(struct sched_entity *se)
  616. +{
  617. +   return se->parent;
  618. +}
  619. +
  620. +/* return depth at which a sched entity is present in the hierarchy */
  621. +static inline int depth_se(struct sched_entity *se)
  622. +{
  623. +   int depth = 0;
  624. +
  625. +   for_each_sched_entity(se)
  626. +       depth++;
  627. +
  628. +   return depth;
  629. +}
  630. +
  631. +static void
  632. +find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  633. +{
  634. +   int se_depth, pse_depth;
  635. +
  636. +   /*
  637. +    * preemption test can be made between sibling entities who are in the
  638. +    * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
  639. +    * both tasks until we find their ancestors who are siblings of common
  640. +    * parent.
  641. +    */
  642. +
  643. +   /* First walk up until both entities are at same depth */
  644. +   se_depth = depth_se(*se);
  645. +   pse_depth = depth_se(*pse);
  646. +
  647. +   while (se_depth > pse_depth) {
  648. +       se_depth--;
  649. +       *se = parent_entity(*se);
  650. +   }
  651. +
  652. +   while (pse_depth > se_depth) {
  653. +       pse_depth--;
  654. +       *pse = parent_entity(*pse);
  655. +   }
  656. +
  657. +   while (!is_same_group(*se, *pse)) {
  658. +       *se = parent_entity(*se);
  659. +       *pse = parent_entity(*pse);
  660. +   }
  661. +}
  662. +
  663. +#else  /* !CONFIG_FAIR_GROUP_SCHED */
  664. +
  665. +static inline struct task_struct *task_of(struct sched_entity *se)
  666. +{
  667. +   return container_of(se, struct task_struct, se);
  668. +}
  669. +
  670. +static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  671. +{
  672. +   return container_of(cfs_rq, struct rq, cfs);
  673. +}
  674. +
  675. +#define entity_is_task(se) 1
  676. +
  677. +#define for_each_sched_entity(se) \
  678. +       for (; se; se = NULL)
  679. +
  680. +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
  681. +{
  682. +   return &task_rq(p)->cfs;
  683. +}
  684. +
  685. +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
  686. +{
  687. +   struct task_struct *p = task_of(se);
  688. +   struct rq *rq = task_rq(p);
  689. +
  690. +   return &rq->cfs;
  691. +}
  692. +
  693. +/* runqueue "owned" by this group */
  694. +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
  695. +{
  696. +   return NULL;
  697. +}
  698. +
  699. +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  700. +{
  701. +}
  702. +
  703. +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  704. +{
  705. +}
  706. +
  707. +#define for_each_leaf_cfs_rq(rq, cfs_rq) \
  708. +       for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
  709. +
  710. +static inline int
  711. +is_same_group(struct sched_entity *se, struct sched_entity *pse)
  712. +{
  713. +   return 1;
  714. +}
  715. +
  716. +static inline struct sched_entity *parent_entity(struct sched_entity *se)
  717. +{
  718. +   return NULL;
  719. +}
  720. +
  721. +static inline void
  722. +find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  723. +{
  724. +}
  725. +
  726. +#endif /* CONFIG_FAIR_GROUP_SCHED */
  727. +
  728. +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
  729. +                  unsigned long delta_exec);
  730. +
  731. +/**************************************************************
  732. + * Scheduling class tree data structure manipulation methods:
  733. + */
  734. +
  735. +static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
  736. +{
  737. +   s64 delta = (s64)(vruntime - min_vruntime);
  738. +   if (delta > 0)
  739. +       min_vruntime = vruntime;
  740. +
  741. +   return min_vruntime;
  742. +}
  743. +
  744. +static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
  745. +{
  746. +   s64 delta = (s64)(vruntime - min_vruntime);
  747. +   if (delta < 0)
  748. +       min_vruntime = vruntime;
  749. +
  750. +   return min_vruntime;
  751. +}
  752. +
  753. +static inline int entity_before(struct sched_entity *a,
  754. +               struct sched_entity *b)
  755. +{
  756. +   return (s64)(a->vruntime - b->vruntime) < 0;
  757. +}
  758. +
  759. +static void update_min_vruntime(struct cfs_rq *cfs_rq)
  760. +{
  761. +   u64 vruntime = cfs_rq->min_vruntime;
  762. +
  763. +   if (cfs_rq->curr)
  764. +       vruntime = cfs_rq->curr->vruntime;
  765. +
  766. +   if (cfs_rq->rb_leftmost) {
  767. +       struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
  768. +                          struct sched_entity,
  769. +                          run_node);
  770. +
  771. +       if (!cfs_rq->curr)
  772. +           vruntime = se->vruntime;
  773. +       else
  774. +           vruntime = min_vruntime(vruntime, se->vruntime);
  775. +   }
  776. +
  777. +   cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
  778. +#ifndef CONFIG_64BIT
  779. +   smp_wmb();
  780. +   cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
  781. +#endif
  782. +}
  783. +
  784. +/*
  785. + * Enqueue an entity into the rb-tree:
  786. + */
  787. +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  788. +{
  789. +   struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
  790. +   struct rb_node *parent = NULL;
  791. +   struct sched_entity *entry;
  792. +   int leftmost = 1;
  793. +
  794. +   /*
  795. +    * Find the right place in the rbtree:
  796. +    */
  797. +   while (*link) {
  798. +       parent = *link;
  799. +       entry = rb_entry(parent, struct sched_entity, run_node);
  800. +       /*
  801. +        * We dont care about collisions. Nodes with
  802. +        * the same key stay together.
  803. +        */
  804. +       if (entity_before(se, entry)) {
  805. +           link = &parent->rb_left;
  806. +       } else {
  807. +           link = &parent->rb_right;
  808. +           leftmost = 0;
  809. +       }
  810. +   }
  811. +
  812. +   /*
  813. +    * Maintain a cache of leftmost tree entries (it is frequently
  814. +    * used):
  815. +    */
  816. +   if (leftmost)
  817. +       cfs_rq->rb_leftmost = &se->run_node;
  818. +
  819. +   rb_link_node(&se->run_node, parent, link);
  820. +   rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
  821. +}
  822. +
  823. +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  824. +{
  825. +   if (cfs_rq->rb_leftmost == &se->run_node) {
  826. +       struct rb_node *next_node;
  827. +
  828. +       next_node = rb_next(&se->run_node);
  829. +       cfs_rq->rb_leftmost = next_node;
  830. +   }
  831. +
  832. +   rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
  833. +}
  834. +
  835. +struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
  836. +{
  837. +   struct rb_node *left = cfs_rq->rb_leftmost;
  838. +
  839. +   if (!left)
  840. +       return NULL;
  841. +
  842. +   return rb_entry(left, struct sched_entity, run_node);
  843. +}
  844. +
  845. +static struct sched_entity *__pick_next_entity(struct sched_entity *se)
  846. +{
  847. +   struct rb_node *next = rb_next(&se->run_node);
  848. +
  849. +   if (!next)
  850. +       return NULL;
  851. +
  852. +   return rb_entry(next, struct sched_entity, run_node);
  853. +}
  854. +
  855. +#ifdef CONFIG_SCHED_DEBUG
  856. +struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
  857. +{
  858. +   struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
  859. +
  860. +   if (!last)
  861. +       return NULL;
  862. +
  863. +   return rb_entry(last, struct sched_entity, run_node);
  864. +}
  865. +
  866. +/**************************************************************
  867. + * Scheduling class statistics methods:
  868. + */
  869. +
  870. +int sched_proc_update_handler(struct ctl_table *table, int write,
  871. +       void __user *buffer, size_t *lenp,
  872. +       loff_t *ppos)
  873. +{
  874. +   int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
  875. +   int factor = get_update_sysctl_factor();
  876. +
  877. +   if (ret || !write)
  878. +       return ret;
  879. +
  880. +   sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
  881. +                   sysctl_sched_min_granularity);
  882. +
  883. +#define WRT_SYSCTL(name) \
  884. +   (normalized_sysctl_##name = sysctl_##name / (factor))
  885. +   WRT_SYSCTL(sched_min_granularity);
  886. +   WRT_SYSCTL(sched_latency);
  887. +   WRT_SYSCTL(sched_wakeup_granularity);
  888. +#undef WRT_SYSCTL
  889. +
  890. +   return 0;
  891. +}
  892. +#endif
  893. +
  894. +/*
  895. + * delta /= w
  896. + */
  897. +static inline unsigned long
  898. +calc_delta_fair(unsigned long delta, struct sched_entity *se)
  899. +{
  900. +   if (unlikely(se->load.weight != NICE_0_LOAD))
  901. +       delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
  902. +
  903. +   return delta;
  904. +}
  905. +
  906. +/*
  907. + * The idea is to set a period in which each task runs once.
  908. + *
  909. + * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
  910. + * this period because otherwise the slices get too small.
  911. + *
  912. + * p = (nr <= nl) ? l : l*nr/nl
  913. + */
  914. +static u64 __sched_period(unsigned long nr_running)
  915. +{
  916. +   u64 period = sysctl_sched_latency;
  917. +   unsigned long nr_latency = sched_nr_latency;
  918. +
  919. +   if (unlikely(nr_running > nr_latency)) {
  920. +       period = sysctl_sched_min_granularity;
  921. +       period *= nr_running;
  922. +   }
  923. +
  924. +   return period;
  925. +}
  926. +
  927. +/*
  928. + * We calculate the wall-time slice from the period by taking a part
  929. + * proportional to the weight.
  930. + *
  931. + * s = p*P[w/rw]
  932. + */
  933. +static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  934. +{
  935. +   u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
  936. +
  937. +   for_each_sched_entity(se) {
  938. +       struct load_weight *load;
  939. +       struct load_weight lw;
  940. +
  941. +       cfs_rq = cfs_rq_of(se);
  942. +       load = &cfs_rq->load;
  943. +
  944. +       if (unlikely(!se->on_rq)) {
  945. +           lw = cfs_rq->load;
  946. +
  947. +           update_load_add(&lw, se->load.weight);
  948. +           load = &lw;
  949. +       }
  950. +       slice = calc_delta_mine(slice, se->load.weight, load);
  951. +   }
  952. +   return slice;
  953. +}
  954. +
  955. +/*
  956. + * We calculate the vruntime slice of a to be inserted task
  957. + *
  958. + * vs = s/w
  959. + */
  960. +static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  961. +{
  962. +   return calc_delta_fair(sched_slice(cfs_rq, se), se);
  963. +}
  964. +
  965. +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
  966. +static void update_cfs_shares(struct cfs_rq *cfs_rq);
  967. +
  968. +/*
  969. + * Update the current task's runtime statistics. Skip current tasks that
  970. + * are not in our scheduling class.
  971. + */
  972. +static inline void
  973. +__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
  974. +         unsigned long delta_exec)
  975. +{
  976. +   unsigned long delta_exec_weighted;
  977. +
  978. +   schedstat_set(curr->statistics.exec_max,
  979. +             max((u64)delta_exec, curr->statistics.exec_max));
  980. +
  981. +   curr->sum_exec_runtime += delta_exec;
  982. +   schedstat_add(cfs_rq, exec_clock, delta_exec);
  983. +   delta_exec_weighted = calc_delta_fair(delta_exec, curr);
  984. +
  985. +   curr->vruntime += delta_exec_weighted;
  986. +   update_min_vruntime(cfs_rq);
  987. +
  988. +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
  989. +   cfs_rq->load_unacc_exec_time += delta_exec;
  990. +#endif
  991. +}
  992. +
  993. +static void update_curr(struct cfs_rq *cfs_rq)
  994. +{
  995. +   struct sched_entity *curr = cfs_rq->curr;
  996. +   u64 now = rq_of(cfs_rq)->clock_task;
  997. +   unsigned long delta_exec;
  998. +
  999. +   if (unlikely(!curr))
  1000. +       return;
  1001. +
  1002. +   /*
  1003. +    * Get the amount of time the current task was running
  1004. +    * since the last time we changed load (this cannot
  1005. +    * overflow on 32 bits):
  1006. +    */
  1007. +   delta_exec = (unsigned long)(now - curr->exec_start);
  1008. +   if (!delta_exec)
  1009. +       return;
  1010. +
  1011. +   __update_curr(cfs_rq, curr, delta_exec);
  1012. +   curr->exec_start = now;
  1013. +
  1014. +   if (entity_is_task(curr)) {
  1015. +       struct task_struct *curtask = task_of(curr);
  1016. +
  1017. +       trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
  1018. +       cpuacct_charge(curtask, delta_exec);
  1019. +       account_group_exec_runtime(curtask, delta_exec);
  1020. +   }
  1021. +
  1022. +   account_cfs_rq_runtime(cfs_rq, delta_exec);
  1023. +}
  1024. +
  1025. +static inline void
  1026. +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  1027. +{
  1028. +   schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
  1029. +}
  1030. +
  1031. +/*
  1032. + * Task is being enqueued - update stats:
  1033. + */
  1034. +static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  1035. +{
  1036. +   /*
  1037. +    * Are we enqueueing a waiting task? (for current tasks
  1038. +    * a dequeue/enqueue event is a NOP)
  1039. +    */
  1040. +   if (se != cfs_rq->curr)
  1041. +       update_stats_wait_start(cfs_rq, se);
  1042. +}
  1043. +
  1044. +static void
  1045. +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
  1046. +{
  1047. +   schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
  1048. +           rq_of(cfs_rq)->clock - se->statistics.wait_start));
  1049. +   schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
  1050. +   schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
  1051. +           rq_of(cfs_rq)->clock - se->statistics.wait_start);
  1052. +#ifdef CONFIG_SCHEDSTATS
  1053. +   if (entity_is_task(se)) {
  1054. +       trace_sched_stat_wait(task_of(se),
  1055. +           rq_of(cfs_rq)->clock - se->statistics.wait_start);
  1056. +   }
  1057. +#endif
  1058. +   schedstat_set(se->statistics.wait_start, 0);
  1059. +}
  1060. +
  1061. +static inline void
  1062. +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  1063. +{
  1064. +   /*
  1065. +    * Mark the end of the wait period if dequeueing a
  1066. +    * waiting task:
  1067. +    */
  1068. +   if (se != cfs_rq->curr)
  1069. +       update_stats_wait_end(cfs_rq, se);
  1070. +}
  1071. +
  1072. +/*
  1073. + * We are picking a new current task - update its stats:
  1074. + */
  1075. +static inline void
  1076. +update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  1077. +{
  1078. +   /*
  1079. +    * We are starting a new run period:
  1080. +    */
  1081. +   se->exec_start = rq_of(cfs_rq)->clock_task;
  1082. +}
  1083. +
  1084. +/**************************************************
  1085. + * Scheduling class queueing methods:
  1086. + */
  1087. +
  1088. +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
  1089. +static void
  1090. +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
  1091. +{
  1092. +   cfs_rq->task_weight += weight;
  1093. +}
  1094. +#else
  1095. +static inline void
  1096. +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
  1097. +{
  1098. +}
  1099. +#endif
  1100. +
  1101. +static void
  1102. +account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  1103. +{
  1104. +   update_load_add(&cfs_rq->load, se->load.weight);
  1105. +   if (!parent_entity(se))
  1106. +       update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
  1107. +   if (entity_is_task(se)) {
  1108. +       add_cfs_task_weight(cfs_rq, se->load.weight);
  1109. +       list_add(&se->group_node, &cfs_rq->tasks);
  1110. +   }
  1111. +   cfs_rq->nr_running++;
  1112. +}
  1113. +
  1114. +static void
  1115. +account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  1116. +{
  1117. +   update_load_sub(&cfs_rq->load, se->load.weight);
  1118. +   if (!parent_entity(se))
  1119. +       update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
  1120. +   if (entity_is_task(se)) {
  1121. +       add_cfs_task_weight(cfs_rq, -se->load.weight);
  1122. +       list_del_init(&se->group_node);
  1123. +   }
  1124. +   cfs_rq->nr_running--;
  1125. +}
  1126. +
  1127. +#ifdef CONFIG_FAIR_GROUP_SCHED
  1128. +/* we need this in update_cfs_load and load-balance functions below */
  1129. +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
  1130. +# ifdef CONFIG_SMP
  1131. +static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
  1132. +                       int global_update)
  1133. +{
  1134. +   struct task_group *tg = cfs_rq->tg;
  1135. +   long load_avg;
  1136. +
  1137. +   load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
  1138. +   load_avg -= cfs_rq->load_contribution;
  1139. +
  1140. +   if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
  1141. +       atomic_add(load_avg, &tg->load_weight);
  1142. +       cfs_rq->load_contribution += load_avg;
  1143. +   }
  1144. +}
  1145. +
  1146. +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
  1147. +{
  1148. +   u64 period = sysctl_sched_shares_window;
  1149. +   u64 now, delta;
  1150. +   unsigned long load = cfs_rq->load.weight;
  1151. +
  1152. +   if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
  1153. +       return;
  1154. +
  1155. +   now = rq_of(cfs_rq)->clock_task;
  1156. +   delta = now - cfs_rq->load_stamp;
  1157. +
  1158. +   /* truncate load history at 4 idle periods */
  1159. +   if (cfs_rq->load_stamp > cfs_rq->load_last &&
  1160. +       now - cfs_rq->load_last > 4 * period) {
  1161. +       cfs_rq->load_period = 0;
  1162. +       cfs_rq->load_avg = 0;
  1163. +       delta = period - 1;
  1164. +   }
  1165. +
  1166. +   cfs_rq->load_stamp = now;
  1167. +   cfs_rq->load_unacc_exec_time = 0;
  1168. +   cfs_rq->load_period += delta;
  1169. +   if (load) {
  1170. +       cfs_rq->load_last = now;
  1171. +       cfs_rq->load_avg += delta * load;
  1172. +   }
  1173. +
  1174. +   /* consider updating load contribution on each fold or truncate */
  1175. +   if (global_update || cfs_rq->load_period > period
  1176. +       || !cfs_rq->load_period)
  1177. +       update_cfs_rq_load_contribution(cfs_rq, global_update);
  1178. +
  1179. +   while (cfs_rq->load_period > period) {
  1180. +       /*
  1181. +        * Inline assembly required to prevent the compiler
  1182. +        * optimising this loop into a divmod call.
  1183. +        * See __iter_div_u64_rem() for another example of this.
  1184. +        */
  1185. +       asm("" : "+rm" (cfs_rq->load_period));
  1186. +       cfs_rq->load_period /= 2;
  1187. +       cfs_rq->load_avg /= 2;
  1188. +   }
  1189. +
  1190. +   if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
  1191. +       list_del_leaf_cfs_rq(cfs_rq);
  1192. +}
  1193. +
  1194. +static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
  1195. +{
  1196. +   long tg_weight;
  1197. +
  1198. +   /*
  1199. +    * Use this CPU's actual weight instead of the last load_contribution
  1200. +    * to gain a more accurate current total weight. See
  1201. +    * update_cfs_rq_load_contribution().
  1202. +    */
  1203. +   tg_weight = atomic_read(&tg->load_weight);
  1204. +   tg_weight -= cfs_rq->load_contribution;
  1205. +   tg_weight += cfs_rq->load.weight;
  1206. +
  1207. +   return tg_weight;
  1208. +}
  1209. +
  1210. +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
  1211. +{
  1212. +   long tg_weight, load, shares;
  1213. +
  1214. +   tg_weight = calc_tg_weight(tg, cfs_rq);
  1215. +   load = cfs_rq->load.weight;
  1216. +
  1217. +   shares = (tg->shares * load);
  1218. +   if (tg_weight)
  1219. +       shares /= tg_weight;
  1220. +
  1221. +   if (shares < MIN_SHARES)
  1222. +       shares = MIN_SHARES;
  1223. +   if (shares > tg->shares)
  1224. +       shares = tg->shares;
  1225. +
  1226. +   return shares;
  1227. +}
  1228. +
  1229. +static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
  1230. +{
  1231. +   if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
  1232. +       update_cfs_load(cfs_rq, 0);
  1233. +       update_cfs_shares(cfs_rq);
  1234. +   }
  1235. +}
  1236. +# else /* CONFIG_SMP */
  1237. +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
  1238. +{
  1239. +}
  1240. +
  1241. +static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
  1242. +{
  1243. +   return tg->shares;
  1244. +}
  1245. +
  1246. +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
  1247. +{
  1248. +}
  1249. +# endif /* CONFIG_SMP */
  1250. +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
  1251. +               unsigned long weight)
  1252. +{
  1253. +   if (se->on_rq) {
  1254. +       /* commit outstanding execution time */
  1255. +       if (cfs_rq->curr == se)
  1256. +           update_curr(cfs_rq);
  1257. +       account_entity_dequeue(cfs_rq, se);
  1258. +   }
  1259. +
  1260. +   update_load_set(&se->load, weight);
  1261. +
  1262. +   if (se->on_rq)
  1263. +       account_entity_enqueue(cfs_rq, se);
  1264. +}
  1265. +
  1266. +static void update_cfs_shares(struct cfs_rq *cfs_rq)
  1267. +{
  1268. +   struct task_group *tg;
  1269. +   struct sched_entity *se;
  1270. +   long shares;
  1271. +
  1272. +   tg = cfs_rq->tg;
  1273. +   se = tg->se[cpu_of(rq_of(cfs_rq))];
  1274. +   if (!se || throttled_hierarchy(cfs_rq))
  1275. +       return;
  1276. +#ifndef CONFIG_SMP
  1277. +   if (likely(se->load.weight == tg->shares))
  1278. +       return;
  1279. +#endif
  1280. +   shares = calc_cfs_shares(cfs_rq, tg);
  1281. +
  1282. +   reweight_entity(cfs_rq_of(se), se, shares);
  1283. +}
  1284. +#else /* CONFIG_FAIR_GROUP_SCHED */
  1285. +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
  1286. +{
  1287. +}
  1288. +
  1289. +static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
  1290. +{
  1291. +}
  1292. +
  1293. +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
  1294. +{
  1295. +}
  1296. +#endif /* CONFIG_FAIR_GROUP_SCHED */
  1297. +
  1298. +static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
  1299. +{
  1300. +#ifdef CONFIG_SCHEDSTATS
  1301. +   struct task_struct *tsk = NULL;
  1302. +
  1303. +   if (entity_is_task(se))
  1304. +       tsk = task_of(se);
  1305. +
  1306. +   if (se->statistics.sleep_start) {
  1307. +       u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
  1308. +
  1309. +       if ((s64)delta < 0)
  1310. +           delta = 0;
  1311. +
  1312. +       if (unlikely(delta > se->statistics.sleep_max))
  1313. +           se->statistics.sleep_max = delta;
  1314. +
  1315. +       se->statistics.sleep_start = 0;
  1316. +       se->statistics.sum_sleep_runtime += delta;
  1317. +
  1318. +       if (tsk) {
  1319. +           account_scheduler_latency(tsk, delta >> 10, 1);
  1320. +           trace_sched_stat_sleep(tsk, delta);
  1321. +       }
  1322. +   }
  1323. +   if (se->statistics.block_start) {
  1324. +       u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
  1325. +
  1326. +       if ((s64)delta < 0)
  1327. +           delta = 0;
  1328. +
  1329. +       if (unlikely(delta > se->statistics.block_max))
  1330. +           se->statistics.block_max = delta;
  1331. +
  1332. +       se->statistics.block_start = 0;
  1333. +       se->statistics.sum_sleep_runtime += delta;
  1334. +
  1335. +       if (tsk) {
  1336. +           if (tsk->in_iowait) {
  1337. +               se->statistics.iowait_sum += delta;
  1338. +               se->statistics.iowait_count++;
  1339. +               trace_sched_stat_iowait(tsk, delta);
  1340. +           }
  1341. +
  1342. +           trace_sched_stat_blocked(tsk, delta);
  1343. +
  1344. +           /*
  1345. +            * Blocking time is in units of nanosecs, so shift by
  1346. +            * 20 to get a milliseconds-range estimation of the
  1347. +            * amount of time that the task spent sleeping:
  1348. +            */
  1349. +           if (unlikely(prof_on == SLEEP_PROFILING)) {
  1350. +               profile_hits(SLEEP_PROFILING,
  1351. +                       (void *)get_wchan(tsk),
  1352. +                       delta >> 20);
  1353. +           }
  1354. +           account_scheduler_latency(tsk, delta >> 10, 0);
  1355. +       }
  1356. +   }
  1357. +#endif
  1358. +}
  1359. +
  1360. +static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
  1361. +{
  1362. +#ifdef CONFIG_SCHED_DEBUG
  1363. +   s64 d = se->vruntime - cfs_rq->min_vruntime;
  1364. +
  1365. +   if (d < 0)
  1366. +       d = -d;
  1367. +
  1368. +   if (d > 3*sysctl_sched_latency)
  1369. +       schedstat_inc(cfs_rq, nr_spread_over);
  1370. +#endif
  1371. +}
  1372. +
  1373. +static void
  1374. +place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
  1375. +{
  1376. +   u64 vruntime = cfs_rq->min_vruntime;
  1377. +
  1378. +   /*
  1379. +    * The 'current' period is already promised to the current tasks,
  1380. +    * however the extra weight of the new task will slow them down a
  1381. +    * little, place the new task so that it fits in the slot that
  1382. +    * stays open at the end.
  1383. +    */
  1384. +   if (initial && sched_feat(START_DEBIT))
  1385. +       vruntime += sched_vslice(cfs_rq, se);
  1386. +
  1387. +   /* sleeps up to a single latency don't count. */
  1388. +   if (!initial) {
  1389. +       unsigned long thresh = sysctl_sched_latency;
  1390. +
  1391. +       /*
  1392. +        * Halve their sleep time's effect, to allow
  1393. +        * for a gentler effect of sleepers:
  1394. +        */
  1395. +       if (sched_feat(GENTLE_FAIR_SLEEPERS))
  1396. +           thresh >>= 1;
  1397. +
  1398. +       vruntime -= thresh;
  1399. +   }
  1400. +
  1401. +   /* ensure we never gain time by being placed backwards. */
  1402. +   vruntime = max_vruntime(se->vruntime, vruntime);
  1403. +
  1404. +   se->vruntime = vruntime;
  1405. +}
  1406. +
  1407. +static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
  1408. +
  1409. +static void
  1410. +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  1411. +{
  1412. +   /*
  1413. +    * Update the normalized vruntime before updating min_vruntime
  1414. +    * through callig update_curr().
  1415. +    */
  1416. +   if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
  1417. +       se->vruntime += cfs_rq->min_vruntime;
  1418. +
  1419. +   /*
  1420. +    * Update run-time statistics of the 'current'.
  1421. +    */
  1422. +   update_curr(cfs_rq);
  1423. +   update_cfs_load(cfs_rq, 0);
  1424. +   account_entity_enqueue(cfs_rq, se);
  1425. +   update_cfs_shares(cfs_rq);
  1426. +
  1427. +   if (flags & ENQUEUE_WAKEUP) {
  1428. +       place_entity(cfs_rq, se, 0);
  1429. +       enqueue_sleeper(cfs_rq, se);
  1430. +   }
  1431. +
  1432. +   update_stats_enqueue(cfs_rq, se);
  1433. +   check_spread(cfs_rq, se);
  1434. +   if (se != cfs_rq->curr)
  1435. +       __enqueue_entity(cfs_rq, se);
  1436. +   se->on_rq = 1;
  1437. +
  1438. +   if (cfs_rq->nr_running == 1) {
  1439. +       list_add_leaf_cfs_rq(cfs_rq);
  1440. +       check_enqueue_throttle(cfs_rq);
  1441. +   }
  1442. +}
  1443. +
  1444. +static void __clear_buddies_last(struct sched_entity *se)
  1445. +{
  1446. +   for_each_sched_entity(se) {
  1447. +       struct cfs_rq *cfs_rq = cfs_rq_of(se);
  1448. +       if (cfs_rq->last == se)
  1449. +           cfs_rq->last = NULL;
  1450. +       else
  1451. +           break;
  1452. +   }
  1453. +}
  1454. +
  1455. +static void __clear_buddies_next(struct sched_entity *se)
  1456. +{
  1457. +   for_each_sched_entity(se) {
  1458. +       struct cfs_rq *cfs_rq = cfs_rq_of(se);
  1459. +       if (cfs_rq->next == se)
  1460. +           cfs_rq->next = NULL;
  1461. +       else
  1462. +           break;
  1463. +   }
  1464. +}
  1465. +
  1466. +static void __clear_buddies_skip(struct sched_entity *se)
  1467. +{
  1468. +   for_each_sched_entity(se) {
  1469. +       struct cfs_rq *cfs_rq = cfs_rq_of(se);
  1470. +       if (cfs_rq->skip == se)
  1471. +           cfs_rq->skip = NULL;
  1472. +       else
  1473. +           break;
  1474. +   }
  1475. +}
  1476. +
  1477. +static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
  1478. +{
  1479. +   if (cfs_rq->last == se)
  1480. +       __clear_buddies_last(se);
  1481. +
  1482. +   if (cfs_rq->next == se)
  1483. +       __clear_buddies_next(se);
  1484. +
  1485. +   if (cfs_rq->skip == se)
  1486. +       __clear_buddies_skip(se);
  1487. +}
  1488. +
  1489. +static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
  1490. +
  1491. +static void
  1492. +dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  1493. +{
  1494. +   /*
  1495. +    * Update run-time statistics of the 'current'.
  1496. +    */
  1497. +   update_curr(cfs_rq);
  1498. +
  1499. +   update_stats_dequeue(cfs_rq, se);
  1500. +   if (flags & DEQUEUE_SLEEP) {
  1501. +#ifdef CONFIG_SCHEDSTATS
  1502. +       if (entity_is_task(se)) {
  1503. +           struct task_struct *tsk = task_of(se);
  1504. +
  1505. +           if (tsk->state & TASK_INTERRUPTIBLE)
  1506. +               se->statistics.sleep_start = rq_of(cfs_rq)->clock;
  1507. +           if (tsk->state & TASK_UNINTERRUPTIBLE)
  1508. +               se->statistics.block_start = rq_of(cfs_rq)->clock;
  1509. +       }
  1510. +#endif
  1511. +   }
  1512. +
  1513. +   clear_buddies(cfs_rq, se);
  1514. +
  1515. +   if (se != cfs_rq->curr)
  1516. +       __dequeue_entity(cfs_rq, se);
  1517. +   se->on_rq = 0;
  1518. +   update_cfs_load(cfs_rq, 0);
  1519. +   account_entity_dequeue(cfs_rq, se);
  1520. +
  1521. +   /*
  1522. +    * Normalize the entity after updating the min_vruntime because the
  1523. +    * update can refer to the ->curr item and we need to reflect this
  1524. +    * movement in our normalized position.
  1525. +    */
  1526. +   if (!(flags & DEQUEUE_SLEEP))
  1527. +       se->vruntime -= cfs_rq->min_vruntime;
  1528. +
  1529. +   /* return excess runtime on last dequeue */
  1530. +   return_cfs_rq_runtime(cfs_rq);
  1531. +
  1532. +   update_min_vruntime(cfs_rq);
  1533. +   update_cfs_shares(cfs_rq);
  1534. +}
  1535. +
  1536. +/*
  1537. + * Preempt the current task with a newly woken task if needed:
  1538. + */
  1539. +static void
  1540. +check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
  1541. +{
  1542. +   unsigned long ideal_runtime, delta_exec;
  1543. +   struct sched_entity *se;
  1544. +   s64 delta;
  1545. +
  1546. +   ideal_runtime = sched_slice(cfs_rq, curr);
  1547. +   delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
  1548. +   if (delta_exec > ideal_runtime) {
  1549. +       resched_task(rq_of(cfs_rq)->curr);
  1550. +       /*
  1551. +        * The current task ran long enough, ensure it doesn't get
  1552. +        * re-elected due to buddy favours.
  1553. +        */
  1554. +       clear_buddies(cfs_rq, curr);
  1555. +       return;
  1556. +   }
  1557. +
  1558. +   /*
  1559. +    * Ensure that a task that missed wakeup preemption by a
  1560. +    * narrow margin doesn't have to wait for a full slice.
  1561. +    * This also mitigates buddy induced latencies under load.
  1562. +    */
  1563. +   if (delta_exec < sysctl_sched_min_granularity)
  1564. +       return;
  1565. +
  1566. +   se = __pick_first_entity(cfs_rq);
  1567. +   delta = curr->vruntime - se->vruntime;
  1568. +
  1569. +   if (delta < 0)
  1570. +       return;
  1571. +
  1572. +   if (delta > ideal_runtime)
  1573. +       resched_task(rq_of(cfs_rq)->curr);
  1574. +}
  1575. +
  1576. +static void
  1577. +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  1578. +{
  1579. +   /* 'current' is not kept within the tree. */
  1580. +   if (se->on_rq) {
  1581. +       /*
  1582. +        * Any task has to be enqueued before it get to execute on
  1583. +        * a CPU. So account for the time it spent waiting on the
  1584. +        * runqueue.
  1585. +        */
  1586. +       update_stats_wait_end(cfs_rq, se);
  1587. +       __dequeue_entity(cfs_rq, se);
  1588. +   }
  1589. +
  1590. +   update_stats_curr_start(cfs_rq, se);
  1591. +   cfs_rq->curr = se;
  1592. +#ifdef CONFIG_SCHEDSTATS
  1593. +   /*
  1594. +    * Track our maximum slice length, if the CPU's load is at
  1595. +    * least twice that of our own weight (i.e. dont track it
  1596. +    * when there are only lesser-weight tasks around):
  1597. +    */
  1598. +   if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
  1599. +       se->statistics.slice_max = max(se->statistics.slice_max,
  1600. +           se->sum_exec_runtime - se->prev_sum_exec_runtime);
  1601. +   }
  1602. +#endif
  1603. +   se->prev_sum_exec_runtime = se->sum_exec_runtime;
  1604. +}
  1605. +
  1606. +static int
  1607. +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
  1608. +
  1609. +/*
  1610. + * Pick the next process, keeping these things in mind, in this order:
  1611. + * 1) keep things fair between processes/task groups
  1612. + * 2) pick the "next" process, since someone really wants that to run
  1613. + * 3) pick the "last" process, for cache locality
  1614. + * 4) do not run the "skip" process, if something else is available
  1615. + */
  1616. +static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
  1617. +{
  1618. +   struct sched_entity *se = __pick_first_entity(cfs_rq);
  1619. +   struct sched_entity *left = se;
  1620. +
  1621. +   /*
  1622. +    * Avoid running the skip buddy, if running something else can
  1623. +    * be done without getting too unfair.
  1624. +    */
  1625. +   if (cfs_rq->skip == se) {
  1626. +       struct sched_entity *second = __pick_next_entity(se);
  1627. +       if (second && wakeup_preempt_entity(second, left) < 1)
  1628. +           se = second;
  1629. +   }
  1630. +
  1631. +   /*
  1632. +    * Prefer last buddy, try to return the CPU to a preempted task.
  1633. +    */
  1634. +   if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
  1635. +       se = cfs_rq->last;
  1636. +
  1637. +   /*
  1638. +    * Someone really wants this to run. If it's not unfair, run it.
  1639. +    */
  1640. +   if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
  1641. +       se = cfs_rq->next;
  1642. +
  1643. +   clear_buddies(cfs_rq, se);
  1644. +
  1645. +   return se;
  1646. +}
  1647. +
  1648. +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
  1649. +
  1650. +static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
  1651. +{
  1652. +   /*
  1653. +    * If still on the runqueue then deactivate_task()
  1654. +    * was not called and update_curr() has to be done:
  1655. +    */
  1656. +   if (prev->on_rq)
  1657. +       update_curr(cfs_rq);
  1658. +
  1659. +   /* throttle cfs_rqs exceeding runtime */
  1660. +   check_cfs_rq_runtime(cfs_rq);
  1661. +
  1662. +   check_spread(cfs_rq, prev);
  1663. +   if (prev->on_rq) {
  1664. +       update_stats_wait_start(cfs_rq, prev);
  1665. +       /* Put 'current' back into the tree. */
  1666. +       __enqueue_entity(cfs_rq, prev);
  1667. +   }
  1668. +   cfs_rq->curr = NULL;
  1669. +}
  1670. +
  1671. +static void
  1672. +entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
  1673. +{
  1674. +   /*
  1675. +    * Update run-time statistics of the 'current'.
  1676. +    */
  1677. +   update_curr(cfs_rq);
  1678. +
  1679. +   /*
  1680. +    * Update share accounting for long-running entities.
  1681. +    */
  1682. +   update_entity_shares_tick(cfs_rq);
  1683. +
  1684. +#ifdef CONFIG_SCHED_HRTICK
  1685. +   /*
  1686. +    * queued ticks are scheduled to match the slice, so don't bother
  1687. +    * validating it and just reschedule.
  1688. +    */
  1689. +   if (queued) {
  1690. +       resched_task(rq_of(cfs_rq)->curr);
  1691. +       return;
  1692. +   }
  1693. +   /*
  1694. +    * don't let the period tick interfere with the hrtick preemption
  1695. +    */
  1696. +   if (!sched_feat(DOUBLE_TICK) &&
  1697. +           hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
  1698. +       return;
  1699. +#endif
  1700. +
  1701. +   if (cfs_rq->nr_running > 1)
  1702. +       check_preempt_tick(cfs_rq, curr);
  1703. +}
  1704. +
  1705. +
  1706. +/**************************************************
  1707. + * CFS bandwidth control machinery
  1708. + */
  1709. +
  1710. +#ifdef CONFIG_CFS_BANDWIDTH
  1711. +
  1712. +#ifdef HAVE_JUMP_LABEL
  1713. +static struct jump_label_key __cfs_bandwidth_used;
  1714. +
  1715. +static inline bool cfs_bandwidth_used(void)
  1716. +{
  1717. +   return static_branch(&__cfs_bandwidth_used);
  1718. +}
  1719. +
  1720. +void account_cfs_bandwidth_used(int enabled, int was_enabled)
  1721. +{
  1722. +   /* only need to count groups transitioning between enabled/!enabled */
  1723. +   if (enabled && !was_enabled)
  1724. +       jump_label_inc(&__cfs_bandwidth_used);
  1725. +   else if (!enabled && was_enabled)
  1726. +       jump_label_dec(&__cfs_bandwidth_used);
  1727. +}
  1728. +#else /* HAVE_JUMP_LABEL */
  1729. +static bool cfs_bandwidth_used(void)
  1730. +{
  1731. +   return true;
  1732. +}
  1733. +
  1734. +void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
  1735. +#endif /* HAVE_JUMP_LABEL */
  1736. +
  1737. +/*
  1738. + * default period for cfs group bandwidth.
  1739. + * default: 0.1s, units: nanoseconds
  1740. + */
  1741. +static inline u64 default_cfs_period(void)
  1742. +{
  1743. +   return 100000000ULL;
  1744. +}
  1745. +
  1746. +static inline u64 sched_cfs_bandwidth_slice(void)
  1747. +{
  1748. +   return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
  1749. +}
  1750. +
  1751. +/*
  1752. + * Replenish runtime according to assigned quota and update expiration time.
  1753. + * We use sched_clock_cpu directly instead of rq->clock to avoid adding
  1754. + * additional synchronization around rq->lock.
  1755. + *
  1756. + * requires cfs_b->lock
  1757. + */
  1758. +void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
  1759. +{
  1760. +   u64 now;
  1761. +
  1762. +   if (cfs_b->quota == RUNTIME_INF)
  1763. +       return;
  1764. +
  1765. +   now = sched_clock_cpu(smp_processor_id());
  1766. +   cfs_b->runtime = cfs_b->quota;
  1767. +   cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
  1768. +}
  1769. +
  1770. +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
  1771. +{
  1772. +   return &tg->cfs_bandwidth;
  1773. +}
  1774. +
  1775. +/* returns 0 on failure to allocate runtime */
  1776. +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  1777. +{
  1778. +   struct task_group *tg = cfs_rq->tg;
  1779. +   struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
  1780. +   u64 amount = 0, min_amount, expires;
  1781. +
  1782. +   /* note: this is a positive sum as runtime_remaining <= 0 */
  1783. +   min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
  1784. +
  1785. +   raw_spin_lock(&cfs_b->lock);
  1786. +   if (cfs_b->quota == RUNTIME_INF)
  1787. +       amount = min_amount;
  1788. +   else {
  1789. +       /*
  1790. +        * If the bandwidth pool has become inactive, then at least one
  1791. +        * period must have elapsed since the last consumption.
  1792. +        * Refresh the global state and ensure bandwidth timer becomes
  1793. +        * active.
  1794. +        */
  1795. +       if (!cfs_b->timer_active) {
  1796. +           __refill_cfs_bandwidth_runtime(cfs_b);
  1797. +           __start_cfs_bandwidth(cfs_b);
  1798. +       }
  1799. +
  1800. +       if (cfs_b->runtime > 0) {
  1801. +           amount = min(cfs_b->runtime, min_amount);
  1802. +           cfs_b->runtime -= amount;
  1803. +           cfs_b->idle = 0;
  1804. +       }
  1805. +   }
  1806. +   expires = cfs_b->runtime_expires;
  1807. +   raw_spin_unlock(&cfs_b->lock);
  1808. +
  1809. +   cfs_rq->runtime_remaining += amount;
  1810. +   /*
  1811. +    * we may have advanced our local expiration to account for allowed
  1812. +    * spread between our sched_clock and the one on which runtime was
  1813. +    * issued.
  1814. +    */
  1815. +   if ((s64)(expires - cfs_rq->runtime_expires) > 0)
  1816. +       cfs_rq->runtime_expires = expires;
  1817. +
  1818. +   return cfs_rq->runtime_remaining > 0;
  1819. +}
  1820. +
  1821. +/*
  1822. + * Note: This depends on the synchronization provided by sched_clock and the
  1823. + * fact that rq->clock snapshots this value.
  1824. + */
  1825. +static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  1826. +{
  1827. +   struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
  1828. +   struct rq *rq = rq_of(cfs_rq);
  1829. +
  1830. +   /* if the deadline is ahead of our clock, nothing to do */
  1831. +   if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
  1832. +       return;
  1833. +
  1834. +   if (cfs_rq->runtime_remaining < 0)
  1835. +       return;
  1836. +
  1837. +   /*
  1838. +    * If the local deadline has passed we have to consider the
  1839. +    * possibility that our sched_clock is 'fast' and the global deadline
  1840. +    * has not truly expired.
  1841. +    *
  1842. +    * Fortunately we can check determine whether this the case by checking
  1843. +    * whether the global deadline has advanced.
  1844. +    */
  1845. +
  1846. +   if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
  1847. +       /* extend local deadline, drift is bounded above by 2 ticks */
  1848. +       cfs_rq->runtime_expires += TICK_NSEC;
  1849. +   } else {
  1850. +       /* global deadline is ahead, expiration has passed */
  1851. +       cfs_rq->runtime_remaining = 0;
  1852. +   }
  1853. +}
  1854. +
  1855. +static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
  1856. +                    unsigned long delta_exec)
  1857. +{
  1858. +   /* dock delta_exec before expiring quota (as it could span periods) */
  1859. +   cfs_rq->runtime_remaining -= delta_exec;
  1860. +   expire_cfs_rq_runtime(cfs_rq);
  1861. +
  1862. +   if (likely(cfs_rq->runtime_remaining > 0))
  1863. +       return;
  1864. +
  1865. +   /*
  1866. +    * if we're unable to extend our runtime we resched so that the active
  1867. +    * hierarchy can be throttled
  1868. +    */
  1869. +   if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
  1870. +       resched_task(rq_of(cfs_rq)->curr);
  1871. +}
  1872. +
  1873. +static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
  1874. +                          unsigned long delta_exec)
  1875. +{
  1876. +   if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
  1877. +       return;
  1878. +
  1879. +   __account_cfs_rq_runtime(cfs_rq, delta_exec);
  1880. +}
  1881. +
  1882. +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
  1883. +{
  1884. +   return cfs_bandwidth_used() && cfs_rq->throttled;
  1885. +}
  1886. +
  1887. +/* check whether cfs_rq, or any parent, is throttled */
  1888. +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
  1889. +{
  1890. +   return cfs_bandwidth_used() && cfs_rq->throttle_count;
  1891. +}
  1892. +
  1893. +/*
  1894. + * Ensure that neither of the group entities corresponding to src_cpu or
  1895. + * dest_cpu are members of a throttled hierarchy when performing group
  1896. + * load-balance operations.
  1897. + */
  1898. +static inline int throttled_lb_pair(struct task_group *tg,
  1899. +                   int src_cpu, int dest_cpu)
  1900. +{
  1901. +   struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
  1902. +
  1903. +   src_cfs_rq = tg->cfs_rq[src_cpu];
  1904. +   dest_cfs_rq = tg->cfs_rq[dest_cpu];
  1905. +
  1906. +   return throttled_hierarchy(src_cfs_rq) ||
  1907. +          throttled_hierarchy(dest_cfs_rq);
  1908. +}
  1909. +
  1910. +/* updated child weight may affect parent so we have to do this bottom up */
  1911. +static int tg_unthrottle_up(struct task_group *tg, void *data)
  1912. +{
  1913. +   struct rq *rq = data;
  1914. +   struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
  1915. +
  1916. +   cfs_rq->throttle_count--;
  1917. +#ifdef CONFIG_SMP
  1918. +   if (!cfs_rq->throttle_count) {
  1919. +       u64 delta = rq->clock_task - cfs_rq->load_stamp;
  1920. +
  1921. +       /* leaving throttled state, advance shares averaging windows */
  1922. +       cfs_rq->load_stamp += delta;
  1923. +       cfs_rq->load_last += delta;
  1924. +
  1925. +       /* update entity weight now that we are on_rq again */
  1926. +       update_cfs_shares(cfs_rq);
  1927. +   }
  1928. +#endif
  1929. +
  1930. +   return 0;
  1931. +}
  1932. +
  1933. +static int tg_throttle_down(struct task_group *tg, void *data)
  1934. +{
  1935. +   struct rq *rq = data;
  1936. +   struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
  1937. +
  1938. +   /* group is entering throttled state, record last load */
  1939. +   if (!cfs_rq->throttle_count)
  1940. +       update_cfs_load(cfs_rq, 0);
  1941. +   cfs_rq->throttle_count++;
  1942. +
  1943. +   return 0;
  1944. +}
  1945. +
  1946. +static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
  1947. +{
  1948. +   struct rq *rq = rq_of(cfs_rq);
  1949. +   struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
  1950. +   struct sched_entity *se;
  1951. +   long task_delta, dequeue = 1;
  1952. +
  1953. +   se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
  1954. +
  1955. +   /* account load preceding throttle */
  1956. +   rcu_read_lock();
  1957. +   walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
  1958. +   rcu_read_unlock();
  1959. +
  1960. +   task_delta = cfs_rq->h_nr_running;
  1961. +   for_each_sched_entity(se) {
  1962. +       struct cfs_rq *qcfs_rq = cfs_rq_of(se);
  1963. +       /* throttled entity or throttle-on-deactivate */
  1964. +       if (!se->on_rq)
  1965. +           break;
  1966. +
  1967. +       if (dequeue)
  1968. +           dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
  1969. +       qcfs_rq->h_nr_running -= task_delta;
  1970. +
  1971. +       if (qcfs_rq->load.weight)
  1972. +           dequeue = 0;
  1973. +   }
  1974. +
  1975. +   if (!se)
  1976. +       rq->nr_running -= task_delta;
  1977. +
  1978. +   cfs_rq->throttled = 1;
  1979. +   cfs_rq->throttled_timestamp = rq->clock;
  1980. +   raw_spin_lock(&cfs_b->lock);
  1981. +   list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
  1982. +   raw_spin_unlock(&cfs_b->lock);
  1983. +}
  1984. +
  1985. +void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
  1986. +{
  1987. +   struct rq *rq = rq_of(cfs_rq);
  1988. +   struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
  1989. +   struct sched_entity *se;
  1990. +   int enqueue = 1;
  1991. +   long task_delta;
  1992. +
  1993. +   se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
  1994. +
  1995. +   cfs_rq->throttled = 0;
  1996. +   raw_spin_lock(&cfs_b->lock);
  1997. +   cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
  1998. +   list_del_rcu(&cfs_rq->throttled_list);
  1999. +   raw_spin_unlock(&cfs_b->lock);
  2000. +   cfs_rq->throttled_timestamp = 0;
  2001. +
  2002. +   update_rq_clock(rq);
  2003. +   /* update hierarchical throttle state */
  2004. +   walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
  2005. +
  2006. +   if (!cfs_rq->load.weight)
  2007. +       return;
  2008. +
  2009. +   task_delta = cfs_rq->h_nr_running;
  2010. +   for_each_sched_entity(se) {
  2011. +       if (se->on_rq)
  2012. +           enqueue = 0;
  2013. +
  2014. +       cfs_rq = cfs_rq_of(se);
  2015. +       if (enqueue)
  2016. +           enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
  2017. +       cfs_rq->h_nr_running += task_delta;
  2018. +
  2019. +       if (cfs_rq_throttled(cfs_rq))
  2020. +           break;
  2021. +   }
  2022. +
  2023. +   if (!se)
  2024. +       rq->nr_running += task_delta;
  2025. +
  2026. +   /* determine whether we need to wake up potentially idle cpu */
  2027. +   if (rq->curr == rq->idle && rq->cfs.nr_running)
  2028. +       resched_task(rq->curr);
  2029. +}
  2030. +
  2031. +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
  2032. +       u64 remaining, u64 expires)
  2033. +{
  2034. +   struct cfs_rq *cfs_rq;
  2035. +   u64 runtime = remaining;
  2036. +
  2037. +   rcu_read_lock();
  2038. +   list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
  2039. +               throttled_list) {
  2040. +       struct rq *rq = rq_of(cfs_rq);
  2041. +
  2042. +       raw_spin_lock(&rq->lock);
  2043. +       if (!cfs_rq_throttled(cfs_rq))
  2044. +           goto next;
  2045. +
  2046. +       runtime = -cfs_rq->runtime_remaining + 1;
  2047. +       if (runtime > remaining)
  2048. +           runtime = remaining;
  2049. +       remaining -= runtime;
  2050. +
  2051. +       cfs_rq->runtime_remaining += runtime;
  2052. +       cfs_rq->runtime_expires = expires;
  2053. +
  2054. +       /* we check whether we're throttled above */
  2055. +       if (cfs_rq->runtime_remaining > 0)
  2056. +           unthrottle_cfs_rq(cfs_rq);
  2057. +
  2058. +next:
  2059. +       raw_spin_unlock(&rq->lock);
  2060. +
  2061. +       if (!remaining)
  2062. +           break;
  2063. +   }
  2064. +   rcu_read_unlock();
  2065. +
  2066. +   return remaining;
  2067. +}
  2068. +
  2069. +/*
  2070. + * Responsible for refilling a task_group's bandwidth and unthrottling its
  2071. + * cfs_rqs as appropriate. If there has been no activity within the last
  2072. + * period the timer is deactivated until scheduling resumes; cfs_b->idle is
  2073. + * used to track this state.
  2074. + */
  2075. +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
  2076. +{
  2077. +   u64 runtime, runtime_expires;
  2078. +   int idle = 1, throttled;
  2079. +
  2080. +   raw_spin_lock(&cfs_b->lock);
  2081. +   /* no need to continue the timer with no bandwidth constraint */
  2082. +   if (cfs_b->quota == RUNTIME_INF)
  2083. +       goto out_unlock;
  2084. +
  2085. +   throttled = !list_empty(&cfs_b->throttled_cfs_rq);
  2086. +   /* idle depends on !throttled (for the case of a large deficit) */
  2087. +   idle = cfs_b->idle && !throttled;
  2088. +   cfs_b->nr_periods += overrun;
  2089. +
  2090. +   /* if we're going inactive then everything else can be deferred */
  2091. +   if (idle)
  2092. +       goto out_unlock;
  2093. +
  2094. +   __refill_cfs_bandwidth_runtime(cfs_b);
  2095. +
  2096. +   if (!throttled) {
  2097. +       /* mark as potentially idle for the upcoming period */
  2098. +       cfs_b->idle = 1;
  2099. +       goto out_unlock;
  2100. +   }
  2101. +
  2102. +   /* account preceding periods in which throttling occurred */
  2103. +   cfs_b->nr_throttled += overrun;
  2104. +
  2105. +   /*
  2106. +    * There are throttled entities so we must first use the new bandwidth
  2107. +    * to unthrottle them before making it generally available.  This
  2108. +    * ensures that all existing debts will be paid before a new cfs_rq is
  2109. +    * allowed to run.
  2110. +    */
  2111. +   runtime = cfs_b->runtime;
  2112. +   runtime_expires = cfs_b->runtime_expires;
  2113. +   cfs_b->runtime = 0;
  2114. +
  2115. +   /*
  2116. +    * This check is repeated as we are holding onto the new bandwidth
  2117. +    * while we unthrottle.  This can potentially race with an unthrottled
  2118. +    * group trying to acquire new bandwidth from the global pool.
  2119. +    */
  2120. +   while (throttled && runtime > 0) {
  2121. +       raw_spin_unlock(&cfs_b->lock);
  2122. +       /* we can't nest cfs_b->lock while distributing bandwidth */
  2123. +       runtime = distribute_cfs_runtime(cfs_b, runtime,
  2124. +                        runtime_expires);
  2125. +       raw_spin_lock(&cfs_b->lock);
  2126. +
  2127. +       throttled = !list_empty(&cfs_b->throttled_cfs_rq);
  2128. +   }
  2129. +
  2130. +   /* return (any) remaining runtime */
  2131. +   cfs_b->runtime = runtime;
  2132. +   /*
  2133. +    * While we are ensured activity in the period following an
  2134. +    * unthrottle, this also covers the case in which the new bandwidth is
  2135. +    * insufficient to cover the existing bandwidth deficit.  (Forcing the
  2136. +    * timer to remain active while there are any throttled entities.)
  2137. +    */
  2138. +   cfs_b->idle = 0;
  2139. +out_unlock:
  2140. +   if (idle)
  2141. +       cfs_b->timer_active = 0;
  2142. +   raw_spin_unlock(&cfs_b->lock);
  2143. +
  2144. +   return idle;
  2145. +}
  2146. +
  2147. +/* a cfs_rq won't donate quota below this amount */
  2148. +static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
  2149. +/* minimum remaining period time to redistribute slack quota */
  2150. +static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
  2151. +/* how long we wait to gather additional slack before distributing */
  2152. +static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
  2153. +
  2154. +/* are we near the end of the current quota period? */
  2155. +static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
  2156. +{
  2157. +   struct hrtimer *refresh_timer = &cfs_b->period_timer;
  2158. +   u64 remaining;
  2159. +
  2160. +   /* if the call-back is running a quota refresh is already occurring */
  2161. +   if (hrtimer_callback_running(refresh_timer))
  2162. +       return 1;
  2163. +
  2164. +   /* is a quota refresh about to occur? */
  2165. +   remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
  2166. +   if (remaining < min_expire)
  2167. +       return 1;
  2168. +
  2169. +   return 0;
  2170. +}
  2171. +
  2172. +static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
  2173. +{
  2174. +   u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
  2175. +
  2176. +   /* if there's a quota refresh soon don't bother with slack */
  2177. +   if (runtime_refresh_within(cfs_b, min_left))
  2178. +       return;
  2179. +
  2180. +   start_bandwidth_timer(&cfs_b->slack_timer,
  2181. +               ns_to_ktime(cfs_bandwidth_slack_period));
  2182. +}
  2183. +
  2184. +/* we know any runtime found here is valid as update_curr() precedes return */
  2185. +static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  2186. +{
  2187. +   struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
  2188. +   s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
  2189. +
  2190. +   if (slack_runtime <= 0)
  2191. +       return;
  2192. +
  2193. +   raw_spin_lock(&cfs_b->lock);
  2194. +   if (cfs_b->quota != RUNTIME_INF &&
  2195. +       cfs_rq->runtime_expires == cfs_b->runtime_expires) {
  2196. +       cfs_b->runtime += slack_runtime;
  2197. +
  2198. +       /* we are under rq->lock, defer unthrottling using a timer */
  2199. +       if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
  2200. +           !list_empty(&cfs_b->throttled_cfs_rq))
  2201. +           start_cfs_slack_bandwidth(cfs_b);
  2202. +   }
  2203. +   raw_spin_unlock(&cfs_b->lock);
  2204. +
  2205. +   /* even if it's not valid for return we don't want to try again */
  2206. +   cfs_rq->runtime_remaining -= slack_runtime;
  2207. +}
  2208. +
  2209. +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  2210. +{
  2211. +   if (!cfs_bandwidth_used())
  2212. +       return;
  2213. +
  2214. +   if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
  2215. +       return;
  2216. +
  2217. +   __return_cfs_rq_runtime(cfs_rq);
  2218. +}
  2219. +
  2220. +/*
  2221. + * This is done with a timer (instead of inline with bandwidth return) since
  2222. + * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
  2223. + */
  2224. +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
  2225. +{
  2226. +   u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
  2227. +   u64 expires;
  2228. +
  2229. +   /* confirm we're still not at a refresh boundary */
  2230. +   if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
  2231. +       return;
  2232. +
  2233. +   raw_spin_lock(&cfs_b->lock);
  2234. +   if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
  2235. +       runtime = cfs_b->runtime;
  2236. +       cfs_b->runtime = 0;
  2237. +   }
  2238. +   expires = cfs_b->runtime_expires;
  2239. +   raw_spin_unlock(&cfs_b->lock);
  2240. +
  2241. +   if (!runtime)
  2242. +       return;
  2243. +
  2244. +   runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
  2245. +
  2246. +   raw_spin_lock(&cfs_b->lock);
  2247. +   if (expires == cfs_b->runtime_expires)
  2248. +       cfs_b->runtime = runtime;
  2249. +   raw_spin_unlock(&cfs_b->lock);
  2250. +}
  2251. +
  2252. +/*
  2253. + * When a group wakes up we want to make sure that its quota is not already
  2254. + * expired/exceeded, otherwise it may be allowed to steal additional ticks of
  2255. + * runtime as update_curr() throttling can not not trigger until it's on-rq.
  2256. + */
  2257. +static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
  2258. +{
  2259. +   if (!cfs_bandwidth_used())
  2260. +       return;
  2261. +
  2262. +   /* an active group must be handled by the update_curr()->put() path */
  2263. +   if (!cfs_rq->runtime_enabled || cfs_rq->curr)
  2264. +       return;
  2265. +
  2266. +   /* ensure the group is not already throttled */
  2267. +   if (cfs_rq_throttled(cfs_rq))
  2268. +       return;
  2269. +
  2270. +   /* update runtime allocation */
  2271. +   account_cfs_rq_runtime(cfs_rq, 0);
  2272. +   if (cfs_rq->runtime_remaining <= 0)
  2273. +       throttle_cfs_rq(cfs_rq);
  2274. +}
  2275. +
  2276. +/* conditionally throttle active cfs_rq's from put_prev_entity() */
  2277. +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  2278. +{
  2279. +   if (!cfs_bandwidth_used())
  2280. +       return;
  2281. +
  2282. +   if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
  2283. +       return;
  2284. +
  2285. +   /*
  2286. +    * it's possible for a throttled entity to be forced into a running
  2287. +    * state (e.g. set_curr_task), in this case we're finished.
  2288. +    */
  2289. +   if (cfs_rq_throttled(cfs_rq))
  2290. +       return;
  2291. +
  2292. +   throttle_cfs_rq(cfs_rq);
  2293. +}
  2294. +
  2295. +static inline u64 default_cfs_period(void);
  2296. +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
  2297. +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
  2298. +
  2299. +static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
  2300. +{
  2301. +   struct cfs_bandwidth *cfs_b =
  2302. +       container_of(timer, struct cfs_bandwidth, slack_timer);
  2303. +   do_sched_cfs_slack_timer(cfs_b);
  2304. +
  2305. +   return HRTIMER_NORESTART;
  2306. +}
  2307. +
  2308. +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
  2309. +{
  2310. +   struct cfs_bandwidth *cfs_b =
  2311. +       container_of(timer, struct cfs_bandwidth, period_timer);
  2312. +   ktime_t now;
  2313. +   int overrun;
  2314. +   int idle = 0;
  2315. +
  2316. +   for (;;) {
  2317. +       now = hrtimer_cb_get_time(timer);
  2318. +       overrun = hrtimer_forward(timer, now, cfs_b->period);
  2319. +
  2320. +       if (!overrun)
  2321. +           break;
  2322. +
  2323. +       idle = do_sched_cfs_period_timer(cfs_b, overrun);
  2324. +   }
  2325. +
  2326. +   return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  2327. +}
  2328. +
  2329. +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
  2330. +{
  2331. +   raw_spin_lock_init(&cfs_b->lock);
  2332. +   cfs_b->runtime = 0;
  2333. +   cfs_b->quota = RUNTIME_INF;
  2334. +   cfs_b->period = ns_to_ktime(default_cfs_period());
  2335. +
  2336. +   INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
  2337. +   hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  2338. +   cfs_b->period_timer.function = sched_cfs_period_timer;
  2339. +   hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  2340. +   cfs_b->slack_timer.function = sched_cfs_slack_timer;
  2341. +}
  2342. +
  2343. +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  2344. +{
  2345. +   cfs_rq->runtime_enabled = 0;
  2346. +   INIT_LIST_HEAD(&cfs_rq->throttled_list);
  2347. +}
  2348. +
  2349. +/* requires cfs_b->lock, may release to reprogram timer */
  2350. +void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
  2351. +{
  2352. +   /*
  2353. +    * The timer may be active because we're trying to set a new bandwidth
  2354. +    * period or because we're racing with the tear-down path
  2355. +    * (timer_active==0 becomes visible before the hrtimer call-back
  2356. +    * terminates).  In either case we ensure that it's re-programmed
  2357. +    */
  2358. +   while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
  2359. +       raw_spin_unlock(&cfs_b->lock);
  2360. +       /* ensure cfs_b->lock is available while we wait */
  2361. +       hrtimer_cancel(&cfs_b->period_timer);
  2362. +
  2363. +       raw_spin_lock(&cfs_b->lock);
  2364. +       /* if someone else restarted the timer then we're done */
  2365. +       if (cfs_b->timer_active)
  2366. +           return;
  2367. +   }
  2368. +
  2369. +   cfs_b->timer_active = 1;
  2370. +   start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
  2371. +}
  2372. +
  2373. +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
  2374. +{
  2375. +   hrtimer_cancel(&cfs_b->period_timer);
  2376. +   hrtimer_cancel(&cfs_b->slack_timer);
  2377. +}
  2378. +
  2379. +void unthrottle_offline_cfs_rqs(struct rq *rq)
  2380. +{
  2381. +   struct cfs_rq *cfs_rq;
  2382. +
  2383. +   for_each_leaf_cfs_rq(rq, cfs_rq) {
  2384. +       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
  2385. +
  2386. +       if (!cfs_rq->runtime_enabled)
  2387. +           continue;
  2388. +
  2389. +       /*
  2390. +        * clock_task is not advancing so we just need to make sure
  2391. +        * there's some valid quota amount
  2392. +        */
  2393. +       cfs_rq->runtime_remaining = cfs_b->quota;
  2394. +       if (cfs_rq_throttled(cfs_rq))
  2395. +           unthrottle_cfs_rq(cfs_rq);
  2396. +   }
  2397. +}
  2398. +
  2399. +#else /* CONFIG_CFS_BANDWIDTH */
  2400. +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
  2401. +                    unsigned long delta_exec) {}
  2402. +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
  2403. +static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
  2404. +static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
  2405. +
  2406. +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
  2407. +{
  2408. +   return 0;
  2409. +}
  2410. +
  2411. +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
  2412. +{
  2413. +   return 0;
  2414. +}
  2415. +
  2416. +static inline int throttled_lb_pair(struct task_group *tg,
  2417. +                   int src_cpu, int dest_cpu)
  2418. +{
  2419. +   return 0;
  2420. +}
  2421. +
  2422. +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
  2423. +
  2424. +#ifdef CONFIG_FAIR_GROUP_SCHED
  2425. +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
  2426. +#endif
  2427. +
  2428. +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
  2429. +{
  2430. +   return NULL;
  2431. +}
  2432. +static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
  2433. +void unthrottle_offline_cfs_rqs(struct rq *rq) {}
  2434. +
  2435. +#endif /* CONFIG_CFS_BANDWIDTH */
  2436. +
  2437. +/**************************************************
  2438. + * CFS operations on tasks:
  2439. + */
  2440. +
  2441. +#ifdef CONFIG_SCHED_HRTICK
  2442. +static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
  2443. +{
  2444. +   struct sched_entity *se = &p->se;
  2445. +   struct cfs_rq *cfs_rq = cfs_rq_of(se);
  2446. +
  2447. +   WARN_ON(task_rq(p) != rq);
  2448. +
  2449. +   if (cfs_rq->nr_running > 1) {
  2450. +       u64 slice = sched_slice(cfs_rq, se);
  2451. +       u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
  2452. +       s64 delta = slice - ran;
  2453. +
  2454. +       if (delta < 0) {
  2455. +           if (rq->curr == p)
  2456. +               resched_task(p);
  2457. +           return;
  2458. +       }
  2459. +
  2460. +       /*
  2461. +        * Don't schedule slices shorter than 10000ns, that just
  2462. +        * doesn't make sense. Rely on vruntime for fairness.
  2463. +        */
  2464. +       if (rq->curr != p)
  2465. +           delta = max_t(s64, 10000LL, delta);
  2466. +
  2467. +       hrtick_start(rq, delta);
  2468. +   }
  2469. +}
  2470. +
  2471. +/*
  2472. + * called from enqueue/dequeue and updates the hrtick when the
  2473. + * current task is from our class and nr_running is low enough
  2474. + * to matter.
  2475. + */
  2476. +static void hrtick_update(struct rq *rq)
  2477. +{
  2478. +   struct task_struct *curr = rq->curr;
  2479. +
  2480. +   if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
  2481. +       return;
  2482. +
  2483. +   if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
  2484. +       hrtick_start_fair(rq, curr);
  2485. +}
  2486. +#else /* !CONFIG_SCHED_HRTICK */
  2487. +static inline void
  2488. +hrtick_start_fair(struct rq *rq, struct task_struct *p)
  2489. +{
  2490. +}
  2491. +
  2492. +static inline void hrtick_update(struct rq *rq)
  2493. +{
  2494. +}
  2495. +#endif
  2496. +
  2497. +/*
  2498. + * The enqueue_task method is called before nr_running is
  2499. + * increased. Here we update the fair scheduling stats and
  2500. + * then put the task into the rbtree:
  2501. + */
  2502. +static void
  2503. +enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  2504. +{
  2505. +   struct cfs_rq *cfs_rq;
  2506. +   struct sched_entity *se = &p->se;
  2507. +
  2508. +   for_each_sched_entity(se) {
  2509. +       if (se->on_rq)
  2510. +           break;
  2511. +       cfs_rq = cfs_rq_of(se);
  2512. +       enqueue_entity(cfs_rq, se, flags);
  2513. +
  2514. +       /*
  2515. +        * end evaluation on encountering a throttled cfs_rq
  2516. +        *
  2517. +        * note: in the case of encountering a throttled cfs_rq we will
  2518. +        * post the final h_nr_running increment below.
  2519. +       */
  2520. +       if (cfs_rq_throttled(cfs_rq))
  2521. +           break;
  2522. +       cfs_rq->h_nr_running++;
  2523. +
  2524. +       flags = ENQUEUE_WAKEUP;
  2525. +   }
  2526. +
  2527. +   for_each_sched_entity(se) {
  2528. +       cfs_rq = cfs_rq_of(se);
  2529. +       cfs_rq->h_nr_running++;
  2530. +
  2531. +       if (cfs_rq_throttled(cfs_rq))
  2532. +           break;
  2533. +
  2534. +       update_cfs_load(cfs_rq, 0);
  2535. +       update_cfs_shares(cfs_rq);
  2536. +   }
  2537. +
  2538. +   if (!se)
  2539. +       inc_nr_running(rq);
  2540. +   hrtick_update(rq);
  2541. +}
  2542. +
  2543. +static void set_next_buddy(struct sched_entity *se);
  2544. +
  2545. +/*
  2546. + * The dequeue_task method is called before nr_running is
  2547. + * decreased. We remove the task from the rbtree and
  2548. + * update the fair scheduling stats:
  2549. + */
  2550. +static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  2551. +{
  2552. +   struct cfs_rq *cfs_rq;
  2553. +   struct sched_entity *se = &p->se;
  2554. +   int task_sleep = flags & DEQUEUE_SLEEP;
  2555. +
  2556. +   for_each_sched_entity(se) {
  2557. +       cfs_rq = cfs_rq_of(se);
  2558. +       dequeue_entity(cfs_rq, se, flags);
  2559. +
  2560. +       /*
  2561. +        * end evaluation on encountering a throttled cfs_rq
  2562. +        *
  2563. +        * note: in the case of encountering a throttled cfs_rq we will
  2564. +        * post the final h_nr_running decrement below.
  2565. +       */
  2566. +       if (cfs_rq_throttled(cfs_rq))
  2567. +           break;
  2568. +       cfs_rq->h_nr_running--;
  2569. +
  2570. +       /* Don't dequeue parent if it has other entities besides us */
  2571. +       if (cfs_rq->load.weight) {
  2572. +           /*
  2573. +            * Bias pick_next to pick a task from this cfs_rq, as
  2574. +            * p is sleeping when it is within its sched_slice.
  2575. +            */
  2576. +           if (task_sleep && parent_entity(se))
  2577. +               set_next_buddy(parent_entity(se));
  2578. +
  2579. +           /* avoid re-evaluating load for this entity */
  2580. +           se = parent_entity(se);
  2581. +           break;
  2582. +       }
  2583. +       flags |= DEQUEUE_SLEEP;
  2584. +   }
  2585. +
  2586. +   for_each_sched_entity(se) {
  2587. +       cfs_rq = cfs_rq_of(se);
  2588. +       cfs_rq->h_nr_running--;
  2589. +
  2590. +       if (cfs_rq_throttled(cfs_rq))
  2591. +           break;
  2592. +
  2593. +       update_cfs_load(cfs_rq, 0);
  2594. +       update_cfs_shares(cfs_rq);
  2595. +   }
  2596. +
  2597. +   if (!se)
  2598. +       dec_nr_running(rq);
  2599. +   hrtick_update(rq);
  2600. +}
  2601. +
  2602. +#ifdef CONFIG_SMP
  2603. +/* Used instead of source_load when we know the type == 0 */
  2604. +static unsigned long weighted_cpuload(const int cpu)
  2605. +{
  2606. +   return cpu_rq(cpu)->load.weight;
  2607. +}
  2608. +
  2609. +/*
  2610. + * Return a low guess at the load of a migration-source cpu weighted
  2611. + * according to the scheduling class and "nice" value.
  2612. + *
  2613. + * We want to under-estimate the load of migration sources, to
  2614. + * balance conservatively.
  2615. + */
  2616. +static unsigned long source_load(int cpu, int type)
  2617. +{
  2618. +   struct rq *rq = cpu_rq(cpu);
  2619. +   unsigned long total = weighted_cpuload(cpu);
  2620. +
  2621. +   if (type == 0 || !sched_feat(LB_BIAS))
  2622. +       return total;
  2623. +
  2624. +   return min(rq->cpu_load[type-1], total);
  2625. +}
  2626. +
  2627. +/*
  2628. + * Return a high guess at the load of a migration-target cpu weighted
  2629. + * according to the scheduling class and "nice" value.
  2630. + */
  2631. +static unsigned long target_load(int cpu, int type)
  2632. +{
  2633. +   struct rq *rq = cpu_rq(cpu);
  2634. +   unsigned long total = weighted_cpuload(cpu);
  2635. +
  2636. +   if (type == 0 || !sched_feat(LB_BIAS))
  2637. +       return total;
  2638. +
  2639. +   return max(rq->cpu_load[type-1], total);
  2640. +}
  2641. +
  2642. +static unsigned long power_of(int cpu)
  2643. +{
  2644. +   return cpu_rq(cpu)->cpu_power;
  2645. +}
  2646. +
  2647. +static unsigned long cpu_avg_load_per_task(int cpu)
  2648. +{
  2649. +   struct rq *rq = cpu_rq(cpu);
  2650. +   unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
  2651. +
  2652. +   if (nr_running)
  2653. +       return rq->load.weight / nr_running;
  2654. +
  2655. +   return 0;
  2656. +}
  2657. +
  2658. +
  2659. +static void task_waking_fair(struct task_struct *p)
  2660. +{
  2661. +   struct sched_entity *se = &p->se;
  2662. +   struct cfs_rq *cfs_rq = cfs_rq_of(se);
  2663. +   u64 min_vruntime;
  2664. +
  2665. +#ifndef CONFIG_64BIT
  2666. +   u64 min_vruntime_copy;
  2667. +
  2668. +   do {
  2669. +       min_vruntime_copy = cfs_rq->min_vruntime_copy;
  2670. +       smp_rmb();
  2671. +       min_vruntime = cfs_rq->min_vruntime;
  2672. +   } while (min_vruntime != min_vruntime_copy);
  2673. +#else
  2674. +   min_vruntime = cfs_rq->min_vruntime;
  2675. +#endif
  2676. +
  2677. +   se->vruntime -= min_vruntime;
  2678. +}
  2679. +
  2680. +#ifdef CONFIG_FAIR_GROUP_SCHED
  2681. +/*
  2682. + * effective_load() calculates the load change as seen from the root_task_group
  2683. + *
  2684. + * Adding load to a group doesn't make a group heavier, but can cause movement
  2685. + * of group shares between cpus. Assuming the shares were perfectly aligned one
  2686. + * can calculate the shift in shares.
  2687. + *
  2688. + * Calculate the effective load difference if @wl is added (subtracted) to @tg
  2689. + * on this @cpu and results in a total addition (subtraction) of @wg to the
  2690. + * total group weight.
  2691. + *
  2692. + * Given a runqueue weight distribution (rw_i) we can compute a shares
  2693. + * distribution (s_i) using:
  2694. + *
  2695. + *   s_i = rw_i / \Sum rw_j                        (1)
  2696. + *
  2697. + * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
  2698. + * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
  2699. + * shares distribution (s_i):
  2700. + *
  2701. + *   rw_i = {   2,   4,   1,   0 }
  2702. + *   s_i  = { 2/7, 4/7, 1/7,   0 }
  2703. + *
  2704. + * As per wake_affine() we're interested in the load of two CPUs (the CPU the
  2705. + * task used to run on and the CPU the waker is running on), we need to
  2706. + * compute the effect of waking a task on either CPU and, in case of a sync
  2707. + * wakeup, compute the effect of the current task going to sleep.
  2708. + *
  2709. + * So for a change of @wl to the local @cpu with an overall group weight change
  2710. + * of @wl we can compute the new shares distribution (s'_i) using:
  2711. + *
  2712. + *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)               (2)
  2713. + *
  2714. + * Suppose we're interested in CPUs 0 and 1, and want to compute the load
  2715. + * differences in waking a task to CPU 0. The additional task changes the
  2716. + * weight and shares distributions like:
  2717. + *
  2718. + *   rw'_i = {   3,   4,   1,   0 }
  2719. + *   s'_i  = { 3/8, 4/8, 1/8,   0 }
  2720. + *
  2721. + * We can then compute the difference in effective weight by using:
  2722. + *
  2723. + *   dw_i = S * (s'_i - s_i)                       (3)
  2724. + *
  2725. + * Where 'S' is the group weight as seen by its parent.
  2726. + *
  2727. + * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
  2728. + * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
  2729. + * 4/7) times the weight of the group.
  2730. + */
  2731. +static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  2732. +{
  2733. +   struct sched_entity *se = tg->se[cpu];
  2734. +
  2735. +   if (!tg->parent)    /* the trivial, non-cgroup case */
  2736. +       return wl;
  2737. +
  2738. +   for_each_sched_entity(se) {
  2739. +       long w, W;
  2740. +
  2741. +       tg = se->my_q->tg;
  2742. +
  2743. +       /*
  2744. +        * W = @wg + \Sum rw_j
  2745. +        */
  2746. +       W = wg + calc_tg_weight(tg, se->my_q);
  2747. +
  2748. +       /*
  2749. +        * w = rw_i + @wl
  2750. +        */
  2751. +       w = se->my_q->load.weight + wl;
  2752. +
  2753. +       /*
  2754. +        * wl = S * s'_i; see (2)
  2755. +        */
  2756. +       if (W > 0 && w < W)
  2757. +           wl = (w * tg->shares) / W;
  2758. +       else
  2759. +           wl = tg->shares;
  2760. +
  2761. +       /*
  2762. +        * Per the above, wl is the new se->load.weight value; since
  2763. +        * those are clipped to [MIN_SHARES, ...) do so now. See
  2764. +        * calc_cfs_shares().
  2765. +        */
  2766. +       if (wl < MIN_SHARES)
  2767. +           wl = MIN_SHARES;
  2768. +
  2769. +       /*
  2770. +        * wl = dw_i = S * (s'_i - s_i); see (3)
  2771. +        */
  2772. +       wl -= se->load.weight;
  2773. +
  2774. +       /*
  2775. +        * Recursively apply this logic to all parent groups to compute
  2776. +        * the final effective load change on the root group. Since
  2777. +        * only the @tg group gets extra weight, all parent groups can
  2778. +        * only redistribute existing shares. @wl is the shift in shares
  2779. +        * resulting from this level per the above.
  2780. +        */
  2781. +       wg = 0;
  2782. +   }
  2783. +
  2784. +   return wl;
  2785. +}
  2786. +#else
  2787. +
  2788. +static inline unsigned long effective_load(struct task_group *tg, int cpu,
  2789. +       unsigned long wl, unsigned long wg)
  2790. +{
  2791. +   return wl;
  2792. +}
  2793. +
  2794. +#endif
  2795. +
  2796. +static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
  2797. +{
  2798. +   s64 this_load, load;
  2799. +   int idx, this_cpu, prev_cpu;
  2800. +   unsigned long tl_per_task;
  2801. +   struct task_group *tg;
  2802. +   unsigned long weight;
  2803. +   int balanced;
  2804. +
  2805. +   idx   = sd->wake_idx;
  2806. +   this_cpu  = smp_processor_id();
  2807. +   prev_cpu  = task_cpu(p);
  2808. +   load      = source_load(prev_cpu, idx);
  2809. +   this_load = target_load(this_cpu, idx);
  2810. +
  2811. +   /*
  2812. +    * If sync wakeup then subtract the (maximum possible)
  2813. +    * effect of the currently running task from the load
  2814. +    * of the current CPU:
  2815. +    */
  2816. +   if (sync) {
  2817. +       tg = task_group(current);
  2818. +       weight = current->se.load.weight;
  2819. +
  2820. +       this_load += effective_load(tg, this_cpu, -weight, -weight);
  2821. +       load += effective_load(tg, prev_cpu, 0, -weight);
  2822. +   }
  2823. +
  2824. +   tg = task_group(p);
  2825. +   weight = p->se.load.weight;
  2826. +
  2827. +   /*
  2828. +    * In low-load situations, where prev_cpu is idle and this_cpu is idle
  2829. +    * due to the sync cause above having dropped this_load to 0, we'll
  2830. +    * always have an imbalance, but there's really nothing you can do
  2831. +    * about that, so that's good too.
  2832. +    *
  2833. +    * Otherwise check if either cpus are near enough in load to allow this
  2834. +    * task to be woken on this_cpu.
  2835. +    */
  2836. +   if (this_load > 0) {
  2837. +       s64 this_eff_load, prev_eff_load;
  2838. +
  2839. +       this_eff_load = 100;
  2840. +       this_eff_load *= power_of(prev_cpu);
  2841. +       this_eff_load *= this_load +
  2842. +           effective_load(tg, this_cpu, weight, weight);
  2843. +
  2844. +       prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
  2845. +       prev_eff_load *= power_of(this_cpu);
  2846. +       prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
  2847. +
  2848. +       balanced = this_eff_load <= prev_eff_load;
  2849. +   } else
  2850. +       balanced = true;
  2851. +
  2852. +   /*
  2853. +    * If the currently running task will sleep within
  2854. +    * a reasonable amount of time then attract this newly
  2855. +    * woken task:
  2856. +    */
  2857. +   if (sync && balanced)
  2858. +       return 1;
  2859. +
  2860. +   schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
  2861. +   tl_per_task = cpu_avg_load_per_task(this_cpu);
  2862. +
  2863. +   if (balanced ||
  2864. +       (this_load <= load &&
  2865. +        this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
  2866. +       /*
  2867. +        * This domain has SD_WAKE_AFFINE and
  2868. +        * p is cache cold in this domain, and
  2869. +        * there is no bad imbalance.
  2870. +        */
  2871. +       schedstat_inc(sd, ttwu_move_affine);
  2872. +       schedstat_inc(p, se.statistics.nr_wakeups_affine);
  2873. +
  2874. +       return 1;
  2875. +   }
  2876. +   return 0;
  2877. +}
  2878. +
  2879. +/*
  2880. + * find_idlest_group finds and returns the least busy CPU group within the
  2881. + * domain.
  2882. + */
  2883. +static struct sched_group *
  2884. +find_idlest_group(struct sched_domain *sd, struct task_struct *p,
  2885. +         int this_cpu, int load_idx)
  2886. +{
  2887. +   struct sched_group *idlest = NULL, *group = sd->groups;
  2888. +   unsigned long min_load = ULONG_MAX, this_load = 0;
  2889. +   int imbalance = 100 + (sd->imbalance_pct-100)/2;
  2890. +
  2891. +   do {
  2892. +       unsigned long load, avg_load;
  2893. +       int local_group;
  2894. +       int i;
  2895. +
  2896. +       /* Skip over this group if it has no CPUs allowed */
  2897. +       if (!cpumask_intersects(sched_group_cpus(group),
  2898. +                   tsk_cpus_allowed(p)))
  2899. +           continue;
  2900. +
  2901. +       local_group = cpumask_test_cpu(this_cpu,
  2902. +                          sched_group_cpus(group));
  2903. +
  2904. +       /* Tally up the load of all CPUs in the group */
  2905. +       avg_load = 0;
  2906. +
  2907. +       for_each_cpu(i, sched_group_cpus(group)) {
  2908. +           /* Bias balancing toward cpus of our domain */
  2909. +           if (local_group)
  2910. +               load = source_load(i, load_idx);
  2911. +           else
  2912. +               load = target_load(i, load_idx);
  2913. +
  2914. +           avg_load += load;
  2915. +       }
  2916. +
  2917. +       /* Adjust by relative CPU power of the group */
  2918. +       avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
  2919. +
  2920. +       if (local_group) {
  2921. +           this_load = avg_load;
  2922. +       } else if (avg_load < min_load) {
  2923. +           min_load = avg_load;
  2924. +           idlest = group;
  2925. +       }
  2926. +   } while (group = group->next, group != sd->groups);
  2927. +
  2928. +   if (!idlest || 100*this_load < imbalance*min_load)
  2929. +       return NULL;
  2930. +   return idlest;
  2931. +}
  2932. +
  2933. +/*
  2934. + * find_idlest_cpu - find the idlest cpu among the cpus in group.
  2935. + */
  2936. +static int
  2937. +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  2938. +{
  2939. +   unsigned long load, min_load = ULONG_MAX;
  2940. +   int idlest = -1;
  2941. +   int i;
  2942. +
  2943. +   /* Traverse only the allowed CPUs */
  2944. +   for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
  2945. +       load = weighted_cpuload(i);
  2946. +
  2947. +       if (load < min_load || (load == min_load && i == this_cpu)) {
  2948. +           min_load = load;
  2949. +           idlest = i;
  2950. +       }
  2951. +   }
  2952. +
  2953. +   return idlest;
  2954. +}
  2955. +
  2956. +/*
  2957. + * Try and locate an idle CPU in the sched_domain.
  2958. + */
  2959. +static int select_idle_sibling(struct task_struct *p, int target)
  2960. +{
  2961. +   int cpu = smp_processor_id();
  2962. +   int prev_cpu = task_cpu(p);
  2963. +   struct sched_domain *sd;
  2964. +   struct sched_group *sg;
  2965. +   int i;
  2966. +
  2967. +   /*
  2968. +    * If the task is going to be woken-up on this cpu and if it is
  2969. +    * already idle, then it is the right target.
  2970. +    */
  2971. +   if (target == cpu && idle_cpu(cpu))
  2972. +       return cpu;
  2973. +
  2974. +   /*
  2975. +    * If the task is going to be woken-up on the cpu where it previously
  2976. +    * ran and if it is currently idle, then it the right target.
  2977. +    */
  2978. +   if (target == prev_cpu && idle_cpu(prev_cpu))
  2979. +       return prev_cpu;
  2980. +
  2981. +   /*
  2982. +    * Otherwise, iterate the domains and find an elegible idle cpu.
  2983. +    */
  2984. +   rcu_read_lock();
  2985. +
  2986. +   sd = rcu_dereference(per_cpu(sd_llc, target));
  2987. +   for_each_lower_domain(sd) {
  2988. +       sg = sd->groups;
  2989. +       do {
  2990. +           if (!cpumask_intersects(sched_group_cpus(sg),
  2991. +                       tsk_cpus_allowed(p)))
  2992. +               goto next;
  2993. +
  2994. +           for_each_cpu(i, sched_group_cpus(sg)) {
  2995. +               if (!idle_cpu(i))
  2996. +                   goto next;
  2997. +           }
  2998. +
  2999. +           target = cpumask_first_and(sched_group_cpus(sg),
  3000. +                   tsk_cpus_allowed(p));
  3001. +           goto done;
  3002. +next:
  3003. +           sg = sg->next;
  3004. +       } while (sg != sd->groups);
  3005. +   }
  3006. +done:
  3007. +   rcu_read_unlock();
  3008. +
  3009. +   return target;
  3010. +}
  3011. +
  3012. +/*
  3013. + * sched_balance_self: balance the current task (running on cpu) in domains
  3014. + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
  3015. + * SD_BALANCE_EXEC.
  3016. + *
  3017. + * Balance, ie. select the least loaded group.
  3018. + *
  3019. + * Returns the target CPU number, or the same CPU if no balancing is needed.
  3020. + *
  3021. + * preempt must be disabled.
  3022. + */
  3023. +static int
  3024. +select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
  3025. +{
  3026. +   struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
  3027. +   int cpu = smp_processor_id();
  3028. +   int prev_cpu = task_cpu(p);
  3029. +   int new_cpu = cpu;
  3030. +   int want_affine = 0;
  3031. +   int want_sd = 1;
  3032. +   int sync = wake_flags & WF_SYNC;
  3033. +
  3034. +   if (p->rt.nr_cpus_allowed == 1)
  3035. +       return prev_cpu;
  3036. +
  3037. +   if (sd_flag & SD_BALANCE_WAKE) {
  3038. +       if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
  3039. +           want_affine = 1;
  3040. +       new_cpu = prev_cpu;
  3041. +   }
  3042. +
  3043. +   rcu_read_lock();
  3044. +   for_each_domain(cpu, tmp) {
  3045. +       if (!(tmp->flags & SD_LOAD_BALANCE))
  3046. +           continue;
  3047. +
  3048. +       /*
  3049. +        * If power savings logic is enabled for a domain, see if we
  3050. +        * are not overloaded, if so, don't balance wider.
  3051. +        */
  3052. +       if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
  3053. +           unsigned long power = 0;
  3054. +           unsigned long nr_running = 0;
  3055. +           unsigned long capacity;
  3056. +           int i;
  3057. +
  3058. +           for_each_cpu(i, sched_domain_span(tmp)) {
  3059. +               power += power_of(i);
  3060. +               nr_running += cpu_rq(i)->cfs.nr_running;
  3061. +           }
  3062. +
  3063. +           capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
  3064. +
  3065. +           if (tmp->flags & SD_POWERSAVINGS_BALANCE)
  3066. +               nr_running /= 2;
  3067. +
  3068. +           if (nr_running < capacity)
  3069. +               want_sd = 0;
  3070. +       }
  3071. +
  3072. +       /*
  3073. +        * If both cpu and prev_cpu are part of this domain,
  3074. +        * cpu is a valid SD_WAKE_AFFINE target.
  3075. +        */
  3076. +       if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
  3077. +           cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
  3078. +           affine_sd = tmp;
  3079. +           want_affine = 0;
  3080. +       }
  3081. +
  3082. +       if (!want_sd && !want_affine)
  3083. +           break;
  3084. +
  3085. +       if (!(tmp->flags & sd_flag))
  3086. +           continue;
  3087. +
  3088. +       if (want_sd)
  3089. +           sd = tmp;
  3090. +   }
  3091. +
  3092. +   if (affine_sd) {
  3093. +       if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
  3094. +           prev_cpu = cpu;
  3095. +
  3096. +       new_cpu = select_idle_sibling(p, prev_cpu);
  3097. +       goto unlock;
  3098. +   }
  3099. +
  3100. +   while (sd) {
  3101. +       int load_idx = sd->forkexec_idx;
  3102. +       struct sched_group *group;
  3103. +       int weight;
  3104. +
  3105. +       if (!(sd->flags & sd_flag)) {
  3106. +           sd = sd->child;
  3107. +           continue;
  3108. +       }
  3109. +
  3110. +       if (sd_flag & SD_BALANCE_WAKE)
  3111. +           load_idx = sd->wake_idx;
  3112. +
  3113. +       group = find_idlest_group(sd, p, cpu, load_idx);
  3114. +       if (!group) {
  3115. +           sd = sd->child;
  3116. +           continue;
  3117. +       }
  3118. +
  3119. +       new_cpu = find_idlest_cpu(group, p, cpu);
  3120. +       if (new_cpu == -1 || new_cpu == cpu) {
  3121. +           /* Now try balancing at a lower domain level of cpu */
  3122. +           sd = sd->child;
  3123. +           continue;
  3124. +       }
  3125. +
  3126. +       /* Now try balancing at a lower domain level of new_cpu */
  3127. +       cpu = new_cpu;
  3128. +       weight = sd->span_weight;
  3129. +       sd = NULL;
  3130. +       for_each_domain(cpu, tmp) {
  3131. +           if (weight <= tmp->span_weight)
  3132. +               break;
  3133. +           if (tmp->flags & sd_flag)
  3134. +               sd = tmp;
  3135. +       }
  3136. +       /* while loop will break here if sd == NULL */
  3137. +   }
  3138. +unlock:
  3139. +   rcu_read_unlock();
  3140. +
  3141. +   return new_cpu;
  3142. +}
  3143. +#endif /* CONFIG_SMP */
  3144. +
  3145. +static unsigned long
  3146. +wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
  3147. +{
  3148. +   unsigned long gran = sysctl_sched_wakeup_granularity;
  3149. +
  3150. +   /*
  3151. +    * Since its curr running now, convert the gran from real-time
  3152. +    * to virtual-time in his units.
  3153. +    *
  3154. +    * By using 'se' instead of 'curr' we penalize light tasks, so
  3155. +    * they get preempted easier. That is, if 'se' < 'curr' then
  3156. +    * the resulting gran will be larger, therefore penalizing the
  3157. +    * lighter, if otoh 'se' > 'curr' then the resulting gran will
  3158. +    * be smaller, again penalizing the lighter task.
  3159. +    *
  3160. +    * This is especially important for buddies when the leftmost
  3161. +    * task is higher priority than the buddy.
  3162. +    */
  3163. +   return calc_delta_fair(gran, se);
  3164. +}
  3165. +
  3166. +/*
  3167. + * Should 'se' preempt 'curr'.
  3168. + *
  3169. + *             |s1
  3170. + *        |s2
  3171. + *   |s3
  3172. + *         g
  3173. + *      |<--->|c
  3174. + *
  3175. + *  w(c, s1) = -1
  3176. + *  w(c, s2) =  0
  3177. + *  w(c, s3) =  1
  3178. + *
  3179. + */
  3180. +static int
  3181. +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
  3182. +{
  3183. +   s64 gran, vdiff = curr->vruntime - se->vruntime;
  3184. +
  3185. +   if (vdiff <= 0)
  3186. +       return -1;
  3187. +
  3188. +   gran = wakeup_gran(curr, se);
  3189. +   if (vdiff > gran)
  3190. +       return 1;
  3191. +
  3192. +   return 0;
  3193. +}
  3194. +
  3195. +static void set_last_buddy(struct sched_entity *se)
  3196. +{
  3197. +   if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
  3198. +       return;
  3199. +
  3200. +   for_each_sched_entity(se)
  3201. +       cfs_rq_of(se)->last = se;
  3202. +}
  3203. +
  3204. +static void set_next_buddy(struct sched_entity *se)
  3205. +{
  3206. +   if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
  3207. +       return;
  3208. +
  3209. +   for_each_sched_entity(se)
  3210. +       cfs_rq_of(se)->next = se;
  3211. +}
  3212. +
  3213. +static void set_skip_buddy(struct sched_entity *se)
  3214. +{
  3215. +   for_each_sched_entity(se)
  3216. +       cfs_rq_of(se)->skip = se;
  3217. +}
  3218. +
  3219. +/*
  3220. + * Preempt the current task with a newly woken task if needed:
  3221. + */
  3222. +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
  3223. +{
  3224. +   struct task_struct *curr = rq->curr;
  3225. +   struct sched_entity *se = &curr->se, *pse = &p->se;
  3226. +   struct cfs_rq *cfs_rq = task_cfs_rq(curr);
  3227. +   int scale = cfs_rq->nr_running >= sched_nr_latency;
  3228. +   int next_buddy_marked = 0;
  3229. +
  3230. +   if (unlikely(se == pse))
  3231. +       return;
  3232. +
  3233. +   /*
  3234. +    * This is possible from callers such as pull_task(), in which we
  3235. +    * unconditionally check_prempt_curr() after an enqueue (which may have
  3236. +    * lead to a throttle).  This both saves work and prevents false
  3237. +    * next-buddy nomination below.
  3238. +    */
  3239. +   if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
  3240. +       return;
  3241. +
  3242. +   if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
  3243. +       set_next_buddy(pse);
  3244. +       next_buddy_marked = 1;
  3245. +   }
  3246. +
  3247. +   /*
  3248. +    * We can come here with TIF_NEED_RESCHED already set from new task
  3249. +    * wake up path.
  3250. +    *
  3251. +    * Note: this also catches the edge-case of curr being in a throttled
  3252. +    * group (e.g. via set_curr_task), since update_curr() (in the
  3253. +    * enqueue of curr) will have resulted in resched being set.  This
  3254. +    * prevents us from potentially nominating it as a false LAST_BUDDY
  3255. +    * below.
  3256. +    */
  3257. +   if (test_tsk_need_resched(curr))
  3258. +       return;
  3259. +
  3260. +   /* Idle tasks are by definition preempted by non-idle tasks. */
  3261. +   if (unlikely(curr->policy == SCHED_IDLE) &&
  3262. +       likely(p->policy != SCHED_IDLE))
  3263. +       goto preempt;
  3264. +
  3265. +   /*
  3266. +    * Batch and idle tasks do not preempt non-idle tasks (their preemption
  3267. +    * is driven by the tick):
  3268. +    */
  3269. +   if (unlikely(p->policy != SCHED_NORMAL))
  3270. +       return;
  3271. +
  3272. +   find_matching_se(&se, &pse);
  3273. +   update_curr(cfs_rq_of(se));
  3274. +   BUG_ON(!pse);
  3275. +   if (wakeup_preempt_entity(se, pse) == 1) {
  3276. +       /*
  3277. +        * Bias pick_next to pick the sched entity that is
  3278. +        * triggering this preemption.
  3279. +        */
  3280. +       if (!next_buddy_marked)
  3281. +           set_next_buddy(pse);
  3282. +       goto preempt;
  3283. +   }
  3284. +
  3285. +   return;
  3286. +
  3287. +preempt:
  3288. +   resched_task(curr);
  3289. +   /*
  3290. +    * Only set the backward buddy when the current task is still
  3291. +    * on the rq. This can happen when a wakeup gets interleaved
  3292. +    * with schedule on the ->pre_schedule() or idle_balance()
  3293. +    * point, either of which can * drop the rq lock.
  3294. +    *
  3295. +    * Also, during early boot the idle thread is in the fair class,
  3296. +    * for obvious reasons its a bad idea to schedule back to it.
  3297. +    */
  3298. +   if (unlikely(!se->on_rq || curr == rq->idle))
  3299. +       return;
  3300. +
  3301. +   if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
  3302. +       set_last_buddy(se);
  3303. +}
  3304. +
  3305. +static struct task_struct *pick_next_task_fair(struct rq *rq)
  3306. +{
  3307. +   struct task_struct *p;
  3308. +   struct cfs_rq *cfs_rq = &rq->cfs;
  3309. +   struct sched_entity *se;
  3310. +
  3311. +   if (!cfs_rq->nr_running)
  3312. +       return NULL;
  3313. +
  3314. +   do {
  3315. +       se = pick_next_entity(cfs_rq);
  3316. +       set_next_entity(cfs_rq, se);
  3317. +       cfs_rq = group_cfs_rq(se);
  3318. +   } while (cfs_rq);
  3319. +
  3320. +   p = task_of(se);
  3321. +   if (hrtick_enabled(rq))
  3322. +       hrtick_start_fair(rq, p);
  3323. +
  3324. +   return p;
  3325. +}
  3326. +
  3327. +/*
  3328. + * Account for a descheduled task:
  3329. + */
  3330. +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
  3331. +{
  3332. +   struct sched_entity *se = &prev->se;
  3333. +   struct cfs_rq *cfs_rq;
  3334. +
  3335. +   for_each_sched_entity(se) {
  3336. +       cfs_rq = cfs_rq_of(se);
  3337. +       put_prev_entity(cfs_rq, se);
  3338. +   }
  3339. +}
  3340. +
  3341. +/*
  3342. + * sched_yield() is very simple
  3343. + *
  3344. + * The magic of dealing with the ->skip buddy is in pick_next_entity.
  3345. + */
  3346. +static void yield_task_fair(struct rq *rq)
  3347. +{
  3348. +   struct task_struct *curr = rq->curr;
  3349. +   struct cfs_rq *cfs_rq = task_cfs_rq(curr);
  3350. +   struct sched_entity *se = &curr->se;
  3351. +
  3352. +   /*
  3353. +    * Are we the only task in the tree?
  3354. +    */
  3355. +   if (unlikely(rq->nr_running == 1))
  3356. +       return;
  3357. +
  3358. +   clear_buddies(cfs_rq, se);
  3359. +
  3360. +   if (curr->policy != SCHED_BATCH) {
  3361. +       update_rq_clock(rq);
  3362. +       /*
  3363. +        * Update run-time statistics of the 'current'.
  3364. +        */
  3365. +       update_curr(cfs_rq);
  3366. +       /*
  3367. +        * Tell update_rq_clock() that we've just updated,
  3368. +        * so we don't do microscopic update in schedule()
  3369. +        * and double the fastpath cost.
  3370. +        */
  3371. +        rq->skip_clock_update = 1;
  3372. +   }
  3373. +
  3374. +   set_skip_buddy(se);
  3375. +}
  3376. +
  3377. +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
  3378. +{
  3379. +   struct sched_entity *se = &p->se;
  3380. +
  3381. +   /* throttled hierarchies are not runnable */
  3382. +   if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
  3383. +       return false;
  3384. +
  3385. +   /* Tell the scheduler that we'd really like pse to run next. */
  3386. +   set_next_buddy(se);
  3387. +
  3388. +   yield_task_fair(rq);
  3389. +
  3390. +   return true;
  3391. +}
  3392. +
  3393. +#ifdef CONFIG_SMP
  3394. +/**************************************************
  3395. + * Fair scheduling class load-balancing methods:
  3396. + */
  3397. +
  3398. +/*
  3399. + * pull_task - move a task from a remote runqueue to the local runqueue.
  3400. + * Both runqueues must be locked.
  3401. + */
  3402. +static void pull_task(struct rq *src_rq, struct task_struct *p,
  3403. +             struct rq *this_rq, int this_cpu)
  3404. +{
  3405. +   deactivate_task(src_rq, p, 0);
  3406. +   set_task_cpu(p, this_cpu);
  3407. +   activate_task(this_rq, p, 0);
  3408. +   check_preempt_curr(this_rq, p, 0);
  3409. +}
  3410. +
  3411. +/*
  3412. + * Is this task likely cache-hot:
  3413. + */
  3414. +static int
  3415. +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
  3416. +{
  3417. +   s64 delta;
  3418. +
  3419. +   if (p->sched_class != &fair_sched_class)
  3420. +       return 0;
  3421. +
  3422. +   if (unlikely(p->policy == SCHED_IDLE))
  3423. +       return 0;
  3424. +
  3425. +   /*
  3426. +    * Buddy candidates are cache hot:
  3427. +    */
  3428. +   if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
  3429. +           (&p->se == cfs_rq_of(&p->se)->next ||
  3430. +            &p->se == cfs_rq_of(&p->se)->last))
  3431. +       return 1;
  3432. +
  3433. +   if (sysctl_sched_migration_cost == -1)
  3434. +       return 1;
  3435. +   if (sysctl_sched_migration_cost == 0)
  3436. +       return 0;
  3437. +
  3438. +   delta = now - p->se.exec_start;
  3439. +
  3440. +   return delta < (s64)sysctl_sched_migration_cost;
  3441. +}
  3442. +
  3443. +#define LBF_ALL_PINNED 0x01
  3444. +#define LBF_NEED_BREAK 0x02    /* clears into HAD_BREAK */
  3445. +#define LBF_HAD_BREAK  0x04
  3446. +#define LBF_HAD_BREAKS 0x0C    /* count HAD_BREAKs overflows into ABORT */
  3447. +#define LBF_ABORT  0x10
  3448. +
  3449. +/*
  3450. + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  3451. + */
  3452. +static
  3453. +int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
  3454. +            struct sched_domain *sd, enum cpu_idle_type idle,
  3455. +            int *lb_flags)
  3456. +{
  3457. +   int tsk_cache_hot = 0;
  3458. +   /*
  3459. +    * We do not migrate tasks that are:
  3460. +    * 1) running (obviously), or
  3461. +    * 2) cannot be migrated to this CPU due to cpus_allowed, or
  3462. +    * 3) are cache-hot on their current CPU.
  3463. +    */
  3464. +   if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
  3465. +       schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
  3466. +       return 0;
  3467. +   }
  3468. +   *lb_flags &= ~LBF_ALL_PINNED;
  3469. +
  3470. +   if (task_running(rq, p)) {
  3471. +       schedstat_inc(p, se.statistics.nr_failed_migrations_running);
  3472. +       return 0;
  3473. +   }
  3474. +
  3475. +   /*
  3476. +    * Aggressive migration if:
  3477. +    * 1) task is cache cold, or
  3478. +    * 2) too many balance attempts have failed.
  3479. +    */
  3480. +
  3481. +   tsk_cache_hot = task_hot(p, rq->clock_task, sd);
  3482. +   if (!tsk_cache_hot ||
  3483. +       sd->nr_balance_failed > sd->cache_nice_tries) {
  3484. +#ifdef CONFIG_SCHEDSTATS
  3485. +       if (tsk_cache_hot) {
  3486. +           schedstat_inc(sd, lb_hot_gained[idle]);
  3487. +           schedstat_inc(p, se.statistics.nr_forced_migrations);
  3488. +       }
  3489. +#endif
  3490. +       return 1;
  3491. +   }
  3492. +
  3493. +   if (tsk_cache_hot) {
  3494. +       schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
  3495. +       return 0;
  3496. +   }
  3497. +   return 1;
  3498. +}
  3499. +
  3500. +/*
  3501. + * move_one_task tries to move exactly one task from busiest to this_rq, as
  3502. + * part of active balancing operations within "domain".
  3503. + * Returns 1 if successful and 0 otherwise.
  3504. + *
  3505. + * Called with both runqueues locked.
  3506. + */
  3507. +static int
  3508. +move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
  3509. +         struct sched_domain *sd, enum cpu_idle_type idle)
  3510. +{
  3511. +   struct task_struct *p, *n;
  3512. +   struct cfs_rq *cfs_rq;
  3513. +   int pinned = 0;
  3514. +
  3515. +   for_each_leaf_cfs_rq(busiest, cfs_rq) {
  3516. +       list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
  3517. +           if (throttled_lb_pair(task_group(p),
  3518. +                         busiest->cpu, this_cpu))
  3519. +               break;
  3520. +
  3521. +           if (!can_migrate_task(p, busiest, this_cpu,
  3522. +                       sd, idle, &pinned))
  3523. +               continue;
  3524. +
  3525. +           pull_task(busiest, p, this_rq, this_cpu);
  3526. +           /*
  3527. +            * Right now, this is only the second place pull_task()
  3528. +            * is called, so we can safely collect pull_task()
  3529. +            * stats here rather than inside pull_task().
  3530. +            */
  3531. +           schedstat_inc(sd, lb_gained[idle]);
  3532. +           return 1;
  3533. +       }
  3534. +   }
  3535. +
  3536. +   return 0;
  3537. +}
  3538. +
  3539. +static unsigned long
  3540. +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
  3541. +         unsigned long max_load_move, struct sched_domain *sd,
  3542. +         enum cpu_idle_type idle, int *lb_flags,
  3543. +         struct cfs_rq *busiest_cfs_rq)
  3544. +{
  3545. +   int loops = 0, pulled = 0;
  3546. +   long rem_load_move = max_load_move;
  3547. +   struct task_struct *p, *n;
  3548. +
  3549. +   if (max_load_move == 0)
  3550. +       goto out;
  3551. +
  3552. +   list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
  3553. +       if (loops++ > sysctl_sched_nr_migrate) {
  3554. +           *lb_flags |= LBF_NEED_BREAK;
  3555. +           break;
  3556. +       }
  3557. +
  3558. +       if ((p->se.load.weight >> 1) > rem_load_move ||
  3559. +           !can_migrate_task(p, busiest, this_cpu, sd, idle,
  3560. +                     lb_flags))
  3561. +           continue;
  3562. +
  3563. +       pull_task(busiest, p, this_rq, this_cpu);
  3564. +       pulled++;
  3565. +       rem_load_move -= p->se.load.weight;
  3566. +
  3567. +#ifdef CONFIG_PREEMPT
  3568. +       /*
  3569. +        * NEWIDLE balancing is a source of latency, so preemptible
  3570. +        * kernels will stop after the first task is pulled to minimize
  3571. +        * the critical section.
  3572. +        */
  3573. +       if (idle == CPU_NEWLY_IDLE) {
  3574. +           *lb_flags |= LBF_ABORT;
  3575. +           break;
  3576. +       }
  3577. +#endif
  3578. +
  3579. +       /*
  3580. +        * We only want to steal up to the prescribed amount of
  3581. +        * weighted load.
  3582. +        */
  3583. +       if (rem_load_move <= 0)
  3584. +           break;
  3585. +   }
  3586. +out:
  3587. +   /*
  3588. +    * Right now, this is one of only two places pull_task() is called,
  3589. +    * so we can safely collect pull_task() stats here rather than
  3590. +    * inside pull_task().
  3591. +    */
  3592. +   schedstat_add(sd, lb_gained[idle], pulled);
  3593. +
  3594. +   return max_load_move - rem_load_move;
  3595. +}
  3596. +
  3597. +#ifdef CONFIG_FAIR_GROUP_SCHED
  3598. +/*
  3599. + * update tg->load_weight by folding this cpu's load_avg
  3600. + */
  3601. +static int update_shares_cpu(struct task_group *tg, int cpu)
  3602. +{
  3603. +   struct cfs_rq *cfs_rq;
  3604. +   unsigned long flags;
  3605. +   struct rq *rq;
  3606. +
  3607. +   if (!tg->se[cpu])
  3608. +       return 0;
  3609. +
  3610. +   rq = cpu_rq(cpu);
  3611. +   cfs_rq = tg->cfs_rq[cpu];
  3612. +
  3613. +   raw_spin_lock_irqsave(&rq->lock, flags);
  3614. +
  3615. +   update_rq_clock(rq);
  3616. +   update_cfs_load(cfs_rq, 1);
  3617. +
  3618. +   /*
  3619. +    * We need to update shares after updating tg->load_weight in
  3620. +    * order to adjust the weight of groups with long running tasks.
  3621. +    */
  3622. +   update_cfs_shares(cfs_rq);
  3623. +
  3624. +   raw_spin_unlock_irqrestore(&rq->lock, flags);
  3625. +
  3626. +   return 0;
  3627. +}
  3628. +
  3629. +static void update_shares(int cpu)
  3630. +{
  3631. +   struct cfs_rq *cfs_rq;
  3632. +   struct rq *rq = cpu_rq(cpu);
  3633. +
  3634. +   rcu_read_lock();
  3635. +   /*
  3636. +    * Iterates the task_group tree in a bottom up fashion, see
  3637. +    * list_add_leaf_cfs_rq() for details.
  3638. +    */
  3639. +   for_each_leaf_cfs_rq(rq, cfs_rq) {
  3640. +       /* throttled entities do not contribute to load */
  3641. +       if (throttled_hierarchy(cfs_rq))
  3642. +           continue;
  3643. +
  3644. +       update_shares_cpu(cfs_rq->tg, cpu);
  3645. +   }
  3646. +   rcu_read_unlock();
  3647. +}
  3648. +
  3649. +/*
  3650. + * Compute the cpu's hierarchical load factor for each task group.
  3651. + * This needs to be done in a top-down fashion because the load of a child
  3652. + * group is a fraction of its parents load.
  3653. + */
  3654. +static int tg_load_down(struct task_group *tg, void *data)
  3655. +{
  3656. +   unsigned long load;
  3657. +   long cpu = (long)data;
  3658. +
  3659. +   if (!tg->parent) {
  3660. +       load = cpu_rq(cpu)->load.weight;
  3661. +   } else {
  3662. +       load = tg->parent->cfs_rq[cpu]->h_load;
  3663. +       load *= tg->se[cpu]->load.weight;
  3664. +       load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
  3665. +   }
  3666. +
  3667. +   tg->cfs_rq[cpu]->h_load = load;
  3668. +
  3669. +   return 0;
  3670. +}
  3671. +
  3672. +static void update_h_load(long cpu)
  3673. +{
  3674. +   walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
  3675. +}
  3676. +
  3677. +static unsigned long
  3678. +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
  3679. +         unsigned long max_load_move,
  3680. +         struct sched_domain *sd, enum cpu_idle_type idle,
  3681. +         int *lb_flags)
  3682. +{
  3683. +   long rem_load_move = max_load_move;
  3684. +   struct cfs_rq *busiest_cfs_rq;
  3685. +
  3686. +   rcu_read_lock();
  3687. +   update_h_load(cpu_of(busiest));
  3688. +
  3689. +   for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
  3690. +       unsigned long busiest_h_load = busiest_cfs_rq->h_load;
  3691. +       unsigned long busiest_weight = busiest_cfs_rq->load.weight;
  3692. +       u64 rem_load, moved_load;
  3693. +
  3694. +       if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
  3695. +           break;
  3696. +
  3697. +       /*
  3698. +        * empty group or part of a throttled hierarchy
  3699. +        */
  3700. +       if (!busiest_cfs_rq->task_weight ||
  3701. +           throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
  3702. +           continue;
  3703. +
  3704. +       rem_load = (u64)rem_load_move * busiest_weight;
  3705. +       rem_load = div_u64(rem_load, busiest_h_load + 1);
  3706. +
  3707. +       moved_load = balance_tasks(this_rq, this_cpu, busiest,
  3708. +               rem_load, sd, idle, lb_flags,
  3709. +               busiest_cfs_rq);
  3710. +
  3711. +       if (!moved_load)
  3712. +           continue;
  3713. +
  3714. +       moved_load *= busiest_h_load;
  3715. +       moved_load = div_u64(moved_load, busiest_weight + 1);
  3716. +
  3717. +       rem_load_move -= moved_load;
  3718. +       if (rem_load_move < 0)
  3719. +           break;
  3720. +   }
  3721. +   rcu_read_unlock();
  3722. +
  3723. +   return max_load_move - rem_load_move;
  3724. +}
  3725. +#else
  3726. +static inline void update_shares(int cpu)
  3727. +{
  3728. +}
  3729. +
  3730. +static unsigned long
  3731. +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
  3732. +         unsigned long max_load_move,
  3733. +         struct sched_domain *sd, enum cpu_idle_type idle,
  3734. +         int *lb_flags)
  3735. +{
  3736. +   return balance_tasks(this_rq, this_cpu, busiest,
  3737. +           max_load_move, sd, idle, lb_flags,
  3738. +           &busiest->cfs);
  3739. +}
  3740. +#endif
  3741. +
  3742. +/*
  3743. + * move_tasks tries to move up to max_load_move weighted load from busiest to
  3744. + * this_rq, as part of a balancing operation within domain "sd".
  3745. + * Returns 1 if successful and 0 otherwise.
  3746. + *
  3747. + * Called with both runqueues locked.
  3748. + */
  3749. +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
  3750. +             unsigned long max_load_move,
  3751. +             struct sched_domain *sd, enum cpu_idle_type idle,
  3752. +             int *lb_flags)
  3753. +{
  3754. +   unsigned long total_load_moved = 0, load_moved;
  3755. +
  3756. +   do {
  3757. +       load_moved = load_balance_fair(this_rq, this_cpu, busiest,
  3758. +               max_load_move - total_load_moved,
  3759. +               sd, idle, lb_flags);
  3760. +
  3761. +       total_load_moved += load_moved;
  3762. +
  3763. +       if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
  3764. +           break;
  3765. +
  3766. +#ifdef CONFIG_PREEMPT
  3767. +       /*
  3768. +        * NEWIDLE balancing is a source of latency, so preemptible
  3769. +        * kernels will stop after the first task is pulled to minimize
  3770. +        * the critical section.
  3771. +        */
  3772. +       if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
  3773. +           *lb_flags |= LBF_ABORT;
  3774. +           break;
  3775. +       }
  3776. +#endif
  3777. +   } while (load_moved && max_load_move > total_load_moved);
  3778. +
  3779. +   return total_load_moved > 0;
  3780. +}
  3781. +
  3782. +/********** Helpers for find_busiest_group ************************/
  3783. +/*
  3784. + * sd_lb_stats - Structure to store the statistics of a sched_domain
  3785. + *         during load balancing.
  3786. + */
  3787. +struct sd_lb_stats {
  3788. +   struct sched_group *busiest; /* Busiest group in this sd */
  3789. +   struct sched_group *this;  /* Local group in this sd */
  3790. +   unsigned long total_load;  /* Total load of all groups in sd */
  3791. +   unsigned long total_pwr;   /*   Total power of all groups in sd */
  3792. +   unsigned long avg_load;    /* Average load across all groups in sd */
  3793. +
  3794. +   /** Statistics of this group */
  3795. +   unsigned long this_load;
  3796. +   unsigned long this_load_per_task;
  3797. +   unsigned long this_nr_running;
  3798. +   unsigned long this_has_capacity;
  3799. +   unsigned int  this_idle_cpus;
  3800. +
  3801. +   /* Statistics of the busiest group */
  3802. +   unsigned int  busiest_idle_cpus;
  3803. +   unsigned long max_load;
  3804. +   unsigned long busiest_load_per_task;
  3805. +   unsigned long busiest_nr_running;
  3806. +   unsigned long busiest_group_capacity;
  3807. +   unsigned long busiest_has_capacity;
  3808. +   unsigned int  busiest_group_weight;
  3809. +
  3810. +   int group_imb; /* Is there imbalance in this sd */
  3811. +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
  3812. +   int power_savings_balance; /* Is powersave balance needed for this sd */
  3813. +   struct sched_group *group_min; /* Least loaded group in sd */
  3814. +   struct sched_group *group_leader; /* Group which relieves group_min */
  3815. +   unsigned long min_load_per_task; /* load_per_task in group_min */
  3816. +   unsigned long leader_nr_running; /* Nr running of group_leader */
  3817. +   unsigned long min_nr_running; /* Nr running of group_min */
  3818. +#endif
  3819. +};
  3820. +
  3821. +/*
  3822. + * sg_lb_stats - stats of a sched_group required for load_balancing
  3823. + */
  3824. +struct sg_lb_stats {
  3825. +   unsigned long avg_load; /*Avg load across the CPUs of the group */
  3826. +   unsigned long group_load; /* Total load over the CPUs of the group */
  3827. +   unsigned long sum_nr_running; /* Nr tasks running in the group */
  3828. +   unsigned long sum_weighted_load; /* Weighted load of group's tasks */
  3829. +   unsigned long group_capacity;
  3830. +   unsigned long idle_cpus;
  3831. +   unsigned long group_weight;
  3832. +   int group_imb; /* Is there an imbalance in the group ? */
  3833. +   int group_has_capacity; /* Is there extra capacity in the group? */
  3834. +};
  3835. +
  3836. +/**
  3837. + * get_sd_load_idx - Obtain the load index for a given sched domain.
  3838. + * @sd: The sched_domain whose load_idx is to be obtained.
  3839. + * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
  3840. + */
  3841. +static inline int get_sd_load_idx(struct sched_domain *sd,
  3842. +                   enum cpu_idle_type idle)
  3843. +{
  3844. +   int load_idx;
  3845. +
  3846. +   switch (idle) {
  3847. +   case CPU_NOT_IDLE:
  3848. +       load_idx = sd->busy_idx;
  3849. +       break;
  3850. +
  3851. +   case CPU_NEWLY_IDLE:
  3852. +       load_idx = sd->newidle_idx;
  3853. +       break;
  3854. +   default:
  3855. +       load_idx = sd->idle_idx;
  3856. +       break;
  3857. +   }
  3858. +
  3859. +   return load_idx;
  3860. +}
  3861. +
  3862. +
  3863. +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
  3864. +/**
  3865. + * init_sd_power_savings_stats - Initialize power savings statistics for
  3866. + * the given sched_domain, during load balancing.
  3867. + *
  3868. + * @sd: Sched domain whose power-savings statistics are to be initialized.
  3869. + * @sds: Variable containing the statistics for sd.
  3870. + * @idle: Idle status of the CPU at which we're performing load-balancing.
  3871. + */
  3872. +static inline void init_sd_power_savings_stats(struct sched_domain *sd,
  3873. +   struct sd_lb_stats *sds, enum cpu_idle_type idle)
  3874. +{
  3875. +   /*
  3876. +    * Busy processors will not participate in power savings
  3877. +    * balance.
  3878. +    */
  3879. +   if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
  3880. +       sds->power_savings_balance = 0;
  3881. +   else {
  3882. +       sds->power_savings_balance = 1;
  3883. +       sds->min_nr_running = ULONG_MAX;
  3884. +       sds->leader_nr_running = 0;
  3885. +   }
  3886. +}
  3887. +
  3888. +/**
  3889. + * update_sd_power_savings_stats - Update the power saving stats for a
  3890. + * sched_domain while performing load balancing.
  3891. + *
  3892. + * @group: sched_group belonging to the sched_domain under consideration.
  3893. + * @sds: Variable containing the statistics of the sched_domain
  3894. + * @local_group: Does group contain the CPU for which we're performing
  3895. + *         load balancing ?
  3896. + * @sgs: Variable containing the statistics of the group.
  3897. + */
  3898. +static inline void update_sd_power_savings_stats(struct sched_group *group,
  3899. +   struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
  3900. +{
  3901. +
  3902. +   if (!sds->power_savings_balance)
  3903. +       return;
  3904. +
  3905. +   /*
  3906. +    * If the local group is idle or completely loaded
  3907. +    * no need to do power savings balance at this domain
  3908. +    */
  3909. +   if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
  3910. +               !sds->this_nr_running))
  3911. +       sds->power_savings_balance = 0;
  3912. +
  3913. +   /*
  3914. +    * If a group is already running at full capacity or idle,
  3915. +    * don't include that group in power savings calculations
  3916. +    */
  3917. +   if (!sds->power_savings_balance ||
  3918. +       sgs->sum_nr_running >= sgs->group_capacity ||
  3919. +       !sgs->sum_nr_running)
  3920. +       return;
  3921. +
  3922. +   /*
  3923. +    * Calculate the group which has the least non-idle load.
  3924. +    * This is the group from where we need to pick up the load
  3925. +    * for saving power
  3926. +    */
  3927. +   if ((sgs->sum_nr_running < sds->min_nr_running) ||
  3928. +       (sgs->sum_nr_running == sds->min_nr_running &&
  3929. +        group_first_cpu(group) > group_first_cpu(sds->group_min))) {
  3930. +       sds->group_min = group;
  3931. +       sds->min_nr_running = sgs->sum_nr_running;
  3932. +       sds->min_load_per_task = sgs->sum_weighted_load /
  3933. +                       sgs->sum_nr_running;
  3934. +   }
  3935. +
  3936. +   /*
  3937. +    * Calculate the group which is almost near its
  3938. +    * capacity but still has some space to pick up some load
  3939. +    * from other group and save more power
  3940. +    */
  3941. +   if (sgs->sum_nr_running + 1 > sgs->group_capacity)
  3942. +       return;
  3943. +
  3944. +   if (sgs->sum_nr_running > sds->leader_nr_running ||
  3945. +       (sgs->sum_nr_running == sds->leader_nr_running &&
  3946. +        group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
  3947. +       sds->group_leader = group;
  3948. +       sds->leader_nr_running = sgs->sum_nr_running;
  3949. +   }
  3950. +}
  3951. +
  3952. +/**
  3953. + * check_power_save_busiest_group - see if there is potential for some power-savings balance
  3954. + * @sds: Variable containing the statistics of the sched_domain
  3955. + * under consideration.
  3956. + * @this_cpu: Cpu at which we're currently performing load-balancing.
  3957. + * @imbalance: Variable to store the imbalance.
  3958. + *
  3959. + * Description:
  3960. + * Check if we have potential to perform some power-savings balance.
  3961. + * If yes, set the busiest group to be the least loaded group in the
  3962. + * sched_domain, so that it's CPUs can be put to idle.
  3963. + *
  3964. + * Returns 1 if there is potential to perform power-savings balance.
  3965. + * Else returns 0.
  3966. + */
  3967. +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
  3968. +                   int this_cpu, unsigned long *imbalance)
  3969. +{
  3970. +   if (!sds->power_savings_balance)
  3971. +       return 0;
  3972. +
  3973. +   if (sds->this != sds->group_leader ||
  3974. +           sds->group_leader == sds->group_min)
  3975. +       return 0;
  3976. +
  3977. +   *imbalance = sds->min_load_per_task;
  3978. +   sds->busiest = sds->group_min;
  3979. +
  3980. +   return 1;
  3981. +
  3982. +}
  3983. +#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
  3984. +static inline void init_sd_power_savings_stats(struct sched_domain *sd,
  3985. +   struct sd_lb_stats *sds, enum cpu_idle_type idle)
  3986. +{
  3987. +   return;
  3988. +}
  3989. +
  3990. +static inline void update_sd_power_savings_stats(struct sched_group *group,
  3991. +   struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
  3992. +{
  3993. +   return;
  3994. +}
  3995. +
  3996. +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
  3997. +                   int this_cpu, unsigned long *imbalance)
  3998. +{
  3999. +   return 0;
  4000. +}
  4001. +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
  4002. +
  4003. +
  4004. +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
  4005. +{
  4006. +   return SCHED_POWER_SCALE;
  4007. +}
  4008. +
  4009. +unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
  4010. +{
  4011. +   return default_scale_freq_power(sd, cpu);
  4012. +}
  4013. +
  4014. +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
  4015. +{
  4016. +   unsigned long weight = sd->span_weight;
  4017. +   unsigned long smt_gain = sd->smt_gain;
  4018. +
  4019. +   smt_gain /= weight;
  4020. +
  4021. +   return smt_gain;
  4022. +}
  4023. +
  4024. +unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
  4025. +{
  4026. +   return default_scale_smt_power(sd, cpu);
  4027. +}
  4028. +
  4029. +unsigned long scale_rt_power(int cpu)
  4030. +{
  4031. +   struct rq *rq = cpu_rq(cpu);
  4032. +   u64 total, available;
  4033. +
  4034. +   total = sched_avg_period() + (rq->clock - rq->age_stamp);
  4035. +
  4036. +   if (unlikely(total < rq->rt_avg)) {
  4037. +       /* Ensures that power won't end up being negative */
  4038. +       available = 0;
  4039. +   } else {
  4040. +       available = total - rq->rt_avg;
  4041. +   }
  4042. +
  4043. +   if (unlikely((s64)total < SCHED_POWER_SCALE))
  4044. +       total = SCHED_POWER_SCALE;
  4045. +
  4046. +   total >>= SCHED_POWER_SHIFT;
  4047. +
  4048. +   return div_u64(available, total);
  4049. +}
  4050. +
  4051. +static void update_cpu_power(struct sched_domain *sd, int cpu)
  4052. +{
  4053. +   unsigned long weight = sd->span_weight;
  4054. +   unsigned long power = SCHED_POWER_SCALE;
  4055. +   struct sched_group *sdg = sd->groups;
  4056. +
  4057. +   if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
  4058. +       if (sched_feat(ARCH_POWER))
  4059. +           power *= arch_scale_smt_power(sd, cpu);
  4060. +       else
  4061. +           power *= default_scale_smt_power(sd, cpu);
  4062. +
  4063. +       power >>= SCHED_POWER_SHIFT;
  4064. +   }
  4065. +
  4066. +   sdg->sgp->power_orig = power;
  4067. +
  4068. +   if (sched_feat(ARCH_POWER))
  4069. +       power *= arch_scale_freq_power(sd, cpu);
  4070. +   else
  4071. +       power *= default_scale_freq_power(sd, cpu);
  4072. +
  4073. +   power >>= SCHED_POWER_SHIFT;
  4074. +
  4075. +   power *= scale_rt_power(cpu);
  4076. +   power >>= SCHED_POWER_SHIFT;
  4077. +
  4078. +   if (!power)
  4079. +       power = 1;
  4080. +
  4081. +   cpu_rq(cpu)->cpu_power = power;
  4082. +   sdg->sgp->power = power;
  4083. +}
  4084. +
  4085. +void update_group_power(struct sched_domain *sd, int cpu)
  4086. +{
  4087. +   struct sched_domain *child = sd->child;
  4088. +   struct sched_group *group, *sdg = sd->groups;
  4089. +   unsigned long power;
  4090. +
  4091. +   if (!child) {
  4092. +       update_cpu_power(sd, cpu);
  4093. +       return;
  4094. +   }
  4095. +
  4096. +   power = 0;
  4097. +
  4098. +   group = child->groups;
  4099. +   do {
  4100. +       power += group->sgp->power;
  4101. +       group = group->next;
  4102. +   } while (group != child->groups);
  4103. +
  4104. +   sdg->sgp->power = power;
  4105. +}
  4106. +
  4107. +/*
  4108. + * Try and fix up capacity for tiny siblings, this is needed when
  4109. + * things like SD_ASYM_PACKING need f_b_g to select another sibling
  4110. + * which on its own isn't powerful enough.
  4111. + *
  4112. + * See update_sd_pick_busiest() and check_asym_packing().
  4113. + */
  4114. +static inline int
  4115. +fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
  4116. +{
  4117. +   /*
  4118. +    * Only siblings can have significantly less than SCHED_POWER_SCALE
  4119. +    */
  4120. +   if (!(sd->flags & SD_SHARE_CPUPOWER))
  4121. +       return 0;
  4122. +
  4123. +   /*
  4124. +    * If ~90% of the cpu_power is still there, we're good.
  4125. +    */
  4126. +   if (group->sgp->power * 32 > group->sgp->power_orig * 29)
  4127. +       return 1;
  4128. +
  4129. +   return 0;
  4130. +}
  4131. +
  4132. +/**
  4133. + * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  4134. + * @sd: The sched_domain whose statistics are to be updated.
  4135. + * @group: sched_group whose statistics are to be updated.
  4136. + * @this_cpu: Cpu for which load balance is currently performed.
  4137. + * @idle: Idle status of this_cpu
  4138. + * @load_idx: Load index of sched_domain of this_cpu for load calc.
  4139. + * @local_group: Does group contain this_cpu.
  4140. + * @cpus: Set of cpus considered for load balancing.
  4141. + * @balance: Should we balance.
  4142. + * @sgs: variable to hold the statistics for this group.
  4143. + */
  4144. +static inline void update_sg_lb_stats(struct sched_domain *sd,
  4145. +           struct sched_group *group, int this_cpu,
  4146. +           enum cpu_idle_type idle, int load_idx,
  4147. +           int local_group, const struct cpumask *cpus,
  4148. +           int *balance, struct sg_lb_stats *sgs)
  4149. +{
  4150. +   unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
  4151. +   int i;
  4152. +   unsigned int balance_cpu = -1, first_idle_cpu = 0;
  4153. +   unsigned long avg_load_per_task = 0;
  4154. +
  4155. +   if (local_group)
  4156. +       balance_cpu = group_first_cpu(group);
  4157. +
  4158. +   /* Tally up the load of all CPUs in the group */
  4159. +   max_cpu_load = 0;
  4160. +   min_cpu_load = ~0UL;
  4161. +   max_nr_running = 0;
  4162. +
  4163. +   for_each_cpu_and(i, sched_group_cpus(group), cpus) {
  4164. +       struct rq *rq = cpu_rq(i);
  4165. +
  4166. +       /* Bias balancing toward cpus of our domain */
  4167. +       if (local_group) {
  4168. +           if (idle_cpu(i) && !first_idle_cpu) {
  4169. +               first_idle_cpu = 1;
  4170. +               balance_cpu = i;
  4171. +           }
  4172. +
  4173. +           load = target_load(i, load_idx);
  4174. +       } else {
  4175. +           load = source_load(i, load_idx);
  4176. +           if (load > max_cpu_load) {
  4177. +               max_cpu_load = load;
  4178. +               max_nr_running = rq->nr_running;
  4179. +           }
  4180. +           if (min_cpu_load > load)
  4181. +               min_cpu_load = load;
  4182. +       }
  4183. +
  4184. +       sgs->group_load += load;
  4185. +       sgs->sum_nr_running += rq->nr_running;
  4186. +       sgs->sum_weighted_load += weighted_cpuload(i);
  4187. +       if (idle_cpu(i))
  4188. +           sgs->idle_cpus++;
  4189. +   }
  4190. +
  4191. +   /*
  4192. +    * First idle cpu or the first cpu(busiest) in this sched group
  4193. +    * is eligible for doing load balancing at this and above
  4194. +    * domains. In the newly idle case, we will allow all the cpu's
  4195. +    * to do the newly idle load balance.
  4196. +    */
  4197. +   if (idle != CPU_NEWLY_IDLE && local_group) {
  4198. +       if (balance_cpu != this_cpu) {
  4199. +           *balance = 0;
  4200. +           return;
  4201. +       }
  4202. +       update_group_power(sd, this_cpu);
  4203. +   }
  4204. +
  4205. +   /* Adjust by relative CPU power of the group */
  4206. +   sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
  4207. +
  4208. +   /*
  4209. +    * Consider the group unbalanced when the imbalance is larger
  4210. +    * than the average weight of a task.
  4211. +    *
  4212. +    * APZ: with cgroup the avg task weight can vary wildly and
  4213. +    *      might not be a suitable number - should we keep a
  4214. +    *      normalized nr_running number somewhere that negates
  4215. +    *      the hierarchy?
  4216. +    */
  4217. +   if (sgs->sum_nr_running)
  4218. +       avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
  4219. +
  4220. +   if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
  4221. +       sgs->group_imb = 1;
  4222. +
  4223. +   sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
  4224. +                       SCHED_POWER_SCALE);
  4225. +   if (!sgs->group_capacity)
  4226. +       sgs->group_capacity = fix_small_capacity(sd, group);
  4227. +   sgs->group_weight = group->group_weight;
  4228. +
  4229. +   if (sgs->group_capacity > sgs->sum_nr_running)
  4230. +       sgs->group_has_capacity = 1;
  4231. +}
  4232. +
  4233. +/**
  4234. + * update_sd_pick_busiest - return 1 on busiest group
  4235. + * @sd: sched_domain whose statistics are to be checked
  4236. + * @sds: sched_domain statistics
  4237. + * @sg: sched_group candidate to be checked for being the busiest
  4238. + * @sgs: sched_group statistics
  4239. + * @this_cpu: the current cpu
  4240. + *
  4241. + * Determine if @sg is a busier group than the previously selected
  4242. + * busiest group.
  4243. + */
  4244. +static bool update_sd_pick_busiest(struct sched_domain *sd,
  4245. +                  struct sd_lb_stats *sds,
  4246. +                  struct sched_group *sg,
  4247. +                  struct sg_lb_stats *sgs,
  4248. +                  int this_cpu)
  4249. +{
  4250. +   if (sgs->avg_load <= sds->max_load)
  4251. +       return false;
  4252. +
  4253. +   if (sgs->sum_nr_running > sgs->group_capacity)
  4254. +       return true;
  4255. +
  4256. +   if (sgs->group_imb)
  4257. +       return true;
  4258. +
  4259. +   /*
  4260. +    * ASYM_PACKING needs to move all the work to the lowest
  4261. +    * numbered CPUs in the group, therefore mark all groups
  4262. +    * higher than ourself as busy.
  4263. +    */
  4264. +   if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
  4265. +       this_cpu < group_first_cpu(sg)) {
  4266. +       if (!sds->busiest)
  4267. +           return true;
  4268. +
  4269. +       if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
  4270. +           return true;
  4271. +   }
  4272. +
  4273. +   return false;
  4274. +}
  4275. +
  4276. +/**
  4277. + * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
  4278. + * @sd: sched_domain whose statistics are to be updated.
  4279. + * @this_cpu: Cpu for which load balance is currently performed.
  4280. + * @idle: Idle status of this_cpu
  4281. + * @cpus: Set of cpus considered for load balancing.
  4282. + * @balance: Should we balance.
  4283. + * @sds: variable to hold the statistics for this sched_domain.
  4284. + */
  4285. +static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
  4286. +           enum cpu_idle_type idle, const struct cpumask *cpus,
  4287. +           int *balance, struct sd_lb_stats *sds)
  4288. +{
  4289. +   struct sched_domain *child = sd->child;
  4290. +   struct sched_group *sg = sd->groups;
  4291. +   struct sg_lb_stats sgs;
  4292. +   int load_idx, prefer_sibling = 0;
  4293. +
  4294. +   if (child && child->flags & SD_PREFER_SIBLING)
  4295. +       prefer_sibling = 1;
  4296. +
  4297. +   init_sd_power_savings_stats(sd, sds, idle);
  4298. +   load_idx = get_sd_load_idx(sd, idle);
  4299. +
  4300. +   do {
  4301. +       int local_group;
  4302. +
  4303. +       local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
  4304. +       memset(&sgs, 0, sizeof(sgs));
  4305. +       update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
  4306. +               local_group, cpus, balance, &sgs);
  4307. +
  4308. +       if (local_group && !(*balance))
  4309. +           return;
  4310. +
  4311. +       sds->total_load += sgs.group_load;
  4312. +       sds->total_pwr += sg->sgp->power;
  4313. +
  4314. +       /*
  4315. +        * In case the child domain prefers tasks go to siblings
  4316. +        * first, lower the sg capacity to one so that we'll try
  4317. +        * and move all the excess tasks away. We lower the capacity
  4318. +        * of a group only if the local group has the capacity to fit
  4319. +        * these excess tasks, i.e. nr_running < group_capacity. The
  4320. +        * extra check prevents the case where you always pull from the
  4321. +        * heaviest group when it is already under-utilized (possible
  4322. +        * with a large weight task outweighs the tasks on the system).
  4323. +        */
  4324. +       if (prefer_sibling && !local_group && sds->this_has_capacity)
  4325. +           sgs.group_capacity = min(sgs.group_capacity, 1UL);
  4326. +
  4327. +       if (local_group) {
  4328. +           sds->this_load = sgs.avg_load;
  4329. +           sds->this = sg;
  4330. +           sds->this_nr_running = sgs.sum_nr_running;
  4331. +           sds->this_load_per_task = sgs.sum_weighted_load;
  4332. +           sds->this_has_capacity = sgs.group_has_capacity;
  4333. +           sds->this_idle_cpus = sgs.idle_cpus;
  4334. +       } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
  4335. +           sds->max_load = sgs.avg_load;
  4336. +           sds->busiest = sg;
  4337. +           sds->busiest_nr_running = sgs.sum_nr_running;
  4338. +           sds->busiest_idle_cpus = sgs.idle_cpus;
  4339. +           sds->busiest_group_capacity = sgs.group_capacity;
  4340. +           sds->busiest_load_per_task = sgs.sum_weighted_load;
  4341. +           sds->busiest_has_capacity = sgs.group_has_capacity;
  4342. +           sds->busiest_group_weight = sgs.group_weight;
  4343. +           sds->group_imb = sgs.group_imb;
  4344. +       }
  4345. +
  4346. +       update_sd_power_savings_stats(sg, sds, local_group, &sgs);
  4347. +       sg = sg->next;
  4348. +   } while (sg != sd->groups);
  4349. +}
  4350. +
  4351. +/**
  4352. + * check_asym_packing - Check to see if the group is packed into the
  4353. + *         sched doman.
  4354. + *
  4355. + * This is primarily intended to used at the sibling level.  Some
  4356. + * cores like POWER7 prefer to use lower numbered SMT threads.  In the
  4357. + * case of POWER7, it can move to lower SMT modes only when higher
  4358. + * threads are idle.  When in lower SMT modes, the threads will
  4359. + * perform better since they share less core resources.  Hence when we
  4360. + * have idle threads, we want them to be the higher ones.
  4361. + *
  4362. + * This packing function is run on idle threads.  It checks to see if
  4363. + * the busiest CPU in this domain (core in the P7 case) has a higher
  4364. + * CPU number than the packing function is being run on.  Here we are
  4365. + * assuming lower CPU number will be equivalent to lower a SMT thread
  4366. + * number.
  4367. + *
  4368. + * Returns 1 when packing is required and a task should be moved to
  4369. + * this CPU.  The amount of the imbalance is returned in *imbalance.
  4370. + *
  4371. + * @sd: The sched_domain whose packing is to be checked.
  4372. + * @sds: Statistics of the sched_domain which is to be packed
  4373. + * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
  4374. + * @imbalance: returns amount of imbalanced due to packing.
  4375. + */
  4376. +static int check_asym_packing(struct sched_domain *sd,
  4377. +                 struct sd_lb_stats *sds,
  4378. +                 int this_cpu, unsigned long *imbalance)
  4379. +{
  4380. +   int busiest_cpu;
  4381. +
  4382. +   if (!(sd->flags & SD_ASYM_PACKING))
  4383. +       return 0;
  4384. +
  4385. +   if (!sds->busiest)
  4386. +       return 0;
  4387. +
  4388. +   busiest_cpu = group_first_cpu(sds->busiest);
  4389. +   if (this_cpu > busiest_cpu)
  4390. +       return 0;
  4391. +
  4392. +   *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
  4393. +                      SCHED_POWER_SCALE);
  4394. +   return 1;
  4395. +}
  4396. +
  4397. +/**
  4398. + * fix_small_imbalance - Calculate the minor imbalance that exists
  4399. + *         amongst the groups of a sched_domain, during
  4400. + *         load balancing.
  4401. + * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
  4402. + * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
  4403. + * @imbalance: Variable to store the imbalance.
  4404. + */
  4405. +static inline void fix_small_imbalance(struct sd_lb_stats *sds,
  4406. +               int this_cpu, unsigned long *imbalance)
  4407. +{
  4408. +   unsigned long tmp, pwr_now = 0, pwr_move = 0;
  4409. +   unsigned int imbn = 2;
  4410. +   unsigned long scaled_busy_load_per_task;
  4411. +
  4412. +   if (sds->this_nr_running) {
  4413. +       sds->this_load_per_task /= sds->this_nr_running;
  4414. +       if (sds->busiest_load_per_task >
  4415. +               sds->this_load_per_task)
  4416. +           imbn = 1;
  4417. +   } else
  4418. +       sds->this_load_per_task =
  4419. +           cpu_avg_load_per_task(this_cpu);
  4420. +
  4421. +   scaled_busy_load_per_task = sds->busiest_load_per_task
  4422. +                    * SCHED_POWER_SCALE;
  4423. +   scaled_busy_load_per_task /= sds->busiest->sgp->power;
  4424. +
  4425. +   if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
  4426. +           (scaled_busy_load_per_task * imbn)) {
  4427. +       *imbalance = sds->busiest_load_per_task;
  4428. +       return;
  4429. +   }
  4430. +
  4431. +   /*
  4432. +    * OK, we don't have enough imbalance to justify moving tasks,
  4433. +    * however we may be able to increase total CPU power used by
  4434. +    * moving them.
  4435. +    */
  4436. +
  4437. +   pwr_now += sds->busiest->sgp->power *
  4438. +           min(sds->busiest_load_per_task, sds->max_load);
  4439. +   pwr_now += sds->this->sgp->power *
  4440. +           min(sds->this_load_per_task, sds->this_load);
  4441. +   pwr_now /= SCHED_POWER_SCALE;
  4442. +
  4443. +   /* Amount of load we'd subtract */
  4444. +   tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
  4445. +       sds->busiest->sgp->power;
  4446. +   if (sds->max_load > tmp)
  4447. +       pwr_move += sds->busiest->sgp->power *
  4448. +           min(sds->busiest_load_per_task, sds->max_load - tmp);
  4449. +
  4450. +   /* Amount of load we'd add */
  4451. +   if (sds->max_load * sds->busiest->sgp->power <
  4452. +       sds->busiest_load_per_task * SCHED_POWER_SCALE)
  4453. +       tmp = (sds->max_load * sds->busiest->sgp->power) /
  4454. +           sds->this->sgp->power;
  4455. +   else
  4456. +       tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
  4457. +           sds->this->sgp->power;
  4458. +   pwr_move += sds->this->sgp->power *
  4459. +           min(sds->this_load_per_task, sds->this_load + tmp);
  4460. +   pwr_move /= SCHED_POWER_SCALE;
  4461. +
  4462. +   /* Move if we gain throughput */
  4463. +   if (pwr_move > pwr_now)
  4464. +       *imbalance = sds->busiest_load_per_task;
  4465. +}
  4466. +
  4467. +/**
  4468. + * calculate_imbalance - Calculate the amount of imbalance present within the
  4469. + *          groups of a given sched_domain during load balance.
  4470. + * @sds: statistics of the sched_domain whose imbalance is to be calculated.
  4471. + * @this_cpu: Cpu for which currently load balance is being performed.
  4472. + * @imbalance: The variable to store the imbalance.
  4473. + */
  4474. +static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
  4475. +       unsigned long *imbalance)
  4476. +{
  4477. +   unsigned long max_pull, load_above_capacity = ~0UL;
  4478. +
  4479. +   sds->busiest_load_per_task /= sds->busiest_nr_running;
  4480. +   if (sds->group_imb) {
  4481. +       sds->busiest_load_per_task =
  4482. +           min(sds->busiest_load_per_task, sds->avg_load);
  4483. +   }
  4484. +
  4485. +   /*
  4486. +    * In the presence of smp nice balancing, certain scenarios can have
  4487. +    * max load less than avg load(as we skip the groups at or below
  4488. +    * its cpu_power, while calculating max_load..)
  4489. +    */
  4490. +   if (sds->max_load < sds->avg_load) {
  4491. +       *imbalance = 0;
  4492. +       return fix_small_imbalance(sds, this_cpu, imbalance);
  4493. +   }
  4494. +
  4495. +   if (!sds->group_imb) {
  4496. +       /*
  4497. +        * Don't want to pull so many tasks that a group would go idle.
  4498. +        */
  4499. +       load_above_capacity = (sds->busiest_nr_running -
  4500. +                       sds->busiest_group_capacity);
  4501. +
  4502. +       load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
  4503. +
  4504. +       load_above_capacity /= sds->busiest->sgp->power;
  4505. +   }
  4506. +
  4507. +   /*
  4508. +    * We're trying to get all the cpus to the average_load, so we don't
  4509. +    * want to push ourselves above the average load, nor do we wish to
  4510. +    * reduce the max loaded cpu below the average load. At the same time,
  4511. +    * we also don't want to reduce the group load below the group capacity
  4512. +    * (so that we can implement power-savings policies etc). Thus we look
  4513. +    * for the minimum possible imbalance.
  4514. +    * Be careful of negative numbers as they'll appear as very large values
  4515. +    * with unsigned longs.
  4516. +    */
  4517. +   max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
  4518. +
  4519. +   /* How much load to actually move to equalise the imbalance */
  4520. +   *imbalance = min(max_pull * sds->busiest->sgp->power,
  4521. +       (sds->avg_load - sds->this_load) * sds->this->sgp->power)
  4522. +           / SCHED_POWER_SCALE;
  4523. +
  4524. +   /*
  4525. +    * if *imbalance is less than the average load per runnable task
  4526. +    * there is no guarantee that any tasks will be moved so we'll have
  4527. +    * a think about bumping its value to force at least one task to be
  4528. +    * moved
  4529. +    */
  4530. +   if (*imbalance < sds->busiest_load_per_task)
  4531. +       return fix_small_imbalance(sds, this_cpu, imbalance);
  4532. +
  4533. +}
  4534. +
  4535. +/******* find_busiest_group() helpers end here *********************/
  4536. +
  4537. +/**
  4538. + * find_busiest_group - Returns the busiest group within the sched_domain
  4539. + * if there is an imbalance. If there isn't an imbalance, and
  4540. + * the user has opted for power-savings, it returns a group whose
  4541. + * CPUs can be put to idle by rebalancing those tasks elsewhere, if
  4542. + * such a group exists.
  4543. + *
  4544. + * Also calculates the amount of weighted load which should be moved
  4545. + * to restore balance.
  4546. + *
  4547. + * @sd: The sched_domain whose busiest group is to be returned.
  4548. + * @this_cpu: The cpu for which load balancing is currently being performed.
  4549. + * @imbalance: Variable which stores amount of weighted load which should
  4550. + *     be moved to restore balance/put a group to idle.
  4551. + * @idle: The idle status of this_cpu.
  4552. + * @cpus: The set of CPUs under consideration for load-balancing.
  4553. + * @balance: Pointer to a variable indicating if this_cpu
  4554. + * is the appropriate cpu to perform load balancing at this_level.
  4555. + *
  4556. + * Returns:    - the busiest group if imbalance exists.
  4557. + *     - If no imbalance and user has opted for power-savings balance,
  4558. + *        return the least loaded group whose CPUs can be
  4559. + *        put to idle by rebalancing its tasks onto our group.
  4560. + */
  4561. +static struct sched_group *
  4562. +find_busiest_group(struct sched_domain *sd, int this_cpu,
  4563. +          unsigned long *imbalance, enum cpu_idle_type idle,
  4564. +          const struct cpumask *cpus, int *balance)
  4565. +{
  4566. +   struct sd_lb_stats sds;
  4567. +
  4568. +   memset(&sds, 0, sizeof(sds));
  4569. +
  4570. +   /*
  4571. +    * Compute the various statistics relavent for load balancing at
  4572. +    * this level.
  4573. +    */
  4574. +   update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
  4575. +
  4576. +   /*
  4577. +    * this_cpu is not the appropriate cpu to perform load balancing at
  4578. +    * this level.
  4579. +    */
  4580. +   if (!(*balance))
  4581. +       goto ret;
  4582. +
  4583. +   if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
  4584. +       check_asym_packing(sd, &sds, this_cpu, imbalance))
  4585. +       return sds.busiest;
  4586. +
  4587. +   /* There is no busy sibling group to pull tasks from */
  4588. +   if (!sds.busiest || sds.busiest_nr_running == 0)
  4589. +       goto out_balanced;
  4590. +
  4591. +   sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
  4592. +
  4593. +   /*
  4594. +    * If the busiest group is imbalanced the below checks don't
  4595. +    * work because they assumes all things are equal, which typically
  4596. +    * isn't true due to cpus_allowed constraints and the like.
  4597. +    */
  4598. +   if (sds.group_imb)
  4599. +       goto force_balance;
  4600. +
  4601. +   /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
  4602. +   if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
  4603. +           !sds.busiest_has_capacity)
  4604. +       goto force_balance;
  4605. +
  4606. +   /*
  4607. +    * If the local group is more busy than the selected busiest group
  4608. +    * don't try and pull any tasks.
  4609. +    */
  4610. +   if (sds.this_load >= sds.max_load)
  4611. +       goto out_balanced;
  4612. +
  4613. +   /*
  4614. +    * Don't pull any tasks if this group is already above the domain
  4615. +    * average load.
  4616. +    */
  4617. +   if (sds.this_load >= sds.avg_load)
  4618. +       goto out_balanced;
  4619. +
  4620. +   if (idle == CPU_IDLE) {
  4621. +       /*
  4622. +        * This cpu is idle. If the busiest group load doesn't
  4623. +        * have more tasks than the number of available cpu's and
  4624. +        * there is no imbalance between this and busiest group
  4625. +        * wrt to idle cpu's, it is balanced.
  4626. +        */
  4627. +       if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
  4628. +           sds.busiest_nr_running <= sds.busiest_group_weight)
  4629. +           goto out_balanced;
  4630. +   } else {
  4631. +       /*
  4632. +        * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
  4633. +        * imbalance_pct to be conservative.
  4634. +        */
  4635. +       if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
  4636. +           goto out_balanced;
  4637. +   }
  4638. +
  4639. +force_balance:
  4640. +   /* Looks like there is an imbalance. Compute it */
  4641. +   calculate_imbalance(&sds, this_cpu, imbalance);
  4642. +   return sds.busiest;
  4643. +
  4644. +out_balanced:
  4645. +   /*
  4646. +    * There is no obvious imbalance. But check if we can do some balancing
  4647. +    * to save power.
  4648. +    */
  4649. +   if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
  4650. +       return sds.busiest;
  4651. +ret:
  4652. +   *imbalance = 0;
  4653. +   return NULL;
  4654. +}
  4655. +
  4656. +/*
  4657. + * find_busiest_queue - find the busiest runqueue among the cpus in group.
  4658. + */
  4659. +static struct rq *
  4660. +find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
  4661. +          enum cpu_idle_type idle, unsigned long imbalance,
  4662. +          const struct cpumask *cpus)
  4663. +{
  4664. +   struct rq *busiest = NULL, *rq;
  4665. +   unsigned long max_load = 0;
  4666. +   int i;
  4667. +
  4668. +   for_each_cpu(i, sched_group_cpus(group)) {
  4669. +       unsigned long power = power_of(i);
  4670. +       unsigned long capacity = DIV_ROUND_CLOSEST(power,
  4671. +                              SCHED_POWER_SCALE);
  4672. +       unsigned long wl;
  4673. +
  4674. +       if (!capacity)
  4675. +           capacity = fix_small_capacity(sd, group);
  4676. +
  4677. +       if (!cpumask_test_cpu(i, cpus))
  4678. +           continue;
  4679. +
  4680. +       rq = cpu_rq(i);
  4681. +       wl = weighted_cpuload(i);
  4682. +
  4683. +       /*
  4684. +        * When comparing with imbalance, use weighted_cpuload()
  4685. +        * which is not scaled with the cpu power.
  4686. +        */
  4687. +       if (capacity && rq->nr_running == 1 && wl > imbalance)
  4688. +           continue;
  4689. +
  4690. +       /*
  4691. +        * For the load comparisons with the other cpu's, consider
  4692. +        * the weighted_cpuload() scaled with the cpu power, so that
  4693. +        * the load can be moved away from the cpu that is potentially
  4694. +        * running at a lower capacity.
  4695. +        */
  4696. +       wl = (wl * SCHED_POWER_SCALE) / power;
  4697. +
  4698. +       if (wl > max_load) {
  4699. +           max_load = wl;
  4700. +           busiest = rq;
  4701. +       }
  4702. +   }
  4703. +
  4704. +   return busiest;
  4705. +}
  4706. +
  4707. +/*
  4708. + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
  4709. + * so long as it is large enough.
  4710. + */
  4711. +#define MAX_PINNED_INTERVAL    512
  4712. +
  4713. +/* Working cpumask for load_balance and load_balance_newidle. */
  4714. +DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
  4715. +
  4716. +static int need_active_balance(struct sched_domain *sd, int idle,
  4717. +                  int busiest_cpu, int this_cpu)
  4718. +{
  4719. +   if (idle == CPU_NEWLY_IDLE) {
  4720. +
  4721. +       /*
  4722. +        * ASYM_PACKING needs to force migrate tasks from busy but
  4723. +        * higher numbered CPUs in order to pack all tasks in the
  4724. +        * lowest numbered CPUs.
  4725. +        */
  4726. +       if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
  4727. +           return 1;
  4728. +
  4729. +       /*
  4730. +        * The only task running in a non-idle cpu can be moved to this
  4731. +        * cpu in an attempt to completely freeup the other CPU
  4732. +        * package.
  4733. +        *
  4734. +        * The package power saving logic comes from
  4735. +        * find_busiest_group(). If there are no imbalance, then
  4736. +        * f_b_g() will return NULL. However when sched_mc={1,2} then
  4737. +        * f_b_g() will select a group from which a running task may be
  4738. +        * pulled to this cpu in order to make the other package idle.
  4739. +        * If there is no opportunity to make a package idle and if
  4740. +        * there are no imbalance, then f_b_g() will return NULL and no
  4741. +        * action will be taken in load_balance_newidle().
  4742. +        *
  4743. +        * Under normal task pull operation due to imbalance, there
  4744. +        * will be more than one task in the source run queue and
  4745. +        * move_tasks() will succeed.  ld_moved will be true and this
  4746. +        * active balance code will not be triggered.
  4747. +        */
  4748. +       if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
  4749. +           return 0;
  4750. +   }
  4751. +
  4752. +   return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
  4753. +}
  4754. +
  4755. +static int active_load_balance_cpu_stop(void *data);
  4756. +
  4757. +/*
  4758. + * Check this_cpu to ensure it is balanced within domain. Attempt to move
  4759. + * tasks if there is an imbalance.
  4760. + */
  4761. +static int load_balance(int this_cpu, struct rq *this_rq,
  4762. +           struct sched_domain *sd, enum cpu_idle_type idle,
  4763. +           int *balance)
  4764. +{
  4765. +   int ld_moved, lb_flags = 0, active_balance = 0;
  4766. +   struct sched_group *group;
  4767. +   unsigned long imbalance;
  4768. +   struct rq *busiest;
  4769. +   unsigned long flags;
  4770. +   struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
  4771. +
  4772. +   cpumask_copy(cpus, cpu_active_mask);
  4773. +
  4774. +   schedstat_inc(sd, lb_count[idle]);
  4775. +
  4776. +redo:
  4777. +   group = find_busiest_group(sd, this_cpu, &imbalance, idle,
  4778. +                  cpus, balance);
  4779. +
  4780. +   if (*balance == 0)
  4781. +       goto out_balanced;
  4782. +
  4783. +   if (!group) {
  4784. +       schedstat_inc(sd, lb_nobusyg[idle]);
  4785. +       goto out_balanced;
  4786. +   }
  4787. +
  4788. +   busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
  4789. +   if (!busiest) {
  4790. +       schedstat_inc(sd, lb_nobusyq[idle]);
  4791. +       goto out_balanced;
  4792. +   }
  4793. +
  4794. +   BUG_ON(busiest == this_rq);
  4795. +
  4796. +   schedstat_add(sd, lb_imbalance[idle], imbalance);
  4797. +
  4798. +   ld_moved = 0;
  4799. +   if (busiest->nr_running > 1) {
  4800. +       /*
  4801. +        * Attempt to move tasks. If find_busiest_group has found
  4802. +        * an imbalance but busiest->nr_running <= 1, the group is
  4803. +        * still unbalanced. ld_moved simply stays zero, so it is
  4804. +        * correctly treated as an imbalance.
  4805. +        */
  4806. +       lb_flags |= LBF_ALL_PINNED;
  4807. +       local_irq_save(flags);
  4808. +       double_rq_lock(this_rq, busiest);
  4809. +       ld_moved = move_tasks(this_rq, this_cpu, busiest,
  4810. +                     imbalance, sd, idle, &lb_flags);
  4811. +       double_rq_unlock(this_rq, busiest);
  4812. +       local_irq_restore(flags);
  4813. +
  4814. +       /*
  4815. +        * some other cpu did the load balance for us.
  4816. +        */
  4817. +       if (ld_moved && this_cpu != smp_processor_id())
  4818. +           resched_cpu(this_cpu);
  4819. +
  4820. +       if (lb_flags & LBF_ABORT)
  4821. +           goto out_balanced;
  4822. +
  4823. +       if (lb_flags & LBF_NEED_BREAK) {
  4824. +           lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
  4825. +           if (lb_flags & LBF_ABORT)
  4826. +               goto out_balanced;
  4827. +           goto redo;
  4828. +       }
  4829. +
  4830. +       /* All tasks on this runqueue were pinned by CPU affinity */
  4831. +       if (unlikely(lb_flags & LBF_ALL_PINNED)) {
  4832. +           cpumask_clear_cpu(cpu_of(busiest), cpus);
  4833. +           if (!cpumask_empty(cpus))
  4834. +               goto redo;
  4835. +           goto out_balanced;
  4836. +       }
  4837. +   }
  4838. +
  4839. +   if (!ld_moved) {
  4840. +       schedstat_inc(sd, lb_failed[idle]);
  4841. +       /*
  4842. +        * Increment the failure counter only on periodic balance.
  4843. +        * We do not want newidle balance, which can be very
  4844. +        * frequent, pollute the failure counter causing
  4845. +        * excessive cache_hot migrations and active balances.
  4846. +        */
  4847. +       if (idle != CPU_NEWLY_IDLE)
  4848. +           sd->nr_balance_failed++;
  4849. +
  4850. +       if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
  4851. +           raw_spin_lock_irqsave(&busiest->lock, flags);
  4852. +
  4853. +           /* don't kick the active_load_balance_cpu_stop,
  4854. +            * if the curr task on busiest cpu can't be
  4855. +            * moved to this_cpu
  4856. +            */
  4857. +           if (!cpumask_test_cpu(this_cpu,
  4858. +                   tsk_cpus_allowed(busiest->curr))) {
  4859. +               raw_spin_unlock_irqrestore(&busiest->lock,
  4860. +                               flags);
  4861. +               lb_flags |= LBF_ALL_PINNED;
  4862. +               goto out_one_pinned;
  4863. +           }
  4864. +
  4865. +           /*
  4866. +            * ->active_balance synchronizes accesses to
  4867. +            * ->active_balance_work.  Once set, it's cleared
  4868. +            * only after active load balance is finished.
  4869. +            */
  4870. +           if (!busiest->active_balance) {
  4871. +               busiest->active_balance = 1;
  4872. +               busiest->push_cpu = this_cpu;
  4873. +               active_balance = 1;
  4874. +           }
  4875. +           raw_spin_unlock_irqrestore(&busiest->lock, flags);
  4876. +
  4877. +           if (active_balance)
  4878. +               stop_one_cpu_nowait(cpu_of(busiest),
  4879. +                   active_load_balance_cpu_stop, busiest,
  4880. +                   &busiest->active_balance_work);
  4881. +
  4882. +           /*
  4883. +            * We've kicked active balancing, reset the failure
  4884. +            * counter.
  4885. +            */
  4886. +           sd->nr_balance_failed = sd->cache_nice_tries+1;
  4887. +       }
  4888. +   } else
  4889. +       sd->nr_balance_failed = 0;
  4890. +
  4891. +   if (likely(!active_balance)) {
  4892. +       /* We were unbalanced, so reset the balancing interval */
  4893. +       sd->balance_interval = sd->min_interval;
  4894. +   } else {
  4895. +       /*
  4896. +        * If we've begun active balancing, start to back off. This
  4897. +        * case may not be covered by the all_pinned logic if there
  4898. +        * is only 1 task on the busy runqueue (because we don't call
  4899. +        * move_tasks).
  4900. +        */
  4901. +       if (sd->balance_interval < sd->max_interval)
  4902. +           sd->balance_interval *= 2;
  4903. +   }
  4904. +
  4905. +   goto out;
  4906. +
  4907. +out_balanced:
  4908. +   schedstat_inc(sd, lb_balanced[idle]);
  4909. +
  4910. +   sd->nr_balance_failed = 0;
  4911. +
  4912. +out_one_pinned:
  4913. +   /* tune up the balancing interval */
  4914. +   if (((lb_flags & LBF_ALL_PINNED) &&
  4915. +           sd->balance_interval < MAX_PINNED_INTERVAL) ||
  4916. +           (sd->balance_interval < sd->max_interval))
  4917. +       sd->balance_interval *= 2;
  4918. +
  4919. +   ld_moved = 0;
  4920. +out:
  4921. +   return ld_moved;
  4922. +}
  4923. +
  4924. +/*
  4925. + * idle_balance is called by schedule() if this_cpu is about to become
  4926. + * idle. Attempts to pull tasks from other CPUs.
  4927. + */
  4928. +void idle_balance(int this_cpu, struct rq *this_rq)
  4929. +{
  4930. +   struct sched_domain *sd;
  4931. +   int pulled_task = 0;
  4932. +   unsigned long next_balance = jiffies + HZ;
  4933. +
  4934. +   this_rq->idle_stamp = this_rq->clock;
  4935. +
  4936. +   if (this_rq->avg_idle < sysctl_sched_migration_cost)
  4937. +       return;
  4938. +
  4939. +   /*
  4940. +    * Drop the rq->lock, but keep IRQ/preempt disabled.
  4941. +    */
  4942. +   raw_spin_unlock(&this_rq->lock);
  4943. +
  4944. +   update_shares(this_cpu);
  4945. +   rcu_read_lock();
  4946. +   for_each_domain(this_cpu, sd) {
  4947. +       unsigned long interval;
  4948. +       int balance = 1;
  4949. +
  4950. +       if (!(sd->flags & SD_LOAD_BALANCE))
  4951. +           continue;
  4952. +
  4953. +       if (sd->flags & SD_BALANCE_NEWIDLE) {
  4954. +           /* If we've pulled tasks over stop searching: */
  4955. +           pulled_task = load_balance(this_cpu, this_rq,
  4956. +                          sd, CPU_NEWLY_IDLE, &balance);
  4957. +       }
  4958. +
  4959. +       interval = msecs_to_jiffies(sd->balance_interval);
  4960. +       if (time_after(next_balance, sd->last_balance + interval))
  4961. +           next_balance = sd->last_balance + interval;
  4962. +       if (pulled_task) {
  4963. +           this_rq->idle_stamp = 0;
  4964. +           break;
  4965. +       }
  4966. +   }
  4967. +   rcu_read_unlock();
  4968. +
  4969. +   raw_spin_lock(&this_rq->lock);
  4970. +
  4971. +   if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
  4972. +       /*
  4973. +        * We are going idle. next_balance may be set based on
  4974. +        * a busy processor. So reset next_balance.
  4975. +        */
  4976. +       this_rq->next_balance = next_balance;
  4977. +   }
  4978. +}
  4979. +
  4980. +/*
  4981. + * active_load_balance_cpu_stop is run by cpu stopper. It pushes
  4982. + * running tasks off the busiest CPU onto idle CPUs. It requires at
  4983. + * least 1 task to be running on each physical CPU where possible, and
  4984. + * avoids physical / logical imbalances.
  4985. + */
  4986. +static int active_load_balance_cpu_stop(void *data)
  4987. +{
  4988. +   struct rq *busiest_rq = data;
  4989. +   int busiest_cpu = cpu_of(busiest_rq);
  4990. +   int target_cpu = busiest_rq->push_cpu;
  4991. +   struct rq *target_rq = cpu_rq(target_cpu);
  4992. +   struct sched_domain *sd;
  4993. +
  4994. +   raw_spin_lock_irq(&busiest_rq->lock);
  4995. +
  4996. +   /* make sure the requested cpu hasn't gone down in the meantime */
  4997. +   if (unlikely(busiest_cpu != smp_processor_id() ||
  4998. +            !busiest_rq->active_balance))
  4999. +       goto out_unlock;
  5000. +
  5001. +   /* Is there any task to move? */
  5002. +   if (busiest_rq->nr_running <= 1)
  5003. +       goto out_unlock;
  5004. +
  5005. +   /*
  5006. +    * This condition is "impossible", if it occurs
  5007. +    * we need to fix it. Originally reported by
  5008. +    * Bjorn Helgaas on a 128-cpu setup.
  5009. +    */
  5010. +   BUG_ON(busiest_rq == target_rq);
  5011. +
  5012. +   /* move a task from busiest_rq to target_rq */
  5013. +   double_lock_balance(busiest_rq, target_rq);
  5014. +
  5015. +   /* Search for an sd spanning us and the target CPU. */
  5016. +   rcu_read_lock();
  5017. +   for_each_domain(target_cpu, sd) {
  5018. +       if ((sd->flags & SD_LOAD_BALANCE) &&
  5019. +           cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
  5020. +               break;
  5021. +   }
  5022. +
  5023. +   if (likely(sd)) {
  5024. +       schedstat_inc(sd, alb_count);
  5025. +
  5026. +       if (move_one_task(target_rq, target_cpu, busiest_rq,
  5027. +                 sd, CPU_IDLE))
  5028. +           schedstat_inc(sd, alb_pushed);
  5029. +       else
  5030. +           schedstat_inc(sd, alb_failed);
  5031. +   }
  5032. +   rcu_read_unlock();
  5033. +   double_unlock_balance(busiest_rq, target_rq);
  5034. +out_unlock:
  5035. +   busiest_rq->active_balance = 0;
  5036. +   raw_spin_unlock_irq(&busiest_rq->lock);
  5037. +   return 0;
  5038. +}
  5039. +
  5040. +#ifdef CONFIG_NO_HZ
  5041. +/*
  5042. + * idle load balancing details
  5043. + * - When one of the busy CPUs notice that there may be an idle rebalancing
  5044. + *   needed, they will kick the idle load balancer, which then does idle
  5045. + *   load balancing for all the idle CPUs.
  5046. + */
  5047. +static struct {
  5048. +   cpumask_var_t idle_cpus_mask;
  5049. +   atomic_t nr_cpus;
  5050. +   unsigned long next_balance;     /* in jiffy units */
  5051. +} nohz ____cacheline_aligned;
  5052. +
  5053. +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
  5054. +/**
  5055. + * lowest_flag_domain - Return lowest sched_domain containing flag.
  5056. + * @cpu:   The cpu whose lowest level of sched domain is to
  5057. + *     be returned.
  5058. + * @flag:  The flag to check for the lowest sched_domain
  5059. + *     for the given cpu.
  5060. + *
  5061. + * Returns the lowest sched_domain of a cpu which contains the given flag.
  5062. + */
  5063. +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
  5064. +{
  5065. +   struct sched_domain *sd;
  5066. +
  5067. +   for_each_domain(cpu, sd)
  5068. +       if (sd->flags & flag)
  5069. +           break;
  5070. +
  5071. +   return sd;
  5072. +}
  5073. +
  5074. +/**
  5075. + * for_each_flag_domain - Iterates over sched_domains containing the flag.
  5076. + * @cpu:   The cpu whose domains we're iterating over.
  5077. + * @sd:        variable holding the value of the power_savings_sd
  5078. + *     for cpu.
  5079. + * @flag:  The flag to filter the sched_domains to be iterated.
  5080. + *
  5081. + * Iterates over all the scheduler domains for a given cpu that has the 'flag'
  5082. + * set, starting from the lowest sched_domain to the highest.
  5083. + */
  5084. +#define for_each_flag_domain(cpu, sd, flag) \
  5085. +   for (sd = lowest_flag_domain(cpu, flag); \
  5086. +       (sd && (sd->flags & flag)); sd = sd->parent)
  5087. +
  5088. +/**
  5089. + * find_new_ilb - Finds the optimum idle load balancer for nomination.
  5090. + * @cpu:   The cpu which is nominating a new idle_load_balancer.
  5091. + *
  5092. + * Returns:    Returns the id of the idle load balancer if it exists,
  5093. + *     Else, returns >= nr_cpu_ids.
  5094. + *
  5095. + * This algorithm picks the idle load balancer such that it belongs to a
  5096. + * semi-idle powersavings sched_domain. The idea is to try and avoid
  5097. + * completely idle packages/cores just for the purpose of idle load balancing
  5098. + * when there are other idle cpu's which are better suited for that job.
  5099. + */
  5100. +static int find_new_ilb(int cpu)
  5101. +{
  5102. +   int ilb = cpumask_first(nohz.idle_cpus_mask);
  5103. +   struct sched_group *ilbg;
  5104. +   struct sched_domain *sd;
  5105. +
  5106. +   /*
  5107. +    * Have idle load balancer selection from semi-idle packages only
  5108. +    * when power-aware load balancing is enabled
  5109. +    */
  5110. +   if (!(sched_smt_power_savings || sched_mc_power_savings))
  5111. +       goto out_done;
  5112. +
  5113. +   /*
  5114. +    * Optimize for the case when we have no idle CPUs or only one
  5115. +    * idle CPU. Don't walk the sched_domain hierarchy in such cases
  5116. +    */
  5117. +   if (cpumask_weight(nohz.idle_cpus_mask) < 2)
  5118. +       goto out_done;
  5119. +
  5120. +   rcu_read_lock();
  5121. +   for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
  5122. +       ilbg = sd->groups;
  5123. +
  5124. +       do {
  5125. +           if (ilbg->group_weight !=
  5126. +               atomic_read(&ilbg->sgp->nr_busy_cpus)) {
  5127. +               ilb = cpumask_first_and(nohz.idle_cpus_mask,
  5128. +                           sched_group_cpus(ilbg));
  5129. +               goto unlock;
  5130. +           }
  5131. +
  5132. +           ilbg = ilbg->next;
  5133. +
  5134. +       } while (ilbg != sd->groups);
  5135. +   }
  5136. +unlock:
  5137. +   rcu_read_unlock();
  5138. +
  5139. +out_done:
  5140. +   if (ilb < nr_cpu_ids && idle_cpu(ilb))
  5141. +       return ilb;
  5142. +
  5143. +   return nr_cpu_ids;
  5144. +}
  5145. +#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
  5146. +static inline int find_new_ilb(int call_cpu)
  5147. +{
  5148. +   return nr_cpu_ids;
  5149. +}
  5150. +#endif
  5151. +
  5152. +/*
  5153. + * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
  5154. + * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
  5155. + * CPU (if there is one).
  5156. + */
  5157. +static void nohz_balancer_kick(int cpu)
  5158. +{
  5159. +   int ilb_cpu;
  5160. +
  5161. +   nohz.next_balance++;
  5162. +
  5163. +   ilb_cpu = find_new_ilb(cpu);
  5164. +
  5165. +   if (ilb_cpu >= nr_cpu_ids)
  5166. +       return;
  5167. +
  5168. +   if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
  5169. +       return;
  5170. +   /*
  5171. +    * Use smp_send_reschedule() instead of resched_cpu().
  5172. +    * This way we generate a sched IPI on the target cpu which
  5173. +    * is idle. And the softirq performing nohz idle load balance
  5174. +    * will be run before returning from the IPI.
  5175. +    */
  5176. +   smp_send_reschedule(ilb_cpu);
  5177. +   return;
  5178. +}
  5179. +
  5180. +static inline void clear_nohz_tick_stopped(int cpu)
  5181. +{
  5182. +   if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
  5183. +       cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
  5184. +       atomic_dec(&nohz.nr_cpus);
  5185. +       clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
  5186. +   }
  5187. +}
  5188. +
  5189. +static inline void set_cpu_sd_state_busy(void)
  5190. +{
  5191. +   struct sched_domain *sd;
  5192. +   int cpu = smp_processor_id();
  5193. +
  5194. +   if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
  5195. +       return;
  5196. +   clear_bit(NOHZ_IDLE, nohz_flags(cpu));
  5197. +
  5198. +   rcu_read_lock();
  5199. +   for_each_domain(cpu, sd)
  5200. +       atomic_inc(&sd->groups->sgp->nr_busy_cpus);
  5201. +   rcu_read_unlock();
  5202. +}
  5203. +
  5204. +void set_cpu_sd_state_idle(void)
  5205. +{
  5206. +   struct sched_domain *sd;
  5207. +   int cpu = smp_processor_id();
  5208. +
  5209. +   if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
  5210. +       return;
  5211. +   set_bit(NOHZ_IDLE, nohz_flags(cpu));
  5212. +
  5213. +   rcu_read_lock();
  5214. +   for_each_domain(cpu, sd)
  5215. +       atomic_dec(&sd->groups->sgp->nr_busy_cpus);
  5216. +   rcu_read_unlock();
  5217. +}
  5218. +
  5219. +/*
  5220. + * This routine will record that this cpu is going idle with tick stopped.
  5221. + * This info will be used in performing idle load balancing in the future.
  5222. + */
  5223. +void select_nohz_load_balancer(int stop_tick)
  5224. +{
  5225. +   int cpu = smp_processor_id();
  5226. +
  5227. +   /*
  5228. +    * If this cpu is going down, then nothing needs to be done.
  5229. +    */
  5230. +   if (!cpu_active(cpu))
  5231. +       return;
  5232. +
  5233. +   if (stop_tick) {
  5234. +       if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
  5235. +           return;
  5236. +
  5237. +       cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
  5238. +       atomic_inc(&nohz.nr_cpus);
  5239. +       set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
  5240. +   }
  5241. +   return;
  5242. +}
  5243. +
  5244. +static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
  5245. +                   unsigned long action, void *hcpu)
  5246. +{
  5247. +   switch (action & ~CPU_TASKS_FROZEN) {
  5248. +   case CPU_DYING:
  5249. +       clear_nohz_tick_stopped(smp_processor_id());
  5250. +       return NOTIFY_OK;
  5251. +   default:
  5252. +       return NOTIFY_DONE;
  5253. +   }
  5254. +}
  5255. +#endif
  5256. +
  5257. +static DEFINE_SPINLOCK(balancing);
  5258. +
  5259. +static unsigned long __read_mostly max_load_balance_interval = HZ/10;
  5260. +
  5261. +/*
  5262. + * Scale the max load_balance interval with the number of CPUs in the system.
  5263. + * This trades load-balance latency on larger machines for less cross talk.
  5264. + */
  5265. +void update_max_interval(void)
  5266. +{
  5267. +   max_load_balance_interval = HZ*num_online_cpus()/10;
  5268. +}
  5269. +
  5270. +/*
  5271. + * It checks each scheduling domain to see if it is due to be balanced,
  5272. + * and initiates a balancing operation if so.
  5273. + *
  5274. + * Balancing parameters are set up in arch_init_sched_domains.
  5275. + */
  5276. +static void rebalance_domains(int cpu, enum cpu_idle_type idle)
  5277. +{
  5278. +   int balance = 1;
  5279. +   struct rq *rq = cpu_rq(cpu);
  5280. +   unsigned long interval;
  5281. +   struct sched_domain *sd;
  5282. +   /* Earliest time when we have to do rebalance again */
  5283. +   unsigned long next_balance = jiffies + 60*HZ;
  5284. +   int update_next_balance = 0;
  5285. +   int need_serialize;
  5286. +
  5287. +   update_shares(cpu);
  5288. +
  5289. +   rcu_read_lock();
  5290. +   for_each_domain(cpu, sd) {
  5291. +       if (!(sd->flags & SD_LOAD_BALANCE))
  5292. +           continue;
  5293. +
  5294. +       interval = sd->balance_interval;
  5295. +       if (idle != CPU_IDLE)
  5296. +           interval *= sd->busy_factor;
  5297. +
  5298. +       /* scale ms to jiffies */
  5299. +       interval = msecs_to_jiffies(interval);
  5300. +       interval = clamp(interval, 1UL, max_load_balance_interval);
  5301. +
  5302. +       need_serialize = sd->flags & SD_SERIALIZE;
  5303. +
  5304. +       if (need_serialize) {
  5305. +           if (!spin_trylock(&balancing))
  5306. +               goto out;
  5307. +       }
  5308. +
  5309. +       if (time_after_eq(jiffies, sd->last_balance + interval)) {
  5310. +           if (load_balance(cpu, rq, sd, idle, &balance)) {
  5311. +               /*
  5312. +                * We've pulled tasks over so either we're no
  5313. +                * longer idle.
  5314. +                */
  5315. +               idle = CPU_NOT_IDLE;
  5316. +           }
  5317. +           sd->last_balance = jiffies;
  5318. +       }
  5319. +       if (need_serialize)
  5320. +           spin_unlock(&balancing);
  5321. +out:
  5322. +       if (time_after(next_balance, sd->last_balance + interval)) {
  5323. +           next_balance = sd->last_balance + interval;
  5324. +           update_next_balance = 1;
  5325. +       }
  5326. +
  5327. +       /*
  5328. +        * Stop the load balance at this level. There is another
  5329. +        * CPU in our sched group which is doing load balancing more
  5330. +        * actively.
  5331. +        */
  5332. +       if (!balance)
  5333. +           break;
  5334. +   }
  5335. +   rcu_read_unlock();
  5336. +
  5337. +   /*
  5338. +    * next_balance will be updated only when there is a need.
  5339. +    * When the cpu is attached to null domain for ex, it will not be
  5340. +    * updated.
  5341. +    */
  5342. +   if (likely(update_next_balance))
  5343. +       rq->next_balance = next_balance;
  5344. +}
  5345. +
  5346. +#ifdef CONFIG_NO_HZ
  5347. +/*
  5348. + * In CONFIG_NO_HZ case, the idle balance kickee will do the
  5349. + * rebalancing for all the cpus for whom scheduler ticks are stopped.
  5350. + */
  5351. +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
  5352. +{
  5353. +   struct rq *this_rq = cpu_rq(this_cpu);
  5354. +   struct rq *rq;
  5355. +   int balance_cpu;
  5356. +
  5357. +   if (idle != CPU_IDLE ||
  5358. +       !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
  5359. +       goto end;
  5360. +
  5361. +   for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
  5362. +       if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
  5363. +           continue;
  5364. +
  5365. +       /*
  5366. +        * If this cpu gets work to do, stop the load balancing
  5367. +        * work being done for other cpus. Next load
  5368. +        * balancing owner will pick it up.
  5369. +        */
  5370. +       if (need_resched())
  5371. +           break;
  5372. +
  5373. +       raw_spin_lock_irq(&this_rq->lock);
  5374. +       update_rq_clock(this_rq);
  5375. +       update_cpu_load(this_rq);
  5376. +       raw_spin_unlock_irq(&this_rq->lock);
  5377. +
  5378. +       rebalance_domains(balance_cpu, CPU_IDLE);
  5379. +
  5380. +       rq = cpu_rq(balance_cpu);
  5381. +       if (time_after(this_rq->next_balance, rq->next_balance))
  5382. +           this_rq->next_balance = rq->next_balance;
  5383. +   }
  5384. +   nohz.next_balance = this_rq->next_balance;
  5385. +end:
  5386. +   clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
  5387. +}
  5388. +
  5389. +/*
  5390. + * Current heuristic for kicking the idle load balancer in the presence
  5391. + * of an idle cpu is the system.
  5392. + *   - This rq has more than one task.
  5393. + *   - At any scheduler domain level, this cpu's scheduler group has multiple
  5394. + *     busy cpu's exceeding the group's power.
  5395. + *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
  5396. + *     domain span are idle.
  5397. + */
  5398. +static inline int nohz_kick_needed(struct rq *rq, int cpu)
  5399. +{
  5400. +   unsigned long now = jiffies;
  5401. +   struct sched_domain *sd;
  5402. +
  5403. +   if (unlikely(idle_cpu(cpu)))
  5404. +       return 0;
  5405. +
  5406. +       /*
  5407. +   * We may be recently in ticked or tickless idle mode. At the first
  5408. +   * busy tick after returning from idle, we will update the busy stats.
  5409. +   */
  5410. +   set_cpu_sd_state_busy();
  5411. +   clear_nohz_tick_stopped(cpu);
  5412. +
  5413. +   /*
  5414. +    * None are in tickless mode and hence no need for NOHZ idle load
  5415. +    * balancing.
  5416. +    */
  5417. +   if (likely(!atomic_read(&nohz.nr_cpus)))
  5418. +       return 0;
  5419. +
  5420. +   if (time_before(now, nohz.next_balance))
  5421. +       return 0;
  5422. +
  5423. +   if (rq->nr_running >= 2)
  5424. +       goto need_kick;
  5425. +
  5426. +   rcu_read_lock();
  5427. +   for_each_domain(cpu, sd) {
  5428. +       struct sched_group *sg = sd->groups;
  5429. +       struct sched_group_power *sgp = sg->sgp;
  5430. +       int nr_busy = atomic_read(&sgp->nr_busy_cpus);
  5431. +
  5432. +       if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
  5433. +           goto need_kick_unlock;
  5434. +
  5435. +       if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
  5436. +           && (cpumask_first_and(nohz.idle_cpus_mask,
  5437. +                     sched_domain_span(sd)) < cpu))
  5438. +           goto need_kick_unlock;
  5439. +
  5440. +       if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
  5441. +           break;
  5442. +   }
  5443. +   rcu_read_unlock();
  5444. +   return 0;
  5445. +
  5446. +need_kick_unlock:
  5447. +   rcu_read_unlock();
  5448. +need_kick:
  5449. +   return 1;
  5450. +}
  5451. +#else
  5452. +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
  5453. +#endif
  5454. +
  5455. +/*
  5456. + * run_rebalance_domains is triggered when needed from the scheduler tick.
  5457. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
  5458. + */
  5459. +static void run_rebalance_domains(struct softirq_action *h)
  5460. +{
  5461. +   int this_cpu = smp_processor_id();
  5462. +   struct rq *this_rq = cpu_rq(this_cpu);
  5463. +   enum cpu_idle_type idle = this_rq->idle_balance ?
  5464. +                       CPU_IDLE : CPU_NOT_IDLE;
  5465. +
  5466. +   rebalance_domains(this_cpu, idle);
  5467. +
  5468. +   /*
  5469. +    * If this cpu has a pending nohz_balance_kick, then do the
  5470. +    * balancing on behalf of the other idle cpus whose ticks are
  5471. +    * stopped.
  5472. +    */
  5473. +   nohz_idle_balance(this_cpu, idle);
  5474. +}
  5475. +
  5476. +static inline int on_null_domain(int cpu)
  5477. +{
  5478. +   return !rcu_dereference_sched(cpu_rq(cpu)->sd);
  5479. +}
  5480. +
  5481. +/*
  5482. + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
  5483. + */
  5484. +void trigger_load_balance(struct rq *rq, int cpu)
  5485. +{
  5486. +   /* Don't need to rebalance while attached to NULL domain */
  5487. +   if (time_after_eq(jiffies, rq->next_balance) &&
  5488. +       likely(!on_null_domain(cpu)))
  5489. +       raise_softirq(SCHED_SOFTIRQ);
  5490. +#ifdef CONFIG_NO_HZ
  5491. +   if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
  5492. +       nohz_balancer_kick(cpu);
  5493. +#endif
  5494. +}
  5495. +
  5496. +static void rq_online_fair(struct rq *rq)
  5497. +{
  5498. +   update_sysctl();
  5499. +}
  5500. +
  5501. +static void rq_offline_fair(struct rq *rq)
  5502. +{
  5503. +   update_sysctl();
  5504. +}
  5505. +
  5506. +#endif /* CONFIG_SMP */
  5507. +
  5508. +/*
  5509. + * scheduler tick hitting a task of our scheduling class:
  5510. + */
  5511. +static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
  5512. +{
  5513. +   struct cfs_rq *cfs_rq;
  5514. +   struct sched_entity *se = &curr->se;
  5515. +
  5516. +   for_each_sched_entity(se) {
  5517. +       cfs_rq = cfs_rq_of(se);
  5518. +       entity_tick(cfs_rq, se, queued);
  5519. +   }
  5520. +}
  5521. +
  5522. +/*
  5523. + * called on fork with the child task as argument from the parent's context
  5524. + *  - child not yet on the tasklist
  5525. + *  - preemption disabled
  5526. + */
  5527. +static void task_fork_fair(struct task_struct *p)
  5528. +{
  5529. +   struct cfs_rq *cfs_rq;
  5530. +   struct sched_entity *se = &p->se, *curr;
  5531. +   int this_cpu = smp_processor_id();
  5532. +   struct rq *rq = this_rq();
  5533. +   unsigned long flags;
  5534. +
  5535. +   raw_spin_lock_irqsave(&rq->lock, flags);
  5536. +
  5537. +   update_rq_clock(rq);
  5538. +
  5539. +   cfs_rq = task_cfs_rq(current);
  5540. +   curr = cfs_rq->curr;
  5541. +
  5542. +   if (unlikely(task_cpu(p) != this_cpu)) {
  5543. +       rcu_read_lock();
  5544. +       __set_task_cpu(p, this_cpu);
  5545. +       rcu_read_unlock();
  5546. +   }
  5547. +
  5548. +   update_curr(cfs_rq);
  5549. +
  5550. +   if (curr)
  5551. +       se->vruntime = curr->vruntime;
  5552. +   place_entity(cfs_rq, se, 1);
  5553. +
  5554. +   if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
  5555. +       /*
  5556. +        * Upon rescheduling, sched_class::put_prev_task() will place
  5557. +        * 'current' within the tree based on its new key value.
  5558. +        */
  5559. +       swap(curr->vruntime, se->vruntime);
  5560. +       resched_task(rq->curr);
  5561. +   }
  5562. +
  5563. +   se->vruntime -= cfs_rq->min_vruntime;
  5564. +
  5565. +   raw_spin_unlock_irqrestore(&rq->lock, flags);
  5566. +}
  5567. +
  5568. +/*
  5569. + * Priority of the task has changed. Check to see if we preempt
  5570. + * the current task.
  5571. + */
  5572. +static void
  5573. +prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
  5574. +{
  5575. +   if (!p->se.on_rq)
  5576. +       return;
  5577. +
  5578. +   /*
  5579. +    * Reschedule if we are currently running on this runqueue and
  5580. +    * our priority decreased, or if we are not currently running on
  5581. +    * this runqueue and our priority is higher than the current's
  5582. +    */
  5583. +   if (rq->curr == p) {
  5584. +       if (p->prio > oldprio)
  5585. +           resched_task(rq->curr);
  5586. +   } else
  5587. +       check_preempt_curr(rq, p, 0);
  5588. +}
  5589. +
  5590. +static void switched_from_fair(struct rq *rq, struct task_struct *p)
  5591. +{
  5592. +   struct sched_entity *se = &p->se;
  5593. +   struct cfs_rq *cfs_rq = cfs_rq_of(se);
  5594. +
  5595. +   /*
  5596. +    * Ensure the task's vruntime is normalized, so that when its
  5597. +    * switched back to the fair class the enqueue_entity(.flags=0) will
  5598. +    * do the right thing.
  5599. +    *
  5600. +    * If it was on_rq, then the dequeue_entity(.flags=0) will already
  5601. +    * have normalized the vruntime, if it was !on_rq, then only when
  5602. +    * the task is sleeping will it still have non-normalized vruntime.
  5603. +    */
  5604. +   if (!se->on_rq && p->state != TASK_RUNNING) {
  5605. +       /*
  5606. +        * Fix up our vruntime so that the current sleep doesn't
  5607. +        * cause 'unlimited' sleep bonus.
  5608. +        */
  5609. +       place_entity(cfs_rq, se, 0);
  5610. +       se->vruntime -= cfs_rq->min_vruntime;
  5611. +   }
  5612. +}
  5613. +
  5614. +/*
  5615. + * We switched to the sched_fair class.
  5616. + */
  5617. +static void switched_to_fair(struct rq *rq, struct task_struct *p)
  5618. +{
  5619. +   if (!p->se.on_rq)
  5620. +       return;
  5621. +
  5622. +   /*
  5623. +    * We were most likely switched from sched_rt, so
  5624. +    * kick off the schedule if running, otherwise just see
  5625. +    * if we can still preempt the current task.
  5626. +    */
  5627. +   if (rq->curr == p)
  5628. +       resched_task(rq->curr);
  5629. +   else
  5630. +       check_preempt_curr(rq, p, 0);
  5631. +}
  5632. +
  5633. +/* Account for a task changing its policy or group.
  5634. + *
  5635. + * This routine is mostly called to set cfs_rq->curr field when a task
  5636. + * migrates between groups/classes.
  5637. + */
  5638. +static void set_curr_task_fair(struct rq *rq)
  5639. +{
  5640. +   struct sched_entity *se = &rq->curr->se;
  5641. +
  5642. +   for_each_sched_entity(se) {
  5643. +       struct cfs_rq *cfs_rq = cfs_rq_of(se);
  5644. +
  5645. +       set_next_entity(cfs_rq, se);
  5646. +       /* ensure bandwidth has been allocated on our new cfs_rq */
  5647. +       account_cfs_rq_runtime(cfs_rq, 0);
  5648. +   }
  5649. +}
  5650. +
  5651. +void init_cfs_rq(struct cfs_rq *cfs_rq)
  5652. +{
  5653. +   cfs_rq->tasks_timeline = RB_ROOT;
  5654. +   INIT_LIST_HEAD(&cfs_rq->tasks);
  5655. +   cfs_rq->min_vruntime = (u64)(-(1LL << 20));
  5656. +#ifndef CONFIG_64BIT
  5657. +   cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
  5658. +#endif
  5659. +}
  5660. +
  5661. +#ifdef CONFIG_FAIR_GROUP_SCHED
  5662. +static void task_move_group_fair(struct task_struct *p, int on_rq)
  5663. +{
  5664. +   /*
  5665. +    * If the task was not on the rq at the time of this cgroup movement
  5666. +    * it must have been asleep, sleeping tasks keep their ->vruntime
  5667. +    * absolute on their old rq until wakeup (needed for the fair sleeper
  5668. +    * bonus in place_entity()).
  5669. +    *
  5670. +    * If it was on the rq, we've just 'preempted' it, which does convert
  5671. +    * ->vruntime to a relative base.
  5672. +    *
  5673. +    * Make sure both cases convert their relative position when migrating
  5674. +    * to another cgroup's rq. This does somewhat interfere with the
  5675. +    * fair sleeper stuff for the first placement, but who cares.
  5676. +    */
  5677. +   /*
  5678. +    * When !on_rq, vruntime of the task has usually NOT been normalized.
  5679. +    * But there are some cases where it has already been normalized:
  5680. +    *
  5681. +    * - Moving a forked child which is waiting for being woken up by
  5682. +    *   wake_up_new_task().
  5683. +    * - Moving a task which has been woken up by try_to_wake_up() and
  5684. +    *   waiting for actually being woken up by sched_ttwu_pending().
  5685. +    *
  5686. +    * To prevent boost or penalty in the new cfs_rq caused by delta
  5687. +    * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
  5688. +    */
  5689. +   if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
  5690. +       on_rq = 1;
  5691. +
  5692. +   if (!on_rq)
  5693. +       p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
  5694. +   set_task_rq(p, task_cpu(p));
  5695. +   if (!on_rq)
  5696. +       p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
  5697. +}
  5698. +
  5699. +void free_fair_sched_group(struct task_group *tg)
  5700. +{
  5701. +   int i;
  5702. +
  5703. +   destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
  5704. +
  5705. +   for_each_possible_cpu(i) {
  5706. +       if (tg->cfs_rq)
  5707. +           kfree(tg->cfs_rq[i]);
  5708. +       if (tg->se)
  5709. +           kfree(tg->se[i]);
  5710. +   }
  5711. +
  5712. +   kfree(tg->cfs_rq);
  5713. +   kfree(tg->se);
  5714. +}
  5715. +
  5716. +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  5717. +{
  5718. +   struct cfs_rq *cfs_rq;
  5719. +   struct sched_entity *se;
  5720. +   int i;
  5721. +
  5722. +   tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
  5723. +   if (!tg->cfs_rq)
  5724. +       goto err;
  5725. +   tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
  5726. +   if (!tg->se)
  5727. +       goto err;
  5728. +
  5729. +   tg->shares = NICE_0_LOAD;
  5730. +
  5731. +   init_cfs_bandwidth(tg_cfs_bandwidth(tg));
  5732. +
  5733. +   for_each_possible_cpu(i) {
  5734. +       cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
  5735. +                     GFP_KERNEL, cpu_to_node(i));
  5736. +       if (!cfs_rq)
  5737. +           goto err;
  5738. +
  5739. +       se = kzalloc_node(sizeof(struct sched_entity),
  5740. +                 GFP_KERNEL, cpu_to_node(i));
  5741. +       if (!se)
  5742. +           goto err_free_rq;
  5743. +
  5744. +       init_cfs_rq(cfs_rq);
  5745. +       init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
  5746. +   }
  5747. +
  5748. +   return 1;
  5749. +
  5750. +err_free_rq:
  5751. +   kfree(cfs_rq);
  5752. +err:
  5753. +   return 0;
  5754. +}
  5755. +
  5756. +void unregister_fair_sched_group(struct task_group *tg, int cpu)
  5757. +{
  5758. +   struct rq *rq = cpu_rq(cpu);
  5759. +   unsigned long flags;
  5760. +
  5761. +   /*
  5762. +   * Only empty task groups can be destroyed; so we can speculatively
  5763. +   * check on_list without danger of it being re-added.
  5764. +   */
  5765. +   if (!tg->cfs_rq[cpu]->on_list)
  5766. +       return;
  5767. +
  5768. +   raw_spin_lock_irqsave(&rq->lock, flags);
  5769. +   list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
  5770. +   raw_spin_unlock_irqrestore(&rq->lock, flags);
  5771. +}
  5772. +
  5773. +void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
  5774. +           struct sched_entity *se, int cpu,
  5775. +           struct sched_entity *parent)
  5776. +{
  5777. +   struct rq *rq = cpu_rq(cpu);
  5778. +
  5779. +   cfs_rq->tg = tg;
  5780. +   cfs_rq->rq = rq;
  5781. +#ifdef CONFIG_SMP
  5782. +   /* allow initial update_cfs_load() to truncate */
  5783. +   cfs_rq->load_stamp = 1;
  5784. +#endif
  5785. +   init_cfs_rq_runtime(cfs_rq);
  5786. +
  5787. +   tg->cfs_rq[cpu] = cfs_rq;
  5788. +   tg->se[cpu] = se;
  5789. +
  5790. +   /* se could be NULL for root_task_group */
  5791. +   if (!se)
  5792. +       return;
  5793. +
  5794. +   if (!parent)
  5795. +       se->cfs_rq = &rq->cfs;
  5796. +   else
  5797. +       se->cfs_rq = parent->my_q;
  5798. +
  5799. +   se->my_q = cfs_rq;
  5800. +   update_load_set(&se->load, 0);
  5801. +   se->parent = parent;
  5802. +}
  5803. +
  5804. +static DEFINE_MUTEX(shares_mutex);
  5805. +
  5806. +int sched_group_set_shares(struct task_group *tg, unsigned long shares)
  5807. +{
  5808. +   int i;
  5809. +   unsigned long flags;
  5810. +
  5811. +   /*
  5812. +    * We can't change the weight of the root cgroup.
  5813. +    */
  5814. +   if (!tg->se[0])
  5815. +       return -EINVAL;
  5816. +
  5817. +   shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
  5818. +
  5819. +   mutex_lock(&shares_mutex);
  5820. +   if (tg->shares == shares)
  5821. +       goto done;
  5822. +
  5823. +   tg->shares = shares;
  5824. +   for_each_possible_cpu(i) {
  5825. +       struct rq *rq = cpu_rq(i);
  5826. +       struct sched_entity *se;
  5827. +
  5828. +       se = tg->se[i];
  5829. +       /* Propagate contribution to hierarchy */
  5830. +       raw_spin_lock_irqsave(&rq->lock, flags);
  5831. +       for_each_sched_entity(se)
  5832. +           update_cfs_shares(group_cfs_rq(se));
  5833. +       raw_spin_unlock_irqrestore(&rq->lock, flags);
  5834. +   }
  5835. +
  5836. +done:
  5837. +   mutex_unlock(&shares_mutex);
  5838. +   return 0;
  5839. +}
  5840. +#else /* CONFIG_FAIR_GROUP_SCHED */
  5841. +
  5842. +void free_fair_sched_group(struct task_group *tg) { }
  5843. +
  5844. +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  5845. +{
  5846. +   return 1;
  5847. +}
  5848. +
  5849. +void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
  5850. +
  5851. +#endif /* CONFIG_FAIR_GROUP_SCHED */
  5852. +
  5853. +
  5854. +static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
  5855. +{
  5856. +   struct sched_entity *se = &task->se;
  5857. +   unsigned int rr_interval = 0;
  5858. +
  5859. +   /*
  5860. +    * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
  5861. +    * idle runqueue:
  5862. +    */
  5863. +   if (rq->cfs.load.weight)
  5864. +       rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
  5865. +
  5866. +   return rr_interval;
  5867. +}
  5868. +
  5869. +/*
  5870. + * All the scheduling class methods:
  5871. + */
  5872. +const struct sched_class fair_sched_class = {
  5873. +   .next           = &idle_sched_class,
  5874. +   .enqueue_task       = enqueue_task_fair,
  5875. +   .dequeue_task       = dequeue_task_fair,
  5876. +   .yield_task     = yield_task_fair,
  5877. +   .yield_to_task      = yield_to_task_fair,
  5878. +
  5879. +   .check_preempt_curr = check_preempt_wakeup,
  5880. +
  5881. +   .pick_next_task     = pick_next_task_fair,
  5882. +   .put_prev_task      = put_prev_task_fair,
  5883. +
  5884. +#ifdef CONFIG_SMP
  5885. +   .select_task_rq     = select_task_rq_fair,
  5886. +
  5887. +   .rq_online      = rq_online_fair,
  5888. +   .rq_offline     = rq_offline_fair,
  5889. +
  5890. +   .task_waking        = task_waking_fair,
  5891. +#endif
  5892. +
  5893. +   .set_curr_task          = set_curr_task_fair,
  5894. +   .task_tick      = task_tick_fair,
  5895. +   .task_fork      = task_fork_fair,
  5896. +
  5897. +   .prio_changed       = prio_changed_fair,
  5898. +   .switched_from      = switched_from_fair,
  5899. +   .switched_to        = switched_to_fair,
  5900. +
  5901. +   .get_rr_interval    = get_rr_interval_fair,
  5902. +
  5903. +#ifdef CONFIG_FAIR_GROUP_SCHED
  5904. +   .task_move_group    = task_move_group_fair,
  5905. +#endif
  5906. +};
  5907. +
  5908. +#ifdef CONFIG_SCHED_DEBUG
  5909. +void print_cfs_stats(struct seq_file *m, int cpu)
  5910. +{
  5911. +   struct cfs_rq *cfs_rq;
  5912. +
  5913. +   rcu_read_lock();
  5914. +   for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
  5915. +       print_cfs_rq(m, cpu, cfs_rq);
  5916. +   rcu_read_unlock();
  5917. +}
  5918. +#endif
  5919. +
  5920. +__init void init_sched_fair_class(void)
  5921. +{
  5922. +#ifdef CONFIG_SMP
  5923. +   open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
  5924. +
  5925. +#ifdef CONFIG_NO_HZ
  5926. +   zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
  5927. +   cpu_notifier(sched_ilb_notifier, 0);
  5928. +#endif
  5929. +#endif /* SMP */
  5930. +
  5931. +}
  5932. diff -Naur linux-3.3.7.orig/kernel/sched/sched.h linux-3.3.7/kernel/sched/sched.h
  5933. --- linux-3.3.7.orig/kernel/sched/sched.h   2012-05-21 21:42:51.000000000 +0300
  5934. +++ linux-3.3.7/kernel/sched/sched.h    2012-05-23 21:52:34.000000000 +0300
  5935. @@ -474,6 +474,17 @@
  5936.  #ifdef CONFIG_SMP
  5937.     struct llist_head wake_list;
  5938.  #endif
  5939. +#ifdef CONFIG_BLD
  5940. +   unsigned long this_cpu_load;
  5941. +   struct list_head disp_load_balance;
  5942. +   /* It indicates whether, rq is first or last
  5943. +    * or in the middle based on load from rq_head.
  5944. +    * 0 - First rq
  5945. +    * 1 - rq stays middle
  5946. +    * 2 - last rq
  5947. +    */
  5948. +   char pos;
  5949. +#endif
  5950.  };
  5951.  
  5952.  static inline int cpu_of(struct rq *rq)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement