Guest User

marlin vs. exns sched

a guest
Aug 26th, 2018
643
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 462.55 KB | None | 0 0
  1. diff -Nur /home/ninez/android/marlin/kernel/sched/auto_group.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/auto_group.c
  2. --- /home/ninez/android/marlin/kernel/sched/auto_group.c    2018-08-10 01:54:08.563395055 -0400
  3. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/auto_group.c    2018-08-11 23:57:17.128607487 -0400
  4. @@ -214,7 +214,7 @@
  5.     ag = autogroup_task_get(p);
  6.  
  7.     down_write(&ag->lock);
  8. -   err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
  9. +   err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]);
  10.     if (!err)
  11.         ag->nice = nice;
  12.     up_write(&ag->lock);
  13. diff -Nur /home/ninez/android/marlin/kernel/sched/boost.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/boost.c
  14. --- /home/ninez/android/marlin/kernel/sched/boost.c 1969-12-31 19:00:00.000000000 -0500
  15. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/boost.c 2018-08-14 15:53:43.604124856 -0400
  16. @@ -0,0 +1,68 @@
  17. +/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved.
  18. + *
  19. + * This program is free software; you can redistribute it and/or modify
  20. + * it under the terms of the GNU General Public License version 2 and
  21. + * only version 2 as published by the Free Software Foundation.
  22. + *
  23. + * This program is distributed in the hope that it will be useful,
  24. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  25. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  26. + * GNU General Public License for more details.
  27. + */
  28. +
  29. +#include "sched.h"
  30. +#include <linux/sched.h>
  31. +
  32. +/*
  33. + * Scheduler boost is a mechanism to temporarily place tasks on CPUs
  34. + * with higher capacity than those where a task would have normally
  35. + * ended up with their load characteristics. Any entity enabling
  36. + * boost is responsible for disabling it as well.
  37. + */
  38. +
  39. +unsigned int sysctl_sched_boost;
  40. +
  41. +static bool verify_boost_params(int old_val, int new_val)
  42. +{
  43. +   /*
  44. +    * Boost can only be turned on or off. There is no possiblity of
  45. +    * switching from one boost type to another or to set the same
  46. +    * kind of boost several times.
  47. +    */
  48. +   return !(!!old_val == !!new_val);
  49. +}
  50. +
  51. +int sched_boost_handler(struct ctl_table *table, int write,
  52. +       void __user *buffer, size_t *lenp,
  53. +       loff_t *ppos)
  54. +{
  55. +   int ret;
  56. +   unsigned int *data = (unsigned int *)table->data;
  57. +   unsigned int old_val;
  58. +   unsigned int dsb_top_app_boost = 30;
  59. +   unsigned int dsb_top_app_floor = 0;
  60. +
  61. +   // Backup current sysctl_sched_boost value
  62. +   old_val = *data;
  63. +
  64. +   // Set new sysctl_sched_boost value
  65. +   ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
  66. +
  67. +   if (ret || !write)
  68. +       goto done;
  69. +
  70. +#ifdef CONFIG_DYNAMIC_STUNE_BOOST
  71. +   if (verify_boost_params(old_val, *data)) {
  72. +       if (*data > 0)
  73. +           do_stune_boost("top-app", dsb_top_app_boost);
  74. +       else
  75. +           do_stune_unboost("top-app", dsb_top_app_floor);
  76. +   } else {
  77. +       *data = old_val;
  78. +       ret = -EINVAL;
  79. +   }
  80. +#endif // CONFIG_DYNAMIC_STUNE_BOOST
  81. +
  82. +done:
  83. +   return ret;
  84. +}
  85. diff -Nur /home/ninez/android/marlin/kernel/sched/completion.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/completion.c
  86. --- /home/ninez/android/marlin/kernel/sched/completion.c    2018-08-10 01:54:08.563395055 -0400
  87. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/completion.c    2018-08-12 21:13:57.906629665 -0400
  88. @@ -30,10 +30,10 @@
  89.  {
  90.     unsigned long flags;
  91.  
  92. -   spin_lock_irqsave(&x->wait.lock, flags);
  93. +   raw_spin_lock_irqsave(&x->wait.lock, flags);
  94.     x->done++;
  95. -   __wake_up_locked(&x->wait, TASK_NORMAL, 1);
  96. -   spin_unlock_irqrestore(&x->wait.lock, flags);
  97. +   swake_up_locked(&x->wait);
  98. +   raw_spin_unlock_irqrestore(&x->wait.lock, flags);
  99.  }
  100.  EXPORT_SYMBOL(complete);
  101.  
  102. @@ -50,10 +50,10 @@
  103.  {
  104.     unsigned long flags;
  105.  
  106. -   spin_lock_irqsave(&x->wait.lock, flags);
  107. +   raw_spin_lock_irqsave(&x->wait.lock, flags);
  108.     x->done += UINT_MAX/2;
  109. -   __wake_up_locked(&x->wait, TASK_NORMAL, 0);
  110. -   spin_unlock_irqrestore(&x->wait.lock, flags);
  111. +   swake_up_all_locked(&x->wait);
  112. +   raw_spin_unlock_irqrestore(&x->wait.lock, flags);
  113.  }
  114.  EXPORT_SYMBOL(complete_all);
  115.  
  116. @@ -62,20 +62,20 @@
  117.            long (*action)(long), long timeout, int state)
  118.  {
  119.     if (!x->done) {
  120. -       DECLARE_WAITQUEUE(wait, current);
  121. +       DECLARE_SWAITQUEUE(wait);
  122.  
  123. -       __add_wait_queue_tail_exclusive(&x->wait, &wait);
  124. +       __prepare_to_swait(&x->wait, &wait);
  125.         do {
  126.             if (signal_pending_state(state, current)) {
  127.                 timeout = -ERESTARTSYS;
  128.                 break;
  129.             }
  130.             __set_current_state(state);
  131. -           spin_unlock_irq(&x->wait.lock);
  132. +           raw_spin_unlock_irq(&x->wait.lock);
  133.             timeout = action(timeout);
  134. -           spin_lock_irq(&x->wait.lock);
  135. +           raw_spin_lock_irq(&x->wait.lock);
  136.         } while (!x->done && timeout);
  137. -       __remove_wait_queue(&x->wait, &wait);
  138. +       __finish_swait(&x->wait, &wait);
  139.         if (!x->done)
  140.             return timeout;
  141.     }
  142. @@ -89,9 +89,9 @@
  143.  {
  144.     might_sleep();
  145.  
  146. -   spin_lock_irq(&x->wait.lock);
  147. +   raw_spin_lock_irq(&x->wait.lock);
  148.     timeout = do_wait_for_common(x, action, timeout, state);
  149. -   spin_unlock_irq(&x->wait.lock);
  150. +   raw_spin_unlock_irq(&x->wait.lock);
  151.     return timeout;
  152.  }
  153.  
  154. @@ -267,12 +267,21 @@
  155.     unsigned long flags;
  156.     int ret = 1;
  157.  
  158. -   spin_lock_irqsave(&x->wait.lock, flags);
  159. +   /*
  160. +    * Since x->done will need to be locked only
  161. +    * in the non-blocking case, we check x->done
  162. +    * first without taking the lock so we can
  163. +    * return early in the blocking case.
  164. +    */
  165. +   if (!READ_ONCE(x->done))
  166. +       return 0;
  167. +
  168. +   raw_spin_lock_irqsave(&x->wait.lock, flags);
  169.     if (!x->done)
  170.         ret = 0;
  171.     else
  172.         x->done--;
  173. -   spin_unlock_irqrestore(&x->wait.lock, flags);
  174. +   raw_spin_unlock_irqrestore(&x->wait.lock, flags);
  175.     return ret;
  176.  }
  177.  EXPORT_SYMBOL(try_wait_for_completion);
  178. @@ -287,13 +296,21 @@
  179.   */
  180.  bool completion_done(struct completion *x)
  181.  {
  182. -   unsigned long flags;
  183. -   int ret = 1;
  184. +   if (!READ_ONCE(x->done))
  185. +       return false;
  186.  
  187. -   spin_lock_irqsave(&x->wait.lock, flags);
  188. -   if (!x->done)
  189. -       ret = 0;
  190. -   spin_unlock_irqrestore(&x->wait.lock, flags);
  191. -   return ret;
  192. +   /*
  193. +    * If ->done, we need to wait for complete() to release ->wait.lock
  194. +    * otherwise we can end up freeing the completion before complete()
  195. +    * is done referencing it.
  196. +    *
  197. +    * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
  198. +    * the loads of ->done and ->wait.lock such that we cannot observe
  199. +    * the lock before complete() acquires it while observing the ->done
  200. +    * after it's acquired the lock.
  201. +    */
  202. +   smp_rmb();
  203. +   raw_spin_unlock_wait(&x->wait.lock);
  204. +   return true;
  205.  }
  206.  EXPORT_SYMBOL(completion_done);
  207. diff -Nur /home/ninez/android/marlin/kernel/sched/core.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/core.c
  208. --- /home/ninez/android/marlin/kernel/sched/core.c  2018-08-10 01:54:08.563395055 -0400
  209. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/core.c  2018-08-26 16:43:11.647206295 -0400
  210. @@ -94,7 +94,6 @@
  211.  #define CREATE_TRACE_POINTS
  212.  #include <trace/events/sched.h>
  213.  #include "walt.h"
  214. -#include "tune.h"
  215.  
  216.  DEFINE_MUTEX(sched_domains_mutex);
  217.  DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  218. @@ -105,7 +104,9 @@
  219.  {
  220.     s64 delta;
  221.  
  222. -   if (rq->skip_clock_update > 0)
  223. +   lockdep_assert_held(&rq->lock);
  224. +
  225. +   if (rq->clock_skip_update & RQCF_ACT_SKIP)
  226.         return;
  227.  
  228.     delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
  229. @@ -168,14 +169,12 @@
  230.  
  231.  static void sched_feat_disable(int i)
  232.  {
  233. -   if (static_key_enabled(&sched_feat_keys[i]))
  234. -       static_key_slow_dec(&sched_feat_keys[i]);
  235. +   static_key_disable(&sched_feat_keys[i]);
  236.  }
  237.  
  238.  static void sched_feat_enable(int i)
  239.  {
  240. -   if (!static_key_enabled(&sched_feat_keys[i]))
  241. -       static_key_slow_inc(&sched_feat_keys[i]);
  242. +   static_key_enable(&sched_feat_keys[i]);
  243.  }
  244.  #else
  245.  static void sched_feat_disable(int i) { };
  246. @@ -290,10 +289,40 @@
  247.   */
  248.  int sysctl_sched_rt_runtime = 950000;
  249.  
  250. +/* cpus with isolated domains */
  251. +cpumask_var_t cpu_isolated_map;
  252. +
  253. +struct rq *
  254. +lock_rq_of(struct task_struct *p, struct rq_flags *rf)
  255. +{
  256. +   return task_rq_lock(p, rf);
  257. +}
  258. +
  259. +void
  260. +unlock_rq_of(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
  261. +{
  262. +   task_rq_unlock(rq, p, rf);
  263. +}
  264. +
  265. +/*
  266. + * this_rq_lock - lock this runqueue and disable interrupts.
  267. + */
  268. +static struct rq *this_rq_lock(void)
  269. +   __acquires(rq->lock)
  270. +{
  271. +   struct rq *rq;
  272. +
  273. +   local_irq_disable();
  274. +   rq = this_rq();
  275. +   raw_spin_lock(&rq->lock);
  276. +
  277. +   return rq;
  278. +}
  279. +
  280.  /*
  281.   * __task_rq_lock - lock the rq @p resides on.
  282.   */
  283. -static inline struct rq *__task_rq_lock(struct task_struct *p)
  284. +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
  285.     __acquires(rq->lock)
  286.  {
  287.     struct rq *rq;
  288. @@ -303,8 +332,10 @@
  289.     for (;;) {
  290.         rq = task_rq(p);
  291.         raw_spin_lock(&rq->lock);
  292. -       if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
  293. +       if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
  294. +           rf->cookie = lockdep_pin_lock(&rq->lock);
  295.             return rq;
  296. +       }
  297.         raw_spin_unlock(&rq->lock);
  298.  
  299.         while (unlikely(task_on_rq_migrating(p)))
  300. @@ -315,68 +346,44 @@
  301.  /*
  302.   * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
  303.   */
  304. -static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
  305. +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
  306.     __acquires(p->pi_lock)
  307.     __acquires(rq->lock)
  308.  {
  309.     struct rq *rq;
  310.  
  311.     for (;;) {
  312. -       raw_spin_lock_irqsave(&p->pi_lock, *flags);
  313. +       raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
  314.         rq = task_rq(p);
  315.         raw_spin_lock(&rq->lock);
  316. -       if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
  317. +       /*
  318. +        *  move_queued_task()      task_rq_lock()
  319. +        *
  320. +        *  ACQUIRE (rq->lock)
  321. +        *  [S] ->on_rq = MIGRATING     [L] rq = task_rq()
  322. +        *  WMB (__set_task_cpu())      ACQUIRE (rq->lock);
  323. +        *  [S] ->cpu = new_cpu     [L] task_rq()
  324. +        *                  [L] ->on_rq
  325. +        *  RELEASE (rq->lock)
  326. +        *
  327. +        * If we observe the old cpu in task_rq_lock, the acquire of
  328. +        * the old rq->lock will fully serialize against the stores.
  329. +        *
  330. +        * If we observe the new cpu in task_rq_lock, the acquire will
  331. +        * pair with the WMB to ensure we must then also see migrating.
  332. +        */
  333. +       if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
  334. +           rf->cookie = lockdep_pin_lock(&rq->lock);
  335.             return rq;
  336. +       }
  337.         raw_spin_unlock(&rq->lock);
  338. -       raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
  339. +       raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
  340.  
  341.         while (unlikely(task_on_rq_migrating(p)))
  342.             cpu_relax();
  343.     }
  344.  }
  345.  
  346. -struct rq *
  347. -lock_rq_of(struct task_struct *p, unsigned long *flags)
  348. -{
  349. -   return task_rq_lock(p, flags);
  350. -}
  351. -
  352. -static void __task_rq_unlock(struct rq *rq)
  353. -   __releases(rq->lock)
  354. -{
  355. -   raw_spin_unlock(&rq->lock);
  356. -}
  357. -
  358. -static inline void
  359. -task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
  360. -   __releases(rq->lock)
  361. -   __releases(p->pi_lock)
  362. -{
  363. -   raw_spin_unlock(&rq->lock);
  364. -   raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
  365. -}
  366. -
  367. -void
  368. -unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags)
  369. -{
  370. -   task_rq_unlock(rq, p, flags);
  371. -}
  372. -
  373. -/*
  374. - * this_rq_lock - lock this runqueue and disable interrupts.
  375. - */
  376. -static struct rq *this_rq_lock(void)
  377. -   __acquires(rq->lock)
  378. -{
  379. -   struct rq *rq;
  380. -
  381. -   local_irq_disable();
  382. -   rq = this_rq();
  383. -   raw_spin_lock(&rq->lock);
  384. -
  385. -   return rq;
  386. -}
  387. -
  388.  #ifdef CONFIG_SCHED_HRTICK
  389.  /*
  390.   * Use HR-timers to deliver accurate preemption points.
  391. @@ -531,15 +538,19 @@
  392.  /*
  393.   * cmpxchg based fetch_or, macro so it works for different integer types
  394.   */
  395. -#define fetch_or(ptr, val)                     \
  396. -({ typeof(*(ptr)) __old, __val = *(ptr);               \
  397. -   for (;;) {                          \
  398. -       __old = cmpxchg((ptr), __val, __val | (val));       \
  399. -       if (__old == __val)                 \
  400. -           break;                      \
  401. -       __val = __old;                      \
  402. -   }                               \
  403. -   __old;                              \
  404. +#define fetch_or(ptr, mask)                        \
  405. +   ({                              \
  406. +       typeof(ptr) _ptr = (ptr);               \
  407. +       typeof(mask) _mask = (mask);                \
  408. +       typeof(*_ptr) _old, _val = *_ptr;           \
  409. +                                   \
  410. +       for (;;) {                      \
  411. +           _old = cmpxchg(_ptr, _val, _val | _mask);   \
  412. +           if (_old == _val)               \
  413. +               break;                  \
  414. +           _val = _old;                    \
  415. +       }                           \
  416. +   _old;                               \
  417.  })
  418.  
  419.  #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
  420. @@ -593,6 +604,58 @@
  421.  #endif
  422.  #endif
  423.  
  424. +void wake_q_add(struct wake_q_head *head, struct task_struct *task)
  425. +{
  426. +   struct wake_q_node *node = &task->wake_q;
  427. +
  428. +   /*
  429. +    * Atomically grab the task, if ->wake_q is !nil already it means
  430. +    * its already queued (either by us or someone else) and will get the
  431. +    * wakeup due to that.
  432. +    *
  433. +    * This cmpxchg() implies a full barrier, which pairs with the write
  434. +    * barrier implied by the wakeup in wake_up_list().
  435. +    */
  436. +   if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
  437. +       return;
  438. +
  439. +   head->count++;
  440. +
  441. +   get_task_struct(task);
  442. +
  443. +   /*
  444. +    * The head is context local, there can be no concurrency.
  445. +    */
  446. +   *head->lastp = node;
  447. +   head->lastp = &node->next;
  448. +}
  449. +
  450. +static int
  451. +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
  452. +          int sibling_count_hint);
  453. +
  454. +void wake_up_q(struct wake_q_head *head)
  455. +{
  456. +   struct wake_q_node *node = head->first;
  457. +
  458. +   while (node != WAKE_Q_TAIL) {
  459. +       struct task_struct *task;
  460. +
  461. +       task = container_of(node, struct task_struct, wake_q);
  462. +       BUG_ON(!task);
  463. +       /* task can safely be re-inserted now */
  464. +       node = node->next;
  465. +       task->wake_q.next = NULL;
  466. +
  467. +       /*
  468. +        * try_to_wake_up() implies a wmb() to pair with the queueing
  469. +        * in wake_q_add() so as not to miss wakeups.
  470. +        */
  471. +       try_to_wake_up(task, TASK_NORMAL, 0, head->count);
  472. +       put_task_struct(task);
  473. +   }
  474. +}
  475. +
  476.  /*
  477.   * resched_curr - mark rq's current task 'to be rescheduled now'.
  478.   *
  479. @@ -629,9 +692,9 @@
  480.     struct rq *rq = cpu_rq(cpu);
  481.     unsigned long flags;
  482.  
  483. -   raw_spin_lock_irqsave(&rq->lock, flags);
  484. -   if (cpu_online(cpu) || cpu == smp_processor_id())
  485. -       resched_curr(rq);
  486. +   if (!raw_spin_trylock_irqsave(&rq->lock, flags))
  487. +       return;
  488. +   resched_curr(rq);
  489.     raw_spin_unlock_irqrestore(&rq->lock, flags);
  490.  }
  491.  
  492. @@ -745,6 +808,23 @@
  493.  bool sched_can_stop_tick(void)
  494.  {
  495.     /*
  496. +    * FIFO realtime policy runs the highest priority task. Other runnable
  497. +    * tasks are of a lower priority. The scheduler tick does nothing.
  498. +    */
  499. +   if (current->policy == SCHED_FIFO)
  500. +       return true;
  501. +
  502. +   /*
  503. +    * Round-robin realtime tasks time slice with other tasks at the same
  504. +    * realtime priority. Is this task the only one at this priority?
  505. +    */
  506. +   if (current->policy == SCHED_RR) {
  507. +       struct sched_rt_entity *rt_se = &current->rt;
  508. +
  509. +       return rt_se->run_list.prev == rt_se->run_list.next;
  510. +   }
  511. +
  512. +   /*
  513.      * More than one running task need preemption.
  514.      * nr_running update is assumed to be visible
  515.      * after IPI is sent from wakers.
  516. @@ -844,27 +924,29 @@
  517.     /*
  518.      * SCHED_IDLE tasks get minimal weight:
  519.      */
  520. -   if (p->policy == SCHED_IDLE) {
  521. +   if (idle_policy(p->policy)) {
  522.         load->weight = scale_load(WEIGHT_IDLEPRIO);
  523.         load->inv_weight = WMULT_IDLEPRIO;
  524.         return;
  525.     }
  526.  
  527. -   load->weight = scale_load(prio_to_weight[prio]);
  528. -   load->inv_weight = prio_to_wmult[prio];
  529. +   load->weight = scale_load(sched_prio_to_weight[prio]);
  530. +   load->inv_weight = sched_prio_to_wmult[prio];
  531.  }
  532.  
  533. -static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
  534. +static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
  535.  {
  536.     update_rq_clock(rq);
  537. -   sched_info_queued(rq, p);
  538. +   if (!(flags & ENQUEUE_RESTORE))
  539. +       sched_info_queued(rq, p);
  540.     p->sched_class->enqueue_task(rq, p, flags);
  541.  }
  542.  
  543. -static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
  544. +static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
  545.  {
  546.     update_rq_clock(rq);
  547. -   sched_info_dequeued(rq, p);
  548. +   if (!(flags & DEQUEUE_SAVE))
  549. +       sched_info_dequeued(rq, p);
  550.     p->sched_class->dequeue_task(rq, p, flags);
  551.  }
  552.  
  553. @@ -1069,10 +1151,37 @@
  554.      * this case, we can save a useless back to back clock update.
  555.      */
  556.     if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
  557. -       rq->skip_clock_update = 1;
  558. +       rq_clock_skip_update(rq, true);
  559.  }
  560.  
  561.  #ifdef CONFIG_SMP
  562. +
  563. +static inline bool is_per_cpu_kthread(struct task_struct *p)
  564. +{
  565. +   if (!(p->flags & PF_KTHREAD))
  566. +       return false;
  567. +
  568. +   if (p->nr_cpus_allowed != 1)
  569. +       return false;
  570. +
  571. +   return true;
  572. +}
  573. +
  574. +/*
  575. + * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
  576. + * __set_cpus_allowed_ptr() and select_fallback_rq().
  577. + */
  578. +static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
  579. +{
  580. +   if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
  581. +       return false;
  582. +
  583. +   if (is_per_cpu_kthread(p))
  584. +       return cpu_online(cpu);
  585. +
  586. +   return cpu_active(cpu);
  587. +}
  588. +
  589.  /*
  590.   * This is how migration works:
  591.   *
  592. @@ -1092,14 +1201,12 @@
  593.   *
  594.   * Returns (locked) new rq. Old rq's lock is released.
  595.   */
  596. -static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
  597. +static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
  598.  {
  599. -   struct rq *rq = task_rq(p);
  600. -
  601.     lockdep_assert_held(&rq->lock);
  602.  
  603. -   dequeue_task(rq, p, 0);
  604.     p->on_rq = TASK_ON_RQ_MIGRATING;
  605. +   dequeue_task(rq, p, 0);
  606.     double_lock_balance(rq, cpu_rq(new_cpu));
  607.     set_task_cpu(p, new_cpu);
  608.     double_unlock_balance(rq, cpu_rq(new_cpu));
  609. @@ -1109,8 +1216,8 @@
  610.  
  611.     raw_spin_lock(&rq->lock);
  612.     BUG_ON(task_cpu(p) != new_cpu);
  613. -   p->on_rq = TASK_ON_RQ_QUEUED;
  614.     enqueue_task(rq, p, 0);
  615. +   p->on_rq = TASK_ON_RQ_QUEUED;
  616.     check_preempt_curr(rq, p, 0);
  617.  
  618.     return rq;
  619. @@ -1129,41 +1236,16 @@
  620.   *
  621.   * So we race with normal scheduler movements, but that's OK, as long
  622.   * as the task is no longer on this CPU.
  623. - *
  624. - * Returns non-zero if task was successfully migrated.
  625.   */
  626. -static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
  627. +static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
  628.  {
  629. -   struct rq *rq;
  630. -   int ret = 0;
  631. -
  632. -   if (unlikely(!cpu_active(dest_cpu)))
  633. -       return ret;
  634. -
  635. -   rq = cpu_rq(src_cpu);
  636. -
  637. -   raw_spin_lock(&p->pi_lock);
  638. -   raw_spin_lock(&rq->lock);
  639. -   /* Already moved. */
  640. -   if (task_cpu(p) != src_cpu)
  641. -       goto done;
  642. -
  643.     /* Affinity changed (again). */
  644. -   if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
  645. -       goto fail;
  646. +   if (!is_cpu_allowed(p, dest_cpu))
  647. +       return rq;
  648.  
  649. -   /*
  650. -    * If we're not on a rq, the next wake-up will ensure we're
  651. -    * placed properly.
  652. -    */
  653. -   if (task_on_rq_queued(p))
  654. -       rq = move_queued_task(p, dest_cpu);
  655. -done:
  656. -   ret = 1;
  657. -fail:
  658. -   raw_spin_unlock(&rq->lock);
  659. -   raw_spin_unlock(&p->pi_lock);
  660. -   return ret;
  661. +   rq = move_queued_task(rq, p, dest_cpu);
  662. +
  663. +   return rq;
  664.  }
  665.  
  666.  /*
  667. @@ -1174,6 +1256,8 @@
  668.  static int migration_cpu_stop(void *data)
  669.  {
  670.     struct migration_arg *arg = data;
  671. +   struct task_struct *p = arg->task;
  672. +   struct rq *rq = this_rq();
  673.  
  674.     /*
  675.      * The original target cpu might have gone down and we might
  676. @@ -1186,20 +1270,77 @@
  677.      * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
  678.      */
  679.     sched_ttwu_pending();
  680. -   __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
  681. +
  682. +   raw_spin_lock(&p->pi_lock);
  683. +   raw_spin_lock(&rq->lock);
  684. +   /*
  685. +    * If task_rq(p) != rq, it cannot be migrated here, because we're
  686. +    * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
  687. +    * we're holding p->pi_lock.
  688. +    */
  689. +   if (task_rq(p) == rq && task_on_rq_queued(p))
  690. +       rq = __migrate_task(rq, p, arg->dest_cpu);
  691. +   raw_spin_unlock(&rq->lock);
  692. +   raw_spin_unlock(&p->pi_lock);
  693. +
  694.     local_irq_enable();
  695.     return 0;
  696.  }
  697.  
  698. +/*
  699. + * sched_class::set_cpus_allowed must do the below, but is not required to
  700. + * actually call this function.
  701. + */
  702. +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
  703. +{
  704. +   cpumask_copy(&p->cpus_allowed, new_mask);
  705. +   p->nr_cpus_allowed = cpumask_weight(new_mask);
  706. +}
  707. +
  708. +static const struct cpumask *get_adjusted_cpumask(const struct task_struct *p,
  709. +   const struct cpumask *req_mask)
  710. +{
  711. +   /* Force all performance-critical kthreads onto the big cluster */
  712. +   if (p->flags & PF_PERF_CRITICAL)
  713. +       return cpu_perf_mask;
  714. +
  715. +   /* Force all trivial, unbound kthreads onto the little cluster */
  716. +   if (p->flags & PF_KTHREAD && p->pid != 1 &&
  717. +       cpumask_equal(req_mask, cpu_all_mask))
  718. +       return cpu_lp_mask;
  719. +
  720. +   return req_mask;
  721. +}
  722. +
  723.  void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  724.  {
  725. +   struct rq *rq = task_rq(p);
  726. +   bool queued, running;
  727. +
  728. +   new_mask = get_adjusted_cpumask(p, new_mask);
  729. +
  730.     lockdep_assert_held(&p->pi_lock);
  731.  
  732. -   if (p->sched_class->set_cpus_allowed)
  733. -       p->sched_class->set_cpus_allowed(p, new_mask);
  734. +   queued = task_on_rq_queued(p);
  735. +   running = task_current(rq, p);
  736.  
  737. -   cpumask_copy(&p->cpus_allowed, new_mask);
  738. -   p->nr_cpus_allowed = cpumask_weight(new_mask);
  739. +   if (queued) {
  740. +       /*
  741. +        * Because __kthread_bind() calls this on blocked tasks without
  742. +        * holding rq->lock.
  743. +        */
  744. +       lockdep_assert_held(&rq->lock);
  745. +       dequeue_task(rq, p, DEQUEUE_SAVE);
  746. +   }
  747. +   if (running)
  748. +       put_prev_task(rq, p);
  749. +
  750. +   p->sched_class->set_cpus_allowed(p, new_mask);
  751. +
  752. +   if (queued)
  753. +       enqueue_task(rq, p, ENQUEUE_RESTORE);
  754. +   if (running)
  755. +       set_curr_task(rq, p);
  756.  }
  757.  
  758.  /*
  759. @@ -1214,12 +1355,23 @@
  760.  static int __set_cpus_allowed_ptr(struct task_struct *p,
  761.                   const struct cpumask *new_mask, bool check)
  762.  {
  763. -   unsigned long flags;
  764. -   struct rq *rq;
  765. +   const struct cpumask *cpu_valid_mask = cpu_active_mask;
  766.     unsigned int dest_cpu;
  767. +   struct rq_flags rf;
  768. +   struct rq *rq;
  769.     int ret = 0;
  770.  
  771. -   rq = task_rq_lock(p, &flags);
  772. +   new_mask = get_adjusted_cpumask(p, new_mask);
  773. +
  774. +   rq = task_rq_lock(p, &rf);
  775. +   update_rq_clock(rq);
  776. +
  777. +   if (p->flags & PF_KTHREAD) {
  778. +       /*
  779. +        * Kernel threads are allowed on online && !active CPUs
  780. +        */
  781. +       cpu_valid_mask = cpu_online_mask;
  782. +   }
  783.  
  784.     /*
  785.      * Must re-check here, to close a race against __kthread_bind(),
  786. @@ -1233,29 +1385,46 @@
  787.     if (cpumask_equal(&p->cpus_allowed, new_mask))
  788.         goto out;
  789.  
  790. -   if (!cpumask_intersects(new_mask, cpu_active_mask)) {
  791. +   if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
  792.         ret = -EINVAL;
  793.         goto out;
  794.     }
  795.  
  796.     do_set_cpus_allowed(p, new_mask);
  797.  
  798. +   if (p->flags & PF_KTHREAD) {
  799. +       /*
  800. +        * For kernel threads that do indeed end up on online &&
  801. +        * !active we want to ensure they are strict per-cpu threads.
  802. +        */
  803. +       WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
  804. +           !cpumask_intersects(new_mask, cpu_active_mask) &&
  805. +           p->nr_cpus_allowed != 1);
  806. +   }
  807. +
  808.     /* Can the task run on the task's current CPU? If so, we're done */
  809.     if (cpumask_test_cpu(task_cpu(p), new_mask))
  810.         goto out;
  811.  
  812. -   dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
  813. +   dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
  814.     if (task_running(rq, p) || p->state == TASK_WAKING) {
  815.         struct migration_arg arg = { p, dest_cpu };
  816.         /* Need help from migration thread: drop lock and wait. */
  817. -       task_rq_unlock(rq, p, &flags);
  818. +       task_rq_unlock(rq, p, &rf);
  819.         stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
  820.         tlb_migrate_finish(p->mm);
  821.         return 0;
  822. -   } else if (task_on_rq_queued(p))
  823. -       rq = move_queued_task(p, dest_cpu);
  824. +   } else if (task_on_rq_queued(p)) {
  825. +       /*
  826. +        * OK, since we're going to drop the lock immediately
  827. +        * afterwards anyway.
  828. +        */
  829. +       lockdep_unpin_lock(&rq->lock, rf.cookie);
  830. +       rq = move_queued_task(rq, p, dest_cpu);
  831. +       lockdep_repin_lock(&rq->lock, rf.cookie);
  832. +   }
  833.  out:
  834. -   task_rq_unlock(rq, p, &flags);
  835. +   task_rq_unlock(rq, p, &rf);
  836.  
  837.     return ret;
  838.  }
  839. @@ -1274,7 +1443,16 @@
  840.      * ttwu() will sort out the placement.
  841.      */
  842.     WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
  843. -           !(task_preempt_count(p) & PREEMPT_ACTIVE));
  844. +           !p->on_rq);
  845. +
  846. +   /*
  847. +    * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
  848. +    * because schedstat_wait_{start,end} rebase migrating task's wait_start
  849. +    * time relying on p->on_rq.
  850. +    */
  851. +   WARN_ON_ONCE(p->state == TASK_RUNNING &&
  852. +            p->sched_class == &fair_sched_class &&
  853. +            (p->on_rq && !task_on_rq_migrating(p)));
  854.  
  855.  #ifdef CONFIG_LOCKDEP
  856.     /*
  857. @@ -1296,7 +1474,7 @@
  858.  
  859.     if (task_cpu(p) != new_cpu) {
  860.         if (p->sched_class->migrate_task_rq)
  861. -           p->sched_class->migrate_task_rq(p, new_cpu);
  862. +           p->sched_class->migrate_task_rq(p);
  863.         p->se.nr_migrations++;
  864.         perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
  865.  
  866. @@ -1314,9 +1492,13 @@
  867.         src_rq = task_rq(p);
  868.         dst_rq = cpu_rq(cpu);
  869.  
  870. +       p->on_rq = TASK_ON_RQ_MIGRATING;
  871.         deactivate_task(src_rq, p, 0);
  872. +       p->on_rq = TASK_ON_RQ_MIGRATING;
  873.         set_task_cpu(p, cpu);
  874. +       p->on_rq = TASK_ON_RQ_QUEUED;
  875.         activate_task(dst_rq, p, 0);
  876. +       p->on_rq = TASK_ON_RQ_QUEUED;
  877.         check_preempt_curr(dst_rq, p, 0);
  878.     } else {
  879.         /*
  880. @@ -1339,12 +1521,16 @@
  881.     struct rq *src_rq, *dst_rq;
  882.     int ret = -EAGAIN;
  883.  
  884. +   if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
  885. +       return -EAGAIN;
  886. +
  887.     src_rq = cpu_rq(arg->src_cpu);
  888.     dst_rq = cpu_rq(arg->dst_cpu);
  889.  
  890.     double_raw_lock(&arg->src_task->pi_lock,
  891.             &arg->dst_task->pi_lock);
  892.     double_rq_lock(src_rq, dst_rq);
  893. +
  894.     if (task_cpu(arg->dst_task) != arg->dst_cpu)
  895.         goto unlock;
  896.  
  897. @@ -1426,8 +1612,8 @@
  898.   */
  899.  unsigned long wait_task_inactive(struct task_struct *p, long match_state)
  900.  {
  901. -   unsigned long flags;
  902.     int running, queued;
  903. +   struct rq_flags rf;
  904.     unsigned long ncsw;
  905.     struct rq *rq;
  906.  
  907. @@ -1462,14 +1648,14 @@
  908.          * lock now, to be *sure*. If we're wrong, we'll
  909.          * just go back and repeat.
  910.          */
  911. -       rq = task_rq_lock(p, &flags);
  912. +       rq = task_rq_lock(p, &rf);
  913.         trace_sched_wait_task(p);
  914.         running = task_running(rq, p);
  915.         queued = task_on_rq_queued(p);
  916.         ncsw = 0;
  917.         if (!match_state || p->state == match_state)
  918.             ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
  919. -       task_rq_unlock(rq, p, &flags);
  920. +       task_rq_unlock(rq, p, &rf);
  921.  
  922.         /*
  923.          * If it changed from the expected state, bail out now.
  924. @@ -1543,6 +1729,25 @@
  925.  
  926.  /*
  927.   * ->cpus_allowed is protected by both rq->lock and p->pi_lock
  928. + *
  929. + * A few notes on cpu_active vs cpu_online:
  930. + *
  931. + *  - cpu_active must be a subset of cpu_online
  932. + *
  933. + *  - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
  934. + *    see __set_cpus_allowed_ptr(). At this point the newly online
  935. + *    cpu isn't yet part of the sched domains, and balancing will not
  936. + *    see it.
  937. + *
  938. + *  - on cpu-down we clear cpu_active() to mask the sched domains and
  939. + *    avoid the load balancer to place new tasks on the to be removed
  940. + *    cpu. Existing tasks will remain running there and will be taken
  941. + *    off.
  942. + *
  943. + * This means that fallback selection must not select !active CPUs.
  944. + * And can assume that any active CPU must be online. Conversely
  945. + * select_task_rq() below may allow selection of !active CPUs in order
  946. + * to satisfy the above rules.
  947.   */
  948.  static int select_fallback_rq(int cpu, struct task_struct *p)
  949.  {
  950. @@ -1561,8 +1766,6 @@
  951.  
  952.         /* Look for allowed, online CPU in same node. */
  953.         for_each_cpu(dest_cpu, nodemask) {
  954. -           if (!cpu_online(dest_cpu))
  955. -               continue;
  956.             if (!cpu_active(dest_cpu))
  957.                 continue;
  958.             if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
  959. @@ -1573,20 +1776,21 @@
  960.     for (;;) {
  961.         /* Any allowed, online CPU? */
  962.         for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
  963. -           if (!cpu_online(dest_cpu))
  964. -               continue;
  965. -           if (!cpu_active(dest_cpu))
  966. +           if (!is_cpu_allowed(p, dest_cpu))
  967.                 continue;
  968. +
  969.             goto out;
  970.         }
  971.  
  972. +       /* No more Mr. Nice Guy. */
  973.         switch (state) {
  974.         case cpuset:
  975. -           /* No more Mr. Nice Guy. */
  976. -           cpuset_cpus_allowed_fallback(p);
  977. -           state = possible;
  978. -           break;
  979. -
  980. +           if (IS_ENABLED(CONFIG_CPUSETS)) {
  981. +               cpuset_cpus_allowed_fallback(p);
  982. +               state = possible;
  983. +               break;
  984. +           }
  985. +           /* fall-through */
  986.         case possible:
  987.             do_set_cpus_allowed(p, cpu_possible_mask);
  988.             state = fail;
  989. @@ -1618,9 +1822,14 @@
  990.   * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
  991.   */
  992.  static inline
  993. -int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
  994. +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
  995. +          int sibling_count_hint)
  996.  {
  997. -   cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
  998. +   lockdep_assert_held(&p->pi_lock);
  999. +
  1000. +   if (tsk_nr_cpus_allowed(p) > 1)
  1001. +       cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
  1002. +                            sibling_count_hint);
  1003.  
  1004.     /*
  1005.      * In order not to call set_task_cpu() on a blocking task we need
  1006. @@ -1658,23 +1867,25 @@
  1007.  static void
  1008.  ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
  1009.  {
  1010. -#ifdef CONFIG_SCHEDSTATS
  1011. -   struct rq *rq = this_rq();
  1012. +   struct rq *rq;
  1013.  
  1014. -#ifdef CONFIG_SMP
  1015. -   int this_cpu = smp_processor_id();
  1016. +   if (!schedstat_enabled())
  1017. +       return;
  1018. +
  1019. +   rq = this_rq();
  1020.  
  1021. -   if (cpu == this_cpu) {
  1022. -       schedstat_inc(rq, ttwu_local);
  1023. -       schedstat_inc(p, se.statistics.nr_wakeups_local);
  1024. +#ifdef CONFIG_SMP
  1025. +   if (cpu == rq->cpu) {
  1026. +       schedstat_inc(rq->ttwu_local);
  1027. +       schedstat_inc(p->se.statistics.nr_wakeups_local);
  1028.     } else {
  1029.         struct sched_domain *sd;
  1030.  
  1031. -       schedstat_inc(p, se.statistics.nr_wakeups_remote);
  1032. +       schedstat_inc(p->se.statistics.nr_wakeups_remote);
  1033.         rcu_read_lock();
  1034. -       for_each_domain(this_cpu, sd) {
  1035. +       for_each_domain(rq->cpu, sd) {
  1036.             if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
  1037. -               schedstat_inc(sd, ttwu_wake_remote);
  1038. +               schedstat_inc(sd->ttwu_wake_remote);
  1039.                 break;
  1040.             }
  1041.         }
  1042. @@ -1682,34 +1893,27 @@
  1043.     }
  1044.  
  1045.     if (wake_flags & WF_MIGRATED)
  1046. -       schedstat_inc(p, se.statistics.nr_wakeups_migrate);
  1047. -
  1048. +       schedstat_inc(p->se.statistics.nr_wakeups_migrate);
  1049.  #endif /* CONFIG_SMP */
  1050.  
  1051. -   schedstat_inc(rq, ttwu_count);
  1052. -   schedstat_inc(p, se.statistics.nr_wakeups);
  1053. +   schedstat_inc(rq->ttwu_count);
  1054. +   schedstat_inc(p->se.statistics.nr_wakeups);
  1055.  
  1056.     if (wake_flags & WF_SYNC)
  1057. -       schedstat_inc(p, se.statistics.nr_wakeups_sync);
  1058. -
  1059. -#endif /* CONFIG_SCHEDSTATS */
  1060. +       schedstat_inc(p->se.statistics.nr_wakeups_sync);
  1061.  }
  1062.  
  1063.  static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
  1064.  {
  1065.     activate_task(rq, p, en_flags);
  1066.     p->on_rq = TASK_ON_RQ_QUEUED;
  1067. -
  1068. -   /* if a worker is waking up, notify workqueue */
  1069. -   if (p->flags & PF_WQ_WORKER)
  1070. -       wq_worker_waking_up(p, cpu_of(rq));
  1071.  }
  1072.  
  1073.  /*
  1074.   * Mark the task runnable and perform wakeup-preemption.
  1075.   */
  1076. -static void
  1077. -ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
  1078. +static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
  1079. +              struct pin_cookie cookie)
  1080.  {
  1081.     check_preempt_curr(rq, p, wake_flags);
  1082.     p->state = TASK_RUNNING;
  1083. @@ -1718,9 +1922,12 @@
  1084.  #ifdef CONFIG_SMP
  1085.     if (p->sched_class->task_woken) {
  1086.         /*
  1087. -        * XXX can drop rq->lock; most likely ok.
  1088. +        * Our task @p is fully woken up and running; so its safe to
  1089. +        * drop the rq->lock, hereafter rq is only used for statistics.
  1090.          */
  1091. +       lockdep_unpin_lock(&rq->lock, cookie);
  1092.         p->sched_class->task_woken(rq, p);
  1093. +       lockdep_repin_lock(&rq->lock, cookie);
  1094.     }
  1095.  
  1096.     if (rq->idle_stamp) {
  1097. @@ -1738,15 +1945,18 @@
  1098.  }
  1099.  
  1100.  static void
  1101. -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
  1102. +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
  1103. +        struct pin_cookie cookie)
  1104.  {
  1105. +   lockdep_assert_held(&rq->lock);
  1106. +
  1107.  #ifdef CONFIG_SMP
  1108.     if (p->sched_contributes_to_load)
  1109.         rq->nr_uninterruptible--;
  1110.  #endif
  1111.  
  1112.     ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
  1113. -   ttwu_do_wakeup(rq, p, wake_flags);
  1114. +   ttwu_do_wakeup(rq, p, wake_flags, cookie);
  1115.  }
  1116.  
  1117.  /*
  1118. @@ -1757,17 +1967,18 @@
  1119.   */
  1120.  static int ttwu_remote(struct task_struct *p, int wake_flags)
  1121.  {
  1122. +   struct rq_flags rf;
  1123.     struct rq *rq;
  1124.     int ret = 0;
  1125.  
  1126. -   rq = __task_rq_lock(p);
  1127. +   rq = __task_rq_lock(p, &rf);
  1128.     if (task_on_rq_queued(p)) {
  1129.         /* check_preempt_curr() may use rq clock */
  1130.         update_rq_clock(rq);
  1131. -       ttwu_do_wakeup(rq, p, wake_flags);
  1132. +       ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
  1133.         ret = 1;
  1134.     }
  1135. -   __task_rq_unlock(rq);
  1136. +   __task_rq_unlock(rq, &rf);
  1137.  
  1138.     return ret;
  1139.  }
  1140. @@ -1777,6 +1988,7 @@
  1141.  {
  1142.     struct rq *rq = this_rq();
  1143.     struct llist_node *llist = llist_del_all(&rq->wake_list);
  1144. +   struct pin_cookie cookie;
  1145.     struct task_struct *p;
  1146.     unsigned long flags;
  1147.  
  1148. @@ -1784,13 +1996,15 @@
  1149.         return;
  1150.  
  1151.     raw_spin_lock_irqsave(&rq->lock, flags);
  1152. +   cookie = lockdep_pin_lock(&rq->lock);
  1153.  
  1154.     while (llist) {
  1155.         p = llist_entry(llist, struct task_struct, wake_entry);
  1156.         llist = llist_next(llist);
  1157. -       ttwu_do_activate(rq, p, 0);
  1158. +       ttwu_do_activate(rq, p, 0, cookie);
  1159.     }
  1160.  
  1161. +   lockdep_unpin_lock(&rq->lock, cookie);
  1162.     raw_spin_unlock_irqrestore(&rq->lock, flags);
  1163.  }
  1164.  
  1165. @@ -1877,6 +2091,7 @@
  1166.  static void ttwu_queue(struct task_struct *p, int cpu)
  1167.  {
  1168.     struct rq *rq = cpu_rq(cpu);
  1169. +   struct pin_cookie cookie;
  1170.  
  1171.  #if defined(CONFIG_SMP)
  1172.     if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
  1173. @@ -1887,15 +2102,110 @@
  1174.  #endif
  1175.  
  1176.     raw_spin_lock(&rq->lock);
  1177. -   ttwu_do_activate(rq, p, 0);
  1178. +   cookie = lockdep_pin_lock(&rq->lock);
  1179. +   ttwu_do_activate(rq, p, 0, cookie);
  1180. +   lockdep_unpin_lock(&rq->lock, cookie);
  1181.     raw_spin_unlock(&rq->lock);
  1182.  }
  1183.  
  1184. +/*
  1185. + * Notes on Program-Order guarantees on SMP systems.
  1186. + *
  1187. + *  MIGRATION
  1188. + *
  1189. + * The basic program-order guarantee on SMP systems is that when a task [t]
  1190. + * migrates, all its activity on its old cpu [c0] happens-before any subsequent
  1191. + * execution on its new cpu [c1].
  1192. + *
  1193. + * For migration (of runnable tasks) this is provided by the following means:
  1194. + *
  1195. + *  A) UNLOCK of the rq(c0)->lock scheduling out task t
  1196. + *  B) migration for t is required to synchronize *both* rq(c0)->lock and
  1197. + *     rq(c1)->lock (if not at the same time, then in that order).
  1198. + *  C) LOCK of the rq(c1)->lock scheduling in task
  1199. + *
  1200. + * Transitivity guarantees that B happens after A and C after B.
  1201. + * Note: we only require RCpc transitivity.
  1202. + * Note: the cpu doing B need not be c0 or c1
  1203. + *
  1204. + * Example:
  1205. + *
  1206. + *   CPU0            CPU1            CPU2
  1207. + *
  1208. + *   LOCK rq(0)->lock
  1209. + *   sched-out X
  1210. + *   sched-in Y
  1211. + *   UNLOCK rq(0)->lock
  1212. + *
  1213. + *                                   LOCK rq(0)->lock // orders against CPU0
  1214. + *                                   dequeue X
  1215. + *                                   UNLOCK rq(0)->lock
  1216. + *
  1217. + *                                   LOCK rq(1)->lock
  1218. + *                                   enqueue X
  1219. + *                                   UNLOCK rq(1)->lock
  1220. + *
  1221. + *                   LOCK rq(1)->lock // orders against CPU2
  1222. + *                   sched-out Z
  1223. + *                   sched-in X
  1224. + *                   UNLOCK rq(1)->lock
  1225. + *
  1226. + *
  1227. + *  BLOCKING -- aka. SLEEP + WAKEUP
  1228. + *
  1229. + * For blocking we (obviously) need to provide the same guarantee as for
  1230. + * migration. However the means are completely different as there is no lock
  1231. + * chain to provide order. Instead we do:
  1232. + *
  1233. + *   1) smp_store_release(X->on_cpu, 0)
  1234. + *   2) smp_cond_load_acquire(!X->on_cpu)
  1235. + *
  1236. + * Example:
  1237. + *
  1238. + *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)
  1239. + *
  1240. + *   LOCK rq(0)->lock LOCK X->pi_lock
  1241. + *   dequeue X
  1242. + *   sched-out X
  1243. + *   smp_store_release(X->on_cpu, 0);
  1244. + *
  1245. + *                    smp_cond_load_acquire(&X->on_cpu, !VAL);
  1246. + *                    X->state = WAKING
  1247. + *                    set_task_cpu(X,2)
  1248. + *
  1249. + *                    LOCK rq(2)->lock
  1250. + *                    enqueue X
  1251. + *                    X->state = RUNNING
  1252. + *                    UNLOCK rq(2)->lock
  1253. + *
  1254. + *                                          LOCK rq(2)->lock // orders against CPU1
  1255. + *                                          sched-out Z
  1256. + *                                          sched-in X
  1257. + *                                          UNLOCK rq(2)->lock
  1258. + *
  1259. + *                    UNLOCK X->pi_lock
  1260. + *   UNLOCK rq(0)->lock
  1261. + *
  1262. + *
  1263. + * However; for wakeups there is a second guarantee we must provide, namely we
  1264. + * must observe the state that lead to our wakeup. That is, not only must our
  1265. + * task observe its own prior state, it must also observe the stores prior to
  1266. + * its wakeup.
  1267. + *
  1268. + * This means that any means of doing remote wakeups must order the CPU doing
  1269. + * the wakeup against the CPU the task is going to end up running on. This,
  1270. + * however, is already required for the regular Program-Order guarantee above,
  1271. + * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
  1272. + *
  1273. + */
  1274. +
  1275.  /**
  1276.   * try_to_wake_up - wake up a thread
  1277.   * @p: the thread to be awakened
  1278.   * @state: the mask of task states that can be woken
  1279.   * @wake_flags: wake modifier flags (WF_*)
  1280. + * @sibling_count_hint: A hint at the number of threads that are being woken up
  1281. + *                      in this event.
  1282.   *
  1283.   * Put it on the run-queue if it's not already there. The "current"
  1284.   * thread is always on the run-queue (except when the actual
  1285. @@ -1907,7 +2217,8 @@
  1286.   * or @state didn't match @p's state.
  1287.   */
  1288.  static int
  1289. -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
  1290. +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
  1291. +          int sibling_count_hint)
  1292.  {
  1293.     unsigned long flags;
  1294.     int cpu, success = 0;
  1295. @@ -1959,15 +2270,34 @@
  1296.  
  1297.  #ifdef CONFIG_SMP
  1298.     /*
  1299. -    * If the owning (remote) cpu is still in the middle of schedule() with
  1300. -    * this task as prev, wait until its done referencing the task.
  1301. +    * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
  1302. +    * possible to, falsely, observe p->on_cpu == 0.
  1303. +    *
  1304. +    * One must be running (->on_cpu == 1) in order to remove oneself
  1305. +    * from the runqueue.
  1306. +    *
  1307. +    *  [S] ->on_cpu = 1;   [L] ->on_rq
  1308. +    *      UNLOCK rq->lock
  1309. +    *          RMB
  1310. +    *      LOCK   rq->lock
  1311. +    *  [S] ->on_rq = 0;    [L] ->on_cpu
  1312. +    *
  1313. +    * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
  1314. +    * from the consecutive calls to schedule(); the first switching to our
  1315. +    * task, the second putting it to sleep.
  1316.      */
  1317. -   while (p->on_cpu)
  1318. -       cpu_relax();
  1319. +   smp_rmb();
  1320. +
  1321.     /*
  1322. -    * Pairs with the smp_wmb() in finish_lock_switch().
  1323. +    * If the owning (remote) cpu is still in the middle of schedule() with
  1324. +    * this task as prev, wait until its done referencing the task.
  1325. +    *
  1326. +    * Pairs with the smp_store_release() in finish_lock_switch().
  1327. +    *
  1328. +    * This ensures that tasks getting woken will be fully ordered against
  1329. +    * their previous state and preserve Program Order.
  1330.      */
  1331. -   smp_rmb();
  1332. +   smp_cond_load_acquire(&p->on_cpu, !VAL);
  1333.  
  1334.     rq = cpu_rq(task_cpu(p));
  1335.  
  1336. @@ -1983,8 +2313,8 @@
  1337.     if (p->sched_class->task_waking)
  1338.         p->sched_class->task_waking(p);
  1339.  
  1340. -   cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
  1341. -
  1342. +   cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
  1343. +                sibling_count_hint);
  1344.     if (task_cpu(p) != cpu) {
  1345.         wake_flags |= WF_MIGRATED;
  1346.         set_task_cpu(p, cpu);
  1347. @@ -2002,47 +2332,6 @@
  1348.  }
  1349.  
  1350.  /**
  1351. - * try_to_wake_up_local - try to wake up a local task with rq lock held
  1352. - * @p: the thread to be awakened
  1353. - *
  1354. - * Put @p on the run-queue if it's not already there. The caller must
  1355. - * ensure that this_rq() is locked, @p is bound to this_rq() and not
  1356. - * the current task.
  1357. - */
  1358. -static void try_to_wake_up_local(struct task_struct *p)
  1359. -{
  1360. -   struct rq *rq = task_rq(p);
  1361. -
  1362. -   if (WARN_ON_ONCE(rq != this_rq()) ||
  1363. -       WARN_ON_ONCE(p == current))
  1364. -       return;
  1365. -
  1366. -   lockdep_assert_held(&rq->lock);
  1367. -
  1368. -   if (!raw_spin_trylock(&p->pi_lock)) {
  1369. -       raw_spin_unlock(&rq->lock);
  1370. -       raw_spin_lock(&p->pi_lock);
  1371. -       raw_spin_lock(&rq->lock);
  1372. -   }
  1373. -
  1374. -   if (!(p->state & TASK_NORMAL))
  1375. -       goto out;
  1376. -
  1377. -   if (!task_on_rq_queued(p)) {
  1378. -       u64 wallclock = walt_ktime_clock();
  1379. -
  1380. -       walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
  1381. -       walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
  1382. -       ttwu_activate(rq, p, ENQUEUE_WAKEUP);
  1383. -   }
  1384. -
  1385. -   ttwu_do_wakeup(rq, p, 0);
  1386. -   ttwu_stat(p, smp_processor_id(), 0);
  1387. -out:
  1388. -   raw_spin_unlock(&p->pi_lock);
  1389. -}
  1390. -
  1391. -/**
  1392.   * wake_up_process - Wake up a specific process
  1393.   * @p: The process to be woken up.
  1394.   *
  1395. @@ -2056,34 +2345,13 @@
  1396.   */
  1397.  int wake_up_process(struct task_struct *p)
  1398.  {
  1399. -   WARN_ON(task_is_stopped_or_traced(p));
  1400. -   return try_to_wake_up(p, TASK_NORMAL, 0);
  1401. +   return try_to_wake_up(p, TASK_NORMAL, 0, 1);
  1402.  }
  1403.  EXPORT_SYMBOL(wake_up_process);
  1404.  
  1405. -/**
  1406. - * wake_up_process_no_notif - Wake up a specific process without notifying
  1407. - * governor
  1408. - * @p: The process to be woken up.
  1409. - *
  1410. - * Attempt to wake up the nominated process and move it to the set of runnable
  1411. - * processes.
  1412. - *
  1413. - * Return: 1 if the process was woken up, 0 if it was already running.
  1414. - *
  1415. - * It may be assumed that this function implies a write memory barrier before
  1416. - * changing the task state if and only if any tasks are woken up.
  1417. - */
  1418. -int wake_up_process_no_notif(struct task_struct *p)
  1419. -{
  1420. -   WARN_ON(task_is_stopped_or_traced(p));
  1421. -   return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER);
  1422. -}
  1423. -EXPORT_SYMBOL(wake_up_process_no_notif);
  1424. -
  1425.  int wake_up_state(struct task_struct *p, unsigned int state)
  1426.  {
  1427. -   return try_to_wake_up(p, state, 0);
  1428. +   return try_to_wake_up(p, state, 0, 1);
  1429.  }
  1430.  
  1431.  /*
  1432. @@ -2120,6 +2388,10 @@
  1433.     p->se.prev_sum_exec_runtime = 0;
  1434.     p->se.nr_migrations     = 0;
  1435.     p->se.vruntime          = 0;
  1436. +#ifdef CONFIG_SCHED_WALT
  1437. +   p->last_sleep_ts        = 0;
  1438. +#endif
  1439. +
  1440.     INIT_LIST_HEAD(&p->se.group_node);
  1441.     walt_init_new_task_load(p);
  1442.  
  1443. @@ -2128,19 +2400,19 @@
  1444.  #endif
  1445.  
  1446.  #ifdef CONFIG_SCHEDSTATS
  1447. +   /* Even if schedstat is disabled, there should not be garbage */
  1448.     memset(&p->se.statistics, 0, sizeof(p->se.statistics));
  1449.  #endif
  1450.  
  1451. -#ifdef CONFIG_CPU_FREQ_STAT
  1452. -   cpufreq_task_stats_init(p);
  1453. -#endif
  1454. -
  1455.     RB_CLEAR_NODE(&p->dl.rb_node);
  1456.     init_dl_task_timer(&p->dl);
  1457.     __dl_clear_params(p);
  1458.  
  1459. -   init_rt_schedtune_timer(&p->rt);
  1460.     INIT_LIST_HEAD(&p->rt.run_list);
  1461. +   p->rt.timeout       = 0;
  1462. +   p->rt.time_slice    = sched_rr_timeslice;
  1463. +   p->rt.on_rq     = 0;
  1464. +   p->rt.on_list       = 0;
  1465.  
  1466.  #ifdef CONFIG_PREEMPT_NOTIFIERS
  1467.     INIT_HLIST_HEAD(&p->preempt_notifiers);
  1468. @@ -2171,31 +2443,88 @@
  1469.  #endif /* CONFIG_NUMA_BALANCING */
  1470.  }
  1471.  
  1472. +DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
  1473. +
  1474.  #ifdef CONFIG_NUMA_BALANCING
  1475. -#ifdef CONFIG_SCHED_DEBUG
  1476. +
  1477.  void set_numabalancing_state(bool enabled)
  1478.  {
  1479.     if (enabled)
  1480. -       sched_feat_set("NUMA");
  1481. +       static_branch_enable(&sched_numa_balancing);
  1482.     else
  1483. -       sched_feat_set("NO_NUMA");
  1484. +       static_branch_disable(&sched_numa_balancing);
  1485.  }
  1486. -#else
  1487. -__read_mostly bool numabalancing_enabled;
  1488.  
  1489. -void set_numabalancing_state(bool enabled)
  1490. +#ifdef CONFIG_PROC_SYSCTL
  1491. +int sysctl_numa_balancing(struct ctl_table *table, int write,
  1492. +            void __user *buffer, size_t *lenp, loff_t *ppos)
  1493.  {
  1494. -   numabalancing_enabled = enabled;
  1495. +   struct ctl_table t;
  1496. +   int err;
  1497. +   int state = static_branch_likely(&sched_numa_balancing);
  1498. +
  1499. +   if (write && !capable(CAP_SYS_ADMIN))
  1500. +       return -EPERM;
  1501. +
  1502. +   t = *table;
  1503. +   t.data = &state;
  1504. +   err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
  1505. +   if (err < 0)
  1506. +       return err;
  1507. +   if (write)
  1508. +       set_numabalancing_state(state);
  1509. +   return err;
  1510.  }
  1511. -#endif /* CONFIG_SCHED_DEBUG */
  1512. +#endif
  1513. +#endif
  1514. +
  1515. +DEFINE_STATIC_KEY_FALSE(sched_schedstats);
  1516. +
  1517. +#ifdef CONFIG_SCHEDSTATS
  1518. +static void set_schedstats(bool enabled)
  1519. +{
  1520. +   if (enabled)
  1521. +       static_branch_enable(&sched_schedstats);
  1522. +   else
  1523. +       static_branch_disable(&sched_schedstats);
  1524. +}
  1525. +
  1526. +void force_schedstat_enabled(void)
  1527. +{
  1528. +   if (!schedstat_enabled()) {
  1529. +       pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
  1530. +       static_branch_enable(&sched_schedstats);
  1531. +   }
  1532. +}
  1533. +
  1534. +static int __init setup_schedstats(char *str)
  1535. +{
  1536. +   int ret = 0;
  1537. +   if (!str)
  1538. +       goto out;
  1539. +
  1540. +   if (!strcmp(str, "enable")) {
  1541. +       set_schedstats(true);
  1542. +       ret = 1;
  1543. +   } else if (!strcmp(str, "disable")) {
  1544. +       set_schedstats(false);
  1545. +       ret = 1;
  1546. +   }
  1547. +out:
  1548. +   if (!ret)
  1549. +       pr_warn("Unable to parse schedstats=\n");
  1550. +
  1551. +   return ret;
  1552. +}
  1553. +__setup("schedstats=", setup_schedstats);
  1554.  
  1555.  #ifdef CONFIG_PROC_SYSCTL
  1556. -int sysctl_numa_balancing(struct ctl_table *table, int write,
  1557. +int sysctl_schedstats(struct ctl_table *table, int write,
  1558.              void __user *buffer, size_t *lenp, loff_t *ppos)
  1559.  {
  1560.     struct ctl_table t;
  1561.     int err;
  1562. -   int state = numabalancing_enabled;
  1563. +   int state = static_branch_likely(&sched_schedstats);
  1564.  
  1565.     if (write && !capable(CAP_SYS_ADMIN))
  1566.         return -EPERM;
  1567. @@ -2206,7 +2535,7 @@
  1568.     if (err < 0)
  1569.         return err;
  1570.     if (write)
  1571. -       set_numabalancing_state(state);
  1572. +       set_schedstats(state);
  1573.     return err;
  1574.  }
  1575.  #endif
  1576. @@ -2220,12 +2549,11 @@
  1577.     unsigned long flags;
  1578.     int cpu = get_cpu();
  1579.  
  1580. -   __sched_fork(clone_flags, p);
  1581. -
  1582.  #ifdef CONFIG_CPU_FREQ_STAT
  1583. -   cpufreq_task_stats_alloc(p);
  1584. +   cpufreq_task_stats_init(p);
  1585.  #endif
  1586.  
  1587. +   __sched_fork(clone_flags, p);
  1588.     /*
  1589.      * We mark the process as running here. This guarantees that
  1590.      * nobody will actually run it, and a signal or other external
  1591. @@ -2268,8 +2596,7 @@
  1592.         p->sched_class = &fair_sched_class;
  1593.     }
  1594.  
  1595. -   if (p->sched_class->task_fork)
  1596. -       p->sched_class->task_fork(p);
  1597. +   init_entity_runnable_average(&p->se);
  1598.  
  1599.     /*
  1600.      * The child is not yet in the pid-hash so no cgroup attach races,
  1601. @@ -2279,7 +2606,13 @@
  1602.      * Silence PROVE_RCU.
  1603.      */
  1604.     raw_spin_lock_irqsave(&p->pi_lock, flags);
  1605. -   set_task_cpu(p, cpu);
  1606. +   /*
  1607. +    * We're setting the cpu for the first time, we don't migrate,
  1608. +    * so use __set_task_cpu().
  1609. +    */
  1610. +   __set_task_cpu(p, cpu);
  1611. +   if (p->sched_class->task_fork)
  1612. +       p->sched_class->task_fork(p);
  1613.     raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  1614.  
  1615.  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
  1616. @@ -2318,8 +2651,8 @@
  1617.  #ifdef CONFIG_SMP
  1618.  inline struct dl_bw *dl_bw_of(int i)
  1619.  {
  1620. -   rcu_lockdep_assert(rcu_read_lock_sched_held(),
  1621. -              "sched RCU must be held");
  1622. +   RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
  1623. +            "sched RCU must be held");
  1624.     return &cpu_rq(i)->rd->dl_bw;
  1625.  }
  1626.  
  1627. @@ -2328,8 +2661,8 @@
  1628.     struct root_domain *rd = cpu_rq(i)->rd;
  1629.     int cpus = 0;
  1630.  
  1631. -   rcu_lockdep_assert(rcu_read_lock_sched_held(),
  1632. -              "sched RCU must be held");
  1633. +   RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
  1634. +            "sched RCU must be held");
  1635.     for_each_cpu_and(i, rd->span, cpu_active_mask)
  1636.         cpus++;
  1637.  
  1638. @@ -2347,25 +2680,6 @@
  1639.  }
  1640.  #endif
  1641.  
  1642. -static inline
  1643. -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
  1644. -{
  1645. -   dl_b->total_bw -= tsk_bw;
  1646. -}
  1647. -
  1648. -static inline
  1649. -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
  1650. -{
  1651. -   dl_b->total_bw += tsk_bw;
  1652. -}
  1653. -
  1654. -static inline
  1655. -bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
  1656. -{
  1657. -   return dl_b->bw != -1 &&
  1658. -          dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
  1659. -}
  1660. -
  1661.  /*
  1662.   * We must be sure that accepting a new task (or allowing changing the
  1663.   * parameters of an existing one) is consistent with the bandwidth
  1664. @@ -2387,7 +2701,8 @@
  1665.     u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
  1666.     int cpus, err = -1;
  1667.  
  1668. -   if (new_bw == p->dl.dl_bw)
  1669. +   /* !deadline task may carry old deadline bandwidth */
  1670. +   if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
  1671.         return 0;
  1672.  
  1673.     /*
  1674. @@ -2426,45 +2741,76 @@
  1675.   */
  1676.  void wake_up_new_task(struct task_struct *p)
  1677.  {
  1678. -   unsigned long flags;
  1679. +   struct rq_flags rf;
  1680.     struct rq *rq;
  1681.  
  1682. -   raw_spin_lock_irqsave(&p->pi_lock, flags);
  1683. +   /* Initialize new task's runnable average */
  1684. +   init_entity_runnable_average(&p->se);
  1685. +   raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
  1686.  
  1687.     walt_init_new_task_load(p);
  1688.  
  1689. -   /* Initialize new task's runnable average */
  1690. -   init_entity_runnable_average(&p->se);
  1691. +   p->state = TASK_RUNNING;
  1692.  #ifdef CONFIG_SMP
  1693.     /*
  1694.      * Fork balancing, do it here and not earlier because:
  1695.      *  - cpus_allowed can change in the fork path
  1696.      *  - any previously selected cpu might disappear through hotplug
  1697. +    *
  1698. +    * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
  1699. +    * as we're not fully set-up yet.
  1700.      */
  1701. -   set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
  1702. +   __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
  1703.  #endif
  1704. +   rq = __task_rq_lock(p, &rf);
  1705. +   update_rq_clock(rq);
  1706. +   post_init_entity_util_avg(&p->se);
  1707.  
  1708. -   rq = __task_rq_lock(p);
  1709.     walt_mark_task_starting(p);
  1710. +
  1711.     activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
  1712.     p->on_rq = TASK_ON_RQ_QUEUED;
  1713.     trace_sched_wakeup_new(p);
  1714.     check_preempt_curr(rq, p, WF_FORK);
  1715.  #ifdef CONFIG_SMP
  1716. -   if (p->sched_class->task_woken)
  1717. +   if (p->sched_class->task_woken) {
  1718. +       /*
  1719. +        * Nothing relies on rq->lock after this, so its fine to
  1720. +        * drop it.
  1721. +        */
  1722. +       lockdep_unpin_lock(&rq->lock, rf.cookie);
  1723.         p->sched_class->task_woken(rq, p);
  1724. +       lockdep_repin_lock(&rq->lock, rf.cookie);
  1725. +   }
  1726.  #endif
  1727. -   task_rq_unlock(rq, p, &flags);
  1728. +   task_rq_unlock(rq, p, &rf);
  1729.  }
  1730.  
  1731.  #ifdef CONFIG_PREEMPT_NOTIFIERS
  1732.  
  1733. +static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
  1734. +
  1735. +void preempt_notifier_inc(void)
  1736. +{
  1737. +   static_key_slow_inc(&preempt_notifier_key);
  1738. +}
  1739. +EXPORT_SYMBOL_GPL(preempt_notifier_inc);
  1740. +
  1741. +void preempt_notifier_dec(void)
  1742. +{
  1743. +   static_key_slow_dec(&preempt_notifier_key);
  1744. +}
  1745. +EXPORT_SYMBOL_GPL(preempt_notifier_dec);
  1746. +
  1747.  /**
  1748.   * preempt_notifier_register - tell me when current is being preempted & rescheduled
  1749.   * @notifier: notifier struct to register
  1750.   */
  1751.  void preempt_notifier_register(struct preempt_notifier *notifier)
  1752.  {
  1753. +   if (!static_key_false(&preempt_notifier_key))
  1754. +       WARN(1, "registering preempt_notifier while notifiers disabled\n");
  1755. +
  1756.     hlist_add_head(&notifier->link, &current->preempt_notifiers);
  1757.  }
  1758.  EXPORT_SYMBOL_GPL(preempt_notifier_register);
  1759. @@ -2473,7 +2819,7 @@
  1760.   * preempt_notifier_unregister - no longer interested in preemption notifications
  1761.   * @notifier: notifier struct to unregister
  1762.   *
  1763. - * This is safe to call from within a preemption notifier.
  1764. + * This is *not* safe to call from within a preemption notifier.
  1765.   */
  1766.  void preempt_notifier_unregister(struct preempt_notifier *notifier)
  1767.  {
  1768. @@ -2481,7 +2827,7 @@
  1769.  }
  1770.  EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
  1771.  
  1772. -static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
  1773. +static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
  1774.  {
  1775.     struct preempt_notifier *notifier;
  1776.  
  1777. @@ -2489,9 +2835,15 @@
  1778.         notifier->ops->sched_in(notifier, raw_smp_processor_id());
  1779.  }
  1780.  
  1781. +static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
  1782. +{
  1783. +   if (static_key_false(&preempt_notifier_key))
  1784. +       __fire_sched_in_preempt_notifiers(curr);
  1785. +}
  1786. +
  1787.  static void
  1788. -fire_sched_out_preempt_notifiers(struct task_struct *curr,
  1789. -                struct task_struct *next)
  1790. +__fire_sched_out_preempt_notifiers(struct task_struct *curr,
  1791. +                  struct task_struct *next)
  1792.  {
  1793.     struct preempt_notifier *notifier;
  1794.  
  1795. @@ -2499,13 +2851,21 @@
  1796.         notifier->ops->sched_out(notifier, next);
  1797.  }
  1798.  
  1799. +static __always_inline void
  1800. +fire_sched_out_preempt_notifiers(struct task_struct *curr,
  1801. +                struct task_struct *next)
  1802. +{
  1803. +   if (static_key_false(&preempt_notifier_key))
  1804. +       __fire_sched_out_preempt_notifiers(curr, next);
  1805. +}
  1806. +
  1807.  #else /* !CONFIG_PREEMPT_NOTIFIERS */
  1808.  
  1809. -static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
  1810. +static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
  1811.  {
  1812.  }
  1813.  
  1814. -static void
  1815. +static inline void
  1816.  fire_sched_out_preempt_notifiers(struct task_struct *curr,
  1817.                  struct task_struct *next)
  1818.  {
  1819. @@ -2530,7 +2890,6 @@
  1820.  prepare_task_switch(struct rq *rq, struct task_struct *prev,
  1821.             struct task_struct *next)
  1822.  {
  1823. -   trace_sched_switch(prev, next);
  1824.     sched_info_switch(rq, prev, next);
  1825.     perf_event_task_sched_out(prev, next);
  1826.     fire_sched_out_preempt_notifiers(prev, next);
  1827. @@ -2545,7 +2904,6 @@
  1828.  
  1829.  /**
  1830.   * finish_task_switch - clean up after a task-switch
  1831. - * @rq: runqueue associated with task-switch
  1832.   * @prev: the thread we just switched away from.
  1833.   *
  1834.   * finish_task_switch must be called after the context switch, paired
  1835. @@ -2557,13 +2915,35 @@
  1836.   * so, we finish that here outside of the runqueue lock. (Doing it
  1837.   * with the lock held can cause deadlocks; see schedule() for
  1838.   * details.)
  1839. + *
  1840. + * The context switch have flipped the stack from under us and restored the
  1841. + * local variables which were saved when this task called schedule() in the
  1842. + * past. prev == current is still correct but we need to recalculate this_rq
  1843. + * because prev may have moved to another CPU.
  1844.   */
  1845. -static void finish_task_switch(struct rq *rq, struct task_struct *prev)
  1846. +static struct rq *finish_task_switch(struct task_struct *prev)
  1847.     __releases(rq->lock)
  1848.  {
  1849. +   struct rq *rq = this_rq();
  1850.     struct mm_struct *mm = rq->prev_mm;
  1851.     long prev_state;
  1852.  
  1853. +   /*
  1854. +    * The previous task will have left us with a preempt_count of 2
  1855. +    * because it left us after:
  1856. +    *
  1857. +    *  schedule()
  1858. +    *    preempt_disable();            // 1
  1859. +    *    __schedule()
  1860. +    *      raw_spin_lock_irq(&rq->lock)    // 2
  1861. +    *
  1862. +    * Also, see FORK_PREEMPT_COUNT.
  1863. +    */
  1864. +   if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
  1865. +             "corrupted preempt_count: %s/%d/0x%x\n",
  1866. +             current->comm, current->pid, preempt_count()))
  1867. +       preempt_count_set(FORK_PREEMPT_COUNT);
  1868. +
  1869.     rq->prev_mm = NULL;
  1870.  
  1871.     /*
  1872. @@ -2579,7 +2959,6 @@
  1873.      */
  1874.     prev_state = prev->state;
  1875.     vtime_task_switch(prev);
  1876. -   finish_arch_switch(prev);
  1877.     perf_event_task_sched_in(prev, current);
  1878.     finish_lock_switch(rq, prev);
  1879.     finish_arch_post_lock_switch();
  1880. @@ -2596,10 +2975,15 @@
  1881.          * task and put them back on the free list.
  1882.          */
  1883.         kprobe_flush_task(prev);
  1884. +
  1885. +       /* Task is done with its stack. */
  1886. +       put_task_stack(prev);
  1887. +
  1888.         put_task_struct(prev);
  1889.     }
  1890.  
  1891.     tick_nohz_task_switch(current);
  1892. +   return rq;
  1893.  }
  1894.  
  1895.  #ifdef CONFIG_SMP
  1896. @@ -2646,27 +3030,31 @@
  1897.  asmlinkage __visible void schedule_tail(struct task_struct *prev)
  1898.     __releases(rq->lock)
  1899.  {
  1900. -   struct rq *rq = this_rq();
  1901. -
  1902. -   finish_task_switch(rq, prev);
  1903. +   struct rq *rq;
  1904.  
  1905.     /*
  1906. -    * FIXME: do we need to worry about rq being invalidated by the
  1907. -    * task_switch?
  1908. +    * New tasks start with FORK_PREEMPT_COUNT, see there and
  1909. +    * finish_task_switch() for details.
  1910. +    *
  1911. +    * finish_task_switch() will drop rq->lock() and lower preempt_count
  1912. +    * and the preempt_enable() will end up enabling preemption (on
  1913. +    * PREEMPT_COUNT kernels).
  1914.      */
  1915. +
  1916. +   rq = finish_task_switch(prev);
  1917.     balance_callback(rq);
  1918. +   preempt_enable();
  1919.  
  1920.     if (current->set_child_tid)
  1921.         put_user(task_pid_vnr(current), current->set_child_tid);
  1922.  }
  1923.  
  1924.  /*
  1925. - * context_switch - switch to the new MM and the new
  1926. - * thread's register state.
  1927. + * context_switch - switch to the new MM and the new thread's register state.
  1928.   */
  1929. -static inline void
  1930. +static inline struct rq *
  1931.  context_switch(struct rq *rq, struct task_struct *prev,
  1932. -          struct task_struct *next)
  1933. +          struct task_struct *next, struct pin_cookie cookie)
  1934.  {
  1935.     struct mm_struct *mm, *oldmm;
  1936.  
  1937. @@ -2698,19 +3086,15 @@
  1938.      * of the scheduler it's an obvious special-case), so we
  1939.      * do an early lockdep release here:
  1940.      */
  1941. +   lockdep_unpin_lock(&rq->lock, cookie);
  1942.     spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
  1943.  
  1944.     context_tracking_task_switch(prev, next);
  1945.     /* Here we just switch the register state and the stack. */
  1946.     switch_to(prev, next, prev);
  1947. -
  1948.     barrier();
  1949. -   /*
  1950. -    * this_rq must be evaluated again because prev may have moved
  1951. -    * CPUs since it called schedule(), thus the 'rq' on its stack
  1952. -    * frame will be invalid.
  1953. -    */
  1954. -   finish_task_switch(this_rq(), prev);
  1955. +
  1956. +   return finish_task_switch(prev);
  1957.  }
  1958.  
  1959.  /*
  1960. @@ -2775,6 +3159,36 @@
  1961.     return atomic_read(&this->nr_iowait);
  1962.  }
  1963.  
  1964. +#ifdef CONFIG_CPU_QUIET
  1965. +u64 nr_running_integral(unsigned int cpu)
  1966. +{
  1967. +   unsigned int seqcnt;
  1968. +   u64 integral;
  1969. +   struct rq *q;
  1970. +
  1971. +   if (cpu >= nr_cpu_ids)
  1972. +       return 0;
  1973. +
  1974. +   q = cpu_rq(cpu);
  1975. +
  1976. +   /*
  1977. +    * Update average to avoid reading stalled value if there were
  1978. +    * no run-queue changes for a long time. On the other hand if
  1979. +    * the changes are happening right now, just read current value
  1980. +    * directly.
  1981. +    */
  1982. +
  1983. +   seqcnt = read_seqcount_begin(&q->ave_seqcnt);
  1984. +   integral = do_nr_running_integral(q);
  1985. +   if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) {
  1986. +       read_seqcount_begin(&q->ave_seqcnt);
  1987. +       integral = q->nr_running_integral;
  1988. +   }
  1989. +
  1990. +   return integral;
  1991. +}
  1992. +#endif
  1993. +
  1994.  void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
  1995.  {
  1996.     struct rq *rq = this_rq();
  1997. @@ -2795,7 +3209,7 @@
  1998.     int dest_cpu;
  1999.  
  2000.     raw_spin_lock_irqsave(&p->pi_lock, flags);
  2001. -   dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
  2002. +   dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
  2003.     if (dest_cpu == smp_processor_id())
  2004.         goto unlock;
  2005.  
  2006. @@ -2825,7 +3239,7 @@
  2007.   */
  2008.  unsigned long long task_sched_runtime(struct task_struct *p)
  2009.  {
  2010. -   unsigned long flags;
  2011. +   struct rq_flags rf;
  2012.     struct rq *rq;
  2013.     u64 ns;
  2014.  
  2015. @@ -2845,7 +3259,7 @@
  2016.         return p->se.sum_exec_runtime;
  2017.  #endif
  2018.  
  2019. -   rq = task_rq_lock(p, &flags);
  2020. +   rq = task_rq_lock(p, &rf);
  2021.     /*
  2022.      * Must be ->curr _and_ ->on_rq.  If dequeued, we would
  2023.      * project cycles that may never be accounted to this
  2024. @@ -2856,7 +3270,7 @@
  2025.         p->sched_class->update_curr(rq);
  2026.     }
  2027.     ns = p->se.sum_exec_runtime;
  2028. -   task_rq_unlock(rq, p, &flags);
  2029. +   task_rq_unlock(rq, p, &rf);
  2030.  
  2031.     return ns;
  2032.  }
  2033. @@ -2879,6 +3293,7 @@
  2034.     return total += scr->dl;
  2035.  }
  2036.  
  2037. +unsigned long boosted_cpu_util(int cpu);
  2038.  static void sched_freq_tick_pelt(int cpu)
  2039.  {
  2040.     unsigned long cpu_utilization = boosted_cpu_util(cpu);
  2041. @@ -2889,47 +3304,45 @@
  2042.     if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr)
  2043.         return;
  2044.  
  2045. -   if (!use_util_est())
  2046. -       cpu_utilization = capacity_max;
  2047. -
  2048.     /*
  2049.      * To make free room for a task that is building up its "real"
  2050.      * utilization and to harm its performance the least, request
  2051.      * a jump to a higher OPP as soon as the margin of free capacity
  2052.      * is impacted (specified by capacity_margin).
  2053. +    * Remember CPU utilization in sched_capacity_reqs should be normalised.
  2054.      */
  2055. +   cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
  2056.     set_cfs_cpu_capacity(cpu, true, cpu_utilization);
  2057.  }
  2058.  
  2059.  #ifdef CONFIG_SCHED_WALT
  2060.  static void sched_freq_tick_walt(int cpu)
  2061.  {
  2062. -   unsigned long cpu_utilization = cpu_util(cpu, UTIL_EST);
  2063. +   unsigned long cpu_utilization = cpu_util_freq(cpu);
  2064.     unsigned long capacity_curr = capacity_curr_of(cpu);
  2065.  
  2066.     if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
  2067.         return sched_freq_tick_pelt(cpu);
  2068.  
  2069.     /*
  2070. -    * Add a margin to the WALT utilization.
  2071. +    * Add a margin to the WALT utilization to check if we will need to
  2072. +    * increase frequency.
  2073.      * NOTE: WALT tracks a single CPU signal for all the scheduling
  2074.      * classes, thus this margin is going to be added to the DL class as
  2075.      * well, which is something we do not do in sched_freq_tick_pelt case.
  2076. -    *
  2077. -    * TODO:
  2078. -    * Here we're adding margin, but we're also adding margin in cpufreq.
  2079. -    * There shouldn't be a double addition.
  2080.      */
  2081. -   cpu_utilization = add_capacity_margin(cpu_utilization);
  2082. -   if (cpu_utilization <= capacity_curr)
  2083. +   if (add_capacity_margin(cpu_utilization) <= capacity_curr)
  2084.         return;
  2085.  
  2086.     /*
  2087.      * It is likely that the load is growing so we
  2088.      * keep the added margin in our request as an
  2089.      * extra boost.
  2090. +    * Remember CPU utilization in sched_capacity_reqs should be normalised.
  2091.      */
  2092. +   cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
  2093.     set_cfs_cpu_capacity(cpu, true, cpu_utilization);
  2094. +
  2095.  }
  2096.  #define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu)
  2097.  #else
  2098. @@ -2938,16 +3351,9 @@
  2099.  
  2100.  static void sched_freq_tick(int cpu)
  2101.  {
  2102. -   unsigned long capacity_orig, capacity_curr;
  2103. -
  2104.     if (!sched_freq())
  2105.         return;
  2106.  
  2107. -   capacity_orig = capacity_orig_of(cpu);
  2108. -   capacity_curr = capacity_curr_of(cpu);
  2109. -   if (capacity_curr == capacity_orig)
  2110. -       return;
  2111. -
  2112.     _sched_freq_tick(cpu);
  2113.  }
  2114.  #else
  2115. @@ -2968,11 +3374,11 @@
  2116.  
  2117.     raw_spin_lock(&rq->lock);
  2118.     walt_set_window_start(rq);
  2119. +   walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
  2120. +           walt_ktime_clock(), 0);
  2121.     update_rq_clock(rq);
  2122.     curr->sched_class->task_tick(rq, curr, 0);
  2123.     update_cpu_load_active(rq);
  2124. -   walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
  2125. -           walt_ktime_clock(), 0);
  2126.     calc_global_load_tick(rq);
  2127.     sched_freq_tick(cpu);
  2128.     raw_spin_unlock(&rq->lock);
  2129. @@ -3115,25 +3521,23 @@
  2130.     if (task_stack_end_corrupted(prev))
  2131.         panic("corrupted stack end detected inside scheduler\n");
  2132.  #endif
  2133. -   /*
  2134. -    * Test if we are atomic. Since do_exit() needs to call into
  2135. -    * schedule() atomically, we ignore that path. Otherwise whine
  2136. -    * if we are scheduling when we should not.
  2137. -    */
  2138. -   if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
  2139. +
  2140. +   if (unlikely(in_atomic_preempt_off())) {
  2141.         __schedule_bug(prev);
  2142. +       preempt_count_set(PREEMPT_DISABLED);
  2143. +   }
  2144.     rcu_sleep_check();
  2145.  
  2146.     profile_hit(SCHED_PROFILING, __builtin_return_address(0));
  2147.  
  2148. -   schedstat_inc(this_rq(), sched_count);
  2149. +   schedstat_inc(this_rq()->sched_count);
  2150.  }
  2151.  
  2152.  /*
  2153.   * Pick up the highest-prio task:
  2154.   */
  2155.  static inline struct task_struct *
  2156. -pick_next_task(struct rq *rq, struct task_struct *prev)
  2157. +pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
  2158.  {
  2159.     const struct sched_class *class = &fair_sched_class;
  2160.     struct task_struct *p;
  2161. @@ -3144,20 +3548,21 @@
  2162.      */
  2163.     if (likely(prev->sched_class == class &&
  2164.            rq->nr_running == rq->cfs.h_nr_running)) {
  2165. -       p = fair_sched_class.pick_next_task(rq, prev);
  2166. +       p = fair_sched_class.pick_next_task(rq, prev, cookie);
  2167.         if (unlikely(p == RETRY_TASK))
  2168.             goto again;
  2169.  
  2170.         /* assumes fair_sched_class->next == idle_sched_class */
  2171.         if (unlikely(!p))
  2172. -           p = idle_sched_class.pick_next_task(rq, prev);
  2173. +           p = idle_sched_class.pick_next_task(rq, prev, cookie);
  2174.  
  2175. -       return p;
  2176. +       if (likely(p != RETRY_TASK))
  2177. +           return p;
  2178.     }
  2179.  
  2180.  again:
  2181.     for_each_class(class) {
  2182. -       p = class->pick_next_task(rq, prev);
  2183. +       p = class->pick_next_task(rq, prev, cookie);
  2184.         if (p) {
  2185.             if (unlikely(p == RETRY_TASK))
  2186.                 goto again;
  2187. @@ -3204,20 +3609,20 @@
  2188.   *          - explicit schedule() call
  2189.   *          - return from syscall or exception to user-space
  2190.   *          - return from interrupt-handler to user-space
  2191. + *
  2192. + * WARNING: must be called with preemption disabled!
  2193.   */
  2194. -static void __sched __schedule(void)
  2195. +static void __sched notrace __schedule(bool preempt)
  2196.  {
  2197.     struct task_struct *prev, *next;
  2198.     unsigned long *switch_count;
  2199. +   struct pin_cookie cookie;
  2200.     struct rq *rq;
  2201.     int cpu;
  2202.     u64 wallclock;
  2203.  
  2204. -need_resched:
  2205. -   preempt_disable();
  2206.     cpu = smp_processor_id();
  2207.     rq = cpu_rq(cpu);
  2208. -   rcu_note_context_switch(cpu);
  2209.     prev = rq->curr;
  2210.  
  2211.     schedule_debug(prev);
  2212. @@ -3225,77 +3630,105 @@
  2213.     if (sched_feat(HRTICK))
  2214.         hrtick_clear(rq);
  2215.  
  2216. +   local_irq_disable();
  2217. +   rcu_note_context_switch();
  2218. +
  2219.     /*
  2220.      * Make sure that signal_pending_state()->signal_pending() below
  2221.      * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
  2222.      * done by the caller to avoid the race with signal_wake_up().
  2223.      */
  2224.     smp_mb__before_spinlock();
  2225. -   raw_spin_lock_irq(&rq->lock);
  2226. +   raw_spin_lock(&rq->lock);
  2227. +   cookie = lockdep_pin_lock(&rq->lock);
  2228. +
  2229. +   rq->clock_skip_update <<= 1; /* promote REQ to ACT */
  2230.  
  2231.     switch_count = &prev->nivcsw;
  2232. -   if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
  2233. +   if (!preempt && prev->state) {
  2234.         if (unlikely(signal_pending_state(prev->state, prev))) {
  2235.             prev->state = TASK_RUNNING;
  2236.         } else {
  2237.             deactivate_task(rq, prev, DEQUEUE_SLEEP);
  2238.             prev->on_rq = 0;
  2239.  
  2240. -           /*
  2241. -            * If a worker went to sleep, notify and ask workqueue
  2242. -            * whether it wants to wake up a task to maintain
  2243. -            * concurrency.
  2244. -            */
  2245. -           if (prev->flags & PF_WQ_WORKER) {
  2246. -               struct task_struct *to_wakeup;
  2247. -
  2248. -               to_wakeup = wq_worker_sleeping(prev, cpu);
  2249. -               if (to_wakeup)
  2250. -                   try_to_wake_up_local(to_wakeup);
  2251. -           }
  2252.         }
  2253.         switch_count = &prev->nvcsw;
  2254.     }
  2255.  
  2256. -   if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
  2257. +   if (task_on_rq_queued(prev))
  2258.         update_rq_clock(rq);
  2259.  
  2260. -   next = pick_next_task(rq, prev);
  2261. +   next = pick_next_task(rq, prev, cookie);
  2262.     wallclock = walt_ktime_clock();
  2263.     walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
  2264.     walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
  2265.     clear_tsk_need_resched(prev);
  2266.     clear_preempt_need_resched();
  2267. -   rq->skip_clock_update = 0;
  2268. +   rq->clock_skip_update = 0;
  2269.  
  2270.     if (likely(prev != next)) {
  2271. +#ifdef CONFIG_SCHED_WALT
  2272. +       if (!prev->on_rq)
  2273. +           prev->last_sleep_ts = wallclock;
  2274. +#endif
  2275.         rq->nr_switches++;
  2276.         rq->curr = next;
  2277.         ++*switch_count;
  2278.  
  2279. -       context_switch(rq, prev, next); /* unlocks the rq */
  2280. -       /*
  2281. -        * The context switch have flipped the stack from under us
  2282. -        * and restored the local variables which were saved when
  2283. -        * this task called schedule() in the past. prev == current
  2284. -        * is still correct, but it can be moved to another cpu/rq.
  2285. -        */
  2286. -       cpu = smp_processor_id();
  2287. -       rq = cpu_rq(cpu);
  2288. -   } else
  2289. +       //trace_sched_switch(preempt, prev, next);
  2290. +       rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
  2291. +       cpu = cpu_of(rq);
  2292. +   } else {
  2293. +       lockdep_unpin_lock(&rq->lock, cookie);
  2294.         raw_spin_unlock_irq(&rq->lock);
  2295. +   }
  2296.  
  2297.     balance_callback(rq);
  2298. +}
  2299.  
  2300. -   sched_preempt_enable_no_resched();
  2301. -   if (need_resched())
  2302. -       goto need_resched;
  2303. +void __noreturn do_task_dead(void)
  2304. +{
  2305. +   /*
  2306. +    * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
  2307. +    * when the following two conditions become true.
  2308. +    *   - There is race condition of mmap_sem (It is acquired by
  2309. +    *     exit_mm()), and
  2310. +    *   - SMI occurs before setting TASK_RUNINNG.
  2311. +    *     (or hypervisor of virtual machine switches to other guest)
  2312. +    *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
  2313. +    *
  2314. +    * To avoid it, we have to wait for releasing tsk->pi_lock which
  2315. +    * is held by try_to_wake_up()
  2316. +    */
  2317. +   smp_mb();
  2318. +   raw_spin_unlock_wait(&current->pi_lock);
  2319. +
  2320. +   /* causes final put_task_struct in finish_task_switch(). */
  2321. +   __set_current_state(TASK_DEAD);
  2322. +   current->flags |= PF_NOFREEZE;  /* tell freezer to ignore us */
  2323. +   __schedule(false);
  2324. +   BUG();
  2325. +   /* Avoid "noreturn function does return".  */
  2326. +   for (;;)
  2327. +       cpu_relax();    /* For when BUG is null */
  2328.  }
  2329.  
  2330.  static inline void sched_submit_work(struct task_struct *tsk)
  2331.  {
  2332. -   if (!tsk->state || tsk_is_pi_blocked(tsk))
  2333. +   if (!tsk->state)
  2334. +       return;
  2335. +   /*
  2336. +    * If a worker went to sleep, notify and ask workqueue whether
  2337. +    * it wants to wake up a task to maintain concurrency.
  2338. +    */
  2339. +   if (tsk->flags & PF_WQ_WORKER)
  2340. +       wq_worker_sleeping(tsk);
  2341. +
  2342. +
  2343. +   if (tsk_is_pi_blocked(tsk))
  2344.         return;
  2345. +
  2346.     /*
  2347.      * If we are going to sleep and we have plugged IO queued,
  2348.      * make sure to submit it to avoid deadlocks.
  2349. @@ -3304,12 +3737,23 @@
  2350.         blk_schedule_flush_plug(tsk);
  2351.  }
  2352.  
  2353. +static void sched_update_worker(struct task_struct *tsk)
  2354. +{
  2355. +   if (tsk->flags & PF_WQ_WORKER)
  2356. +       wq_worker_running(tsk);
  2357. +}
  2358. +
  2359.  asmlinkage __visible void __sched schedule(void)
  2360.  {
  2361.     struct task_struct *tsk = current;
  2362.  
  2363.     sched_submit_work(tsk);
  2364. -   __schedule();
  2365. +   do {
  2366. +       preempt_disable();
  2367. +       __schedule(false);
  2368. +       sched_preempt_enable_no_resched();
  2369. +   } while (need_resched());
  2370. +   sched_update_worker(tsk);
  2371.  }
  2372.  EXPORT_SYMBOL(schedule);
  2373.  
  2374. @@ -3344,6 +3788,20 @@
  2375.     preempt_disable();
  2376.  }
  2377.  
  2378. +static void preempt_schedule_common(void)
  2379. +{
  2380. +   do {
  2381. +       preempt_disable_notrace();
  2382. +       __schedule(true);
  2383. +       preempt_enable_no_resched_notrace();
  2384. +
  2385. +       /*
  2386. +        * Check again in case we missed a preemption opportunity
  2387. +        * between schedule and now.
  2388. +        */
  2389. +   } while (need_resched());
  2390. +}
  2391. +
  2392.  #ifdef CONFIG_PREEMPT
  2393.  /*
  2394.   * this is the entry point to schedule() from in-kernel preemption
  2395. @@ -3359,24 +3817,13 @@
  2396.     if (likely(!preemptible()))
  2397.         return;
  2398.  
  2399. -   do {
  2400. -       __preempt_count_add(PREEMPT_ACTIVE);
  2401. -       __schedule();
  2402. -       __preempt_count_sub(PREEMPT_ACTIVE);
  2403. -
  2404. -       /*
  2405. -        * Check again in case we missed a preemption opportunity
  2406. -        * between schedule and now.
  2407. -        */
  2408. -       barrier();
  2409. -   } while (need_resched());
  2410. +   preempt_schedule_common();
  2411.  }
  2412.  NOKPROBE_SYMBOL(preempt_schedule);
  2413.  EXPORT_SYMBOL(preempt_schedule);
  2414.  
  2415. -#ifdef CONFIG_CONTEXT_TRACKING
  2416.  /**
  2417. - * preempt_schedule_context - preempt_schedule called by tracing
  2418. + * preempt_schedule_notrace - preempt_schedule called by tracing
  2419.   *
  2420.   * The tracing infrastructure uses preempt_enable_notrace to prevent
  2421.   * recursion and tracing preempt enabling caused by the tracing
  2422. @@ -3389,7 +3836,7 @@
  2423.   * instead of preempt_schedule() to exit user context if needed before
  2424.   * calling the scheduler.
  2425.   */
  2426. -asmlinkage __visible void __sched notrace preempt_schedule_context(void)
  2427. +asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
  2428.  {
  2429.     enum ctx_state prev_ctx;
  2430.  
  2431. @@ -3397,22 +3844,20 @@
  2432.         return;
  2433.  
  2434.     do {
  2435. -       __preempt_count_add(PREEMPT_ACTIVE);
  2436. +       preempt_disable_notrace();
  2437.         /*
  2438.          * Needs preempt disabled in case user_exit() is traced
  2439.          * and the tracer calls preempt_enable_notrace() causing
  2440.          * an infinite recursion.
  2441.          */
  2442.         prev_ctx = exception_enter();
  2443. -       __schedule();
  2444. +       __schedule(true);
  2445.         exception_exit(prev_ctx);
  2446.  
  2447. -       __preempt_count_sub(PREEMPT_ACTIVE);
  2448. -       barrier();
  2449. +       preempt_enable_no_resched_notrace();
  2450.     } while (need_resched());
  2451.  }
  2452. -EXPORT_SYMBOL_GPL(preempt_schedule_context);
  2453. -#endif /* CONFIG_CONTEXT_TRACKING */
  2454. +EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
  2455.  
  2456.  #endif /* CONFIG_PREEMPT */
  2457.  
  2458. @@ -3432,17 +3877,11 @@
  2459.     prev_state = exception_enter();
  2460.  
  2461.     do {
  2462. -       __preempt_count_add(PREEMPT_ACTIVE);
  2463. +       preempt_disable();
  2464.         local_irq_enable();
  2465. -       __schedule();
  2466. +       __schedule(true);
  2467.         local_irq_disable();
  2468. -       __preempt_count_sub(PREEMPT_ACTIVE);
  2469. -
  2470. -       /*
  2471. -        * Check again in case we missed a preemption opportunity
  2472. -        * between schedule and now.
  2473. -        */
  2474. -       barrier();
  2475. +       sched_preempt_enable_no_resched();
  2476.     } while (need_resched());
  2477.  
  2478.     exception_exit(prev_state);
  2479. @@ -3451,7 +3890,7 @@
  2480.  int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
  2481.               void *key)
  2482.  {
  2483. -   return try_to_wake_up(curr->private, mode, wake_flags);
  2484. +   return try_to_wake_up(curr->private, mode, wake_flags, 1);
  2485.  }
  2486.  EXPORT_SYMBOL(default_wake_function);
  2487.  
  2488. @@ -3470,13 +3909,15 @@
  2489.   */
  2490.  void rt_mutex_setprio(struct task_struct *p, int prio)
  2491.  {
  2492. -   int oldprio, queued, running, enqueue_flag = 0;
  2493. -   struct rq *rq;
  2494. +   int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
  2495.     const struct sched_class *prev_class;
  2496. +   struct rq_flags rf;
  2497. +   struct rq *rq;
  2498.  
  2499.     BUG_ON(prio > MAX_PRIO);
  2500.  
  2501. -   rq = __task_rq_lock(p);
  2502. +   rq = __task_rq_lock(p, &rf);
  2503. +   update_rq_clock(rq);
  2504.  
  2505.     /*
  2506.      * Idle task boosting is a nono in general. There is one
  2507. @@ -3498,11 +3939,15 @@
  2508.  
  2509.     trace_sched_pi_setprio(p, prio);
  2510.     oldprio = p->prio;
  2511. +
  2512. +   if (oldprio == prio)
  2513. +       queue_flag &= ~DEQUEUE_MOVE;
  2514. +
  2515.     prev_class = p->sched_class;
  2516.     queued = task_on_rq_queued(p);
  2517.     running = task_current(rq, p);
  2518.     if (queued)
  2519. -       dequeue_task(rq, p, 0);
  2520. +       dequeue_task(rq, p, queue_flag);
  2521.     if (running)
  2522.         put_prev_task(rq, p);
  2523.  
  2524. @@ -3520,8 +3965,7 @@
  2525.         if (!dl_prio(p->normal_prio) ||
  2526.             (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
  2527.             p->dl.dl_boosted = 1;
  2528. -           p->dl.dl_throttled = 0;
  2529. -           enqueue_flag = ENQUEUE_REPLENISH;
  2530. +           queue_flag |= ENQUEUE_REPLENISH;
  2531.         } else
  2532.             p->dl.dl_boosted = 0;
  2533.         p->sched_class = &dl_sched_class;
  2534. @@ -3529,7 +3973,7 @@
  2535.         if (dl_prio(oldprio))
  2536.             p->dl.dl_boosted = 0;
  2537.         if (oldprio < prio)
  2538. -           enqueue_flag = ENQUEUE_HEAD;
  2539. +           queue_flag |= ENQUEUE_HEAD;
  2540.         p->sched_class = &rt_sched_class;
  2541.     } else {
  2542.         if (dl_prio(oldprio))
  2543. @@ -3541,15 +3985,15 @@
  2544.  
  2545.     p->prio = prio;
  2546.  
  2547. -   if (running)
  2548. -       p->sched_class->set_curr_task(rq);
  2549.     if (queued)
  2550. -       enqueue_task(rq, p, enqueue_flag);
  2551. +       enqueue_task(rq, p, queue_flag);
  2552. +   if (running)
  2553. +       set_curr_task(rq, p);
  2554.  
  2555.     check_class_changed(rq, p, prev_class, oldprio);
  2556.  out_unlock:
  2557.     preempt_disable(); /* avoid rq from going away on us */
  2558. -   __task_rq_unlock(rq);
  2559. +   __task_rq_unlock(rq, &rf);
  2560.  
  2561.     balance_callback(rq);
  2562.     preempt_enable();
  2563. @@ -3558,8 +4002,9 @@
  2564.  
  2565.  void set_user_nice(struct task_struct *p, long nice)
  2566.  {
  2567. -   int old_prio, delta, queued;
  2568. -   unsigned long flags;
  2569. +   bool queued, running;
  2570. +   int old_prio, delta;
  2571. +   struct rq_flags rf;
  2572.     struct rq *rq;
  2573.  
  2574.     if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
  2575. @@ -3568,7 +4013,9 @@
  2576.      * We have to be careful, if called from sys_setpriority(),
  2577.      * the task might be in the middle of scheduling on another CPU.
  2578.      */
  2579. -   rq = task_rq_lock(p, &flags);
  2580. +   rq = task_rq_lock(p, &rf);
  2581. +   update_rq_clock(rq);
  2582. +
  2583.     /*
  2584.      * The RT priorities are set via sched_setscheduler(), but we still
  2585.      * allow the 'normal' nice value to be set - but as expected
  2586. @@ -3580,8 +4027,11 @@
  2587.         goto out_unlock;
  2588.     }
  2589.     queued = task_on_rq_queued(p);
  2590. +   running = task_current(rq, p);
  2591.     if (queued)
  2592. -       dequeue_task(rq, p, 0);
  2593. +       dequeue_task(rq, p, DEQUEUE_SAVE);
  2594. +   if (running)
  2595. +       put_prev_task(rq, p);
  2596.  
  2597.     p->static_prio = NICE_TO_PRIO(nice);
  2598.     set_load_weight(p);
  2599. @@ -3590,7 +4040,7 @@
  2600.     delta = p->prio - old_prio;
  2601.  
  2602.     if (queued) {
  2603. -       enqueue_task(rq, p, 0);
  2604. +       enqueue_task(rq, p, ENQUEUE_RESTORE);
  2605.         /*
  2606.          * If the task increased its priority or is running and
  2607.          * lowered its priority, then reschedule its CPU:
  2608. @@ -3598,8 +4048,10 @@
  2609.         if (delta < 0 || (delta > 0 && task_running(rq, p)))
  2610.             resched_curr(rq);
  2611.     }
  2612. +   if (running)
  2613. +       set_curr_task(rq, p);
  2614.  out_unlock:
  2615. -   task_rq_unlock(rq, p, &flags);
  2616. +   task_rq_unlock(rq, p, &rf);
  2617.  }
  2618.  EXPORT_SYMBOL(set_user_nice);
  2619.  
  2620. @@ -3874,18 +4326,33 @@
  2621.     return match;
  2622.  }
  2623.  
  2624. +static bool dl_param_changed(struct task_struct *p,
  2625. +       const struct sched_attr *attr)
  2626. +{
  2627. +   struct sched_dl_entity *dl_se = &p->dl;
  2628. +
  2629. +   if (dl_se->dl_runtime != attr->sched_runtime ||
  2630. +       dl_se->dl_deadline != attr->sched_deadline ||
  2631. +       dl_se->dl_period != attr->sched_period ||
  2632. +       dl_se->flags != attr->sched_flags)
  2633. +       return true;
  2634. +
  2635. +   return false;
  2636. +}
  2637. +
  2638.  static int __sched_setscheduler(struct task_struct *p,
  2639.                 const struct sched_attr *attr,
  2640. -               bool user)
  2641. +               bool user, bool pi)
  2642.  {
  2643.     int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
  2644.               MAX_RT_PRIO - 1 - attr->sched_priority;
  2645.     int retval, oldprio, oldpolicy = -1, queued, running;
  2646.     int new_effective_prio, policy = attr->sched_policy;
  2647. -   unsigned long flags;
  2648.     const struct sched_class *prev_class;
  2649. -   struct rq *rq;
  2650. +   struct rq_flags rf;
  2651.     int reset_on_fork;
  2652. +   int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
  2653. +   struct rq *rq;
  2654.  
  2655.     /* may grab non-irq protected spin_locks */
  2656.     BUG_ON(in_interrupt());
  2657. @@ -3897,10 +4364,7 @@
  2658.     } else {
  2659.         reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
  2660.  
  2661. -       if (policy != SCHED_DEADLINE &&
  2662. -               policy != SCHED_FIFO && policy != SCHED_RR &&
  2663. -               policy != SCHED_NORMAL && policy != SCHED_BATCH &&
  2664. -               policy != SCHED_IDLE)
  2665. +       if (!valid_policy(policy))
  2666.             return -EINVAL;
  2667.     }
  2668.  
  2669. @@ -3956,7 +4420,7 @@
  2670.          * Treat SCHED_IDLE as nice 20. Only allow a switch to
  2671.          * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
  2672.          */
  2673. -       if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
  2674. +       if (idle_policy(p->policy) && !idle_policy(policy)) {
  2675.             if (!can_nice(p, task_nice(p)))
  2676.                 return -EPERM;
  2677.         }
  2678. @@ -3983,13 +4447,14 @@
  2679.      * To be able to change p->policy safely, the appropriate
  2680.      * runqueue lock must be held.
  2681.      */
  2682. -   rq = task_rq_lock(p, &flags);
  2683. +   rq = task_rq_lock(p, &rf);
  2684. +   update_rq_clock(rq);
  2685.  
  2686.     /*
  2687.      * Changing the policy of the stop threads its a very bad idea
  2688.      */
  2689.     if (p == rq->stop) {
  2690. -       task_rq_unlock(rq, p, &flags);
  2691. +       task_rq_unlock(rq, p, &rf);
  2692.         return -EINVAL;
  2693.     }
  2694.  
  2695. @@ -4002,11 +4467,11 @@
  2696.             goto change;
  2697.         if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
  2698.             goto change;
  2699. -       if (dl_policy(policy))
  2700. +       if (dl_policy(policy) && dl_param_changed(p, attr))
  2701.             goto change;
  2702.  
  2703.         p->sched_reset_on_fork = reset_on_fork;
  2704. -       task_rq_unlock(rq, p, &flags);
  2705. +       task_rq_unlock(rq, p, &rf);
  2706.         return 0;
  2707.     }
  2708.  change:
  2709. @@ -4020,7 +4485,7 @@
  2710.         if (rt_bandwidth_enabled() && rt_policy(policy) &&
  2711.                 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
  2712.                 !task_group_is_autogroup(task_group(p))) {
  2713. -           task_rq_unlock(rq, p, &flags);
  2714. +           task_rq_unlock(rq, p, &rf);
  2715.             return -EPERM;
  2716.         }
  2717.  #endif
  2718. @@ -4035,7 +4500,7 @@
  2719.              */
  2720.             if (!cpumask_subset(span, &p->cpus_allowed) ||
  2721.                 rq->rd->dl_bw.bw == 0) {
  2722. -               task_rq_unlock(rq, p, &flags);
  2723. +               task_rq_unlock(rq, p, &rf);
  2724.                 return -EPERM;
  2725.             }
  2726.         }
  2727. @@ -4045,7 +4510,7 @@
  2728.     /* recheck policy now with rq lock held */
  2729.     if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
  2730.         policy = oldpolicy = -1;
  2731. -       task_rq_unlock(rq, p, &flags);
  2732. +       task_rq_unlock(rq, p, &rf);
  2733.         goto recheck;
  2734.     }
  2735.  
  2736. @@ -4055,52 +4520,55 @@
  2737.      * is available.
  2738.      */
  2739.     if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
  2740. -       task_rq_unlock(rq, p, &flags);
  2741. +       task_rq_unlock(rq, p, &rf);
  2742.         return -EBUSY;
  2743.     }
  2744.  
  2745.     p->sched_reset_on_fork = reset_on_fork;
  2746.     oldprio = p->prio;
  2747.  
  2748. -   /*
  2749. -    * Take priority boosted tasks into account. If the new
  2750. -    * effective priority is unchanged, we just store the new
  2751. -    * normal parameters and do not touch the scheduler class and
  2752. -    * the runqueue. This will be done when the task deboost
  2753. -    * itself.
  2754. -    */
  2755. -   new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
  2756. -   if (new_effective_prio == oldprio) {
  2757. -       __setscheduler_params(p, attr);
  2758. -       task_rq_unlock(rq, p, &flags);
  2759. -       return 0;
  2760. +   if (pi) {
  2761. +       /*
  2762. +        * Take priority boosted tasks into account. If the new
  2763. +        * effective priority is unchanged, we just store the new
  2764. +        * normal parameters and do not touch the scheduler class and
  2765. +        * the runqueue. This will be done when the task deboost
  2766. +        * itself.
  2767. +        */
  2768. +       new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
  2769. +       if (new_effective_prio == oldprio)
  2770. +           queue_flags &= ~DEQUEUE_MOVE;
  2771.     }
  2772.  
  2773.     queued = task_on_rq_queued(p);
  2774.     running = task_current(rq, p);
  2775.     if (queued)
  2776. -       dequeue_task(rq, p, 0);
  2777. +       dequeue_task(rq, p, queue_flags);
  2778.     if (running)
  2779.         put_prev_task(rq, p);
  2780.  
  2781.     prev_class = p->sched_class;
  2782. -   __setscheduler(rq, p, attr, true);
  2783. +   __setscheduler(rq, p, attr, pi);
  2784.  
  2785. -   if (running)
  2786. -       p->sched_class->set_curr_task(rq);
  2787.     if (queued) {
  2788.         /*
  2789.          * We enqueue to tail when the priority of a task is
  2790.          * increased (user space view).
  2791.          */
  2792. -       enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
  2793. +       if (oldprio < p->prio)
  2794. +           queue_flags |= ENQUEUE_HEAD;
  2795. +
  2796. +       enqueue_task(rq, p, queue_flags);
  2797.     }
  2798. +   if (running)
  2799. +       set_curr_task(rq, p);
  2800.  
  2801.     check_class_changed(rq, p, prev_class, oldprio);
  2802.     preempt_disable(); /* avoid rq from going away on us */
  2803. -   task_rq_unlock(rq, p, &flags);
  2804. +   task_rq_unlock(rq, p, &rf);
  2805.  
  2806. -   rt_mutex_adjust_pi(p);
  2807. +   if (pi)
  2808. +       rt_mutex_adjust_pi(p);
  2809.  
  2810.     /*
  2811.      * Run balance callbacks after we've adjusted the PI chain.
  2812. @@ -4127,7 +4595,7 @@
  2813.         attr.sched_policy = policy;
  2814.     }
  2815.  
  2816. -   return __sched_setscheduler(p, &attr, check);
  2817. +   return __sched_setscheduler(p, &attr, check, true);
  2818.  }
  2819.  /**
  2820.   * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
  2821. @@ -4148,7 +4616,7 @@
  2822.  
  2823.  int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
  2824.  {
  2825. -   return __sched_setscheduler(p, attr, true);
  2826. +   return __sched_setscheduler(p, attr, true, true);
  2827.  }
  2828.  EXPORT_SYMBOL_GPL(sched_setattr);
  2829.  
  2830. @@ -4170,6 +4638,7 @@
  2831.  {
  2832.     return _sched_setscheduler(p, policy, param, false);
  2833.  }
  2834. +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
  2835.  
  2836.  static int
  2837.  do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
  2838. @@ -4683,7 +5152,7 @@
  2839.  {
  2840.     struct rq *rq = this_rq_lock();
  2841.  
  2842. -   schedstat_inc(rq, yld_count);
  2843. +   schedstat_inc(rq->yld_count);
  2844.     current->sched_class->yield_task(rq);
  2845.  
  2846.     /*
  2847. @@ -4700,22 +5169,17 @@
  2848.     return 0;
  2849.  }
  2850.  
  2851. -static void __cond_resched(void)
  2852. -{
  2853. -   __preempt_count_add(PREEMPT_ACTIVE);
  2854. -   __schedule();
  2855. -   __preempt_count_sub(PREEMPT_ACTIVE);
  2856. -}
  2857. -
  2858. +#ifndef CONFIG_PREEMPT
  2859.  int __sched _cond_resched(void)
  2860.  {
  2861. -   if (should_resched()) {
  2862. -       __cond_resched();
  2863. +   if (should_resched(0)) {
  2864. +       preempt_schedule_common();
  2865.         return 1;
  2866.     }
  2867.     return 0;
  2868.  }
  2869.  EXPORT_SYMBOL(_cond_resched);
  2870. +#endif
  2871.  
  2872.  /*
  2873.   * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
  2874. @@ -4727,7 +5191,7 @@
  2875.   */
  2876.  int __cond_resched_lock(spinlock_t *lock)
  2877.  {
  2878. -   int resched = should_resched();
  2879. +   int resched = should_resched(PREEMPT_LOCK_OFFSET);
  2880.     int ret = 0;
  2881.  
  2882.     lockdep_assert_held(lock);
  2883. @@ -4735,7 +5199,7 @@
  2884.     if (spin_needbreak(lock) || resched) {
  2885.         spin_unlock(lock);
  2886.         if (resched)
  2887. -           __cond_resched();
  2888. +           preempt_schedule_common();
  2889.         else
  2890.             cpu_relax();
  2891.         ret = 1;
  2892. @@ -4749,9 +5213,9 @@
  2893.  {
  2894.     BUG_ON(!in_softirq());
  2895.  
  2896. -   if (should_resched()) {
  2897. +   if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
  2898.         local_bh_enable();
  2899. -       __cond_resched();
  2900. +       preempt_schedule_common();
  2901.         local_bh_disable();
  2902.         return 1;
  2903.     }
  2904. @@ -4841,7 +5305,7 @@
  2905.  
  2906.     yielded = curr->sched_class->yield_to_task(rq, p, preempt);
  2907.     if (yielded) {
  2908. -       schedstat_inc(rq, yld_count);
  2909. +       schedstat_inc(rq->yld_count);
  2910.         /*
  2911.          * Make p's CPU reschedule; pick_next_entity takes care of
  2912.          * fairness.
  2913. @@ -4866,36 +5330,26 @@
  2914.   * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  2915.   * that process accounting knows that this is a task in IO wait state.
  2916.   */
  2917. -void __sched io_schedule(void)
  2918. -{
  2919. -   struct rq *rq = raw_rq();
  2920. -
  2921. -   delayacct_blkio_start();
  2922. -   atomic_inc(&rq->nr_iowait);
  2923. -   blk_flush_plug(current);
  2924. -   current->in_iowait = 1;
  2925. -   schedule();
  2926. -   current->in_iowait = 0;
  2927. -   atomic_dec(&rq->nr_iowait);
  2928. -   delayacct_blkio_end();
  2929. -}
  2930. -EXPORT_SYMBOL(io_schedule);
  2931. -
  2932.  long __sched io_schedule_timeout(long timeout)
  2933.  {
  2934. -   struct rq *rq = raw_rq();
  2935. +   int old_iowait = current->in_iowait;
  2936. +   struct rq *rq;
  2937.     long ret;
  2938.  
  2939. +   current->in_iowait = 1;
  2940. +   blk_schedule_flush_plug(current);
  2941. +
  2942.     delayacct_blkio_start();
  2943. +   rq = raw_rq();
  2944.     atomic_inc(&rq->nr_iowait);
  2945. -   blk_flush_plug(current);
  2946. -   current->in_iowait = 1;
  2947.     ret = schedule_timeout(timeout);
  2948. -   current->in_iowait = 0;
  2949. +   current->in_iowait = old_iowait;
  2950.     atomic_dec(&rq->nr_iowait);
  2951.     delayacct_blkio_end();
  2952. +
  2953.     return ret;
  2954.  }
  2955. +EXPORT_SYMBOL(io_schedule_timeout);
  2956.  
  2957.  /**
  2958.   * sys_sched_get_priority_max - return maximum RT priority.
  2959. @@ -4966,10 +5420,10 @@
  2960.  {
  2961.     struct task_struct *p;
  2962.     unsigned int time_slice;
  2963. -   unsigned long flags;
  2964. +   struct rq_flags rf;
  2965. +   struct timespec t;
  2966.     struct rq *rq;
  2967.     int retval;
  2968. -   struct timespec t;
  2969.  
  2970.     if (pid < 0)
  2971.         return -EINVAL;
  2972. @@ -4984,11 +5438,11 @@
  2973.     if (retval)
  2974.         goto out_unlock;
  2975.  
  2976. -   rq = task_rq_lock(p, &flags);
  2977. +   rq = task_rq_lock(p, &rf);
  2978.     time_slice = 0;
  2979.     if (p->sched_class->get_rr_interval)
  2980.         time_slice = p->sched_class->get_rr_interval(rq, p);
  2981. -   task_rq_unlock(rq, p, &flags);
  2982. +   task_rq_unlock(rq, p, &rf);
  2983.  
  2984.     rcu_read_unlock();
  2985.     jiffies_to_timespec(time_slice, &t);
  2986. @@ -5006,9 +5460,12 @@
  2987.  {
  2988.     unsigned long free = 0;
  2989.     int ppid;
  2990. -   unsigned state;
  2991. +   unsigned long state = p->state;
  2992.  
  2993. -   state = p->state ? __ffs(p->state) + 1 : 0;
  2994. +   if (!try_get_task_stack(p))
  2995. +       return;
  2996. +   if (state)
  2997. +       state = __ffs(state) + 1;
  2998.     printk(KERN_INFO "%-15.15s %c", p->comm,
  2999.         state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
  3000.  #if BITS_PER_LONG == 32
  3001. @@ -5025,8 +5482,10 @@
  3002.  #ifdef CONFIG_DEBUG_STACK_USAGE
  3003.     free = stack_not_used(p);
  3004.  #endif
  3005. +   ppid = 0;
  3006.     rcu_read_lock();
  3007. -   ppid = task_pid_nr(rcu_dereference(p->real_parent));
  3008. +   if (pid_alive(p))
  3009. +       ppid = task_pid_nr(rcu_dereference(p->real_parent));
  3010.     rcu_read_unlock();
  3011.     printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
  3012.         task_pid_nr(p), ppid,
  3013. @@ -5034,6 +5493,7 @@
  3014.  
  3015.     print_worker_info(KERN_INFO, p);
  3016.     show_stack(p, NULL);
  3017. +   put_task_stack(p);
  3018.  }
  3019.  
  3020.  void show_state_filter(unsigned long state_filter)
  3021. @@ -5095,7 +5555,6 @@
  3022.     raw_spin_lock(&rq->lock);
  3023.  
  3024.     __sched_fork(0, idle);
  3025. -
  3026.     idle->state = TASK_RUNNING;
  3027.     idle->se.exec_start = sched_clock();
  3028.  
  3029. @@ -5163,26 +5622,26 @@
  3030.   */
  3031.  void sched_setnuma(struct task_struct *p, int nid)
  3032.  {
  3033. -   struct rq *rq;
  3034. -   unsigned long flags;
  3035.     bool queued, running;
  3036. +   struct rq_flags rf;
  3037. +   struct rq *rq;
  3038.  
  3039. -   rq = task_rq_lock(p, &flags);
  3040. +   rq = task_rq_lock(p, &rf);
  3041.     queued = task_on_rq_queued(p);
  3042.     running = task_current(rq, p);
  3043.  
  3044.     if (queued)
  3045. -       dequeue_task(rq, p, 0);
  3046. +       dequeue_task(rq, p, DEQUEUE_SAVE);
  3047.     if (running)
  3048.         put_prev_task(rq, p);
  3049.  
  3050.     p->numa_preferred_nid = nid;
  3051.  
  3052. -   if (running)
  3053. -       p->sched_class->set_curr_task(rq);
  3054.     if (queued)
  3055. -       enqueue_task(rq, p, 0);
  3056. -   task_rq_unlock(rq, p, &flags);
  3057. +       enqueue_task(rq, p, ENQUEUE_RESTORE);
  3058. +   if (running)
  3059. +       set_curr_task(rq, p);
  3060. +   task_rq_unlock(rq, p, &rf);
  3061.  }
  3062.  #endif /* CONFIG_NUMA_BALANCING */
  3063.  
  3064. @@ -5242,10 +5701,11 @@
  3065.   * there's no concurrency possible, we hold the required locks anyway
  3066.   * because of lock validation efforts.
  3067.   */
  3068. -static void migrate_tasks(unsigned int dead_cpu)
  3069. +static void migrate_tasks(struct rq *dead_rq)
  3070.  {
  3071. -   struct rq *rq = cpu_rq(dead_cpu);
  3072. +   struct rq *rq = dead_rq;
  3073.     struct task_struct *next, *stop = rq->stop;
  3074. +   struct pin_cookie cookie;
  3075.     int dest_cpu;
  3076.  
  3077.     /*
  3078. @@ -5266,7 +5726,7 @@
  3079.      */
  3080.     update_rq_clock(rq);
  3081.  
  3082. -   for ( ; ; ) {
  3083. +   for (;;) {
  3084.         /*
  3085.          * There's this thread running, bail when that's the only
  3086.          * remaining thread.
  3087. @@ -5274,17 +5734,48 @@
  3088.         if (rq->nr_running == 1)
  3089.             break;
  3090.  
  3091. -       next = pick_next_task(rq, &fake_task);
  3092. +       /*
  3093. +        * pick_next_task assumes pinned rq->lock.
  3094. +        */
  3095. +       cookie = lockdep_pin_lock(&rq->lock);
  3096. +       next = pick_next_task(rq, &fake_task, cookie);
  3097.         BUG_ON(!next);
  3098.         next->sched_class->put_prev_task(rq, next);
  3099.  
  3100. -       /* Find suitable destination for @next, with force if needed. */
  3101. -       dest_cpu = select_fallback_rq(dead_cpu, next);
  3102. +       /*
  3103. +        * Rules for changing task_struct::cpus_allowed are holding
  3104. +        * both pi_lock and rq->lock, such that holding either
  3105. +        * stabilizes the mask.
  3106. +        *
  3107. +        * Drop rq->lock is not quite as disastrous as it usually is
  3108. +        * because !cpu_active at this point, which means load-balance
  3109. +        * will not interfere. Also, stop-machine.
  3110. +        */
  3111. +       lockdep_unpin_lock(&rq->lock, cookie);
  3112.         raw_spin_unlock(&rq->lock);
  3113. +       raw_spin_lock(&next->pi_lock);
  3114. +       raw_spin_lock(&rq->lock);
  3115. +
  3116. +       /*
  3117. +        * Since we're inside stop-machine, _nothing_ should have
  3118. +        * changed the task, WARN if weird stuff happened, because in
  3119. +        * that case the above rq->lock drop is a fail too.
  3120. +        */
  3121. +       if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
  3122. +           raw_spin_unlock(&next->pi_lock);
  3123. +           continue;
  3124. +       }
  3125.  
  3126. -       __migrate_task(next, dead_cpu, dest_cpu);
  3127. +       /* Find suitable destination for @next, with force if needed. */
  3128. +       dest_cpu = select_fallback_rq(dead_rq->cpu, next);
  3129.  
  3130. -       raw_spin_lock(&rq->lock);
  3131. +       rq = __migrate_task(rq, next, dest_cpu);
  3132. +       if (rq != dead_rq) {
  3133. +           raw_spin_unlock(&rq->lock);
  3134. +           rq = dead_rq;
  3135. +           raw_spin_lock(&rq->lock);
  3136. +       }
  3137. +       raw_spin_unlock(&next->pi_lock);
  3138.     }
  3139.  
  3140.     rq->stop = stop;
  3141. @@ -5517,8 +6008,7 @@
  3142.  /* may be called multiple times per register */
  3143.  static void unregister_sched_domain_sysctl(void)
  3144.  {
  3145. -   if (sd_sysctl_header)
  3146. -       unregister_sysctl_table(sd_sysctl_header);
  3147. +   unregister_sysctl_table(sd_sysctl_header);
  3148.     sd_sysctl_header = NULL;
  3149.     if (sd_ctl_dir[0].child)
  3150.         sd_free_ctl_entry(&sd_ctl_dir[0].child);
  3151. @@ -5603,7 +6093,7 @@
  3152.             BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
  3153.             set_rq_offline(rq);
  3154.         }
  3155. -       migrate_tasks(cpu);
  3156. +       migrate_tasks(rq);
  3157.         BUG_ON(rq->nr_running != 1); /* the migration thread */
  3158.         raw_spin_unlock_irqrestore(&rq->lock, flags);
  3159.         break;
  3160. @@ -5629,7 +6119,7 @@
  3161.     .priority = CPU_PRI_MIGRATION,
  3162.  };
  3163.  
  3164. -static void __cpuinit set_cpu_rq_start_time(void)
  3165. +static void set_cpu_rq_start_time(void)
  3166.  {
  3167.     int cpu = smp_processor_id();
  3168.     struct rq *rq = cpu_rq(cpu);
  3169. @@ -5745,9 +6235,6 @@
  3170.  
  3171.     if (!(sd->flags & SD_LOAD_BALANCE)) {
  3172.         printk("does not load-balance\n");
  3173. -       if (sd->parent)
  3174. -           printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
  3175. -                   " has parent");
  3176.         return -1;
  3177.     }
  3178.  
  3179. @@ -5840,8 +6327,12 @@
  3180.  
  3181.  static int sd_degenerate(struct sched_domain *sd)
  3182.  {
  3183. -   if (cpumask_weight(sched_domain_span(sd)) == 1)
  3184. -       return 1;
  3185. +   if (cpumask_weight(sched_domain_span(sd)) == 1) {
  3186. +       if (sd->groups->sge)
  3187. +           sd->flags &= ~SD_LOAD_BALANCE;
  3188. +       else
  3189. +           return 1;
  3190. +   }
  3191.  
  3192.     /* Following flags need at least 2 groups */
  3193.     if (sd->flags & (SD_LOAD_BALANCE |
  3194. @@ -5849,6 +6340,7 @@
  3195.              SD_BALANCE_FORK |
  3196.              SD_BALANCE_EXEC |
  3197.              SD_SHARE_CPUCAPACITY |
  3198. +            SD_ASYM_CPUCAPACITY |
  3199.              SD_SHARE_PKG_RESOURCES |
  3200.              SD_SHARE_POWERDOMAIN |
  3201.              SD_SHARE_CAP_STATES)) {
  3202. @@ -5880,11 +6372,16 @@
  3203.                 SD_BALANCE_NEWIDLE |
  3204.                 SD_BALANCE_FORK |
  3205.                 SD_BALANCE_EXEC |
  3206. +               SD_ASYM_CPUCAPACITY |
  3207.                 SD_SHARE_CPUCAPACITY |
  3208.                 SD_SHARE_PKG_RESOURCES |
  3209.                 SD_PREFER_SIBLING |
  3210.                 SD_SHARE_POWERDOMAIN |
  3211.                 SD_SHARE_CAP_STATES);
  3212. +       if (parent->groups->sge) {
  3213. +           parent->flags &= ~SD_LOAD_BALANCE;
  3214. +           return 0;
  3215. +       }
  3216.         if (nr_node_ids == 1)
  3217.             pflags &= ~SD_SERIALIZE;
  3218.     }
  3219. @@ -5944,6 +6441,19 @@
  3220.         call_rcu_sched(&old_rd->rcu, free_rootdomain);
  3221.  }
  3222.  
  3223. +void sched_get_rd(struct root_domain *rd)
  3224. +{
  3225. +   atomic_inc(&rd->refcount);
  3226. +}
  3227. +
  3228. +void sched_put_rd(struct root_domain *rd)
  3229. +{
  3230. +   if (!atomic_dec_and_test(&rd->refcount))
  3231. +       return;
  3232. +
  3233. +   call_rcu_sched(&rd->rcu, free_rootdomain);
  3234. +}
  3235. +
  3236.  static int init_rootdomain(struct root_domain *rd)
  3237.  {
  3238.     memset(rd, 0, sizeof(*rd));
  3239. @@ -5957,6 +6467,12 @@
  3240.     if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
  3241.         goto free_dlo_mask;
  3242.  
  3243. +#ifdef HAVE_RT_PUSH_IPI
  3244. +   rd->rto_cpu = -1;
  3245. +   raw_spin_lock_init(&rd->rto_lock);
  3246. +   init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
  3247. +#endif
  3248. +
  3249.     init_dl_bw(&rd->dl_bw);
  3250.     if (cpudl_init(&rd->cpudl) != 0)
  3251.         goto free_dlo_mask;
  3252. @@ -5965,6 +6481,9 @@
  3253.         goto free_rto_mask;
  3254.  
  3255.     init_max_cpu_capacity(&rd->max_cpu_capacity);
  3256. +
  3257. +   rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1;
  3258. +
  3259.     return 0;
  3260.  
  3261.  free_rto_mask:
  3262. @@ -6027,10 +6546,8 @@
  3263.     } while (sg != first);
  3264.  }
  3265.  
  3266. -static void free_sched_domain(struct rcu_head *rcu)
  3267. +static void destroy_sched_domain(struct sched_domain *sd)
  3268.  {
  3269. -   struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
  3270. -
  3271.     /*
  3272.      * If its an overlapping domain it has private groups, iterate and
  3273.      * nuke them all.
  3274. @@ -6041,18 +6558,26 @@
  3275.         kfree(sd->groups->sgc);
  3276.         kfree(sd->groups);
  3277.     }
  3278. +   if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
  3279. +       kfree(sd->shared);
  3280.     kfree(sd);
  3281.  }
  3282.  
  3283. -static void destroy_sched_domain(struct sched_domain *sd, int cpu)
  3284. +static void destroy_sched_domains_rcu(struct rcu_head *rcu)
  3285.  {
  3286. -   call_rcu(&sd->rcu, free_sched_domain);
  3287. +   struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
  3288. +
  3289. +   while (sd) {
  3290. +       struct sched_domain *parent = sd->parent;
  3291. +       destroy_sched_domain(sd);
  3292. +       sd = parent;
  3293. +   }
  3294.  }
  3295.  
  3296. -static void destroy_sched_domains(struct sched_domain *sd, int cpu)
  3297. +static void destroy_sched_domains(struct sched_domain *sd)
  3298.  {
  3299. -   for (; sd; sd = sd->parent)
  3300. -       destroy_sched_domain(sd, cpu);
  3301. +   if (sd)
  3302. +       call_rcu(&sd->rcu, destroy_sched_domains_rcu);
  3303.  }
  3304.  
  3305.  /*
  3306. @@ -6067,16 +6592,17 @@
  3307.  DEFINE_PER_CPU(struct sched_domain *, sd_llc);
  3308.  DEFINE_PER_CPU(int, sd_llc_size);
  3309.  DEFINE_PER_CPU(int, sd_llc_id);
  3310. +DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
  3311.  DEFINE_PER_CPU(struct sched_domain *, sd_numa);
  3312. -DEFINE_PER_CPU(struct sched_domain *, sd_busy);
  3313.  DEFINE_PER_CPU(struct sched_domain *, sd_asym);
  3314.  DEFINE_PER_CPU(struct sched_domain *, sd_ea);
  3315.  DEFINE_PER_CPU(struct sched_domain *, sd_scs);
  3316.  
  3317.  static void update_top_cache_domain(int cpu)
  3318.  {
  3319. +   struct sched_domain_shared *sds = NULL;
  3320.     struct sched_domain *sd;
  3321. -   struct sched_domain *busy_sd = NULL, *ea_sd = NULL;
  3322. +   struct sched_domain *ea_sd = NULL;
  3323.     int id = cpu;
  3324.     int size = 1;
  3325.  
  3326. @@ -6084,13 +6610,13 @@
  3327.     if (sd) {
  3328.         id = cpumask_first(sched_domain_span(sd));
  3329.         size = cpumask_weight(sched_domain_span(sd));
  3330. -       busy_sd = sd->parent; /* sd_busy */
  3331. +       sds = sd->shared;
  3332.     }
  3333. -   rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
  3334.  
  3335.     rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
  3336.     per_cpu(sd_llc_size, cpu) = size;
  3337.     per_cpu(sd_llc_id, cpu) = id;
  3338. +   rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
  3339.  
  3340.     sd = lowest_flag_domain(cpu, SD_NUMA);
  3341.     rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
  3342. @@ -6137,7 +6663,7 @@
  3343.              */
  3344.             if (parent->flags & SD_PREFER_SIBLING)
  3345.                 tmp->flags |= SD_PREFER_SIBLING;
  3346. -           destroy_sched_domain(parent, cpu);
  3347. +           destroy_sched_domain(parent);
  3348.         } else
  3349.             tmp = tmp->parent;
  3350.     }
  3351. @@ -6145,7 +6671,7 @@
  3352.     if (sd && sd_degenerate(sd)) {
  3353.         tmp = sd;
  3354.         sd = sd->parent;
  3355. -       destroy_sched_domain(tmp, cpu);
  3356. +       destroy_sched_domain(tmp);
  3357.         if (sd)
  3358.             sd->child = NULL;
  3359.     }
  3360. @@ -6155,14 +6681,11 @@
  3361.     rq_attach_root(rq, rd);
  3362.     tmp = rq->sd;
  3363.     rcu_assign_pointer(rq->sd, sd);
  3364. -   destroy_sched_domains(tmp, cpu);
  3365. +   destroy_sched_domains(tmp);
  3366.  
  3367.     update_top_cache_domain(cpu);
  3368.  }
  3369.  
  3370. -/* cpus with isolated domains */
  3371. -static cpumask_var_t cpu_isolated_map;
  3372. -
  3373.  /* Setup the mask of cpus configured for isolated domains */
  3374.  static int __init isolated_cpu_setup(char *str)
  3375.  {
  3376. @@ -6288,6 +6811,7 @@
  3377.          */
  3378.         sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
  3379.         sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
  3380. +       sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
  3381.  
  3382.         /*
  3383.          * Make sure the first group of this domain contains the
  3384. @@ -6413,7 +6937,6 @@
  3385.         return;
  3386.  
  3387.     update_group_capacity(sd, cpu);
  3388. -   atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
  3389.  }
  3390.  
  3391.  /*
  3392. @@ -6476,28 +6999,6 @@
  3393.     sd->groups->sge = fn(cpu);
  3394.  }
  3395.  
  3396. -#ifdef CONFIG_SCHED_DEBUG
  3397. -void set_energy_aware()
  3398. -{
  3399. -   sched_feat_set("ENERGY_AWARE");
  3400. -}
  3401. -void clear_energy_aware()
  3402. -{
  3403. -   sched_feat_set("NO_ENERGY_AWARE");
  3404. -}
  3405. -#else
  3406. -struct static_key __read_mostly __energy_aware = STATIC_KEY_INIT_FALSE;
  3407. -
  3408. -void set_energy_aware()
  3409. -{
  3410. -   static_key_slow_inc(&__energy_aware);
  3411. -}
  3412. -void clear_energy_aware()
  3413. -{
  3414. -   static_key_slow_dec(&__energy_aware);
  3415. -}
  3416. -#endif /* CONFIG_SCHED_DEBUG */
  3417. -
  3418.  /*
  3419.   * Initializers for schedule domains
  3420.   * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
  3421. @@ -6583,6 +7084,9 @@
  3422.     WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
  3423.     *per_cpu_ptr(sdd->sd, cpu) = NULL;
  3424.  
  3425. +   if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
  3426. +       *per_cpu_ptr(sdd->sds, cpu) = NULL;
  3427. +
  3428.     if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
  3429.         *per_cpu_ptr(sdd->sg, cpu) = NULL;
  3430.  
  3431. @@ -6600,13 +7104,20 @@
  3432.  /*
  3433.   * SD_flags allowed in topology descriptions.
  3434.   *
  3435. - * SD_SHARE_CPUCAPACITY      - describes SMT topologies
  3436. - * SD_SHARE_PKG_RESOURCES - describes shared caches
  3437. - * SD_NUMA                - describes NUMA topologies
  3438. - * SD_SHARE_POWERDOMAIN   - describes shared power domain
  3439. - * SD_SHARE_CAP_STATES    - describes shared capacity states
  3440. + * These flags are purely descriptive of the topology and do not prescribe
  3441. + * behaviour. Behaviour is artificial and mapped in the below sd_init()
  3442. + * function:
  3443. + *
  3444. + *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
  3445. + *   SD_SHARE_PKG_RESOURCES - describes shared caches
  3446. + *   SD_NUMA                - describes NUMA topologies
  3447. + *   SD_SHARE_POWERDOMAIN   - describes shared power domain
  3448. + *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
  3449. + *   SD_SHARE_CAP_STATES    - describes shared capacity states
  3450. + *
  3451. + * Odd one out, which beside describing the topology has a quirk also
  3452. + * prescribes the desired behaviour that goes along with it:
  3453.   *
  3454. - * Odd one out:
  3455.   * SD_ASYM_PACKING        - describes SMT quirks
  3456.   */
  3457.  #define TOPOLOGY_SD_FLAGS      \
  3458. @@ -6614,14 +7125,18 @@
  3459.      SD_SHARE_PKG_RESOURCES |   \
  3460.      SD_NUMA |          \
  3461.      SD_ASYM_PACKING |      \
  3462. +    SD_ASYM_CPUCAPACITY |      \
  3463.      SD_SHARE_POWERDOMAIN |     \
  3464.      SD_SHARE_CAP_STATES)
  3465.  
  3466.  static struct sched_domain *
  3467. -sd_init(struct sched_domain_topology_level *tl, int cpu)
  3468. +sd_init(struct sched_domain_topology_level *tl,
  3469. +   const struct cpumask *cpu_map,
  3470. +   struct sched_domain *child, int cpu)
  3471.  {
  3472. -   struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
  3473. -   int sd_weight, sd_flags = 0;
  3474. +   struct sd_data *sdd = &tl->data;
  3475. +   struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
  3476. +   int sd_id, sd_weight, sd_flags = 0;
  3477.  
  3478.  #ifdef CONFIG_NUMA
  3479.     /*
  3480. @@ -6670,15 +7185,26 @@
  3481.         .smt_gain       = 0,
  3482.         .max_newidle_lb_cost    = 0,
  3483.         .next_decay_max_lb_cost = jiffies,
  3484. +       .child          = child,
  3485.  #ifdef CONFIG_SCHED_DEBUG
  3486.         .name           = tl->name,
  3487.  #endif
  3488.     };
  3489.  
  3490. +   cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
  3491. +   sd_id = cpumask_first(sched_domain_span(sd));
  3492. +
  3493.     /*
  3494.      * Convert topological properties into behaviour.
  3495.      */
  3496.  
  3497. +   if (sd->flags & SD_ASYM_CPUCAPACITY) {
  3498. +       struct sched_domain *t = sd;
  3499. +
  3500. +       for_each_lower_domain(t)
  3501. +           t->flags |= SD_BALANCE_WAKE;
  3502. +   }
  3503. +
  3504.     if (sd->flags & SD_SHARE_CPUCAPACITY) {
  3505.         sd->flags |= SD_PREFER_SIBLING;
  3506.         sd->imbalance_pct = 110;
  3507. @@ -6710,7 +7236,17 @@
  3508.         sd->idle_idx = 1;
  3509.     }
  3510.  
  3511. -   sd->private = &tl->data;
  3512. +   /*
  3513. +    * For all levels sharing cache; connect a sched_domain_shared
  3514. +    * instance.
  3515. +    */
  3516. +   if (sd->flags & SD_SHARE_PKG_RESOURCES) {
  3517. +       sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
  3518. +       atomic_inc(&sd->shared->ref);
  3519. +       atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
  3520. +   }
  3521. +
  3522. +   sd->private = sdd;
  3523.  
  3524.     return sd;
  3525.  }
  3526. @@ -6729,7 +7265,8 @@
  3527.     { NULL, },
  3528.  };
  3529.  
  3530. -struct sched_domain_topology_level *sched_domain_topology = default_topology;
  3531. +static struct sched_domain_topology_level *sched_domain_topology =
  3532. +   default_topology;
  3533.  
  3534.  #define for_each_sd_topology(tl)           \
  3535.     for (tl = sched_domain_topology; tl->mask; tl++)
  3536. @@ -6992,6 +7529,10 @@
  3537.         if (!sdd->sd)
  3538.             return -ENOMEM;
  3539.  
  3540. +       sdd->sds = alloc_percpu(struct sched_domain_shared *);
  3541. +       if (!sdd->sds)
  3542. +           return -ENOMEM;
  3543. +
  3544.         sdd->sg = alloc_percpu(struct sched_group *);
  3545.         if (!sdd->sg)
  3546.             return -ENOMEM;
  3547. @@ -7002,6 +7543,7 @@
  3548.  
  3549.         for_each_cpu(j, cpu_map) {
  3550.             struct sched_domain *sd;
  3551. +           struct sched_domain_shared *sds;
  3552.             struct sched_group *sg;
  3553.             struct sched_group_capacity *sgc;
  3554.  
  3555. @@ -7012,6 +7554,13 @@
  3556.  
  3557.             *per_cpu_ptr(sdd->sd, j) = sd;
  3558.  
  3559. +           sds = kzalloc_node(sizeof(struct sched_domain_shared),
  3560. +                   GFP_KERNEL, cpu_to_node(j));
  3561. +           if (!sds)
  3562. +               return -ENOMEM;
  3563. +
  3564. +           *per_cpu_ptr(sdd->sds, j) = sds;
  3565. +
  3566.             sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
  3567.                     GFP_KERNEL, cpu_to_node(j));
  3568.             if (!sg)
  3569. @@ -7051,6 +7600,8 @@
  3570.                 kfree(*per_cpu_ptr(sdd->sd, j));
  3571.             }
  3572.  
  3573. +           if (sdd->sds)
  3574. +               kfree(*per_cpu_ptr(sdd->sds, j));
  3575.             if (sdd->sg)
  3576.                 kfree(*per_cpu_ptr(sdd->sg, j));
  3577.             if (sdd->sgc)
  3578. @@ -7058,6 +7609,8 @@
  3579.         }
  3580.         free_percpu(sdd->sd);
  3581.         sdd->sd = NULL;
  3582. +       free_percpu(sdd->sds);
  3583. +       sdd->sds = NULL;
  3584.         free_percpu(sdd->sg);
  3585.         sdd->sg = NULL;
  3586.         free_percpu(sdd->sgc);
  3587. @@ -7069,16 +7622,12 @@
  3588.         const struct cpumask *cpu_map, struct sched_domain_attr *attr,
  3589.         struct sched_domain *child, int cpu)
  3590.  {
  3591. -   struct sched_domain *sd = sd_init(tl, cpu);
  3592. -   if (!sd)
  3593. -       return child;
  3594. +   struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
  3595.  
  3596. -   cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
  3597.     if (child) {
  3598.         sd->level = child->level + 1;
  3599.         sched_domain_level_max = max(sched_domain_level_max, sd->level);
  3600.         child->parent = sd;
  3601. -       sd->child = child;
  3602.  
  3603.         if (!cpumask_subset(sched_domain_span(child),
  3604.                     sched_domain_span(sd))) {
  3605. @@ -7109,7 +7658,6 @@
  3606.     enum s_alloc alloc_state;
  3607.     struct sched_domain *sd;
  3608.     struct s_data d;
  3609. -   struct rq *rq = NULL;
  3610.     int i, ret = -ENOMEM;
  3611.  
  3612.     alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
  3613. @@ -7127,8 +7675,6 @@
  3614.                 *per_cpu_ptr(d.sd, i) = sd;
  3615.             if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
  3616.                 sd->flags |= SD_OVERLAP;
  3617. -           if (cpumask_equal(cpu_map, sched_domain_span(sd)))
  3618. -               break;
  3619.         }
  3620.     }
  3621.  
  3622. @@ -7163,8 +7709,19 @@
  3623.     /* Attach the domains */
  3624.     rcu_read_lock();
  3625.     for_each_cpu(i, cpu_map) {
  3626. -       rq = cpu_rq(i);
  3627. +       int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
  3628. +       int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu);
  3629. +
  3630. +       if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig >
  3631. +           cpu_rq(max_cpu)->cpu_capacity_orig))
  3632. +           WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
  3633. +
  3634. +       if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig <
  3635. +           cpu_rq(min_cpu)->cpu_capacity_orig))
  3636. +           WRITE_ONCE(d.rd->min_cap_orig_cpu, i);
  3637. +
  3638.         sd = *per_cpu_ptr(d.sd, i);
  3639. +
  3640.         cpu_attach_domain(sd, d.rd, i);
  3641.     }
  3642.     rcu_read_unlock();
  3643. @@ -7385,17 +7942,16 @@
  3644.          * operation in the resume sequence, just build a single sched
  3645.          * domain, ignoring cpusets.
  3646.          */
  3647. -       num_cpus_frozen--;
  3648. -       if (likely(num_cpus_frozen)) {
  3649. -           partition_sched_domains(1, NULL, NULL);
  3650. +       partition_sched_domains(1, NULL, NULL);
  3651. +       if (--num_cpus_frozen)
  3652.             break;
  3653. -       }
  3654.  
  3655.         /*
  3656.          * This is the last CPU online operation. So fall through and
  3657.          * restore the original sched domains by considering the
  3658.          * cpuset configurations.
  3659.          */
  3660. +       cpuset_force_rebuild();
  3661.  
  3662.     case CPU_ONLINE:
  3663.     case CPU_DOWN_FAILED:
  3664. @@ -7428,7 +7984,6 @@
  3665.  {
  3666.     cpumask_var_t non_isolated_cpus;
  3667.  
  3668. -   walt_init_cpu_efficiency();
  3669.     alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
  3670.     alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
  3671.  
  3672. @@ -7490,6 +8045,7 @@
  3673.  #endif
  3674.  
  3675.  DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
  3676. +DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
  3677.  
  3678.  void __init sched_init(void)
  3679.  {
  3680. @@ -7528,6 +8084,8 @@
  3681.         for_each_possible_cpu(i) {
  3682.             per_cpu(load_balance_mask, i) = (void *)ptr;
  3683.             ptr += cpumask_size();
  3684. +       per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
  3685. +           cpumask_size(), GFP_KERNEL, cpu_to_node(i));
  3686.         }
  3687.  #endif /* CONFIG_CPUMASK_OFFSTACK */
  3688.     }
  3689. @@ -7553,6 +8111,7 @@
  3690.     INIT_LIST_HEAD(&root_task_group.children);
  3691.     INIT_LIST_HEAD(&root_task_group.siblings);
  3692.     autogroup_init(&init_task);
  3693. +
  3694.  #endif /* CONFIG_CGROUP_SCHED */
  3695.  
  3696.     for_each_possible_cpu(i) {
  3697. @@ -7564,11 +8123,12 @@
  3698.         rq->calc_load_active = 0;
  3699.         rq->calc_load_update = jiffies + LOAD_FREQ;
  3700.         init_cfs_rq(&rq->cfs);
  3701. -       init_rt_rq(&rq->rt, rq);
  3702. -       init_dl_rq(&rq->dl, rq);
  3703. +       init_rt_rq(&rq->rt);
  3704. +       init_dl_rq(&rq->dl);
  3705.  #ifdef CONFIG_FAIR_GROUP_SCHED
  3706.         root_task_group.shares = ROOT_TASK_GROUP_LOAD;
  3707.         INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
  3708. +       rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
  3709.         /*
  3710.          * How much cpu bandwidth does root_task_group get?
  3711.          *
  3712. @@ -7610,6 +8170,7 @@
  3713.         rq->active_balance = 0;
  3714.         rq->next_balance = jiffies;
  3715.         rq->push_cpu = 0;
  3716. +       rq->push_task = NULL;
  3717.         rq->cpu = i;
  3718.         rq->online = 0;
  3719.         rq->idle_stamp = 0;
  3720. @@ -7695,15 +8256,34 @@
  3721.  
  3722.  void __might_sleep(const char *file, int line, int preempt_offset)
  3723.  {
  3724. +   /*
  3725. +    * Blocking primitives will set (and therefore destroy) current->state,
  3726. +    * since we will exit with TASK_RUNNING make sure we enter with it,
  3727. +    * otherwise we will destroy state.
  3728. +    */
  3729. +   if (WARN_ONCE(current->state != TASK_RUNNING,
  3730. +           "do not call blocking ops when !TASK_RUNNING; "
  3731. +           "state=%lx set at [<%p>] %pS\n",
  3732. +           current->state,
  3733. +           (void *)current->task_state_change,
  3734. +           (void *)current->task_state_change))
  3735. +       __set_current_state(TASK_RUNNING);
  3736. +
  3737. +   ___might_sleep(file, line, preempt_offset);
  3738. +}
  3739. +EXPORT_SYMBOL(__might_sleep);
  3740. +
  3741. +void ___might_sleep(const char *file, int line, int preempt_offset)
  3742. +{
  3743.     static unsigned long prev_jiffy;    /* ratelimiting */
  3744.  
  3745.     rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
  3746.     if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
  3747. -        !is_idle_task(current)) || oops_in_progress)
  3748. -       return;
  3749. -   if (system_state != SYSTEM_RUNNING &&
  3750. -       (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
  3751. +        !is_idle_task(current)) ||
  3752. +       system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
  3753. +       oops_in_progress)
  3754.         return;
  3755. +
  3756.     if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
  3757.         return;
  3758.     prev_jiffy = jiffies;
  3759. @@ -7716,6 +8296,9 @@
  3760.             in_atomic(), irqs_disabled(),
  3761.             current->pid, current->comm);
  3762.  
  3763. +   if (task_stack_end_corrupted(current))
  3764. +       printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
  3765. +
  3766.     debug_show_held_locks(current);
  3767.     if (irqs_disabled())
  3768.         print_irqtrace_events(current);
  3769. @@ -7728,36 +8311,16 @@
  3770.  #endif
  3771.     dump_stack();
  3772.  }
  3773. -EXPORT_SYMBOL(__might_sleep);
  3774. +EXPORT_SYMBOL(___might_sleep);
  3775.  #endif
  3776.  
  3777.  #ifdef CONFIG_MAGIC_SYSRQ
  3778. -static void normalize_task(struct rq *rq, struct task_struct *p)
  3779. +void normalize_rt_tasks(void)
  3780.  {
  3781. -   const struct sched_class *prev_class = p->sched_class;
  3782. +   struct task_struct *g, *p;
  3783.     struct sched_attr attr = {
  3784.         .sched_policy = SCHED_NORMAL,
  3785.     };
  3786. -   int old_prio = p->prio;
  3787. -   int queued;
  3788. -
  3789. -   queued = task_on_rq_queued(p);
  3790. -   if (queued)
  3791. -       dequeue_task(rq, p, 0);
  3792. -   __setscheduler(rq, p, &attr, false);
  3793. -   if (queued) {
  3794. -       enqueue_task(rq, p, 0);
  3795. -       resched_curr(rq);
  3796. -   }
  3797. -
  3798. -   check_class_changed(rq, p, prev_class, old_prio);
  3799. -}
  3800. -
  3801. -void normalize_rt_tasks(void)
  3802. -{
  3803. -   struct task_struct *g, *p;
  3804. -   unsigned long flags;
  3805. -   struct rq *rq;
  3806.  
  3807.     read_lock(&tasklist_lock);
  3808.     for_each_process_thread(g, p) {
  3809. @@ -7767,12 +8330,10 @@
  3810.         if (p->flags & PF_KTHREAD)
  3811.             continue;
  3812.  
  3813. -       p->se.exec_start        = 0;
  3814. -#ifdef CONFIG_SCHEDSTATS
  3815. -       p->se.statistics.wait_start = 0;
  3816. -       p->se.statistics.sleep_start    = 0;
  3817. -       p->se.statistics.block_start    = 0;
  3818. -#endif
  3819. +       p->se.exec_start = 0;
  3820. +       schedstat_set(p->se.statistics.wait_start,  0);
  3821. +       schedstat_set(p->se.statistics.sleep_start, 0);
  3822. +       schedstat_set(p->se.statistics.block_start, 0);
  3823.  
  3824.         if (!dl_task(p) && !rt_task(p)) {
  3825.             /*
  3826. @@ -7784,9 +8345,7 @@
  3827.             continue;
  3828.         }
  3829.  
  3830. -       rq = task_rq_lock(p, &flags);
  3831. -       normalize_task(rq, p);
  3832. -       task_rq_unlock(rq, p, &flags);
  3833. +       __sched_setscheduler(p, &attr, false, false);
  3834.     }
  3835.     read_unlock(&tasklist_lock);
  3836.  }
  3837. @@ -7920,27 +8479,9 @@
  3838.     spin_unlock_irqrestore(&task_group_lock, flags);
  3839.  }
  3840.  
  3841. -/* change task's runqueue when it moves between groups.
  3842. - * The caller of this function should have put the task in its new group
  3843. - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
  3844. - * reflect its new group.
  3845. - */
  3846. -void sched_move_task(struct task_struct *tsk)
  3847. +static void sched_change_group(struct task_struct *tsk, int type)
  3848.  {
  3849.     struct task_group *tg;
  3850. -   int queued, running;
  3851. -   unsigned long flags;
  3852. -   struct rq *rq;
  3853. -
  3854. -   rq = task_rq_lock(tsk, &flags);
  3855. -
  3856. -   running = task_current(rq, tsk);
  3857. -   queued = task_on_rq_queued(tsk);
  3858. -
  3859. -   if (queued)
  3860. -       dequeue_task(rq, tsk, 0);
  3861. -   if (unlikely(running))
  3862. -       put_prev_task(rq, tsk);
  3863.  
  3864.     /*
  3865.      * All callers are synchronized by task_rq_lock(); we do not use RCU
  3866. @@ -7953,18 +8494,45 @@
  3867.     tsk->sched_task_group = tg;
  3868.  
  3869.  #ifdef CONFIG_FAIR_GROUP_SCHED
  3870. -   if (tsk->sched_class->task_move_group)
  3871. -       tsk->sched_class->task_move_group(tsk);
  3872. +   if (tsk->sched_class->task_change_group)
  3873. +       tsk->sched_class->task_change_group(tsk, type);
  3874.     else
  3875.  #endif
  3876.         set_task_rq(tsk, task_cpu(tsk));
  3877. +}
  3878. +
  3879. +/*
  3880. + * Change task's runqueue when it moves between groups.
  3881. + *
  3882. + * The caller of this function should have put the task in its new group by
  3883. + * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
  3884. + * its new group.
  3885. + */
  3886. +void sched_move_task(struct task_struct *tsk)
  3887. +{
  3888. +   int queued, running;
  3889. +   struct rq_flags rf;
  3890. +   struct rq *rq;
  3891.  
  3892. +   rq = task_rq_lock(tsk, &rf);
  3893. +   update_rq_clock(rq);
  3894. +
  3895. +   running = task_current(rq, tsk);
  3896. +   queued = task_on_rq_queued(tsk);
  3897. +
  3898. +   if (queued)
  3899. +       dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
  3900.     if (unlikely(running))
  3901. -       tsk->sched_class->set_curr_task(rq);
  3902. +       put_prev_task(rq, tsk);
  3903. +
  3904. +   sched_change_group(tsk, TASK_MOVE_GROUP);
  3905. +
  3906.     if (queued)
  3907. -       enqueue_task(rq, tsk, 0);
  3908. +       enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
  3909. +   if (unlikely(running))
  3910. +       set_curr_task(rq, tsk);
  3911.  
  3912. -   task_rq_unlock(rq, tsk, &flags);
  3913. +   task_rq_unlock(rq, tsk, &rf);
  3914.  }
  3915.  #endif /* CONFIG_CGROUP_SCHED */
  3916.  
  3917. @@ -8077,6 +8645,17 @@
  3918.  {
  3919.     int i, err = 0;
  3920.  
  3921. +   /*
  3922. +    * Disallowing the root group RT runtime is BAD, it would disallow the
  3923. +    * kernel creating (and or operating) RT threads.
  3924. +    */
  3925. +   if (tg == &root_task_group && rt_runtime == 0)
  3926. +       return -EINVAL;
  3927. +
  3928. +   /* No period doesn't make any sense. */
  3929. +   if (rt_period == 0)
  3930. +       return -EINVAL;
  3931. +
  3932.     mutex_lock(&rt_constraints_mutex);
  3933.     read_lock(&tasklist_lock);
  3934.     err = __rt_schedulable(tg, rt_period, rt_runtime);
  3935. @@ -8126,16 +8705,13 @@
  3936.     return rt_runtime_us;
  3937.  }
  3938.  
  3939. -static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
  3940. +static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
  3941.  {
  3942.     u64 rt_runtime, rt_period;
  3943.  
  3944. -   rt_period = (u64)rt_period_us * NSEC_PER_USEC;
  3945. +   rt_period = rt_period_us * NSEC_PER_USEC;
  3946.     rt_runtime = tg->rt_bandwidth.rt_runtime;
  3947.  
  3948. -   if (rt_period == 0)
  3949. -       return -EINVAL;
  3950. -
  3951.     return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
  3952.  }
  3953.  
  3954. @@ -8382,9 +8958,21 @@
  3955.     sched_offline_group(tg);
  3956.  }
  3957.  
  3958. +/*
  3959. + * This is called before wake_up_new_task(), therefore we really only
  3960. + * have to set its group bits, all the other stuff does not apply.
  3961. + */
  3962.  static void cpu_cgroup_fork(struct task_struct *task)
  3963.  {
  3964. -   sched_move_task(task);
  3965. +   struct rq_flags rf;
  3966. +   struct rq *rq;
  3967. +
  3968. +   rq = task_rq_lock(task, &rf);
  3969. +
  3970. +   update_rq_clock(rq);
  3971. +   sched_change_group(task, TASK_SET_GROUP);
  3972. +
  3973. +   task_rq_unlock(rq, task, &rf);
  3974.  }
  3975.  
  3976.  static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
  3977. @@ -8765,3 +9353,44 @@
  3978.     pr_info("Task dump for CPU %d:\n", cpu);
  3979.     sched_show_task(cpu_curr(cpu));
  3980.  }
  3981. +
  3982. +/*
  3983. + * Nice levels are multiplicative, with a gentle 10% change for every
  3984. + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
  3985. + * nice 1, it will get ~10% less CPU time than another CPU-bound task
  3986. + * that remained on nice 0.
  3987. + *
  3988. + * The "10% effect" is relative and cumulative: from _any_ nice level,
  3989. + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
  3990. + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
  3991. + * If a task goes up by ~10% and another task goes down by ~10% then
  3992. + * the relative distance between them is ~25%.)
  3993. + */
  3994. +const int sched_prio_to_weight[40] = {
  3995. + /* -20 */     88761,     71755,     56483,     46273,     36291,
  3996. + /* -15 */     29154,     23254,     18705,     14949,     11916,
  3997. + /* -10 */      9548,      7620,      6100,      4904,      3906,
  3998. + /*  -5 */      3121,      2501,      1991,      1586,      1277,
  3999. + /*   0 */      1024,       820,       655,       526,       423,
  4000. + /*   5 */       335,       272,       215,       172,       137,
  4001. + /*  10 */       110,        87,        70,        56,        45,
  4002. + /*  15 */        36,        29,        23,        18,        15,
  4003. +};
  4004. +
  4005. +/*
  4006. + * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
  4007. + *
  4008. + * In cases where the weight does not change often, we can use the
  4009. + * precalculated inverse to speed up arithmetics by turning divisions
  4010. + * into multiplications:
  4011. + */
  4012. +const u32 sched_prio_to_wmult[40] = {
  4013. + /* -20 */     48388,     59856,     76040,     92818,    118348,
  4014. + /* -15 */    147320,    184698,    229616,    287308,    360437,
  4015. + /* -10 */    449829,    563644,    704093,    875809,   1099582,
  4016. + /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
  4017. + /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
  4018. + /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
  4019. + /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
  4020. + /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
  4021. +};
  4022. diff -Nur /home/ninez/android/marlin/kernel/sched/cpudeadline.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpudeadline.c
  4023. --- /home/ninez/android/marlin/kernel/sched/cpudeadline.c   2018-08-10 01:54:08.563395055 -0400
  4024. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpudeadline.c   2018-08-11 23:57:17.128607487 -0400
  4025. @@ -31,11 +31,6 @@
  4026.     return (i << 1) + 2;
  4027.  }
  4028.  
  4029. -static inline int dl_time_before(u64 a, u64 b)
  4030. -{
  4031. -   return (s64)(a - b) < 0;
  4032. -}
  4033. -
  4034.  static void cpudl_exchange(struct cpudl *cp, int a, int b)
  4035.  {
  4036.     int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
  4037. @@ -107,7 +102,9 @@
  4038.     int best_cpu = -1;
  4039.     const struct sched_dl_entity *dl_se = &p->dl;
  4040.  
  4041. -   if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
  4042. +   if (later_mask &&
  4043. +       cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed) &&
  4044. +       cpumask_and(later_mask, later_mask, cpu_active_mask)) {
  4045.         best_cpu = cpumask_any(later_mask);
  4046.         goto out;
  4047.     } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
  4048. @@ -186,6 +183,26 @@
  4049.  }
  4050.  
  4051.  /*
  4052. + * cpudl_set_freecpu - Set the cpudl.free_cpus
  4053. + * @cp: the cpudl max-heap context
  4054. + * @cpu: rd attached cpu
  4055. + */
  4056. +void cpudl_set_freecpu(struct cpudl *cp, int cpu)
  4057. +{
  4058. +   cpumask_set_cpu(cpu, cp->free_cpus);
  4059. +}
  4060. +
  4061. +/*
  4062. + * cpudl_clear_freecpu - Clear the cpudl.free_cpus
  4063. + * @cp: the cpudl max-heap context
  4064. + * @cpu: rd attached cpu
  4065. + */
  4066. +void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
  4067. +{
  4068. +   cpumask_clear_cpu(cpu, cp->free_cpus);
  4069. +}
  4070. +
  4071. +/*
  4072.   * cpudl_init - initialize the cpudl structure
  4073.   * @cp: the cpudl max-heap context
  4074.   */
  4075. @@ -203,7 +220,7 @@
  4076.     if (!cp->elements)
  4077.         return -ENOMEM;
  4078.  
  4079. -   if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
  4080. +   if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
  4081.         kfree(cp->elements);
  4082.         return -ENOMEM;
  4083.     }
  4084. @@ -211,8 +228,6 @@
  4085.     for_each_possible_cpu(i)
  4086.         cp->elements[i].idx = IDX_INVALID;
  4087.  
  4088. -   cpumask_setall(cp->free_cpus);
  4089. -
  4090.     return 0;
  4091.  }
  4092.  
  4093. diff -Nur /home/ninez/android/marlin/kernel/sched/cpudeadline.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpudeadline.h
  4094. --- /home/ninez/android/marlin/kernel/sched/cpudeadline.h   2018-08-10 01:54:08.563395055 -0400
  4095. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpudeadline.h   2018-08-11 23:57:17.128607487 -0400
  4096. @@ -2,6 +2,7 @@
  4097.  #define _LINUX_CPUDL_H
  4098.  
  4099.  #include <linux/sched.h>
  4100. +#include <linux/sched/deadline.h>
  4101.  
  4102.  #define IDX_INVALID     -1
  4103.  
  4104. @@ -24,6 +25,8 @@
  4105.            struct cpumask *later_mask);
  4106.  void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
  4107.  int cpudl_init(struct cpudl *cp);
  4108. +void cpudl_set_freecpu(struct cpudl *cp, int cpu);
  4109. +void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
  4110.  void cpudl_cleanup(struct cpudl *cp);
  4111.  #else
  4112.  #define cpudl_set(cp, cpu, dl) do { } while (0)
  4113. diff -Nur /home/ninez/android/marlin/kernel/sched/cpufreq.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq.c
  4114. --- /home/ninez/android/marlin/kernel/sched/cpufreq.c   1969-12-31 19:00:00.000000000 -0500
  4115. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq.c   2018-08-11 23:57:17.128607487 -0400
  4116. @@ -0,0 +1,63 @@
  4117. +/*
  4118. + * Scheduler code and data structures related to cpufreq.
  4119. + *
  4120. + * Copyright (C) 2016, Intel Corporation
  4121. + * Author: Rafael J. Wysocki <[email protected]>
  4122. + *
  4123. + * This program is free software; you can redistribute it and/or modify
  4124. + * it under the terms of the GNU General Public License version 2 as
  4125. + * published by the Free Software Foundation.
  4126. + */
  4127. +
  4128. +#include "sched.h"
  4129. +
  4130. +DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
  4131. +
  4132. +/**
  4133. + * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
  4134. + * @cpu: The CPU to set the pointer for.
  4135. + * @data: New pointer value.
  4136. + * @func: Callback function to set for the CPU.
  4137. + *
  4138. + * Set and publish the update_util_data pointer for the given CPU.
  4139. + *
  4140. + * The update_util_data pointer of @cpu is set to @data and the callback
  4141. + * function pointer in the target struct update_util_data is set to @func.
  4142. + * That function will be called by cpufreq_update_util() from RCU-sched
  4143. + * read-side critical sections, so it must not sleep.  @data will always be
  4144. + * passed to it as the first argument which allows the function to get to the
  4145. + * target update_util_data structure and its container.
  4146. + *
  4147. + * The update_util_data pointer of @cpu must be NULL when this function is
  4148. + * called or it will WARN() and return with no effect.
  4149. + */
  4150. +void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
  4151. +           void (*func)(struct update_util_data *data, u64 time,
  4152. +                    unsigned int flags))
  4153. +{
  4154. +   if (WARN_ON(!data || !func))
  4155. +       return;
  4156. +
  4157. +   if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
  4158. +       return;
  4159. +
  4160. +   data->func = func;
  4161. +   rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
  4162. +}
  4163. +EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
  4164. +
  4165. +/**
  4166. + * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer.
  4167. + * @cpu: The CPU to clear the pointer for.
  4168. + *
  4169. + * Clear the update_util_data pointer for the given CPU.
  4170. + *
  4171. + * Callers must use RCU-sched callbacks to free any memory that might be
  4172. + * accessed via the old update_util_data pointer or invoke synchronize_sched()
  4173. + * right after this function to avoid use-after-free.
  4174. + */
  4175. +void cpufreq_remove_update_util_hook(int cpu)
  4176. +{
  4177. +   rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
  4178. +}
  4179. +EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
  4180. diff -Nur /home/ninez/android/marlin/kernel/sched/cpufreq_sched.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq_sched.c
  4181. --- /home/ninez/android/marlin/kernel/sched/cpufreq_sched.c 2018-08-10 01:54:08.563395055 -0400
  4182. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq_sched.c 2018-08-11 23:57:17.128607487 -0400
  4183. @@ -32,6 +32,12 @@
  4184.  static DEFINE_PER_CPU(unsigned long, enabled);
  4185.  DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
  4186.  
  4187. +struct gov_tunables {
  4188. +   struct gov_attr_set attr_set;
  4189. +   unsigned int up_throttle_nsec;
  4190. +   unsigned int down_throttle_nsec;
  4191. +};
  4192. +
  4193.  /**
  4194.   * gov_data - per-policy data internal to the governor
  4195.   * @up_throttle: next throttling period expiry if increasing OPP
  4196. @@ -53,8 +59,8 @@
  4197.  struct gov_data {
  4198.     ktime_t up_throttle;
  4199.     ktime_t down_throttle;
  4200. -   unsigned int up_throttle_nsec;
  4201. -   unsigned int down_throttle_nsec;
  4202. +   struct gov_tunables *tunables;
  4203. +   struct list_head tunables_hook;
  4204.     struct task_struct *task;
  4205.     struct irq_work irq_work;
  4206.     unsigned int requested_freq;
  4207. @@ -71,8 +77,10 @@
  4208.  
  4209.     __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
  4210.  
  4211. -   gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec);
  4212. -   gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec);
  4213. +   gd->up_throttle = ktime_add_ns(ktime_get(),
  4214. +                      gd->tunables->up_throttle_nsec);
  4215. +   gd->down_throttle = ktime_add_ns(ktime_get(),
  4216. +                    gd->tunables->down_throttle_nsec);
  4217.     up_write(&policy->rwsem);
  4218.  }
  4219.  
  4220. @@ -194,7 +202,7 @@
  4221.     }
  4222.  
  4223.     /* Convert the new maximum capacity request into a cpu frequency */
  4224. -   freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
  4225. +   freq_new = capacity * policy->cpuinfo.max_freq >> SCHED_CAPACITY_SHIFT;
  4226.     if (cpufreq_frequency_table_target(policy, policy->freq_table,
  4227.                        freq_new, CPUFREQ_RELATION_L,
  4228.                        &index_new))
  4229. @@ -227,6 +235,18 @@
  4230.     cpufreq_cpu_put(policy);
  4231.  }
  4232.  
  4233. +#ifdef CONFIG_SCHED_WALT
  4234. +static inline unsigned long
  4235. +requested_capacity(struct sched_capacity_reqs *scr)
  4236. +{
  4237. +   if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
  4238. +       return scr->cfs;
  4239. +   return scr->cfs + scr->rt;
  4240. +}
  4241. +#else
  4242. +#define requested_capacity(scr) (scr->cfs + scr->rt)
  4243. +#endif
  4244. +
  4245.  void update_cpu_capacity_request(int cpu, bool request)
  4246.  {
  4247.     unsigned long new_capacity;
  4248. @@ -237,25 +257,10 @@
  4249.  
  4250.     scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
  4251.  
  4252. -#ifdef CONFIG_SCHED_WALT
  4253. -   if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
  4254. -       /*
  4255. -        * Same WALT signal is set at different places, take the max
  4256. -        * reported utilization
  4257. -        */
  4258. -       new_capacity = max(scr->cfs, scr->rt);
  4259. -       new_capacity = max(new_capacity, scr->dl);
  4260. -   } else {
  4261. -       /*
  4262. -        * For PELT, utilization is aggregated
  4263. -        */
  4264. -       new_capacity = scr->cfs + scr->rt + scr->dl;
  4265. -   }
  4266. -#else
  4267. -   new_capacity = scr->cfs + scr->rt + scr->dl;
  4268. -#endif
  4269. +   new_capacity = requested_capacity(scr);
  4270.     new_capacity = new_capacity * capacity_margin
  4271.         / SCHED_CAPACITY_SCALE;
  4272. +   new_capacity += scr->dl;
  4273.  
  4274.     if (new_capacity == scr->total)
  4275.         return;
  4276. @@ -277,12 +282,70 @@
  4277.     static_key_slow_dec(&__sched_freq);
  4278.  }
  4279.  
  4280. -static struct attribute_group sched_attr_group_gov_pol;
  4281. -static struct attribute_group *get_sysfs_attr(void)
  4282. +/* Tunables */
  4283. +static struct gov_tunables *global_tunables;
  4284. +
  4285. +static inline struct gov_tunables *to_tunables(struct gov_attr_set *attr_set)
  4286. +{
  4287. +   return container_of(attr_set, struct gov_tunables, attr_set);
  4288. +}
  4289. +
  4290. +static ssize_t up_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
  4291. +{
  4292. +   struct gov_tunables *tunables = to_tunables(attr_set);
  4293. +
  4294. +   return sprintf(buf, "%u\n", tunables->up_throttle_nsec);
  4295. +}
  4296. +
  4297. +static ssize_t up_throttle_nsec_store(struct gov_attr_set *attr_set,
  4298. +                     const char *buf, size_t count)
  4299.  {
  4300. -   return &sched_attr_group_gov_pol;
  4301. +   struct gov_tunables *tunables = to_tunables(attr_set);
  4302. +   int ret;
  4303. +   long unsigned int val;
  4304. +
  4305. +   ret = kstrtoul(buf, 0, &val);
  4306. +   if (ret < 0)
  4307. +       return ret;
  4308. +   tunables->up_throttle_nsec = val;
  4309. +   return count;
  4310.  }
  4311.  
  4312. +static ssize_t down_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
  4313. +{
  4314. +   struct gov_tunables *tunables = to_tunables(attr_set);
  4315. +
  4316. +   return sprintf(buf, "%u\n", tunables->down_throttle_nsec);
  4317. +}
  4318. +
  4319. +static ssize_t down_throttle_nsec_store(struct gov_attr_set *attr_set,
  4320. +                   const char *buf, size_t count)
  4321. +{
  4322. +   struct gov_tunables *tunables = to_tunables(attr_set);
  4323. +   int ret;
  4324. +   long unsigned int val;
  4325. +
  4326. +   ret = kstrtoul(buf, 0, &val);
  4327. +   if (ret < 0)
  4328. +       return ret;
  4329. +   tunables->down_throttle_nsec = val;
  4330. +   return count;
  4331. +}
  4332. +
  4333. +static struct governor_attr up_throttle_nsec = __ATTR_RW(up_throttle_nsec);
  4334. +static struct governor_attr down_throttle_nsec = __ATTR_RW(down_throttle_nsec);
  4335. +
  4336. +static struct attribute *schedfreq_attributes[] = {
  4337. +   &up_throttle_nsec.attr,
  4338. +   &down_throttle_nsec.attr,
  4339. +   NULL
  4340. +};
  4341. +
  4342. +static struct kobj_type tunables_ktype = {
  4343. +   .default_attrs = schedfreq_attributes,
  4344. +   .sysfs_ops = &governor_sysfs_ops,
  4345. +};
  4346. +
  4347.  static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
  4348.  {
  4349.     struct gov_data *gd;
  4350. @@ -297,20 +360,40 @@
  4351.     if (!gd)
  4352.         return -ENOMEM;
  4353.  
  4354. -   gd->up_throttle_nsec = policy->cpuinfo.transition_latency ?
  4355. -               policy->cpuinfo.transition_latency :
  4356. -               THROTTLE_UP_NSEC;
  4357. -   gd->down_throttle_nsec = THROTTLE_DOWN_NSEC;
  4358. -   pr_debug("%s: throttle threshold = %u [ns]\n",
  4359. -         __func__, gd->up_throttle_nsec);
  4360. -
  4361.     policy->governor_data = gd;
  4362.  
  4363. -   rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr());
  4364. -   if (rc) {
  4365. -       pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc);
  4366. -       goto err;
  4367. -   }
  4368. +   if (!global_tunables) {
  4369. +       gd->tunables = kzalloc(sizeof(*gd->tunables), GFP_KERNEL);
  4370. +       if (!gd->tunables)
  4371. +           goto free_gd;
  4372. +
  4373. +       gd->tunables->up_throttle_nsec =
  4374. +           policy->cpuinfo.transition_latency ?
  4375. +           policy->cpuinfo.transition_latency :
  4376. +           THROTTLE_UP_NSEC;
  4377. +       gd->tunables->down_throttle_nsec =
  4378. +           THROTTLE_DOWN_NSEC;
  4379. +
  4380. +       rc = kobject_init_and_add(&gd->tunables->attr_set.kobj,
  4381. +                     &tunables_ktype,
  4382. +                     get_governor_parent_kobj(policy),
  4383. +                     "%s", cpufreq_gov_sched.name);
  4384. +       if (rc)
  4385. +           goto free_tunables;
  4386. +
  4387. +       gov_attr_set_init(&gd->tunables->attr_set,
  4388. +                 &gd->tunables_hook);
  4389. +
  4390. +       pr_debug("%s: throttle_threshold = %u [ns]\n",
  4391. +            __func__, gd->tunables->up_throttle_nsec);
  4392. +
  4393. +       if (!have_governor_per_policy())
  4394. +           global_tunables = gd->tunables;
  4395. +   } else {
  4396. +       gd->tunables = global_tunables;
  4397. +       gov_attr_set_get(&global_tunables->attr_set,
  4398. +                &gd->tunables_hook);
  4399. +   }
  4400.  
  4401.     if (cpufreq_driver_is_slow()) {
  4402.         cpufreq_driver_slow = true;
  4403. @@ -320,7 +403,7 @@
  4404.         if (IS_ERR_OR_NULL(gd->task)) {
  4405.             pr_err("%s: failed to create kschedfreq thread\n",
  4406.                    __func__);
  4407. -           goto err;
  4408. +           goto free_tunables;
  4409.         }
  4410.         get_task_struct(gd->task);
  4411.         kthread_bind_mask(gd->task, policy->related_cpus);
  4412. @@ -332,7 +415,9 @@
  4413.  
  4414.     return 0;
  4415.  
  4416. -err:
  4417. +free_tunables:
  4418. +   kfree(gd->tunables);
  4419. +free_gd:
  4420.     policy->governor_data = NULL;
  4421.     kfree(gd);
  4422.     return -ENOMEM;
  4423. @@ -340,6 +425,7 @@
  4424.  
  4425.  static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
  4426.  {
  4427. +   unsigned int count;
  4428.     struct gov_data *gd = policy->governor_data;
  4429.  
  4430.     if (!gd)
  4431. @@ -351,7 +437,12 @@
  4432.         put_task_struct(gd->task);
  4433.     }
  4434.  
  4435. -   sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr());
  4436. +   count = gov_attr_set_put(&gd->tunables->attr_set, &gd->tunables_hook);
  4437. +   if (!count) {
  4438. +       if (!have_governor_per_policy())
  4439. +           global_tunables = NULL;
  4440. +       kfree(gd->tunables);
  4441. +   }
  4442.  
  4443.     policy->governor_data = NULL;
  4444.  
  4445. @@ -413,88 +504,6 @@
  4446.     return 0;
  4447.  }
  4448.  
  4449. -/* Tunables */
  4450. -static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf)
  4451. -{
  4452. -   return sprintf(buf, "%u\n", gd->up_throttle_nsec);
  4453. -}
  4454. -
  4455. -static ssize_t store_up_throttle_nsec(struct gov_data *gd,
  4456. -       const char *buf, size_t count)
  4457. -{
  4458. -   int ret;
  4459. -   long unsigned int val;
  4460. -
  4461. -   ret = kstrtoul(buf, 0, &val);
  4462. -   if (ret < 0)
  4463. -       return ret;
  4464. -   gd->up_throttle_nsec = val;
  4465. -   return count;
  4466. -}
  4467. -
  4468. -static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf)
  4469. -{
  4470. -   return sprintf(buf, "%u\n", gd->down_throttle_nsec);
  4471. -}
  4472. -
  4473. -static ssize_t store_down_throttle_nsec(struct gov_data *gd,
  4474. -       const char *buf, size_t count)
  4475. -{
  4476. -   int ret;
  4477. -   long unsigned int val;
  4478. -
  4479. -   ret = kstrtoul(buf, 0, &val);
  4480. -   if (ret < 0)
  4481. -       return ret;
  4482. -   gd->down_throttle_nsec = val;
  4483. -   return count;
  4484. -}
  4485. -
  4486. -/*
  4487. - * Create show/store routines
  4488. - * - sys: One governor instance for complete SYSTEM
  4489. - * - pol: One governor instance per struct cpufreq_policy
  4490. - */
  4491. -#define show_gov_pol_sys(file_name)                    \
  4492. -static ssize_t show_##file_name##_gov_pol              \
  4493. -(struct cpufreq_policy *policy, char *buf)             \
  4494. -{                                  \
  4495. -   return show_##file_name(policy->governor_data, buf);        \
  4496. -}
  4497. -
  4498. -#define store_gov_pol_sys(file_name)                   \
  4499. -static ssize_t store_##file_name##_gov_pol             \
  4500. -(struct cpufreq_policy *policy, const char *buf, size_t count)     \
  4501. -{                                  \
  4502. -   return store_##file_name(policy->governor_data, buf, count);    \
  4503. -}
  4504. -
  4505. -#define gov_pol_attr_rw(_name)                     \
  4506. -   static struct freq_attr _name##_gov_pol =               \
  4507. -   __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
  4508. -
  4509. -#define show_store_gov_pol_sys(file_name)              \
  4510. -   show_gov_pol_sys(file_name);                        \
  4511. -   store_gov_pol_sys(file_name)
  4512. -#define tunable_handlers(file_name) \
  4513. -   show_gov_pol_sys(file_name); \
  4514. -   store_gov_pol_sys(file_name); \
  4515. -   gov_pol_attr_rw(file_name)
  4516. -
  4517. -tunable_handlers(down_throttle_nsec);
  4518. -tunable_handlers(up_throttle_nsec);
  4519. -
  4520. -/* Per policy governor instance */
  4521. -static struct attribute *sched_attributes_gov_pol[] = {
  4522. -   &up_throttle_nsec_gov_pol.attr,
  4523. -   &down_throttle_nsec_gov_pol.attr,
  4524. -   NULL,
  4525. -};
  4526. -
  4527. -static struct attribute_group sched_attr_group_gov_pol = {
  4528. -   .attrs = sched_attributes_gov_pol,
  4529. -   .name = "sched",
  4530. -};
  4531.  
  4532.  #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
  4533.  static
  4534. diff -Nur /home/ninez/android/marlin/kernel/sched/cpufreq_schedutil.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq_schedutil.c
  4535. --- /home/ninez/android/marlin/kernel/sched/cpufreq_schedutil.c 1969-12-31 19:00:00.000000000 -0500
  4536. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq_schedutil.c 2018-08-21 13:56:47.913412345 -0400
  4537. @@ -0,0 +1,874 @@
  4538. +/*
  4539. + * CPUFreq governor based on scheduler-provided CPU utilization data.
  4540. + *
  4541. + * Copyright (C) 2016, Intel Corporation
  4542. + * Author: Rafael J. Wysocki <[email protected]>
  4543. + *
  4544. + * This program is free software; you can redistribute it and/or modify
  4545. + * it under the terms of the GNU General Public License version 2 as
  4546. + * published by the Free Software Foundation.
  4547. + */
  4548. +
  4549. +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  4550. +
  4551. +#include <linux/cpufreq.h>
  4552. +#include <linux/kthread.h>
  4553. +#include <linux/slab.h>
  4554. +#include <trace/events/power.h>
  4555. +
  4556. +#include "sched.h"
  4557. +#include "tune.h"
  4558. +
  4559. +unsigned long boosted_cpu_util(int cpu);
  4560. +
  4561. +/* Stub out fast switch routines present on mainline to reduce the backport
  4562. + * overhead. */
  4563. +#define cpufreq_driver_fast_switch(x, y) 0
  4564. +#define cpufreq_enable_fast_switch(x)
  4565. +#define cpufreq_disable_fast_switch(x)
  4566. +#define LATENCY_MULTIPLIER         (1000)
  4567. +#define SUGOV_KTHREAD_PRIORITY 80
  4568. +
  4569. +struct sugov_tunables {
  4570. +   struct gov_attr_set attr_set;
  4571. +   unsigned int up_rate_limit_us;
  4572. +   unsigned int down_rate_limit_us;
  4573. +   bool iowait_boost_enable;
  4574. +};
  4575. +
  4576. +struct sugov_policy {
  4577. +   struct cpufreq_policy *policy;
  4578. +
  4579. +   struct sugov_tunables *tunables;
  4580. +   struct list_head tunables_hook;
  4581. +
  4582. +   raw_spinlock_t update_lock;  /* For shared policies */
  4583. +   u64 last_freq_update_time;
  4584. +   s64 min_rate_limit_ns;
  4585. +   s64 up_rate_delay_ns;
  4586. +   s64 down_rate_delay_ns;
  4587. +   unsigned int next_freq;
  4588. +   unsigned int cached_raw_freq;
  4589. +
  4590. +   /* The next fields are only needed if fast switch cannot be used. */
  4591. +   struct irq_work irq_work;
  4592. +   struct kthread_work work;
  4593. +   struct mutex work_lock;
  4594. +   struct kthread_worker worker;
  4595. +   struct task_struct *thread;
  4596. +   bool work_in_progress;
  4597. +
  4598. +   bool need_freq_update;
  4599. +};
  4600. +
  4601. +struct sugov_cpu {
  4602. +   struct update_util_data update_util;
  4603. +   struct sugov_policy *sg_policy;
  4604. +
  4605. +   bool iowait_boost_pending;
  4606. +   unsigned int iowait_boost;
  4607. +   unsigned int iowait_boost_max;
  4608. +   u64 last_update;
  4609. +
  4610. +   /* The fields below are only needed when sharing a policy. */
  4611. +   unsigned long util;
  4612. +   unsigned long max;
  4613. +   unsigned int flags;
  4614. +
  4615. +   /* The field below is for single-CPU policies only. */
  4616. +#ifdef CONFIG_NO_HZ_COMMON
  4617. +   unsigned long saved_idle_calls;
  4618. +#endif
  4619. +};
  4620. +
  4621. +static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
  4622. +
  4623. +/************************ Governor internals ***********************/
  4624. +
  4625. +static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
  4626. +{
  4627. +   s64 delta_ns;
  4628. +
  4629. +   if (unlikely(sg_policy->need_freq_update))
  4630. +       return true;
  4631. +
  4632. +   delta_ns = time - sg_policy->last_freq_update_time;
  4633. +
  4634. +   /* No need to recalculate next freq for min_rate_limit_us at least */
  4635. +   return delta_ns >= sg_policy->min_rate_limit_ns;
  4636. +}
  4637. +
  4638. +static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time,
  4639. +                    unsigned int next_freq)
  4640. +{
  4641. +   s64 delta_ns;
  4642. +
  4643. +   delta_ns = time - sg_policy->last_freq_update_time;
  4644. +
  4645. +   if (next_freq > sg_policy->next_freq &&
  4646. +       delta_ns < sg_policy->up_rate_delay_ns)
  4647. +           return true;
  4648. +
  4649. +   if (next_freq < sg_policy->next_freq &&
  4650. +       delta_ns < sg_policy->down_rate_delay_ns)
  4651. +           return true;
  4652. +
  4653. +   return false;
  4654. +}
  4655. +
  4656. +static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
  4657. +               unsigned int next_freq)
  4658. +{
  4659. +   struct cpufreq_policy *policy = sg_policy->policy;
  4660. +
  4661. +   if (sugov_up_down_rate_limit(sg_policy, time, next_freq)) {
  4662. +       /* Reset cached freq as next_freq isn't changed */
  4663. +       sg_policy->cached_raw_freq = 0;
  4664. +       return;
  4665. +   }
  4666. +
  4667. +   if (sg_policy->next_freq == next_freq)
  4668. +       return;
  4669. +
  4670. +   sg_policy->next_freq = next_freq;
  4671. +   sg_policy->last_freq_update_time = time;
  4672. +
  4673. +   if (policy->fast_switch_enabled) {
  4674. +       next_freq = cpufreq_driver_fast_switch(policy, next_freq);
  4675. +       if (next_freq == CPUFREQ_ENTRY_INVALID)
  4676. +           return;
  4677. +
  4678. +       policy->cur = next_freq;
  4679. +       trace_cpu_frequency(next_freq, smp_processor_id());
  4680. +   } else if (!sg_policy->work_in_progress) {
  4681. +       sg_policy->work_in_progress = true;
  4682. +       irq_work_queue(&sg_policy->irq_work);
  4683. +   }
  4684. +}
  4685. +
  4686. +/**
  4687. + * get_next_freq - Compute a new frequency for a given cpufreq policy.
  4688. + * @sg_policy: schedutil policy object to compute the new frequency for.
  4689. + * @util: Current CPU utilization.
  4690. + * @max: CPU capacity.
  4691. + *
  4692. + * If the utilization is frequency-invariant, choose the new frequency to be
  4693. + * proportional to it, that is
  4694. + *
  4695. + * next_freq = C * max_freq * util / max
  4696. + *
  4697. + * Otherwise, approximate the would-be frequency-invariant utilization by
  4698. + * util_raw * (curr_freq / max_freq) which leads to
  4699. + *
  4700. + * next_freq = C * curr_freq * util_raw / max
  4701. + *
  4702. + * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
  4703. + *
  4704. + * The lowest driver-supported frequency which is equal or greater than the raw
  4705. + * next_freq (as calculated above) is returned, subject to policy min/max and
  4706. + * cpufreq driver limitations.
  4707. + */
  4708. +static unsigned int get_next_freq(struct sugov_policy *sg_policy,
  4709. +                 unsigned long util, unsigned long max)
  4710. +{
  4711. +   struct cpufreq_policy *policy = sg_policy->policy;
  4712. +   unsigned int freq = arch_scale_freq_invariant() ?
  4713. +               policy->cpuinfo.max_freq : policy->cur;
  4714. +
  4715. +   freq = (freq + (freq >> 2)) * util / max;
  4716. +
  4717. +   if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
  4718. +       return sg_policy->next_freq;
  4719. +
  4720. +   sg_policy->need_freq_update = false;
  4721. +   sg_policy->cached_raw_freq = freq;
  4722. +   return cpufreq_driver_resolve_freq(policy, freq);
  4723. +}
  4724. +
  4725. +static inline bool use_pelt(void)
  4726. +{
  4727. +#ifdef CONFIG_SCHED_WALT
  4728. +   return (!sysctl_sched_use_walt_cpu_util || walt_disabled);
  4729. +#else
  4730. +   return true;
  4731. +#endif
  4732. +}
  4733. +
  4734. +static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time)
  4735. +{
  4736. +   int cpu = smp_processor_id();
  4737. +   struct rq *rq = cpu_rq(cpu);
  4738. +   unsigned long max_cap, rt;
  4739. +   s64 delta;
  4740. +
  4741. +   max_cap = arch_scale_cpu_capacity(NULL, cpu);
  4742. +
  4743. +   sched_avg_update(rq);
  4744. +   delta = time - rq->age_stamp;
  4745. +   if (unlikely(delta < 0))
  4746. +       delta = 0;
  4747. +   rt = div64_u64(rq->rt_avg, sched_avg_period() + delta);
  4748. +   rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;
  4749. +
  4750. +   *util = boosted_cpu_util(cpu);
  4751. +   if (use_pelt())
  4752. +       *util = *util + rt;
  4753. +
  4754. +   *util = min(*util, max_cap);
  4755. +   *max = max_cap;
  4756. +}
  4757. +
  4758. +static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
  4759. +                  unsigned int flags)
  4760. +{
  4761. +   struct sugov_policy *sg_policy = sg_cpu->sg_policy;
  4762. +
  4763. +   if (!sg_policy->tunables->iowait_boost_enable)
  4764. +       return;
  4765. +
  4766. +   /* Clear iowait_boost if the CPU apprears to have been idle. */
  4767. +   if (sg_cpu->iowait_boost) {
  4768. +       s64 delta_ns = time - sg_cpu->last_update;
  4769. +
  4770. +       if (delta_ns > TICK_NSEC) {
  4771. +           sg_cpu->iowait_boost = 0;
  4772. +           sg_cpu->iowait_boost_pending = false;
  4773. +       }
  4774. +   }
  4775. +
  4776. +   if (flags & SCHED_CPUFREQ_IOWAIT) {
  4777. +       if (sg_cpu->iowait_boost_pending)
  4778. +           return;
  4779. +
  4780. +       sg_cpu->iowait_boost_pending = true;
  4781. +
  4782. +       if (sg_cpu->iowait_boost) {
  4783. +           sg_cpu->iowait_boost <<= 1;
  4784. +           if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
  4785. +               sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
  4786. +       } else {
  4787. +           sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
  4788. +       }
  4789. +   }
  4790. +}
  4791. +
  4792. +static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
  4793. +                  unsigned long *max)
  4794. +{
  4795. +   unsigned int boost_util, boost_max;
  4796. +
  4797. +   if (!sg_cpu->iowait_boost)
  4798. +       return;
  4799. +
  4800. +   if (sg_cpu->iowait_boost_pending) {
  4801. +       sg_cpu->iowait_boost_pending = false;
  4802. +   } else {
  4803. +       sg_cpu->iowait_boost >>= 1;
  4804. +       if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
  4805. +           sg_cpu->iowait_boost = 0;
  4806. +           return;
  4807. +       }
  4808. +   }
  4809. +
  4810. +   boost_util = sg_cpu->iowait_boost;
  4811. +   boost_max = sg_cpu->iowait_boost_max;
  4812. +
  4813. +   if (*util * boost_max < *max * boost_util) {
  4814. +       *util = boost_util;
  4815. +       *max = boost_max;
  4816. +   }
  4817. +}
  4818. +
  4819. +#ifdef CONFIG_NO_HZ_COMMON
  4820. +static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
  4821. +{
  4822. +   unsigned long idle_calls = tick_nohz_get_idle_calls();
  4823. +   bool ret = idle_calls == sg_cpu->saved_idle_calls;
  4824. +
  4825. +   sg_cpu->saved_idle_calls = idle_calls;
  4826. +   return ret;
  4827. +}
  4828. +#else
  4829. +static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
  4830. +#endif /* CONFIG_NO_HZ_COMMON */
  4831. +
  4832. +static void sugov_update_single(struct update_util_data *hook, u64 time,
  4833. +               unsigned int flags)
  4834. +{
  4835. +   struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
  4836. +   struct sugov_policy *sg_policy = sg_cpu->sg_policy;
  4837. +   struct cpufreq_policy *policy = sg_policy->policy;
  4838. +   unsigned long util, max;
  4839. +   unsigned int next_f;
  4840. +   bool busy;
  4841. +
  4842. +   sugov_set_iowait_boost(sg_cpu, time, flags);
  4843. +   sg_cpu->last_update = time;
  4844. +
  4845. +   /*
  4846. +    * For slow-switch systems, single policy requests can't run at the
  4847. +    * moment if update is in progress, unless we acquire update_lock.
  4848. +    */
  4849. +   if (sg_policy->work_in_progress)
  4850. +       return;
  4851. +
  4852. +   if (!sugov_should_update_freq(sg_policy, time))
  4853. +       return;
  4854. +
  4855. +   busy = sugov_cpu_is_busy(sg_cpu);
  4856. +
  4857. +   if (flags & SCHED_CPUFREQ_DL) {
  4858. +       next_f = policy->cpuinfo.max_freq;
  4859. +   } else {
  4860. +       sugov_get_util(&util, &max, time);
  4861. +       sugov_iowait_boost(sg_cpu, &util, &max);
  4862. +       next_f = get_next_freq(sg_policy, util, max);
  4863. +       /*
  4864. +        * Do not reduce the frequency if the CPU has not been idle
  4865. +        * recently, as the reduction is likely to be premature then.
  4866. +        */
  4867. +       if (busy && next_f < sg_policy->next_freq) {
  4868. +           next_f = sg_policy->next_freq;
  4869. +
  4870. +           /* Reset cached freq as next_freq has changed */
  4871. +           sg_policy->cached_raw_freq = 0;
  4872. +       }
  4873. +   }
  4874. +   sugov_update_commit(sg_policy, time, next_f);
  4875. +}
  4876. +
  4877. +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
  4878. +{
  4879. +   struct sugov_policy *sg_policy = sg_cpu->sg_policy;
  4880. +   struct cpufreq_policy *policy = sg_policy->policy;
  4881. +   unsigned long util = 0, max = 1;
  4882. +   unsigned int j;
  4883. +
  4884. +   for_each_cpu(j, policy->cpus) {
  4885. +       struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
  4886. +       unsigned long j_util, j_max;
  4887. +       s64 delta_ns;
  4888. +
  4889. +       /*
  4890. +        * If the CPU utilization was last updated before the previous
  4891. +        * frequency update and the time elapsed between the last update
  4892. +        * of the CPU utilization and the last frequency update is long
  4893. +        * enough, don't take the CPU into account as it probably is
  4894. +        * idle now (and clear iowait_boost for it).
  4895. +        */
  4896. +       delta_ns = time - j_sg_cpu->last_update;
  4897. +       if (delta_ns > TICK_NSEC) {
  4898. +           j_sg_cpu->iowait_boost = 0;
  4899. +           j_sg_cpu->iowait_boost_pending = false;
  4900. +           continue;
  4901. +       }
  4902. +       if (j_sg_cpu->flags & SCHED_CPUFREQ_DL)
  4903. +           return policy->cpuinfo.max_freq;
  4904. +
  4905. +       j_util = j_sg_cpu->util;
  4906. +       j_max = j_sg_cpu->max;
  4907. +       if (j_util * max > j_max * util) {
  4908. +           util = j_util;
  4909. +           max = j_max;
  4910. +       }
  4911. +
  4912. +       sugov_iowait_boost(j_sg_cpu, &util, &max);
  4913. +   }
  4914. +
  4915. +   return get_next_freq(sg_policy, util, max);
  4916. +}
  4917. +
  4918. +static void sugov_update_shared(struct update_util_data *hook, u64 time,
  4919. +               unsigned int flags)
  4920. +{
  4921. +   struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
  4922. +   struct sugov_policy *sg_policy = sg_cpu->sg_policy;
  4923. +   unsigned long util, max;
  4924. +   unsigned int next_f;
  4925. +
  4926. +   sugov_get_util(&util, &max, time);
  4927. +
  4928. +   raw_spin_lock(&sg_policy->update_lock);
  4929. +
  4930. +   sg_cpu->util = util;
  4931. +   sg_cpu->max = max;
  4932. +   sg_cpu->flags = flags;
  4933. +
  4934. +   sugov_set_iowait_boost(sg_cpu, time, flags);
  4935. +   sg_cpu->last_update = time;
  4936. +
  4937. +   if (sugov_should_update_freq(sg_policy, time)) {
  4938. +       if (flags & SCHED_CPUFREQ_DL)
  4939. +           next_f = sg_policy->policy->cpuinfo.max_freq;
  4940. +       else
  4941. +           next_f = sugov_next_freq_shared(sg_cpu, time);
  4942. +
  4943. +       sugov_update_commit(sg_policy, time, next_f);
  4944. +   }
  4945. +
  4946. +   raw_spin_unlock(&sg_policy->update_lock);
  4947. +}
  4948. +
  4949. +static void sugov_work(struct kthread_work *work)
  4950. +{
  4951. +   struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
  4952. +   unsigned int freq;
  4953. +   unsigned long flags;
  4954. +
  4955. +   /*
  4956. +    * Hold sg_policy->update_lock shortly to handle the case where:
  4957. +    * incase sg_policy->next_freq is read here, and then updated by
  4958. +    * sugov_update_shared just before work_in_progress is set to false
  4959. +    * here, we may miss queueing the new update.
  4960. +    *
  4961. +    * Note: If a work was queued after the update_lock is released,
  4962. +    * sugov_work will just be called again by kthread_work code; and the
  4963. +    * request will be proceed before the sugov thread sleeps.
  4964. +    */
  4965. +   raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
  4966. +   freq = sg_policy->next_freq;
  4967. +   sg_policy->work_in_progress = false;
  4968. +   raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
  4969. +
  4970. +   mutex_lock(&sg_policy->work_lock);
  4971. +   __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
  4972. +   mutex_unlock(&sg_policy->work_lock);
  4973. +}
  4974. +
  4975. +static void sugov_irq_work(struct irq_work *irq_work)
  4976. +{
  4977. +   struct sugov_policy *sg_policy;
  4978. +
  4979. +   sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
  4980. +
  4981. +   /*
  4982. +    * For RT and deadline tasks, the schedutil governor shoots the
  4983. +    * frequency to maximum. Special care must be taken to ensure that this
  4984. +    * kthread doesn't result in the same behavior.
  4985. +    *
  4986. +    * This is (mostly) guaranteed by the work_in_progress flag. The flag is
  4987. +    * updated only at the end of the sugov_work() function and before that
  4988. +    * the schedutil governor rejects all other frequency scaling requests.
  4989. +    *
  4990. +    * There is a very rare case though, where the RT thread yields right
  4991. +    * after the work_in_progress flag is cleared. The effects of that are
  4992. +    * neglected for now.
  4993. +    */
  4994. +   kthread_queue_work(&sg_policy->worker, &sg_policy->work);
  4995. +}
  4996. +
  4997. +/************************** sysfs interface ************************/
  4998. +
  4999. +static struct sugov_tunables *global_tunables;
  5000. +static DEFINE_MUTEX(global_tunables_lock);
  5001. +
  5002. +static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
  5003. +{
  5004. +   return container_of(attr_set, struct sugov_tunables, attr_set);
  5005. +}
  5006. +
  5007. +static DEFINE_MUTEX(min_rate_lock);
  5008. +
  5009. +static void update_min_rate_limit_us(struct sugov_policy *sg_policy)
  5010. +{
  5011. +   mutex_lock(&min_rate_lock);
  5012. +   sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,
  5013. +                      sg_policy->down_rate_delay_ns);
  5014. +   mutex_unlock(&min_rate_lock);
  5015. +}
  5016. +
  5017. +static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
  5018. +{
  5019. +   struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
  5020. +
  5021. +   return sprintf(buf, "%u\n", tunables->up_rate_limit_us);
  5022. +}
  5023. +
  5024. +static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
  5025. +{
  5026. +   struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
  5027. +
  5028. +   return sprintf(buf, "%u\n", tunables->down_rate_limit_us);
  5029. +}
  5030. +
  5031. +static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set,
  5032. +                     const char *buf, size_t count)
  5033. +{
  5034. +   struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
  5035. +   struct sugov_policy *sg_policy;
  5036. +   unsigned int rate_limit_us;
  5037. +
  5038. +   if (kstrtouint(buf, 10, &rate_limit_us))
  5039. +       return -EINVAL;
  5040. +
  5041. +   tunables->up_rate_limit_us = rate_limit_us;
  5042. +
  5043. +   list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
  5044. +       sg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
  5045. +       update_min_rate_limit_us(sg_policy);
  5046. +   }
  5047. +
  5048. +   return count;
  5049. +}
  5050. +
  5051. +static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set,
  5052. +                   const char *buf, size_t count)
  5053. +{
  5054. +   struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
  5055. +   struct sugov_policy *sg_policy;
  5056. +   unsigned int rate_limit_us;
  5057. +
  5058. +   if (kstrtouint(buf, 10, &rate_limit_us))
  5059. +       return -EINVAL;
  5060. +
  5061. +   tunables->down_rate_limit_us = rate_limit_us;
  5062. +
  5063. +   list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
  5064. +       sg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
  5065. +       update_min_rate_limit_us(sg_policy);
  5066. +   }
  5067. +
  5068. +   return count;
  5069. +}
  5070. +
  5071. +static ssize_t iowait_boost_enable_show(struct gov_attr_set *attr_set,
  5072. +                   char *buf)
  5073. +{
  5074. +   struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
  5075. +
  5076. +   return sprintf(buf, "%u\n", tunables->iowait_boost_enable);
  5077. +}
  5078. +
  5079. +static ssize_t iowait_boost_enable_store(struct gov_attr_set *attr_set,
  5080. +                    const char *buf, size_t count)
  5081. +{
  5082. +   struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
  5083. +   bool enable;
  5084. +
  5085. +   if (kstrtobool(buf, &enable))
  5086. +       return -EINVAL;
  5087. +
  5088. +   tunables->iowait_boost_enable = enable;
  5089. +
  5090. +   return count;
  5091. +}
  5092. +
  5093. +static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us);
  5094. +static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us);
  5095. +static struct governor_attr iowait_boost_enable = __ATTR_RW(iowait_boost_enable);
  5096. +
  5097. +static struct attribute *sugov_attributes[] = {
  5098. +   &up_rate_limit_us.attr,
  5099. +   &down_rate_limit_us.attr,
  5100. +   &iowait_boost_enable.attr,
  5101. +   NULL
  5102. +};
  5103. +
  5104. +static struct kobj_type sugov_tunables_ktype = {
  5105. +   .default_attrs = sugov_attributes,
  5106. +   .sysfs_ops = &governor_sysfs_ops,
  5107. +};
  5108. +
  5109. +/********************** cpufreq governor interface *********************/
  5110. +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
  5111. +static
  5112. +#endif
  5113. +struct cpufreq_governor cpufreq_gov_schedutil;
  5114. +
  5115. +static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
  5116. +{
  5117. +   struct sugov_policy *sg_policy;
  5118. +
  5119. +   sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
  5120. +   if (!sg_policy)
  5121. +       return NULL;
  5122. +
  5123. +   sg_policy->policy = policy;
  5124. +   raw_spin_lock_init(&sg_policy->update_lock);
  5125. +   return sg_policy;
  5126. +}
  5127. +
  5128. +static void sugov_policy_free(struct sugov_policy *sg_policy)
  5129. +{
  5130. +   kfree(sg_policy);
  5131. +}
  5132. +
  5133. +static int sugov_kthread_create(struct sugov_policy *sg_policy)
  5134. +{
  5135. +   struct task_struct *thread;
  5136. +   struct sched_param param = { .sched_priority = 80 };
  5137. +   struct cpufreq_policy *policy = sg_policy->policy;
  5138. +   int ret;
  5139. +
  5140. +   /* kthread only required for slow path */
  5141. +   if (policy->fast_switch_enabled)
  5142. +       return 0;
  5143. +
  5144. +   kthread_init_work(&sg_policy->work, sugov_work);
  5145. +   kthread_init_worker(&sg_policy->worker);
  5146. +   thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
  5147. +               "sugov:%d",
  5148. +               cpumask_first(policy->related_cpus));
  5149. +   if (IS_ERR(thread)) {
  5150. +       pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
  5151. +       return PTR_ERR(thread);
  5152. +   }
  5153. +
  5154. +   ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
  5155. +   if (ret) {
  5156. +       kthread_stop(thread);
  5157. +       pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
  5158. +       return ret;
  5159. +   }
  5160. +
  5161. +   sg_policy->thread = thread;
  5162. +   kthread_bind_mask(thread, policy->related_cpus);
  5163. +   init_irq_work(&sg_policy->irq_work, sugov_irq_work);
  5164. +   mutex_init(&sg_policy->work_lock);
  5165. +
  5166. +   wake_up_process(thread);
  5167. +
  5168. +   return 0;
  5169. +}
  5170. +
  5171. +static void sugov_kthread_stop(struct sugov_policy *sg_policy)
  5172. +{
  5173. +   /* kthread only required for slow path */
  5174. +   if (sg_policy->policy->fast_switch_enabled)
  5175. +       return;
  5176. +
  5177. +   kthread_flush_worker(&sg_policy->worker);
  5178. +   kthread_stop(sg_policy->thread);
  5179. +   mutex_destroy(&sg_policy->work_lock);
  5180. +}
  5181. +
  5182. +static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
  5183. +{
  5184. +   struct sugov_tunables *tunables;
  5185. +
  5186. +   tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
  5187. +   if (tunables) {
  5188. +       gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
  5189. +       if (!have_governor_per_policy())
  5190. +           global_tunables = tunables;
  5191. +   }
  5192. +   return tunables;
  5193. +}
  5194. +
  5195. +static void sugov_tunables_free(struct sugov_tunables *tunables)
  5196. +{
  5197. +   if (!have_governor_per_policy())
  5198. +       global_tunables = NULL;
  5199. +
  5200. +   kfree(tunables);
  5201. +}
  5202. +
  5203. +static int sugov_init(struct cpufreq_policy *policy)
  5204. +{
  5205. +   struct sugov_policy *sg_policy;
  5206. +   struct sugov_tunables *tunables;
  5207. +   int ret = 0;
  5208. +
  5209. +   /* State should be equivalent to EXIT */
  5210. +   if (policy->governor_data)
  5211. +       return -EBUSY;
  5212. +
  5213. +   cpufreq_enable_fast_switch(policy);
  5214. +
  5215. +   sg_policy = sugov_policy_alloc(policy);
  5216. +   if (!sg_policy) {
  5217. +       ret = -ENOMEM;
  5218. +       goto disable_fast_switch;
  5219. +   }
  5220. +
  5221. +   ret = sugov_kthread_create(sg_policy);
  5222. +   if (ret)
  5223. +       goto free_sg_policy;
  5224. +
  5225. +   mutex_lock(&global_tunables_lock);
  5226. +
  5227. +   if (global_tunables) {
  5228. +       if (WARN_ON(have_governor_per_policy())) {
  5229. +           ret = -EINVAL;
  5230. +           goto stop_kthread;
  5231. +       }
  5232. +       policy->governor_data = sg_policy;
  5233. +       sg_policy->tunables = global_tunables;
  5234. +
  5235. +       gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
  5236. +       goto out;
  5237. +   }
  5238. +
  5239. +   tunables = sugov_tunables_alloc(sg_policy);
  5240. +   if (!tunables) {
  5241. +       ret = -ENOMEM;
  5242. +       goto stop_kthread;
  5243. +   }
  5244. +
  5245. +   if (policy->up_transition_delay_us && policy->down_transition_delay_us) {
  5246. +       tunables->up_rate_limit_us = policy->up_transition_delay_us;
  5247. +       tunables->down_rate_limit_us = policy->down_transition_delay_us;
  5248. +   } else {
  5249. +       unsigned int lat;
  5250. +
  5251. +                tunables->up_rate_limit_us = LATENCY_MULTIPLIER;
  5252. +                tunables->down_rate_limit_us = LATENCY_MULTIPLIER;
  5253. +       lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
  5254. +       if (lat) {
  5255. +                        tunables->up_rate_limit_us *= lat;
  5256. +                        tunables->down_rate_limit_us *= lat;
  5257. +                }
  5258. +   }
  5259. +
  5260. +   tunables->iowait_boost_enable = policy->iowait_boost_enable;
  5261. +
  5262. +   policy->governor_data = sg_policy;
  5263. +   sg_policy->tunables = tunables;
  5264. +
  5265. +   ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
  5266. +                  get_governor_parent_kobj(policy), "%s",
  5267. +                  cpufreq_gov_schedutil.name);
  5268. +   if (ret)
  5269. +       goto fail;
  5270. +
  5271. +out:
  5272. +   mutex_unlock(&global_tunables_lock);
  5273. +   return 0;
  5274. +
  5275. +fail:
  5276. +   policy->governor_data = NULL;
  5277. +   sugov_tunables_free(tunables);
  5278. +
  5279. +stop_kthread:
  5280. +   sugov_kthread_stop(sg_policy);
  5281. +
  5282. +free_sg_policy:
  5283. +   mutex_unlock(&global_tunables_lock);
  5284. +
  5285. +   sugov_policy_free(sg_policy);
  5286. +
  5287. +disable_fast_switch:
  5288. +   cpufreq_disable_fast_switch(policy);
  5289. +
  5290. +   pr_err("initialization failed (error %d)\n", ret);
  5291. +   return ret;
  5292. +}
  5293. +
  5294. +static int sugov_exit(struct cpufreq_policy *policy)
  5295. +{
  5296. +   struct sugov_policy *sg_policy = policy->governor_data;
  5297. +   struct sugov_tunables *tunables = sg_policy->tunables;
  5298. +   unsigned int count;
  5299. +
  5300. +   mutex_lock(&global_tunables_lock);
  5301. +
  5302. +   count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
  5303. +   policy->governor_data = NULL;
  5304. +   if (!count)
  5305. +       sugov_tunables_free(tunables);
  5306. +
  5307. +   mutex_unlock(&global_tunables_lock);
  5308. +
  5309. +   sugov_kthread_stop(sg_policy);
  5310. +   sugov_policy_free(sg_policy);
  5311. +
  5312. +   cpufreq_disable_fast_switch(policy);
  5313. +   return 0;
  5314. +}
  5315. +
  5316. +static int sugov_start(struct cpufreq_policy *policy)
  5317. +{
  5318. +   struct sugov_policy *sg_policy = policy->governor_data;
  5319. +   unsigned int cpu;
  5320. +
  5321. +   sg_policy->up_rate_delay_ns =
  5322. +       sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;
  5323. +   sg_policy->down_rate_delay_ns =
  5324. +       sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
  5325. +   update_min_rate_limit_us(sg_policy);
  5326. +   sg_policy->last_freq_update_time = 0;
  5327. +   sg_policy->next_freq = 0;
  5328. +   sg_policy->work_in_progress = false;
  5329. +   sg_policy->need_freq_update = false;
  5330. +   sg_policy->cached_raw_freq = 0;
  5331. +
  5332. +   for_each_cpu(cpu, policy->cpus) {
  5333. +       struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
  5334. +
  5335. +       memset(sg_cpu, 0, sizeof(*sg_cpu));
  5336. +       sg_cpu->sg_policy = sg_policy;
  5337. +       sg_cpu->flags = SCHED_CPUFREQ_DL;
  5338. +       sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
  5339. +       cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
  5340. +                        policy_is_shared(policy) ?
  5341. +                           sugov_update_shared :
  5342. +                           sugov_update_single);
  5343. +   }
  5344. +   return 0;
  5345. +}
  5346. +
  5347. +static int sugov_stop(struct cpufreq_policy *policy)
  5348. +{
  5349. +   struct sugov_policy *sg_policy = policy->governor_data;
  5350. +   unsigned int cpu;
  5351. +
  5352. +   for_each_cpu(cpu, policy->cpus)
  5353. +       cpufreq_remove_update_util_hook(cpu);
  5354. +
  5355. +   synchronize_sched();
  5356. +
  5357. +   if (!policy->fast_switch_enabled) {
  5358. +       irq_work_sync(&sg_policy->irq_work);
  5359. +       kthread_cancel_work_sync(&sg_policy->work);
  5360. +   }
  5361. +   return 0;
  5362. +}
  5363. +
  5364. +static int sugov_limits(struct cpufreq_policy *policy)
  5365. +{
  5366. +   struct sugov_policy *sg_policy = policy->governor_data;
  5367. +
  5368. +   if (!policy->fast_switch_enabled) {
  5369. +       mutex_lock(&sg_policy->work_lock);
  5370. +       cpufreq_policy_apply_limits(policy);
  5371. +       mutex_unlock(&sg_policy->work_lock);
  5372. +   }
  5373. +
  5374. +   sg_policy->need_freq_update = true;
  5375. +
  5376. +   return 0;
  5377. +}
  5378. +
  5379. +static int cpufreq_schedutil_cb(struct cpufreq_policy *policy,
  5380. +               unsigned int event)
  5381. +{
  5382. +   switch(event) {
  5383. +   case CPUFREQ_GOV_POLICY_INIT:
  5384. +       return sugov_init(policy);
  5385. +   case CPUFREQ_GOV_POLICY_EXIT:
  5386. +       return sugov_exit(policy);
  5387. +   case CPUFREQ_GOV_START:
  5388. +       return sugov_start(policy);
  5389. +   case CPUFREQ_GOV_STOP:
  5390. +       return sugov_stop(policy);
  5391. +   case CPUFREQ_GOV_LIMITS:
  5392. +       return sugov_limits(policy);
  5393. +   default:
  5394. +       BUG();
  5395. +   }
  5396. +}
  5397. +
  5398. +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
  5399. +static
  5400. +#endif
  5401. +struct cpufreq_governor cpufreq_gov_schedutil = {
  5402. +   .name = "schedutil",
  5403. +   .governor = cpufreq_schedutil_cb,
  5404. +   .owner = THIS_MODULE,
  5405. +};
  5406. +
  5407. +static int __init sugov_register(void)
  5408. +{
  5409. +   return cpufreq_register_governor(&cpufreq_gov_schedutil);
  5410. +}
  5411. +fs_initcall(sugov_register);
  5412. diff -Nur /home/ninez/android/marlin/kernel/sched/cputime.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cputime.c
  5413. --- /home/ninez/android/marlin/kernel/sched/cputime.c   2018-08-10 01:54:08.563395055 -0400
  5414. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cputime.c   2018-08-11 23:57:17.128607487 -0400
  5415. @@ -306,6 +306,26 @@
  5416.     return false;
  5417.  }
  5418.  
  5419. +#ifdef CONFIG_64BIT
  5420. +static inline u64 read_sum_exec_runtime(struct task_struct *t)
  5421. +{
  5422. +   return t->se.sum_exec_runtime;
  5423. +}
  5424. +#else
  5425. +static u64 read_sum_exec_runtime(struct task_struct *t)
  5426. +{
  5427. +   u64 ns;
  5428. +   struct rq_flags rf;
  5429. +   struct rq *rq;
  5430. +
  5431. +   rq = task_rq_lock(t, &rf);
  5432. +   ns = t->se.sum_exec_runtime;
  5433. +   task_rq_unlock(rq, t, &rf);
  5434. +
  5435. +   return ns;
  5436. +}
  5437. +#endif
  5438. +
  5439.  /*
  5440.   * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
  5441.   * tasks (sum on group iteration) belonging to @tsk's group.
  5442. @@ -318,6 +338,17 @@
  5443.     unsigned int seq, nextseq;
  5444.     unsigned long flags;
  5445.  
  5446. +   /*
  5447. +    * Update current task runtime to account pending time since last
  5448. +    * scheduler action or thread_group_cputime() call. This thread group
  5449. +    * might have other running tasks on different CPUs, but updating
  5450. +    * their runtime can affect syscall performance, so we skip account
  5451. +    * those pending times and rely only on values updated on tick or
  5452. +    * other scheduler action.
  5453. +    */
  5454. +   if (same_thread_group(current, tsk))
  5455. +       (void) task_sched_runtime(current);
  5456. +
  5457.     rcu_read_lock();
  5458.     /* Attempt a lockless read on the first round. */
  5459.     nextseq = 0;
  5460. @@ -332,7 +363,7 @@
  5461.             task_cputime(t, &utime, &stime);
  5462.             times->utime += utime;
  5463.             times->stime += stime;
  5464. -           times->sum_exec_runtime += task_sched_runtime(t);
  5465. +           times->sum_exec_runtime += read_sum_exec_runtime(t);
  5466.         }
  5467.         /* If lockless access failed, take the lock. */
  5468.         nextseq = 1;
  5469. @@ -582,48 +613,43 @@
  5470.  }
  5471.  
  5472.  /*
  5473. - * Atomically advance counter to the new value. Interrupts, vcpu
  5474. - * scheduling, and scaling inaccuracies can cause cputime_advance
  5475. - * to be occasionally called with a new value smaller than counter.
  5476. - * Let's enforce atomicity.
  5477. + * Adjust tick based cputime random precision against scheduler runtime
  5478. + * accounting.
  5479.   *
  5480. - * Normally a caller will only go through this loop once, or not
  5481. - * at all in case a previous caller updated counter the same jiffy.
  5482. - */
  5483. -static void cputime_advance(cputime_t *counter, cputime_t new)
  5484. -{
  5485. -   cputime_t old;
  5486. -
  5487. -   while (new > (old = READ_ONCE(*counter)))
  5488. -       cmpxchg_cputime(counter, old, new);
  5489. -}
  5490. -
  5491. -/*
  5492. - * Adjust tick based cputime random precision against scheduler
  5493. - * runtime accounting.
  5494. + * Tick based cputime accounting depend on random scheduling timeslices of a
  5495. + * task to be interrupted or not by the timer.  Depending on these
  5496. + * circumstances, the number of these interrupts may be over or
  5497. + * under-optimistic, matching the real user and system cputime with a variable
  5498. + * precision.
  5499. + *
  5500. + * Fix this by scaling these tick based values against the total runtime
  5501. + * accounted by the CFS scheduler.
  5502. + *
  5503. + * This code provides the following guarantees:
  5504. + *
  5505. + *   stime + utime == rtime
  5506. + *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
  5507. + *
  5508. + * Assuming that rtime_i+1 >= rtime_i.
  5509.   */
  5510.  static void cputime_adjust(struct task_cputime *curr,
  5511. -              struct cputime *prev,
  5512. +              struct prev_cputime *prev,
  5513.                cputime_t *ut, cputime_t *st)
  5514.  {
  5515.     cputime_t rtime, stime, utime;
  5516. +   unsigned long flags;
  5517.  
  5518. -   /*
  5519. -    * Tick based cputime accounting depend on random scheduling
  5520. -    * timeslices of a task to be interrupted or not by the timer.
  5521. -    * Depending on these circumstances, the number of these interrupts
  5522. -    * may be over or under-optimistic, matching the real user and system
  5523. -    * cputime with a variable precision.
  5524. -    *
  5525. -    * Fix this by scaling these tick based values against the total
  5526. -    * runtime accounted by the CFS scheduler.
  5527. -    */
  5528. +   /* Serialize concurrent callers such that we can honour our guarantees */
  5529. +   raw_spin_lock_irqsave(&prev->lock, flags);
  5530.     rtime = nsecs_to_cputime(curr->sum_exec_runtime);
  5531.  
  5532.     /*
  5533. -    * Update userspace visible utime/stime values only if actual execution
  5534. -    * time is bigger than already exported. Note that can happen, that we
  5535. -    * provided bigger values due to scaling inaccuracy on big numbers.
  5536. +    * This is possible under two circumstances:
  5537. +    *  - rtime isn't monotonic after all (a bug);
  5538. +    *  - we got reordered by the lock.
  5539. +    *
  5540. +    * In both cases this acts as a filter such that the rest of the code
  5541. +    * can assume it is monotonic regardless of anything else.
  5542.      */
  5543.     if (prev->stime + prev->utime >= rtime)
  5544.         goto out;
  5545. @@ -633,22 +659,46 @@
  5546.  
  5547.     if (utime == 0) {
  5548.         stime = rtime;
  5549. -   } else if (stime == 0) {
  5550. -       utime = rtime;
  5551. -   } else {
  5552. -       cputime_t total = stime + utime;
  5553. +       goto update;
  5554. +   }
  5555.  
  5556. -       stime = scale_stime((__force u64)stime,
  5557. -                   (__force u64)rtime, (__force u64)total);
  5558. -       utime = rtime - stime;
  5559. +   if (stime == 0) {
  5560. +       utime = rtime;
  5561. +       goto update;
  5562.     }
  5563.  
  5564. -   cputime_advance(&prev->stime, stime);
  5565. -   cputime_advance(&prev->utime, utime);
  5566. +   stime = scale_stime((__force u64)stime, (__force u64)rtime,
  5567. +               (__force u64)(stime + utime));
  5568. +
  5569. +   /*
  5570. +    * Make sure stime doesn't go backwards; this preserves monotonicity
  5571. +    * for utime because rtime is monotonic.
  5572. +    *
  5573. +    *  utime_i+1 = rtime_i+1 - stime_i
  5574. +    *            = rtime_i+1 - (rtime_i - utime_i)
  5575. +    *            = (rtime_i+1 - rtime_i) + utime_i
  5576. +    *            >= utime_i
  5577. +    */
  5578. +   if (stime < prev->stime)
  5579. +       stime = prev->stime;
  5580. +   utime = rtime - stime;
  5581. +
  5582. +   /*
  5583. +    * Make sure utime doesn't go backwards; this still preserves
  5584. +    * monotonicity for stime, analogous argument to above.
  5585. +    */
  5586. +   if (utime < prev->utime) {
  5587. +       utime = prev->utime;
  5588. +       stime = rtime - utime;
  5589. +   }
  5590.  
  5591. +update:
  5592. +   prev->stime = stime;
  5593. +   prev->utime = utime;
  5594.  out:
  5595.     *ut = prev->utime;
  5596.     *st = prev->stime;
  5597. +   raw_spin_unlock_irqrestore(&prev->lock, flags);
  5598.  }
  5599.  
  5600.  void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
  5601. diff -Nur /home/ninez/android/marlin/kernel/sched/deadline.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/deadline.c
  5602. --- /home/ninez/android/marlin/kernel/sched/deadline.c  2018-08-10 01:54:08.563395055 -0400
  5603. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/deadline.c  2018-08-26 16:43:11.647206295 -0400
  5604. @@ -18,6 +18,8 @@
  5605.  
  5606.  #include <linux/slab.h>
  5607.  
  5608. +#include "walt.h"
  5609. +
  5610.  struct dl_bandwidth def_dl_bandwidth;
  5611.  
  5612.  static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
  5613. @@ -87,7 +89,7 @@
  5614.     dl_b->total_bw = 0;
  5615.  }
  5616.  
  5617. -void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
  5618. +void init_dl_rq(struct dl_rq *dl_rq)
  5619.  {
  5620.     dl_rq->rb_root = RB_ROOT;
  5621.  
  5622. @@ -152,7 +154,7 @@
  5623.  {
  5624.     struct task_struct *p = dl_task_of(dl_se);
  5625.  
  5626. -   if (p->nr_cpus_allowed > 1)
  5627. +   if (tsk_nr_cpus_allowed(p) > 1)
  5628.         dl_rq->dl_nr_migratory++;
  5629.  
  5630.     update_dl_migration(dl_rq);
  5631. @@ -162,7 +164,7 @@
  5632.  {
  5633.     struct task_struct *p = dl_task_of(dl_se);
  5634.  
  5635. -   if (p->nr_cpus_allowed > 1)
  5636. +   if (tsk_nr_cpus_allowed(p) > 1)
  5637.         dl_rq->dl_nr_migratory--;
  5638.  
  5639.     update_dl_migration(dl_rq);
  5640. @@ -231,17 +233,23 @@
  5641.     return dl_task(prev);
  5642.  }
  5643.  
  5644. -static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
  5645. +static DEFINE_PER_CPU(struct callback_head, dl_push_head);
  5646. +static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
  5647.  
  5648.  static void push_dl_tasks(struct rq *);
  5649. +static void pull_dl_task(struct rq *);
  5650.  
  5651.  static inline void queue_push_tasks(struct rq *rq)
  5652.  {
  5653.     if (!has_pushable_dl_tasks(rq))
  5654.         return;
  5655.  
  5656. -   queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu),
  5657. -       push_dl_tasks);
  5658. +   queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
  5659. +}
  5660. +
  5661. +static inline void queue_pull_task(struct rq *rq)
  5662. +{
  5663. +   queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
  5664.  }
  5665.  
  5666.  static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
  5667. @@ -322,14 +330,17 @@
  5668.     return false;
  5669.  }
  5670.  
  5671. -static inline int pull_dl_task(struct rq *rq)
  5672. +static inline void pull_dl_task(struct rq *rq)
  5673.  {
  5674. -   return 0;
  5675.  }
  5676.  
  5677.  static inline void queue_push_tasks(struct rq *rq)
  5678.  {
  5679.  }
  5680. +
  5681. +static inline void queue_pull_task(struct rq *rq)
  5682. +{
  5683. +}
  5684.  #endif /* CONFIG_SMP */
  5685.  
  5686.  static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
  5687. @@ -450,13 +461,13 @@
  5688.   *
  5689.   * This function returns true if:
  5690.   *
  5691. - *   runtime / (deadline - t) > dl_runtime / dl_deadline ,
  5692. + *   runtime / (deadline - t) > dl_runtime / dl_period ,
  5693.   *
  5694.   * IOW we can't recycle current parameters.
  5695.   *
  5696. - * Notice that the bandwidth check is done against the deadline. For
  5697. + * Notice that the bandwidth check is done against the period. For
  5698.   * task with deadline equal to period this is the same of using
  5699. - * dl_period instead of dl_deadline in the equation above.
  5700. + * dl_deadline instead of dl_period in the equation above.
  5701.   */
  5702.  static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
  5703.                    struct sched_dl_entity *pi_se, u64 t)
  5704. @@ -481,7 +492,7 @@
  5705.      * of anything below microseconds resolution is actually fiction
  5706.      * (but still we want to give the user that illusion >;).
  5707.      */
  5708. -   left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
  5709. +   left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
  5710.     right = ((dl_se->deadline - t) >> DL_SCALE) *
  5711.         (pi_se->dl_runtime >> DL_SCALE);
  5712.  
  5713. @@ -596,16 +607,10 @@
  5714.                              struct sched_dl_entity,
  5715.                              dl_timer);
  5716.     struct task_struct *p = dl_task_of(dl_se);
  5717. +   struct rq_flags rf;
  5718.     struct rq *rq;
  5719. -again:
  5720. -   rq = task_rq(p);
  5721. -   raw_spin_lock(&rq->lock);
  5722.  
  5723. -   if (rq != task_rq(p)) {
  5724. -       /* Task was moved, retrying. */
  5725. -       raw_spin_unlock(&rq->lock);
  5726. -       goto again;
  5727. -   }
  5728. +   rq = task_rq_lock(p, &rf);
  5729.  
  5730.     /*
  5731.      * The task might have changed its scheduling policy to something
  5732. @@ -686,12 +691,19 @@
  5733.      * Queueing this task back might have overloaded rq, check if we need
  5734.      * to kick someone away.
  5735.      */
  5736. -   if (has_pushable_dl_tasks(rq))
  5737. +   if (has_pushable_dl_tasks(rq)) {
  5738. +       /*
  5739. +        * Nothing relies on rq->lock after this, so its safe to drop
  5740. +        * rq->lock.
  5741. +        */
  5742. +       lockdep_unpin_lock(&rq->lock, rf.cookie);
  5743.         push_dl_task(rq);
  5744. +       lockdep_repin_lock(&rq->lock, rf.cookie);
  5745. +   }
  5746.  #endif
  5747.  
  5748.  unlock:
  5749. -   raw_spin_unlock(&rq->lock);
  5750. +   task_rq_unlock(rq, p, &rf);
  5751.  
  5752.     /*
  5753.      * This can free the task_struct, including this hrtimer, do not touch
  5754. @@ -711,7 +723,7 @@
  5755.  }
  5756.  
  5757.  static
  5758. -int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
  5759. +int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
  5760.  {
  5761.     return (dl_se->runtime <= 0);
  5762.  }
  5763. @@ -743,6 +755,9 @@
  5764.     if (unlikely((s64)delta_exec <= 0))
  5765.         return;
  5766.  
  5767. +   /* kick cpufreq (see the comment in kernel/sched/sched.h). */
  5768. +   cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
  5769. +
  5770.     schedstat_set(curr->se.statistics.exec_max,
  5771.               max(curr->se.statistics.exec_max, delta_exec));
  5772.  
  5773. @@ -753,7 +768,7 @@
  5774.     cpuacct_charge(curr, delta_exec);
  5775.  
  5776.     dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
  5777. -   if (dl_runtime_exceeded(rq, dl_se)) {
  5778. +   if (dl_runtime_exceeded(dl_se)) {
  5779.         dl_se->dl_throttled = 1;
  5780.         __dequeue_task_dl(rq, curr, 0);
  5781.         if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
  5782. @@ -869,6 +884,7 @@
  5783.     WARN_ON(!dl_prio(prio));
  5784.     dl_rq->dl_nr_running++;
  5785.     add_nr_running(rq_of_dl_rq(dl_rq), 1);
  5786. +   walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
  5787.  
  5788.     inc_dl_deadline(dl_rq, deadline);
  5789.     inc_dl_migration(dl_se, dl_rq);
  5790. @@ -883,6 +899,7 @@
  5791.     WARN_ON(!dl_rq->dl_nr_running);
  5792.     dl_rq->dl_nr_running--;
  5793.     sub_nr_running(rq_of_dl_rq(dl_rq), 1);
  5794. +   walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
  5795.  
  5796.     dec_dl_deadline(dl_rq, dl_se->deadline);
  5797.     dec_dl_migration(dl_se, dl_rq);
  5798. @@ -969,7 +986,7 @@
  5799.  
  5800.     /*
  5801.      * Use the scheduling parameters of the top pi-waiter
  5802. -    * task if we have one and its (relative) deadline is
  5803. +    * task if we have one and its (absolute) deadline is
  5804.      * smaller than our one... OTW we keep our runtime and
  5805.      * deadline.
  5806.      */
  5807. @@ -998,7 +1015,7 @@
  5808.  
  5809.     enqueue_dl_entity(&p->dl, pi_se, flags);
  5810.  
  5811. -   if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
  5812. +   if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
  5813.         enqueue_pushable_dl_task(rq, p);
  5814.  }
  5815.  
  5816. @@ -1038,7 +1055,14 @@
  5817.         rq->curr->dl.dl_yielded = 1;
  5818.         p->dl.runtime = 0;
  5819.     }
  5820. +   update_rq_clock(rq);
  5821.     update_curr_dl(rq);
  5822. +   /*
  5823. +    * Tell update_rq_clock() that we've just updated,
  5824. +    * so we don't do microscopic update in schedule()
  5825. +    * and double the fastpath cost.
  5826. +    */
  5827. +   rq_clock_skip_update(rq, true);
  5828.  }
  5829.  
  5830.  #ifdef CONFIG_SMP
  5831. @@ -1046,12 +1070,13 @@
  5832.  static int find_later_rq(struct task_struct *task);
  5833.  
  5834.  static int
  5835. -select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
  5836. +select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags,
  5837. +         int sibling_count_hint)
  5838.  {
  5839.     struct task_struct *curr;
  5840.     struct rq *rq;
  5841.  
  5842. -   if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
  5843. +   if (sd_flag != SD_BALANCE_WAKE)
  5844.         goto out;
  5845.  
  5846.     rq = cpu_rq(cpu);
  5847. @@ -1069,12 +1094,15 @@
  5848.      * try to make it stay here, it might be important.
  5849.      */
  5850.     if (unlikely(dl_task(curr)) &&
  5851. -       (curr->nr_cpus_allowed < 2 ||
  5852. +       (tsk_nr_cpus_allowed(curr) < 2 ||
  5853.          !dl_entity_preempt(&p->dl, &curr->dl)) &&
  5854. -       (p->nr_cpus_allowed > 1)) {
  5855. +       (tsk_nr_cpus_allowed(p) > 1)) {
  5856.         int target = find_later_rq(p);
  5857.  
  5858. -       if (target != -1)
  5859. +       if (target != -1 &&
  5860. +               (dl_time_before(p->dl.deadline,
  5861. +                   cpu_rq(target)->dl.earliest_dl.curr) ||
  5862. +               (cpu_rq(target)->dl.dl_nr_running == 0)))
  5863.             cpu = target;
  5864.     }
  5865.     rcu_read_unlock();
  5866. @@ -1089,7 +1117,7 @@
  5867.      * Current can't be migrated, useless to reschedule,
  5868.      * let's hope p can move out.
  5869.      */
  5870. -   if (rq->curr->nr_cpus_allowed == 1 ||
  5871. +   if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
  5872.         cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
  5873.         return;
  5874.  
  5875. @@ -1097,15 +1125,13 @@
  5876.      * p is migratable, so let's not schedule it and
  5877.      * see if it is pushed or pulled somewhere else.
  5878.      */
  5879. -   if (p->nr_cpus_allowed != 1 &&
  5880. +   if (tsk_nr_cpus_allowed(p) != 1 &&
  5881.         cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
  5882.         return;
  5883.  
  5884.     resched_curr(rq);
  5885.  }
  5886.  
  5887. -static int pull_dl_task(struct rq *this_rq);
  5888. -
  5889.  #endif /* CONFIG_SMP */
  5890.  
  5891.  /*
  5892. @@ -1136,6 +1162,10 @@
  5893.  {
  5894.     hrtick_start(rq, p->dl.runtime);
  5895.  }
  5896. +#else /* !CONFIG_SCHED_HRTICK */
  5897. +static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
  5898. +{
  5899. +}
  5900.  #endif
  5901.  
  5902.  static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
  5903. @@ -1149,7 +1179,8 @@
  5904.     return rb_entry(left, struct sched_dl_entity, rb_node);
  5905.  }
  5906.  
  5907. -struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
  5908. +struct task_struct *
  5909. +pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
  5910.  {
  5911.     struct sched_dl_entity *dl_se;
  5912.     struct task_struct *p;
  5913. @@ -1158,7 +1189,15 @@
  5914.     dl_rq = &rq->dl;
  5915.  
  5916.     if (need_pull_dl_task(rq, prev)) {
  5917. +       /*
  5918. +        * This is OK, because current is on_cpu, which avoids it being
  5919. +        * picked for load-balance and preemption/IRQs are still
  5920. +        * disabled avoiding further scheduler activity on it and we're
  5921. +        * being very careful to re-start the picking loop.
  5922. +        */
  5923. +       lockdep_unpin_lock(&rq->lock, cookie);
  5924.         pull_dl_task(rq);
  5925. +       lockdep_repin_lock(&rq->lock, cookie);
  5926.         /*
  5927.          * pull_rt_task() can drop (and re-acquire) rq->lock; this
  5928.          * means a stop task can slip in, in which case we need to
  5929. @@ -1189,10 +1228,8 @@
  5930.     /* Running task will never be pushed. */
  5931.         dequeue_pushable_dl_task(rq, p);
  5932.  
  5933. -#ifdef CONFIG_SCHED_HRTICK
  5934.     if (hrtick_enabled(rq))
  5935.         start_hrtick_dl(rq, p);
  5936. -#endif
  5937.  
  5938.     queue_push_tasks(rq);
  5939.  
  5940. @@ -1203,7 +1240,7 @@
  5941.  {
  5942.     update_curr_dl(rq);
  5943.  
  5944. -   if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
  5945. +   if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1)
  5946.         enqueue_pushable_dl_task(rq, p);
  5947.  }
  5948.  
  5949. @@ -1211,10 +1248,14 @@
  5950.  {
  5951.     update_curr_dl(rq);
  5952.  
  5953. -#ifdef CONFIG_SCHED_HRTICK
  5954. -   if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
  5955. +   /*
  5956. +    * Even when we have runtime, update_curr_dl() might have resulted in us
  5957. +    * not being the leftmost task anymore. In that case NEED_RESCHED will
  5958. +    * be set and schedule() will start a new hrtick for the next task.
  5959. +    */
  5960. +   if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
  5961. +       is_leftmost(p, &rq->dl))
  5962.         start_hrtick_dl(rq, p);
  5963. -#endif
  5964.  }
  5965.  
  5966.  static void task_fork_dl(struct task_struct *p)
  5967. @@ -1287,6 +1328,32 @@
  5968.     return NULL;
  5969.  }
  5970.  
  5971. +/*
  5972. + * Return the earliest pushable rq's task, which is suitable to be executed
  5973. + * on the CPU, NULL otherwise:
  5974. + */
  5975. +static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
  5976. +{
  5977. +   struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
  5978. +   struct task_struct *p = NULL;
  5979. +
  5980. +   if (!has_pushable_dl_tasks(rq))
  5981. +       return NULL;
  5982. +
  5983. +next_node:
  5984. +   if (next_node) {
  5985. +       p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
  5986. +
  5987. +       if (pick_dl_task(rq, p, cpu))
  5988. +           return p;
  5989. +
  5990. +       next_node = rb_next(next_node);
  5991. +       goto next_node;
  5992. +   }
  5993. +
  5994. +   return NULL;
  5995. +}
  5996. +
  5997.  static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
  5998.  
  5999.  static int find_later_rq(struct task_struct *task)
  6000. @@ -1300,16 +1367,13 @@
  6001.     if (unlikely(!later_mask))
  6002.         return -1;
  6003.  
  6004. -   if (task->nr_cpus_allowed == 1)
  6005. +   if (tsk_nr_cpus_allowed(task) == 1)
  6006.         return -1;
  6007.  
  6008.     /*
  6009.      * We have to consider system topology and task affinity
  6010.      * first, then we can look for a suitable cpu.
  6011.      */
  6012. -   cpumask_copy(later_mask, task_rq(task)->rd->span);
  6013. -   cpumask_and(later_mask, later_mask, cpu_active_mask);
  6014. -   cpumask_and(later_mask, later_mask, &task->cpus_allowed);
  6015.     best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
  6016.             task, later_mask);
  6017.     if (best_cpu == -1)
  6018. @@ -1393,6 +1457,18 @@
  6019.  
  6020.         later_rq = cpu_rq(cpu);
  6021.  
  6022. +       if (later_rq->dl.dl_nr_running &&
  6023. +           !dl_time_before(task->dl.deadline,
  6024. +                   later_rq->dl.earliest_dl.curr)) {
  6025. +           /*
  6026. +            * Target rq has tasks of equal or earlier deadline,
  6027. +            * retrying does not release any lock and is unlikely
  6028. +            * to yield a different result.
  6029. +            */
  6030. +           later_rq = NULL;
  6031. +           break;
  6032. +       }
  6033. +
  6034.         /* Retry if something changed. */
  6035.         if (double_lock_balance(rq, later_rq)) {
  6036.             if (unlikely(task_rq(task) != rq ||
  6037. @@ -1436,7 +1512,7 @@
  6038.  
  6039.     BUG_ON(rq->cpu != task_cpu(p));
  6040.     BUG_ON(task_current(rq, p));
  6041. -   BUG_ON(p->nr_cpus_allowed <= 1);
  6042. +   BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
  6043.  
  6044.     BUG_ON(!task_on_rq_queued(p));
  6045.     BUG_ON(!dl_task(p));
  6046. @@ -1453,6 +1529,7 @@
  6047.  {
  6048.     struct task_struct *next_task;
  6049.     struct rq *later_rq;
  6050. +   int ret = 0;
  6051.  
  6052.     if (!rq->dl.overloaded)
  6053.         return 0;
  6054. @@ -1474,7 +1551,7 @@
  6055.      */
  6056.     if (dl_task(rq->curr) &&
  6057.         dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
  6058. -       rq->curr->nr_cpus_allowed > 1) {
  6059. +       tsk_nr_cpus_allowed(rq->curr) > 1) {
  6060.         resched_curr(rq);
  6061.         return 0;
  6062.     }
  6063. @@ -1498,7 +1575,6 @@
  6064.              * The task is still there. We don't try
  6065.              * again, some other cpu will pull it when ready.
  6066.              */
  6067. -           dequeue_pushable_dl_task(rq, next_task);
  6068.             goto out;
  6069.         }
  6070.  
  6071. @@ -1513,9 +1589,12 @@
  6072.  
  6073.     deactivate_task(rq, next_task, 0);
  6074.     clear_average_bw(&next_task->dl, &rq->dl);
  6075. +   next_task->on_rq = TASK_ON_RQ_MIGRATING;
  6076.     set_task_cpu(next_task, later_rq->cpu);
  6077. +   next_task->on_rq = TASK_ON_RQ_QUEUED;
  6078.     add_average_bw(&next_task->dl, &later_rq->dl);
  6079.     activate_task(later_rq, next_task, 0);
  6080. +   ret = 1;
  6081.  
  6082.     resched_curr(later_rq);
  6083.  
  6084. @@ -1524,25 +1603,26 @@
  6085.  out:
  6086.     put_task_struct(next_task);
  6087.  
  6088. -   return 1;
  6089. +   return ret;
  6090.  }
  6091.  
  6092.  static void push_dl_tasks(struct rq *rq)
  6093.  {
  6094. -   /* Terminates as it moves a -deadline task */
  6095. +   /* push_dl_task() will return true if it moved a -deadline task */
  6096.     while (push_dl_task(rq))
  6097.         ;
  6098.  }
  6099.  
  6100. -static int pull_dl_task(struct rq *this_rq)
  6101. +static void pull_dl_task(struct rq *this_rq)
  6102.  {
  6103. -   int this_cpu = this_rq->cpu, ret = 0, cpu;
  6104. +   int this_cpu = this_rq->cpu, cpu;
  6105.     struct task_struct *p;
  6106. +   bool resched = false;
  6107.     struct rq *src_rq;
  6108.     u64 dmin = LONG_MAX;
  6109.  
  6110.     if (likely(!dl_overloaded(this_rq)))
  6111. -       return 0;
  6112. +       return;
  6113.  
  6114.     /*
  6115.      * Match the barrier from dl_set_overloaded; this guarantees that if we
  6116. @@ -1575,7 +1655,7 @@
  6117.         if (src_rq->dl.dl_nr_running <= 1)
  6118.             goto skip;
  6119.  
  6120. -       p = pick_next_earliest_dl_task(src_rq, this_cpu);
  6121. +       p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
  6122.  
  6123.         /*
  6124.          * We found a task to be pulled if:
  6125. @@ -1597,11 +1677,13 @@
  6126.                        src_rq->curr->dl.deadline))
  6127.                 goto skip;
  6128.  
  6129. -           ret = 1;
  6130. +           resched = true;
  6131.  
  6132.             deactivate_task(src_rq, p, 0);
  6133.             clear_average_bw(&p->dl, &src_rq->dl);
  6134. +           p->on_rq = TASK_ON_RQ_MIGRATING;
  6135.             set_task_cpu(p, this_cpu);
  6136. +           p->on_rq = TASK_ON_RQ_QUEUED;
  6137.             add_average_bw(&p->dl, &this_rq->dl);
  6138.             activate_task(this_rq, p, 0);
  6139.             dmin = p->dl.deadline;
  6140. @@ -1612,7 +1694,8 @@
  6141.         double_unlock_balance(this_rq, src_rq);
  6142.     }
  6143.  
  6144. -   return ret;
  6145. +   if (resched)
  6146. +       resched_curr(this_rq);
  6147.  }
  6148.  
  6149.  /*
  6150. @@ -1623,11 +1706,10 @@
  6151.  {
  6152.     if (!task_running(rq, p) &&
  6153.         !test_tsk_need_resched(rq->curr) &&
  6154. -       has_pushable_dl_tasks(rq) &&
  6155. -       p->nr_cpus_allowed > 1 &&
  6156. +       tsk_nr_cpus_allowed(p) > 1 &&
  6157.         dl_task(rq->curr) &&
  6158. -       (rq->curr->nr_cpus_allowed < 2 ||
  6159. -        dl_entity_preempt(&rq->curr->dl, &p->dl))) {
  6160. +       (tsk_nr_cpus_allowed(rq->curr) < 2 ||
  6161. +        !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
  6162.         push_dl_tasks(rq);
  6163.     }
  6164.  }
  6165. @@ -1635,44 +1717,34 @@
  6166.  static void set_cpus_allowed_dl(struct task_struct *p,
  6167.                 const struct cpumask *new_mask)
  6168.  {
  6169. +   struct root_domain *src_rd;
  6170.     struct rq *rq;
  6171. -   int weight;
  6172.  
  6173.     BUG_ON(!dl_task(p));
  6174.  
  6175. -   /*
  6176. -    * Update only if the task is actually running (i.e.,
  6177. -    * it is on the rq AND it is not throttled).
  6178. -    */
  6179. -   if (!on_dl_rq(&p->dl))
  6180. -       return;
  6181. -
  6182. -   weight = cpumask_weight(new_mask);
  6183. -
  6184. -   /*
  6185. -    * Only update if the process changes its state from whether it
  6186. -    * can migrate or not.
  6187. -    */
  6188. -   if ((p->nr_cpus_allowed > 1) == (weight > 1))
  6189. -       return;
  6190. -
  6191.     rq = task_rq(p);
  6192. -
  6193. +   src_rd = rq->rd;
  6194.     /*
  6195. -    * The process used to be able to migrate OR it can now migrate
  6196. +    * Migrating a SCHED_DEADLINE task between exclusive
  6197. +    * cpusets (different root_domains) entails a bandwidth
  6198. +    * update. We already made space for us in the destination
  6199. +    * domain (see cpuset_can_attach()).
  6200.      */
  6201. -   if (weight <= 1) {
  6202. -       if (!task_current(rq, p))
  6203. -           dequeue_pushable_dl_task(rq, p);
  6204. -       BUG_ON(!rq->dl.dl_nr_migratory);
  6205. -       rq->dl.dl_nr_migratory--;
  6206. -   } else {
  6207. -       if (!task_current(rq, p))
  6208. -           enqueue_pushable_dl_task(rq, p);
  6209. -       rq->dl.dl_nr_migratory++;
  6210. +   if (!cpumask_intersects(src_rd->span, new_mask)) {
  6211. +       struct dl_bw *src_dl_b;
  6212. +
  6213. +       src_dl_b = dl_bw_of(cpu_of(rq));
  6214. +       /*
  6215. +        * We now free resources of the root_domain we are migrating
  6216. +        * off. In the worst case, sched_setattr() may temporary fail
  6217. +        * until we complete the update.
  6218. +        */
  6219. +       raw_spin_lock(&src_dl_b->lock);
  6220. +       __dl_clear(src_dl_b, p->dl.dl_bw);
  6221. +       raw_spin_unlock(&src_dl_b->lock);
  6222.     }
  6223.  
  6224. -   update_dl_migration(&rq->dl);
  6225. +   set_cpus_allowed_common(p, new_mask);
  6226.  }
  6227.  
  6228.  /* Assumes rq->lock is held */
  6229. @@ -1681,6 +1753,7 @@
  6230.     if (rq->dl.overloaded)
  6231.         dl_set_overload(rq);
  6232.  
  6233. +   cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
  6234.     if (rq->dl.dl_nr_running > 0)
  6235.         cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
  6236.  }
  6237. @@ -1692,9 +1765,10 @@
  6238.         dl_clear_overload(rq);
  6239.  
  6240.     cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
  6241. +   cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
  6242.  }
  6243.  
  6244. -void init_sched_dl_class(void)
  6245. +void __init init_sched_dl_class(void)
  6246.  {
  6247.     unsigned int i;
  6248.  
  6249. @@ -1726,8 +1800,7 @@
  6250.     if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
  6251.         return;
  6252.  
  6253. -   if (pull_dl_task(rq))
  6254. -       resched_curr(rq);
  6255. +   queue_pull_task(rq);
  6256.  }
  6257.  
  6258.  /*
  6259. @@ -1736,28 +1809,15 @@
  6260.   */
  6261.  static void switched_to_dl(struct rq *rq, struct task_struct *p)
  6262.  {
  6263. -   int check_resched = 1;
  6264. -
  6265. -   /*
  6266. -    * If p is throttled, don't consider the possibility
  6267. -    * of preempting rq->curr, the check will be done right
  6268. -    * after its runtime will get replenished.
  6269. -    */
  6270. -   if (unlikely(p->dl.dl_throttled))
  6271. -       return;
  6272. -
  6273.     if (task_on_rq_queued(p) && rq->curr != p) {
  6274.  #ifdef CONFIG_SMP
  6275. -       if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
  6276. -           /* Only reschedule if pushing failed */
  6277. -           check_resched = 0;
  6278. -#endif /* CONFIG_SMP */
  6279. -       if (check_resched) {
  6280. -           if (dl_task(rq->curr))
  6281. -               check_preempt_curr_dl(rq, p, 0);
  6282. -           else
  6283. -               resched_curr(rq);
  6284. -       }
  6285. +       if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
  6286. +           queue_push_tasks(rq);
  6287. +#endif
  6288. +       if (dl_task(rq->curr))
  6289. +           check_preempt_curr_dl(rq, p, 0);
  6290. +       else
  6291. +           resched_curr(rq);
  6292.     }
  6293.  }
  6294.  
  6295. @@ -1777,15 +1837,14 @@
  6296.          * or lowering its prio, so...
  6297.          */
  6298.         if (!rq->dl.overloaded)
  6299. -           pull_dl_task(rq);
  6300. +           queue_pull_task(rq);
  6301.  
  6302.         /*
  6303.          * If we now have a earlier deadline task than p,
  6304.          * then reschedule, provided p is still on this
  6305.          * runqueue.
  6306.          */
  6307. -       if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
  6308. -           rq->curr == p)
  6309. +       if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
  6310.             resched_curr(rq);
  6311.  #else
  6312.         /*
  6313. diff -Nur /home/ninez/android/marlin/kernel/sched/debug.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/debug.c
  6314. --- /home/ninez/android/marlin/kernel/sched/debug.c 2018-08-10 01:54:08.563395055 -0400
  6315. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/debug.c 2018-08-26 16:43:11.647206295 -0400
  6316. @@ -65,8 +65,12 @@
  6317.  
  6318.  #define P(F) \
  6319.     SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
  6320. +#define P_SCHEDSTAT(F) \
  6321. +   SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)schedstat_val(F))
  6322.  #define PN(F) \
  6323.     SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
  6324. +#define PN_SCHEDSTAT(F) \
  6325. +   SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
  6326.  
  6327.     if (!se)
  6328.         return;
  6329. @@ -74,25 +78,27 @@
  6330.     PN(se->exec_start);
  6331.     PN(se->vruntime);
  6332.     PN(se->sum_exec_runtime);
  6333. -#ifdef CONFIG_SCHEDSTATS
  6334. -   PN(se->statistics.wait_start);
  6335. -   PN(se->statistics.sleep_start);
  6336. -   PN(se->statistics.block_start);
  6337. -   PN(se->statistics.sleep_max);
  6338. -   PN(se->statistics.block_max);
  6339. -   PN(se->statistics.exec_max);
  6340. -   PN(se->statistics.slice_max);
  6341. -   PN(se->statistics.wait_max);
  6342. -   PN(se->statistics.wait_sum);
  6343. -   P(se->statistics.wait_count);
  6344. -#endif
  6345. +   if (schedstat_enabled()) {
  6346. +       PN_SCHEDSTAT(se->statistics.wait_start);
  6347. +       PN_SCHEDSTAT(se->statistics.sleep_start);
  6348. +       PN_SCHEDSTAT(se->statistics.block_start);
  6349. +       PN_SCHEDSTAT(se->statistics.sleep_max);
  6350. +       PN_SCHEDSTAT(se->statistics.block_max);
  6351. +       PN_SCHEDSTAT(se->statistics.exec_max);
  6352. +       PN_SCHEDSTAT(se->statistics.slice_max);
  6353. +       PN_SCHEDSTAT(se->statistics.wait_max);
  6354. +       PN_SCHEDSTAT(se->statistics.wait_sum);
  6355. +       P_SCHEDSTAT(se->statistics.wait_count);
  6356. +   }
  6357.     P(se->load.weight);
  6358.  #ifdef CONFIG_SMP
  6359.     P(se->avg.load_avg);
  6360.     P(se->avg.util_avg);
  6361. -   P(se->avg.util_est);
  6362.  #endif
  6363. +
  6364. +#undef PN_SCHEDSTAT
  6365.  #undef PN
  6366. +#undef P_SCHEDSTAT
  6367.  #undef P
  6368.  }
  6369.  #endif
  6370. @@ -123,13 +129,17 @@
  6371.         (long long)(p->nvcsw + p->nivcsw),
  6372.         p->prio);
  6373.  #ifdef CONFIG_SCHEDSTATS
  6374. +   if (schedstat_enabled()) {
  6375. +       SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
  6376. +           SPLIT_NS(p->se.statistics.wait_sum),
  6377. +           SPLIT_NS(p->se.sum_exec_runtime),
  6378. +           SPLIT_NS(p->se.statistics.sum_sleep_runtime));
  6379. +   }
  6380. +#else
  6381.     SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
  6382. -       SPLIT_NS(p->se.vruntime),
  6383. +       0LL, 0L,
  6384.         SPLIT_NS(p->se.sum_exec_runtime),
  6385. -       SPLIT_NS(p->se.statistics.sum_sleep_runtime));
  6386. -#else
  6387. -   SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
  6388. -       0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
  6389. +       0LL, 0L);
  6390.  #endif
  6391.  #ifdef CONFIG_NUMA_BALANCING
  6392.     SEQ_printf(m, " %d", task_node(p));
  6393. @@ -148,7 +158,7 @@
  6394.     SEQ_printf(m,
  6395.     "\nrunnable tasks:\n"
  6396.     "            task   PID         tree-key  switches  prio"
  6397. -   "     exec-runtime         sum-exec        sum-sleep\n"
  6398. +   "     wait-time             sum-exec        sum-sleep\n"
  6399.     "------------------------------------------------------"
  6400.     "----------------------------------------------------\n");
  6401.  
  6402. @@ -210,8 +220,6 @@
  6403.             cfs_rq->runnable_load_avg);
  6404.     SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
  6405.             cfs_rq->avg.util_avg);
  6406. -   SEQ_printf(m, "  .%-30s: %lu\n", "util_est",
  6407. -           cfs_rq->avg.util_est);
  6408.     SEQ_printf(m, "  .%-30s: %ld\n", "removed_load_avg",
  6409.             atomic_long_read(&cfs_rq->removed_load_avg));
  6410.     SEQ_printf(m, "  .%-30s: %ld\n", "removed_util_avg",
  6411. @@ -297,6 +305,7 @@
  6412.     PN(next_balance);
  6413.     SEQ_printf(m, "  .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
  6414.     PN(clock);
  6415. +   PN(clock_task);
  6416.     P(cpu_load[0]);
  6417.     P(cpu_load[1]);
  6418.     P(cpu_load[2]);
  6419. @@ -305,25 +314,23 @@
  6420.  #undef P
  6421.  #undef PN
  6422.  
  6423. -#ifdef CONFIG_SCHEDSTATS
  6424. -#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
  6425. -#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
  6426. -
  6427. -   P(yld_count);
  6428. -
  6429. -   P(sched_count);
  6430. -   P(sched_goidle);
  6431.  #ifdef CONFIG_SMP
  6432. +#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
  6433.     P64(avg_idle);
  6434.     P64(max_idle_balance_cost);
  6435. +#undef P64
  6436.  #endif
  6437.  
  6438. -   P(ttwu_count);
  6439. -   P(ttwu_local);
  6440. -
  6441. +#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, schedstat_val(rq->n));
  6442. +   if (schedstat_enabled()) {
  6443. +       P(yld_count);
  6444. +       P(sched_count);
  6445. +       P(sched_goidle);
  6446. +       P(ttwu_count);
  6447. +       P(ttwu_local);
  6448. +   }
  6449.  #undef P
  6450. -#undef P64
  6451. -#endif
  6452. +
  6453.     spin_lock_irqsave(&sched_debug_lock, flags);
  6454.     print_cfs_stats(m, cpu);
  6455.     print_rt_stats(m, cpu);
  6456. @@ -556,10 +563,14 @@
  6457.     SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
  6458.  #define P(F) \
  6459.     SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
  6460. +#define P_SCHEDSTAT(F) \
  6461. +   SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
  6462.  #define __PN(F) \
  6463.     SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
  6464.  #define PN(F) \
  6465.     SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
  6466. +#define PN_SCHEDSTAT(F) \
  6467. +   SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
  6468.  
  6469.     PN(se.exec_start);
  6470.     PN(se.vruntime);
  6471. @@ -567,38 +578,66 @@
  6472.  
  6473.     nr_switches = p->nvcsw + p->nivcsw;
  6474.  
  6475. -#ifdef CONFIG_SCHEDSTATS
  6476. -   PN(se.statistics.wait_start);
  6477. -   PN(se.statistics.sleep_start);
  6478. -   PN(se.statistics.block_start);
  6479. -   PN(se.statistics.sleep_max);
  6480. -   PN(se.statistics.block_max);
  6481. -   PN(se.statistics.exec_max);
  6482. -   PN(se.statistics.slice_max);
  6483. -   PN(se.statistics.wait_max);
  6484. -   PN(se.statistics.wait_sum);
  6485. -   P(se.statistics.wait_count);
  6486. -   PN(se.statistics.iowait_sum);
  6487. -   P(se.statistics.iowait_count);
  6488. +
  6489.     P(se.nr_migrations);
  6490. -   P(se.statistics.nr_migrations_cold);
  6491. -   P(se.statistics.nr_failed_migrations_affine);
  6492. -   P(se.statistics.nr_failed_migrations_running);
  6493. -   P(se.statistics.nr_failed_migrations_hot);
  6494. -   P(se.statistics.nr_forced_migrations);
  6495. -   P(se.statistics.nr_wakeups);
  6496. -   P(se.statistics.nr_wakeups_sync);
  6497. -   P(se.statistics.nr_wakeups_migrate);
  6498. -   P(se.statistics.nr_wakeups_local);
  6499. -   P(se.statistics.nr_wakeups_remote);
  6500. -   P(se.statistics.nr_wakeups_affine);
  6501. -   P(se.statistics.nr_wakeups_affine_attempts);
  6502. -   P(se.statistics.nr_wakeups_passive);
  6503. -   P(se.statistics.nr_wakeups_idle);
  6504.  
  6505. -   {
  6506. +   if (schedstat_enabled()) {
  6507.         u64 avg_atom, avg_per_cpu;
  6508.  
  6509. +       PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
  6510. +       PN_SCHEDSTAT(se.statistics.wait_start);
  6511. +       PN_SCHEDSTAT(se.statistics.sleep_start);
  6512. +       PN_SCHEDSTAT(se.statistics.block_start);
  6513. +       PN_SCHEDSTAT(se.statistics.sleep_max);
  6514. +       PN_SCHEDSTAT(se.statistics.block_max);
  6515. +       PN_SCHEDSTAT(se.statistics.exec_max);
  6516. +       PN_SCHEDSTAT(se.statistics.slice_max);
  6517. +       PN_SCHEDSTAT(se.statistics.wait_max);
  6518. +       PN_SCHEDSTAT(se.statistics.wait_sum);
  6519. +       P_SCHEDSTAT(se.statistics.wait_count);
  6520. +       PN_SCHEDSTAT(se.statistics.iowait_sum);
  6521. +       P_SCHEDSTAT(se.statistics.iowait_count);
  6522. +       P_SCHEDSTAT(se.statistics.nr_migrations_cold);
  6523. +       P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
  6524. +       P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
  6525. +       P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
  6526. +       P_SCHEDSTAT(se.statistics.nr_forced_migrations);
  6527. +       P_SCHEDSTAT(se.statistics.nr_wakeups);
  6528. +       P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
  6529. +       P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
  6530. +       P_SCHEDSTAT(se.statistics.nr_wakeups_local);
  6531. +       P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
  6532. +       P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
  6533. +       P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
  6534. +       P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
  6535. +       P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
  6536. +       /* eas */
  6537. +       /* select_idle_sibling() */
  6538. +       P_SCHEDSTAT(se.statistics.nr_wakeups_sis_attempts);
  6539. +       P_SCHEDSTAT(se.statistics.nr_wakeups_sis_idle);
  6540. +       P_SCHEDSTAT(se.statistics.nr_wakeups_sis_cache_affine);
  6541. +       P_SCHEDSTAT(se.statistics.nr_wakeups_sis_suff_cap);
  6542. +       P_SCHEDSTAT(se.statistics.nr_wakeups_sis_idle_cpu);
  6543. +       P_SCHEDSTAT(se.statistics.nr_wakeups_sis_count);
  6544. +       /* select_energy_cpu_brute() */
  6545. +       P_SCHEDSTAT(se.statistics.nr_wakeups_secb_attempts);
  6546. +       P_SCHEDSTAT(se.statistics.nr_wakeups_secb_sync);
  6547. +       P_SCHEDSTAT(se.statistics.nr_wakeups_secb_idle_bt);
  6548. +       P_SCHEDSTAT(se.statistics.nr_wakeups_secb_insuff_cap);
  6549. +       P_SCHEDSTAT(se.statistics.nr_wakeups_secb_no_nrg_sav);
  6550. +       P_SCHEDSTAT(se.statistics.nr_wakeups_secb_nrg_sav);
  6551. +       P_SCHEDSTAT(se.statistics.nr_wakeups_secb_count);
  6552. +       /* find_best_target() */
  6553. +       P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_attempts);
  6554. +       P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_no_cpu);
  6555. +       P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_no_sd);
  6556. +       P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_pref_idle);
  6557. +       P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_count);
  6558. +       /* cas */
  6559. +       /* select_task_rq_fair() */
  6560. +       P_SCHEDSTAT(se.statistics.nr_wakeups_cas_attempts);
  6561. +       P_SCHEDSTAT(se.statistics.nr_wakeups_cas_count);
  6562. +      
  6563.         avg_atom = p->se.sum_exec_runtime;
  6564.         if (nr_switches)
  6565.             avg_atom = div64_ul(avg_atom, nr_switches);
  6566. @@ -616,7 +655,7 @@
  6567.         __PN(avg_atom);
  6568.         __PN(avg_per_cpu);
  6569.     }
  6570. -#endif
  6571. +
  6572.     __P(nr_switches);
  6573.     SEQ_printf(m, "%-45s:%21Ld\n",
  6574.            "nr_voluntary_switches", (long long)p->nvcsw);
  6575. @@ -629,13 +668,14 @@
  6576.     P(se.avg.util_sum);
  6577.     P(se.avg.load_avg);
  6578.     P(se.avg.util_avg);
  6579. -   P(se.avg.util_est);
  6580.     P(se.avg.last_update_time);
  6581.  #endif
  6582.     P(policy);
  6583.     P(prio);
  6584. +#undef PN_SCHEDSTAT
  6585.  #undef PN
  6586.  #undef __PN
  6587. +#undef P_SCHEDSTAT
  6588.  #undef P
  6589.  #undef __P
  6590.  
  6591. diff -Nur /home/ninez/android/marlin/kernel/sched/energy.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/energy.c
  6592. --- /home/ninez/android/marlin/kernel/sched/energy.c    2018-08-10 01:54:08.563395055 -0400
  6593. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/energy.c    2018-08-11 23:57:17.128607487 -0400
  6594. @@ -27,8 +27,6 @@
  6595.  #include <linux/sched_energy.h>
  6596.  #include <linux/stddef.h>
  6597.  
  6598. -#include "sched.h"
  6599. -
  6600.  struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
  6601.  
  6602.  static void free_resources(void)
  6603. @@ -61,14 +59,12 @@
  6604.     for_each_possible_cpu(cpu) {
  6605.         cn = of_get_cpu_node(cpu, NULL);
  6606.         if (!cn) {
  6607. -           if (sched_feat(ENERGY_AWARE))
  6608. -               pr_warn("CPU device node missing for CPU %d\n", cpu);
  6609. +           pr_warn("CPU device node missing for CPU %d\n", cpu);
  6610.             return;
  6611.         }
  6612.  
  6613.         if (!of_find_property(cn, "sched-energy-costs", NULL)) {
  6614. -           if (sched_feat(ENERGY_AWARE))
  6615. -               pr_warn("CPU device node has no sched-energy-costs\n");
  6616. +           pr_warn("CPU device node has no sched-energy-costs\n");
  6617.             return;
  6618.         }
  6619.  
  6620. @@ -79,8 +75,7 @@
  6621.  
  6622.             prop = of_find_property(cp, "busy-cost-data", NULL);
  6623.             if (!prop || !prop->value) {
  6624. -               if (sched_feat(ENERGY_AWARE))
  6625. -                   pr_warn("No busy-cost data, skipping sched_energy init\n");
  6626. +               pr_warn("No busy-cost data, skipping sched_energy init\n");
  6627.                 goto out;
  6628.             }
  6629.  
  6630. @@ -102,8 +97,7 @@
  6631.  
  6632.             prop = of_find_property(cp, "idle-cost-data", NULL);
  6633.             if (!prop || !prop->value) {
  6634. -               if (sched_feat(ENERGY_AWARE))
  6635. -                   pr_warn("No idle-cost data, skipping sched_energy init\n");
  6636. +               pr_warn("No idle-cost data, skipping sched_energy init\n");
  6637.                 goto out;
  6638.             }
  6639.  
  6640. @@ -123,7 +117,6 @@
  6641.     }
  6642.  
  6643.     pr_info("Sched-energy-costs installed from DT\n");
  6644. -   set_energy_aware();
  6645.     return;
  6646.  
  6647.  out:
  6648. diff -Nur /home/ninez/android/marlin/kernel/sched/fair.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/fair.c
  6649. --- /home/ninez/android/marlin/kernel/sched/fair.c  2018-08-10 01:54:08.566728454 -0400
  6650. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/fair.c  2018-08-26 16:43:11.647206295 -0400
  6651. @@ -20,8 +20,8 @@
  6652.   *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
  6653.   */
  6654.  
  6655. -#include <linux/latencytop.h>
  6656.  #include <linux/sched.h>
  6657. +#include <linux/latencytop.h>
  6658.  #include <linux/cpumask.h>
  6659.  #include <linux/cpuidle.h>
  6660.  #include <linux/slab.h>
  6661. @@ -53,14 +53,18 @@
  6662.  unsigned int sysctl_sched_latency = 6000000ULL;
  6663.  unsigned int normalized_sysctl_sched_latency = 6000000ULL;
  6664.  
  6665. -unsigned int sysctl_sched_is_big_little = 0;
  6666.  unsigned int sysctl_sched_sync_hint_enable = 1;
  6667.  unsigned int sysctl_sched_initial_task_util = 0;
  6668.  unsigned int sysctl_sched_cstate_aware = 1;
  6669.  
  6670.  #ifdef CONFIG_SCHED_WALT
  6671. +#ifdef CONFIG_SCHED_WALT_DEFAULT
  6672.  unsigned int sysctl_sched_use_walt_cpu_util = 1;
  6673.  unsigned int sysctl_sched_use_walt_task_util = 1;
  6674. +#else
  6675. +unsigned int sysctl_sched_use_walt_cpu_util = 0;
  6676. +unsigned int sysctl_sched_use_walt_task_util = 0;
  6677. +#endif
  6678.  __read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
  6679.      (10 * NSEC_PER_MSEC);
  6680.  #endif
  6681. @@ -128,6 +132,12 @@
  6682.  unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
  6683.  #endif
  6684.  
  6685. +/*
  6686. + * The margin used when comparing utilization with CPU capacity:
  6687. + * util * margin < capacity * 1024
  6688. + */
  6689. +unsigned int capacity_margin = 1280; /* ~20% */
  6690. +
  6691.  static inline void update_load_add(struct load_weight *lw, unsigned long inc)
  6692.  {
  6693.     lw->weight += inc;
  6694. @@ -155,9 +165,9 @@
  6695.   *
  6696.   * This idea comes from the SD scheduler of Con Kolivas:
  6697.   */
  6698. -static int get_update_sysctl_factor(void)
  6699. +static unsigned int get_update_sysctl_factor(void)
  6700.  {
  6701. -   unsigned int cpus = min_t(int, num_online_cpus(), 8);
  6702. +   unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
  6703.     unsigned int factor;
  6704.  
  6705.     switch (sysctl_sched_tunable_scaling) {
  6706. @@ -270,9 +280,7 @@
  6707.  
  6708.  static inline struct task_struct *task_of(struct sched_entity *se)
  6709.  {
  6710. -#ifdef CONFIG_SCHED_DEBUG
  6711. -   WARN_ON_ONCE(!entity_is_task(se));
  6712. -#endif
  6713. +   SCHED_WARN_ON(!entity_is_task(se));
  6714.     return container_of(se, struct task_struct, se);
  6715.  }
  6716.  
  6717. @@ -300,19 +308,59 @@
  6718.  static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  6719.  {
  6720.     if (!cfs_rq->on_list) {
  6721. +       struct rq *rq = rq_of(cfs_rq);
  6722. +       int cpu = cpu_of(rq);
  6723.         /*
  6724.          * Ensure we either appear before our parent (if already
  6725.          * enqueued) or force our parent to appear after us when it is
  6726. -        * enqueued.  The fact that we always enqueue bottom-up
  6727. -        * reduces this to two cases.
  6728. +        * enqueued. The fact that we always enqueue bottom-up
  6729. +        * reduces this to two cases and a special case for the root
  6730. +        * cfs_rq. Furthermore, it also means that we will always reset
  6731. +        * tmp_alone_branch either when the branch is connected
  6732. +        * to a tree or when we reach the beg of the tree
  6733.          */
  6734.         if (cfs_rq->tg->parent &&
  6735. -           cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
  6736. -           list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
  6737. -               &rq_of(cfs_rq)->leaf_cfs_rq_list);
  6738. -       } else {
  6739. +           cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
  6740. +           /*
  6741. +            * If parent is already on the list, we add the child
  6742. +            * just before. Thanks to circular linked property of
  6743. +            * the list, this means to put the child at the tail
  6744. +            * of the list that starts by parent.
  6745. +            */
  6746. +           list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
  6747. +               &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
  6748. +           /*
  6749. +            * The branch is now connected to its tree so we can
  6750. +            * reset tmp_alone_branch to the beginning of the
  6751. +            * list.
  6752. +            */
  6753. +           rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
  6754. +       } else if (!cfs_rq->tg->parent) {
  6755. +           /*
  6756. +            * cfs rq without parent should be put
  6757. +            * at the tail of the list.
  6758. +            */
  6759.             list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
  6760. -               &rq_of(cfs_rq)->leaf_cfs_rq_list);
  6761. +               &rq->leaf_cfs_rq_list);
  6762. +           /*
  6763. +            * We have reach the beg of a tree so we can reset
  6764. +            * tmp_alone_branch to the beginning of the list.
  6765. +            */
  6766. +           rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
  6767. +       } else {
  6768. +           /*
  6769. +            * The parent has not already been added so we want to
  6770. +            * make sure that it will be put after us.
  6771. +            * tmp_alone_branch points to the beg of the branch
  6772. +            * where we will add parent.
  6773. +            */
  6774. +           list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
  6775. +               rq->tmp_alone_branch);
  6776. +           /*
  6777. +            * update tmp_alone_branch to points to the new beg
  6778. +            * of the branch
  6779. +            */
  6780. +           rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
  6781.         }
  6782.  
  6783.         cfs_rq->on_list = 1;
  6784. @@ -470,17 +518,23 @@
  6785.  
  6786.  static void update_min_vruntime(struct cfs_rq *cfs_rq)
  6787.  {
  6788. +   struct sched_entity *curr = cfs_rq->curr;
  6789. +
  6790.     u64 vruntime = cfs_rq->min_vruntime;
  6791.  
  6792. -   if (cfs_rq->curr)
  6793. -       vruntime = cfs_rq->curr->vruntime;
  6794. +   if (curr) {
  6795. +       if (curr->on_rq)
  6796. +           vruntime = curr->vruntime;
  6797. +       else
  6798. +           curr = NULL;
  6799. +   }
  6800.  
  6801.     if (cfs_rq->rb_leftmost) {
  6802.         struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
  6803.                            struct sched_entity,
  6804.                            run_node);
  6805.  
  6806. -       if (!cfs_rq->curr)
  6807. +       if (!curr)
  6808.             vruntime = se->vruntime;
  6809.         else
  6810.             vruntime = min_vruntime(vruntime, se->vruntime);
  6811. @@ -585,7 +639,7 @@
  6812.         loff_t *ppos)
  6813.  {
  6814.     int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
  6815. -   int factor = get_update_sysctl_factor();
  6816. +   unsigned int factor = get_update_sysctl_factor();
  6817.  
  6818.     if (ret || !write)
  6819.         return ret;
  6820. @@ -670,16 +724,17 @@
  6821.  }
  6822.  
  6823.  #ifdef CONFIG_SMP
  6824. -static int select_idle_sibling(struct task_struct *p, int cpu);
  6825. +static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
  6826.  static unsigned long task_h_load(struct task_struct *p);
  6827.  
  6828.  /*
  6829.   * We choose a half-life close to 1 scheduling period.
  6830. - * Note: The tables below are dependent on this value.
  6831. + * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
  6832. + * dependent on this value.
  6833.   */
  6834. -#define LOAD_AVG_PERIOD 16
  6835. -#define LOAD_AVG_MAX 24117 /* maximum possible load avg */
  6836. -#define LOAD_AVG_MAX_N 172 /* number of full periods to produce LOAD_AVG_MAX */
  6837. +#define LOAD_AVG_PERIOD 32
  6838. +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
  6839. +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
  6840.  
  6841.  /* Give new sched_entity start runnable values to heavy its load in infant time */
  6842.  void init_entity_runnable_average(struct sched_entity *se)
  6843. @@ -693,23 +748,117 @@
  6844.      * will definitely be update (after enqueue).
  6845.      */
  6846.     sa->period_contrib = 1023;
  6847. -   sa->load_avg = scale_load_down(se->load.weight);
  6848. +   /*
  6849. +    * Tasks are intialized with full load to be seen as heavy tasks until
  6850. +    * they get a chance to stabilize to their real load level.
  6851. +    * Group entities are intialized with zero load to reflect the fact that
  6852. +    * nothing has been attached to the task group yet.
  6853. +    */
  6854. +   if (entity_is_task(se))
  6855. +       sa->load_avg = scale_load_down(se->load.weight);
  6856.     sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
  6857. -   sa->util_avg =  sched_freq() ?
  6858. -       sysctl_sched_initial_task_util :
  6859. -       scale_load_down(SCHED_LOAD_SCALE);
  6860. -   sa->util_est = sa->util_avg;
  6861. -   sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
  6862. +   /*
  6863. +    * In previous Android versions, we used to have:
  6864. +    *  sa->util_avg =  sched_freq() ?
  6865. +    *      sysctl_sched_initial_task_util :
  6866. +    *      scale_load_down(SCHED_LOAD_SCALE);
  6867. +    *  sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
  6868. +    * However, that functionality has been moved to enqueue.
  6869. +    * It is unclear if we should restore this in enqueue.
  6870. +    */
  6871. +   /*
  6872. +    * At this point, util_avg won't be used in select_task_rq_fair anyway
  6873. +    */
  6874. +   sa->util_avg = 0;
  6875. +   sa->util_sum = 0;
  6876.     /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
  6877.  }
  6878.  
  6879. -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
  6880. -static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
  6881. +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
  6882. +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
  6883. +static void attach_entity_cfs_rq(struct sched_entity *se);
  6884. +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
  6885. +
  6886. +/*
  6887. + * With new tasks being created, their initial util_avgs are extrapolated
  6888. + * based on the cfs_rq's current util_avg:
  6889. + *
  6890. + *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
  6891. + *
  6892. + * However, in many cases, the above util_avg does not give a desired
  6893. + * value. Moreover, the sum of the util_avgs may be divergent, such
  6894. + * as when the series is a harmonic series.
  6895. + *
  6896. + * To solve this problem, we also cap the util_avg of successive tasks to
  6897. + * only 1/2 of the left utilization budget:
  6898. + *
  6899. + *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
  6900. + *
  6901. + * where n denotes the nth task.
  6902. + *
  6903. + * For example, a simplest series from the beginning would be like:
  6904. + *
  6905. + *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
  6906. + * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
  6907. + *
  6908. + * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
  6909. + * if util_avg > util_avg_cap.
  6910. + */
  6911. +void post_init_entity_util_avg(struct sched_entity *se)
  6912. +{
  6913. +   struct cfs_rq *cfs_rq = cfs_rq_of(se);
  6914. +   struct sched_avg *sa = &se->avg;
  6915. +   long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
  6916. +
  6917. +   if (cap > 0) {
  6918. +       if (cfs_rq->avg.util_avg != 0) {
  6919. +           sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
  6920. +           sa->util_avg /= (cfs_rq->avg.load_avg + 1);
  6921. +
  6922. +           if (sa->util_avg > cap)
  6923. +               sa->util_avg = cap;
  6924. +       } else {
  6925. +           sa->util_avg = cap;
  6926. +       }
  6927. +       /*
  6928. +        * If we wish to restore tuning via setting initial util,
  6929. +        * this is where we should do it.
  6930. +        */
  6931. +       sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
  6932. +   }
  6933. +
  6934. +   if (entity_is_task(se)) {
  6935. +       struct task_struct *p = task_of(se);
  6936. +       if (p->sched_class != &fair_sched_class) {
  6937. +           /*
  6938. +            * For !fair tasks do:
  6939. +            *
  6940. +           update_cfs_rq_load_avg(now, cfs_rq, false);
  6941. +           attach_entity_load_avg(cfs_rq, se);
  6942. +           switched_from_fair(rq, p);
  6943. +            *
  6944. +            * such that the next switched_to_fair() has the
  6945. +            * expected state.
  6946. +            */
  6947. +           se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
  6948. +           return;
  6949. +       }
  6950. +   }
  6951. +
  6952. +   attach_entity_cfs_rq(se);
  6953. +}
  6954. +
  6955.  #else
  6956.  void init_entity_runnable_average(struct sched_entity *se)
  6957.  {
  6958.  }
  6959. -#endif
  6960. +void post_init_entity_util_avg(struct sched_entity *se)
  6961. +{
  6962. +}
  6963. +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
  6964. +{
  6965. +}
  6966. +#endif /* CONFIG_SMP */
  6967.  
  6968.  /*
  6969.   * Update the current task's runtime statistics.
  6970. @@ -733,7 +882,7 @@
  6971.               max(delta_exec, curr->statistics.exec_max));
  6972.  
  6973.     curr->sum_exec_runtime += delta_exec;
  6974. -   schedstat_add(cfs_rq, exec_clock, delta_exec);
  6975. +   schedstat_add(cfs_rq->exec_clock, delta_exec);
  6976.  
  6977.     curr->vruntime += calc_delta_fair(delta_exec, curr);
  6978.     update_min_vruntime(cfs_rq);
  6979. @@ -757,48 +906,165 @@
  6980.  static inline void
  6981.  update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  6982.  {
  6983. -   schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
  6984. +   u64 wait_start, prev_wait_start;
  6985. +
  6986. +   if (!schedstat_enabled())
  6987. +       return;
  6988. +
  6989. +   wait_start = rq_clock(rq_of(cfs_rq));
  6990. +   prev_wait_start = schedstat_val(se->statistics.wait_start);
  6991. +
  6992. +   if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
  6993. +       likely(wait_start > prev_wait_start))
  6994. +       wait_start -= prev_wait_start;
  6995. +
  6996. +   schedstat_set(se->statistics.wait_start, wait_start);
  6997. +}
  6998. +
  6999. +static inline void
  7000. +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
  7001. +{
  7002. +   struct task_struct *p;
  7003. +   u64 delta;
  7004. +
  7005. +   if (!schedstat_enabled())
  7006. +       return;
  7007. +
  7008. +   delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
  7009. +
  7010. +   if (entity_is_task(se)) {
  7011. +       p = task_of(se);
  7012. +       if (task_on_rq_migrating(p)) {
  7013. +           /*
  7014. +            * Preserve migrating task's wait time so wait_start
  7015. +            * time stamp can be adjusted to accumulate wait time
  7016. +            * prior to migration.
  7017. +            */
  7018. +           schedstat_set(se->statistics.wait_start, delta);
  7019. +           return;
  7020. +       }
  7021. +       trace_sched_stat_wait(p, delta);
  7022. +   }
  7023. +
  7024. +   schedstat_set(se->statistics.wait_max,
  7025. +             max(schedstat_val(se->statistics.wait_max), delta));
  7026. +   schedstat_inc(se->statistics.wait_count);
  7027. +   schedstat_add(se->statistics.wait_sum, delta);
  7028. +   schedstat_set(se->statistics.wait_start, 0);
  7029. +}
  7030. +
  7031. +static inline void
  7032. +update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
  7033. +{
  7034. +   struct task_struct *tsk = NULL;
  7035. +   u64 sleep_start, block_start;
  7036. +
  7037. +   if (!schedstat_enabled())
  7038. +       return;
  7039. +
  7040. +   sleep_start = schedstat_val(se->statistics.sleep_start);
  7041. +   block_start = schedstat_val(se->statistics.block_start);
  7042. +
  7043. +   if (entity_is_task(se))
  7044. +       tsk = task_of(se);
  7045. +
  7046. +   if (sleep_start) {
  7047. +       u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
  7048. +
  7049. +       if ((s64)delta < 0)
  7050. +           delta = 0;
  7051. +
  7052. +       if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
  7053. +           schedstat_set(se->statistics.sleep_max, delta);
  7054. +
  7055. +       schedstat_set(se->statistics.sleep_start, 0);
  7056. +       schedstat_add(se->statistics.sum_sleep_runtime, delta);
  7057. +
  7058. +       if (tsk) {
  7059. +           account_scheduler_latency(tsk, delta >> 10, 1);
  7060. +           trace_sched_stat_sleep(tsk, delta);
  7061. +       }
  7062. +   }
  7063. +   if (block_start) {
  7064. +       u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
  7065. +
  7066. +       if ((s64)delta < 0)
  7067. +           delta = 0;
  7068. +
  7069. +       if (unlikely(delta > schedstat_val(se->statistics.block_max)))
  7070. +           schedstat_set(se->statistics.block_max, delta);
  7071. +
  7072. +       schedstat_set(se->statistics.block_start, 0);
  7073. +       schedstat_add(se->statistics.sum_sleep_runtime, delta);
  7074. +
  7075. +       if (tsk) {
  7076. +           if (tsk->in_iowait) {
  7077. +               schedstat_add(se->statistics.iowait_sum, delta);
  7078. +               schedstat_inc(se->statistics.iowait_count);
  7079. +               trace_sched_stat_iowait(tsk, delta);
  7080. +           }
  7081. +
  7082. +           trace_sched_stat_blocked(tsk, delta);
  7083. +
  7084. +           /*
  7085. +            * Blocking time is in units of nanosecs, so shift by
  7086. +            * 20 to get a milliseconds-range estimation of the
  7087. +            * amount of time that the task spent sleeping:
  7088. +            */
  7089. +           if (unlikely(prof_on == SLEEP_PROFILING)) {
  7090. +               profile_hits(SLEEP_PROFILING,
  7091. +                       (void *)get_wchan(tsk),
  7092. +                       delta >> 20);
  7093. +           }
  7094. +           account_scheduler_latency(tsk, delta >> 10, 0);
  7095. +       }
  7096. +   }
  7097.  }
  7098.  
  7099.  /*
  7100.   * Task is being enqueued - update stats:
  7101.   */
  7102. -static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  7103. +static inline void
  7104. +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  7105.  {
  7106. +   if (!schedstat_enabled())
  7107. +       return;
  7108. +
  7109.     /*
  7110.      * Are we enqueueing a waiting task? (for current tasks
  7111.      * a dequeue/enqueue event is a NOP)
  7112.      */
  7113.     if (se != cfs_rq->curr)
  7114.         update_stats_wait_start(cfs_rq, se);
  7115. -}
  7116.  
  7117. -static void
  7118. -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
  7119. -{
  7120. -   schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
  7121. -           rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
  7122. -   schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
  7123. -   schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
  7124. -           rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
  7125. -#ifdef CONFIG_SCHEDSTATS
  7126. -   if (entity_is_task(se)) {
  7127. -       trace_sched_stat_wait(task_of(se),
  7128. -           rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
  7129. -   }
  7130. -#endif
  7131. -   schedstat_set(se->statistics.wait_start, 0);
  7132. +   if (flags & ENQUEUE_WAKEUP)
  7133. +       update_stats_enqueue_sleeper(cfs_rq, se);
  7134.  }
  7135.  
  7136.  static inline void
  7137. -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  7138. +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  7139.  {
  7140. +
  7141. +   if (!schedstat_enabled())
  7142. +       return;
  7143. +
  7144.     /*
  7145.      * Mark the end of the wait period if dequeueing a
  7146.      * waiting task:
  7147.      */
  7148.     if (se != cfs_rq->curr)
  7149.         update_stats_wait_end(cfs_rq, se);
  7150. +
  7151. +   if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
  7152. +       struct task_struct *tsk = task_of(se);
  7153. +
  7154. +       if (tsk->state & TASK_INTERRUPTIBLE)
  7155. +           schedstat_set(se->statistics.sleep_start,
  7156. +                     rq_clock(rq_of(cfs_rq)));
  7157. +       if (tsk->state & TASK_UNINTERRUPTIBLE)
  7158. +           schedstat_set(se->statistics.block_start,
  7159. +                     rq_clock(rq_of(cfs_rq)));
  7160. +   }
  7161.  }
  7162.  
  7163.  /*
  7164. @@ -1309,8 +1575,16 @@
  7165.      * One idle CPU per node is evaluated for a task numa move.
  7166.      * Call select_idle_sibling to maybe find a better one.
  7167.      */
  7168. -   if (!cur)
  7169. -       env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
  7170. +   if (!cur) {
  7171. +       /*
  7172. +        * select_idle_siblings() uses an per-cpu cpumask that
  7173. +        * can be used from IRQ context.
  7174. +        */
  7175. +       local_irq_disable();
  7176. +       env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
  7177. +                          env->dst_cpu);
  7178. +       local_irq_enable();
  7179. +   }
  7180.  
  7181.  assign:
  7182.     task_numa_assign(env, cur, imp);
  7183. @@ -1612,6 +1886,11 @@
  7184.     u64 runtime, period;
  7185.     spinlock_t *group_lock = NULL;
  7186.  
  7187. +   /*
  7188. +    * The p->mm->numa_scan_seq field gets updated without
  7189. +    * exclusive access. Use READ_ONCE() here to ensure
  7190. +    * that the field is read in a single access:
  7191. +    */
  7192.     seq = READ_ONCE(p->mm->numa_scan_seq);
  7193.     if (p->numa_scan_seq == seq)
  7194.         return;
  7195. @@ -1857,7 +2136,7 @@
  7196.     int local = !!(flags & TNF_FAULT_LOCAL);
  7197.     int priv;
  7198.  
  7199. -   if (!numabalancing_enabled)
  7200. +   if (!static_branch_likely(&sched_numa_balancing))
  7201.         return;
  7202.  
  7203.     /* for example, ksmd faulting in a user's mm */
  7204. @@ -1929,6 +2208,14 @@
  7205.  
  7206.  static void reset_ptenuma_scan(struct task_struct *p)
  7207.  {
  7208. +   /*
  7209. +    * We only did a read acquisition of the mmap sem, so
  7210. +    * p->mm->numa_scan_seq is written to without exclusive access
  7211. +    * and the update is not guaranteed to be atomic. That's not
  7212. +    * much of an issue though, since this is just used for
  7213. +    * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
  7214. +    * expensive, to avoid any form of compiler optimizations:
  7215. +    */
  7216.     WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
  7217.     p->mm->numa_scan_offset = 0;
  7218.  }
  7219. @@ -1945,9 +2232,9 @@
  7220.     struct vm_area_struct *vma;
  7221.     unsigned long start, end;
  7222.     unsigned long nr_pte_updates = 0;
  7223. -   long pages;
  7224. +   long pages, virtpages;
  7225.  
  7226. -   WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
  7227. +   SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
  7228.  
  7229.     work->next = work; /* protect against double add */
  7230.     /*
  7231. @@ -1991,9 +2278,11 @@
  7232.     start = mm->numa_scan_offset;
  7233.     pages = sysctl_numa_balancing_scan_size;
  7234.     pages <<= 20 - PAGE_SHIFT; /* MB in pages */
  7235. +   virtpages = pages * 8;     /* Scan up to this much virtual space */
  7236.     if (!pages)
  7237.         return;
  7238.  
  7239. +
  7240.     down_read(&mm->mmap_sem);
  7241.     vma = find_vma(mm, start);
  7242.     if (!vma) {
  7243. @@ -2003,7 +2292,7 @@
  7244.     }
  7245.     for (; vma; vma = vma->vm_next) {
  7246.         if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
  7247. -           is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
  7248. +           is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
  7249.             continue;
  7250.         }
  7251.  
  7252. @@ -2028,18 +2317,22 @@
  7253.             start = max(start, vma->vm_start);
  7254.             end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
  7255.             end = min(end, vma->vm_end);
  7256. -           nr_pte_updates += change_prot_numa(vma, start, end);
  7257. +           nr_pte_updates = change_prot_numa(vma, start, end);
  7258.  
  7259.             /*
  7260. -            * Scan sysctl_numa_balancing_scan_size but ensure that
  7261. -            * at least one PTE is updated so that unused virtual
  7262. -            * address space is quickly skipped.
  7263. +            * Try to scan sysctl_numa_balancing_size worth of
  7264. +            * hpages that have at least one present PTE that
  7265. +            * is not already pte-numa. If the VMA contains
  7266. +            * areas that are unused or already full of prot_numa
  7267. +            * PTEs, scan up to virtpages, to skip through those
  7268. +            * areas faster.
  7269.              */
  7270.             if (nr_pte_updates)
  7271.                 pages -= (end - start) >> PAGE_SHIFT;
  7272. +           virtpages -= (end - start) >> PAGE_SHIFT;
  7273.  
  7274.             start = end;
  7275. -           if (pages <= 0)
  7276. +           if (pages <= 0 || virtpages <= 0)
  7277.                 goto out;
  7278.  
  7279.             cond_resched();
  7280. @@ -2140,28 +2433,22 @@
  7281.  
  7282.  #ifdef CONFIG_FAIR_GROUP_SCHED
  7283.  # ifdef CONFIG_SMP
  7284. -static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
  7285. +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
  7286.  {
  7287. -   long tg_weight;
  7288. +   long tg_weight, load, shares;
  7289.  
  7290.     /*
  7291. -    * Use this CPU's real-time load instead of the last load contribution
  7292. -    * as the updating of the contribution is delayed, and we will use the
  7293. -    * the real-time load to calc the share. See update_tg_load_avg().
  7294. +    * This really should be: cfs_rq->avg.load_avg, but instead we use
  7295. +    * cfs_rq->load.weight, which is its upper bound. This helps ramp up
  7296. +    * the shares for small weight interactive tasks.
  7297.      */
  7298. -   tg_weight = atomic_long_read(&tg->load_avg);
  7299. -   tg_weight -= cfs_rq->tg_load_avg_contrib;
  7300. -   tg_weight += cfs_rq->load.weight;
  7301. +   load = scale_load_down(cfs_rq->load.weight);
  7302.  
  7303. -   return tg_weight;
  7304. -}
  7305. -
  7306. -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
  7307. -{
  7308. -   long tg_weight, load, shares;
  7309. +   tg_weight = atomic_long_read(&tg->load_avg);
  7310.  
  7311. -   tg_weight = calc_tg_weight(tg, cfs_rq);
  7312. -   load = cfs_rq->load.weight;
  7313. +   /* Ensure tg_weight >= load */
  7314. +   tg_weight -= cfs_rq->tg_load_avg_contrib;
  7315. +   tg_weight += load;
  7316.  
  7317.     shares = (tg->shares * load);
  7318.     if (tg_weight)
  7319. @@ -2198,16 +2485,20 @@
  7320.  
  7321.  static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
  7322.  
  7323. -static void update_cfs_shares(struct cfs_rq *cfs_rq)
  7324. +static void update_cfs_shares(struct sched_entity *se)
  7325.  {
  7326. +   struct cfs_rq *cfs_rq = group_cfs_rq(se);
  7327.     struct task_group *tg;
  7328. -   struct sched_entity *se;
  7329.     long shares;
  7330.  
  7331. -   tg = cfs_rq->tg;
  7332. -   se = tg->se[cpu_of(rq_of(cfs_rq))];
  7333. -   if (!se || throttled_hierarchy(cfs_rq))
  7334. +   if (!cfs_rq)
  7335. +       return;
  7336. +
  7337. +   if (throttled_hierarchy(cfs_rq))
  7338.         return;
  7339. +
  7340. +   tg = cfs_rq->tg;
  7341. +
  7342.  #ifndef CONFIG_SMP
  7343.     if (likely(se->load.weight == tg->shares))
  7344.         return;
  7345. @@ -2216,8 +2507,9 @@
  7346.  
  7347.     reweight_entity(cfs_rq_of(se), se, shares);
  7348.  }
  7349. +
  7350.  #else /* CONFIG_FAIR_GROUP_SCHED */
  7351. -static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
  7352. +static inline void update_cfs_shares(struct sched_entity *se)
  7353.  {
  7354.  }
  7355.  #endif /* CONFIG_FAIR_GROUP_SCHED */
  7356. @@ -2225,8 +2517,12 @@
  7357.  #ifdef CONFIG_SMP
  7358.  /* Precomputed fixed inverse multiplies for multiplication by y^n */
  7359.  static const u32 runnable_avg_yN_inv[] = {
  7360. -   0xffff, 0xf524, 0xeabf, 0xe0cb, 0xd744, 0xce23, 0xc566, 0xbd07,
  7361. -   0xb504, 0xad57, 0xa5fe, 0x9ef4, 0x9837, 0x91c3, 0x8b95, 0x85aa,
  7362. +   0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
  7363. +   0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
  7364. +   0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
  7365. +   0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
  7366. +   0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
  7367. +   0x85aac367, 0x82cd8698,
  7368.  };
  7369.  
  7370.  /*
  7371. @@ -2234,8 +2530,19 @@
  7372.   * over-estimates when re-combining.
  7373.   */
  7374.  static const u32 runnable_avg_yN_sum[] = {
  7375. -       0,  980, 1919, 2818, 3679, 4503, 5292, 6048, 6772, 7465, 8129,
  7376. -    8764, 9373, 9956,10514,11048,11560,
  7377. +       0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
  7378. +    9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
  7379. +   17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
  7380. +};
  7381. +
  7382. +/*
  7383. + * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
  7384. + * lower integers. See Documentation/scheduler/sched-avg.txt how these
  7385. + * were generated:
  7386. + */
  7387. +static const u32 __accumulated_sum_N32[] = {
  7388. +       0, 23371, 35056, 40899, 43820, 45281,
  7389. +   46011, 46376, 46559, 46650, 46696, 46719,
  7390.  };
  7391.  
  7392.  /*
  7393. @@ -2266,8 +2573,7 @@
  7394.         local_n %= LOAD_AVG_PERIOD;
  7395.     }
  7396.  
  7397. -   val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n],
  7398. -           LOAD_AVG_PERIOD);
  7399. +   val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
  7400.     return val;
  7401.  }
  7402.  
  7403. @@ -2287,22 +2593,13 @@
  7404.     else if (unlikely(n >= LOAD_AVG_MAX_N))
  7405.         return LOAD_AVG_MAX;
  7406.  
  7407. -   /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
  7408. -   do {
  7409. -       contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
  7410. -       contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
  7411. -
  7412. -       n -= LOAD_AVG_PERIOD;
  7413. -   } while (n > LOAD_AVG_PERIOD);
  7414. -
  7415. +   /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
  7416. +   contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
  7417. +   n %= LOAD_AVG_PERIOD;
  7418.     contrib = decay_load(contrib, n);
  7419.     return contrib + runnable_avg_yN_sum[n];
  7420.  }
  7421.  
  7422. -#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
  7423. -#error "load tracking assumes 2^10 as unit"
  7424. -#endif
  7425. -
  7426.  #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
  7427.  
  7428.  /*
  7429. @@ -2439,10 +2736,42 @@
  7430.     return decayed;
  7431.  }
  7432.  
  7433. -#ifdef CONFIG_FAIR_GROUP_SCHED
  7434.  /*
  7435. - * Updating tg's load_avg is necessary before update_cfs_share (which is done)
  7436. - * and effective_load (which is not done because it is too costly).
  7437. + * Signed add and clamp on underflow.
  7438. + *
  7439. + * Explicitly do a load-store to ensure the intermediate value never hits
  7440. + * memory. This allows lockless observations without ever seeing the negative
  7441. + * values.
  7442. + */
  7443. +#define add_positive(_ptr, _val) do {                           \
  7444. +   typeof(_ptr) ptr = (_ptr);                              \
  7445. +   typeof(_val) val = (_val);                              \
  7446. +   typeof(*ptr) res, var = READ_ONCE(*ptr);                \
  7447. +                               \
  7448. +   res = var + val;                                        \
  7449. +                               \
  7450. +   if (val < 0 && res > var)                               \
  7451. +       res = 0;                                        \
  7452. +                               \
  7453. +   WRITE_ONCE(*ptr, res);                                  \
  7454. +} while (0)
  7455. +
  7456. +#ifdef CONFIG_FAIR_GROUP_SCHED
  7457. +/**
  7458. + * update_tg_load_avg - update the tg's load avg
  7459. + * @cfs_rq: the cfs_rq whose avg changed
  7460. + * @force: update regardless of how small the difference
  7461. + *
  7462. + * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
  7463. + * However, because tg->load_avg is a global value there are performance
  7464. + * considerations.
  7465. + *
  7466. + * In order to avoid having to look at the other cfs_rq's, we use a
  7467. + * differential update where we store the last value we propagated. This in
  7468. + * turn allows skipping updates if the differential is 'small'.
  7469. + *
  7470. + * Updating tg's load_avg is necessary before update_cfs_share() (which is
  7471. + * done) and effective_load() (which is not done because it is too costly).
  7472.   */
  7473.  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
  7474.  {
  7475. @@ -2506,29 +2835,249 @@
  7476.         se->avg.last_update_time = n_last_update_time;
  7477.     }
  7478.  }
  7479. +
  7480. +/* Take into account change of utilization of a child task group */
  7481. +static inline void
  7482. +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
  7483. +{
  7484. +   struct cfs_rq *gcfs_rq = group_cfs_rq(se);
  7485. +   long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
  7486. +
  7487. +   /* Nothing to update */
  7488. +   if (!delta)
  7489. +       return;
  7490. +
  7491. +   /* Set new sched_entity's utilization */
  7492. +   se->avg.util_avg = gcfs_rq->avg.util_avg;
  7493. +   se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
  7494. +
  7495. +   /* Update parent cfs_rq utilization */
  7496. +   add_positive(&cfs_rq->avg.util_avg, delta);
  7497. +   cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
  7498. +}
  7499. +
  7500. +/* Take into account change of load of a child task group */
  7501. +static inline void
  7502. +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
  7503. +{
  7504. +   struct cfs_rq *gcfs_rq = group_cfs_rq(se);
  7505. +   long delta, load = gcfs_rq->avg.load_avg;
  7506. +
  7507. +   /*
  7508. +    * If the load of group cfs_rq is null, the load of the
  7509. +    * sched_entity will also be null so we can skip the formula
  7510. +    */
  7511. +   if (load) {
  7512. +       long tg_load;
  7513. +
  7514. +       /* Get tg's load and ensure tg_load > 0 */
  7515. +       tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
  7516. +
  7517. +       /* Ensure tg_load >= load and updated with current load*/
  7518. +       tg_load -= gcfs_rq->tg_load_avg_contrib;
  7519. +       tg_load += load;
  7520. +
  7521. +       /*
  7522. +        * We need to compute a correction term in the case that the
  7523. +        * task group is consuming more CPU than a task of equal
  7524. +        * weight. A task with a weight equals to tg->shares will have
  7525. +        * a load less or equal to scale_load_down(tg->shares).
  7526. +        * Similarly, the sched_entities that represent the task group
  7527. +        * at parent level, can't have a load higher than
  7528. +        * scale_load_down(tg->shares). And the Sum of sched_entities'
  7529. +        * load must be <= scale_load_down(tg->shares).
  7530. +        */
  7531. +       if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
  7532. +           /* scale gcfs_rq's load into tg's shares*/
  7533. +           load *= scale_load_down(gcfs_rq->tg->shares);
  7534. +           load /= tg_load;
  7535. +       }
  7536. +   }
  7537. +
  7538. +   delta = load - se->avg.load_avg;
  7539. +
  7540. +   /* Nothing to update */
  7541. +   if (!delta)
  7542. +       return;
  7543. +
  7544. +   /* Set new sched_entity's load */
  7545. +   se->avg.load_avg = load;
  7546. +   se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
  7547. +
  7548. +   /* Update parent cfs_rq load */
  7549. +   add_positive(&cfs_rq->avg.load_avg, delta);
  7550. +   cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
  7551. +
  7552. +   /*
  7553. +    * If the sched_entity is already enqueued, we also have to update the
  7554. +    * runnable load avg.
  7555. +    */
  7556. +   if (se->on_rq) {
  7557. +       /* Update parent cfs_rq runnable_load_avg */
  7558. +       add_positive(&cfs_rq->runnable_load_avg, delta);
  7559. +       cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
  7560. +   }
  7561. +}
  7562. +
  7563. +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
  7564. +{
  7565. +   cfs_rq->propagate_avg = 1;
  7566. +}
  7567. +
  7568. +static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
  7569. +{
  7570. +   struct cfs_rq *cfs_rq = group_cfs_rq(se);
  7571. +
  7572. +   if (!cfs_rq->propagate_avg)
  7573. +       return 0;
  7574. +
  7575. +   cfs_rq->propagate_avg = 0;
  7576. +   return 1;
  7577. +}
  7578. +
  7579. +/* Update task and its cfs_rq load average */
  7580. +static inline int propagate_entity_load_avg(struct sched_entity *se)
  7581. +{
  7582. +   struct cfs_rq *cfs_rq;
  7583. +
  7584. +   if (entity_is_task(se))
  7585. +       return 0;
  7586. +
  7587. +   if (!test_and_clear_tg_cfs_propagate(se))
  7588. +       return 0;
  7589. +
  7590. +   cfs_rq = cfs_rq_of(se);
  7591. +
  7592. +   set_tg_cfs_propagate(cfs_rq);
  7593. +
  7594. +   update_tg_cfs_util(cfs_rq, se);
  7595. +   update_tg_cfs_load(cfs_rq, se);
  7596. +
  7597. +   return 1;
  7598. +}
  7599. +
  7600. +/*
  7601. + * Check if we need to update the load and the utilization of a blocked
  7602. + * group_entity:
  7603. + */
  7604. +static inline bool skip_blocked_update(struct sched_entity *se)
  7605. +{
  7606. +   struct cfs_rq *gcfs_rq = group_cfs_rq(se);
  7607. +
  7608. +   /*
  7609. +    * If sched_entity still have not zero load or utilization, we have to
  7610. +    * decay it:
  7611. +    */
  7612. +   if (se->avg.load_avg || se->avg.util_avg)
  7613. +       return false;
  7614. +
  7615. +   /*
  7616. +    * If there is a pending propagation, we have to update the load and
  7617. +    * the utilization of the sched_entity:
  7618. +    */
  7619. +   if (gcfs_rq->propagate_avg)
  7620. +       return false;
  7621. +
  7622. +   /*
  7623. +    * Otherwise, the load and the utilization of the sched_entity is
  7624. +    * already zero and there is no pending propagation, so it will be a
  7625. +    * waste of time to try to decay it:
  7626. +    */
  7627. +   return true;
  7628. +}
  7629. +
  7630.  #else /* CONFIG_FAIR_GROUP_SCHED */
  7631. +
  7632.  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
  7633. +
  7634. +static inline int propagate_entity_load_avg(struct sched_entity *se)
  7635. +{
  7636. +   return 0;
  7637. +}
  7638. +
  7639. +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
  7640. +
  7641.  #endif /* CONFIG_FAIR_GROUP_SCHED */
  7642.  
  7643. +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
  7644. +{
  7645. +        if (&this_rq()->cfs == cfs_rq) {
  7646. +                /*
  7647. +                 * There are a few boundary cases this might miss but it should
  7648. +                 * get called often enough that that should (hopefully) not be
  7649. +                 * a real problem -- added to that it only calls on the local
  7650. +                 * CPU, so if we enqueue remotely we'll miss an update, but
  7651. +                 * the next tick/schedule should update.
  7652. +                 *
  7653. +                 * It will not get called when we go idle, because the idle
  7654. +                 * thread is a different class (!fair), nor will the utilization
  7655. +                 * number include things like RT tasks.
  7656. +                 *
  7657. +                 * As is, the util number is not freq-invariant (we'd have to
  7658. +                 * implement arch_scale_freq_capacity() for that).
  7659. +                 *
  7660. +                 * See cpu_util().
  7661. +                 */
  7662. +                cpufreq_update_util(rq_of(cfs_rq), 0);
  7663. +        }
  7664. +}
  7665. +
  7666.  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
  7667.  
  7668. -/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
  7669. -static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  7670. +/*
  7671. + * Unsigned subtract and clamp on underflow.
  7672. + *
  7673. + * Explicitly do a load-store to ensure the intermediate value never hits
  7674. + * memory. This allows lockless observations without ever seeing the negative
  7675. + * values.
  7676. + */
  7677. +#define sub_positive(_ptr, _val) do {              \
  7678. +   typeof(_ptr) ptr = (_ptr);              \
  7679. +   typeof(*ptr) val = (_val);              \
  7680. +   typeof(*ptr) res, var = READ_ONCE(*ptr);        \
  7681. +   res = var - val;                    \
  7682. +   if (res > var)                      \
  7683. +       res = 0;                    \
  7684. +   WRITE_ONCE(*ptr, res);                  \
  7685. +} while (0)
  7686. +
  7687. +/**
  7688. + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
  7689. + * @now: current time, as per cfs_rq_clock_task()
  7690. + * @cfs_rq: cfs_rq to update
  7691. + * @update_freq: should we call cfs_rq_util_change() or will the call do so
  7692. + *
  7693. + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
  7694. + * avg. The immediate corollary is that all (fair) tasks must be attached, see
  7695. + * post_init_entity_util_avg().
  7696. + *
  7697. + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
  7698. + *
  7699. + * Returns true if the load decayed or we removed load.
  7700. + *
  7701. + * Since both these conditions indicate a changed cfs_rq->avg.load we should
  7702. + * call update_tg_load_avg() when this function returns true.
  7703. + */
  7704. +static inline int
  7705. +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
  7706.  {
  7707.     struct sched_avg *sa = &cfs_rq->avg;
  7708. -   int decayed, removed = 0;
  7709. +   int decayed, removed = 0, removed_util = 0;
  7710.  
  7711.     if (atomic_long_read(&cfs_rq->removed_load_avg)) {
  7712. -       long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
  7713. -       sa->load_avg = max_t(long, sa->load_avg - r, 0);
  7714. -       sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
  7715. +       s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
  7716. +       sub_positive(&sa->load_avg, r);
  7717. +       sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
  7718.         removed = 1;
  7719. +       set_tg_cfs_propagate(cfs_rq);
  7720.     }
  7721.  
  7722.     if (atomic_long_read(&cfs_rq->removed_util_avg)) {
  7723.         long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
  7724. -       sa->util_avg = max_t(long, sa->util_avg - r, 0);
  7725. -       sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
  7726. +       sub_positive(&sa->util_avg, r);
  7727. +       sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
  7728. +       removed_util = 1;
  7729. +       set_tg_cfs_propagate(cfs_rq);
  7730.     }
  7731.  
  7732.     decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
  7733. @@ -2539,81 +3088,94 @@
  7734.     cfs_rq->load_last_update_time_copy = sa->last_update_time;
  7735.  #endif
  7736.  
  7737. +   /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
  7738. +   if (cfs_rq == &rq_of(cfs_rq)->cfs)
  7739. +       trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
  7740. +
  7741. +   if (update_freq && (decayed || removed_util))
  7742. +       cfs_rq_util_change(cfs_rq);
  7743. +
  7744.     return decayed || removed;
  7745.  }
  7746.  
  7747. -static inline unsigned long task_util_est(struct task_struct *p)
  7748. -{
  7749. -   return p->se.avg.util_est;
  7750. -}
  7751. +/*
  7752. + * Optional action to be done while updating the load average
  7753. + */
  7754. +#define UPDATE_TG  0x1
  7755. +#define SKIP_AGE_LOAD  0x2
  7756. +#define SKIP_CPUFREQ   0x4
  7757.  
  7758.  /* Update task and its cfs_rq load average */
  7759. -static inline void update_load_avg(struct sched_entity *se, int update_tg)
  7760. +static inline void update_load_avg(struct sched_entity *se, int flags)
  7761.  {
  7762.     struct cfs_rq *cfs_rq = cfs_rq_of(se);
  7763.     u64 now = cfs_rq_clock_task(cfs_rq);
  7764.     int cpu = cpu_of(rq_of(cfs_rq));
  7765. +   int decayed;
  7766. +   void *ptr = NULL;
  7767.  
  7768.     /*
  7769.      * Track task load average for carrying it to new CPU after migrated, and
  7770.      * track group sched_entity load average for task_h_load calc in migration
  7771.      */
  7772. -   __update_load_avg(now, cpu, &se->avg,
  7773. +   if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
  7774. +       __update_load_avg(now, cpu, &se->avg,
  7775.               se->on_rq * scale_load_down(se->load.weight),
  7776.               cfs_rq->curr == se, NULL);
  7777. +   }
  7778.  
  7779. -   if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
  7780. -       update_tg_load_avg(cfs_rq, 0);
  7781. +   decayed  = update_cfs_rq_load_avg(now, cfs_rq, !(flags & SKIP_CPUFREQ));
  7782. +   decayed |= propagate_entity_load_avg(se);
  7783.  
  7784. -   if (entity_is_task(se))
  7785. -       trace_sched_load_avg_task(task_of(se), &se->avg);
  7786. -   trace_sched_load_avg_cpu(cpu, cfs_rq);
  7787. +   if (decayed && (flags & UPDATE_TG))
  7788. +       update_tg_load_avg(cfs_rq, 0);
  7789.  
  7790. -   /* Update task estimated utilization */
  7791. -   if (se->avg.util_est < se->avg.util_avg) {
  7792. -       cfs_rq->avg.util_est += (se->avg.util_avg - se->avg.util_est);
  7793. -       se->avg.util_est = se->avg.util_avg;
  7794. +   if (entity_is_task(se)) {
  7795. +#ifdef CONFIG_SCHED_WALT
  7796. +       ptr = (void *)&(task_of(se)->ravg);
  7797. +#endif
  7798. +       trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
  7799.     }
  7800. -
  7801.  }
  7802.  
  7803. +/**
  7804. + * attach_entity_load_avg - attach this entity to its cfs_rq load avg
  7805. + * @cfs_rq: cfs_rq to attach to
  7806. + * @se: sched_entity to attach
  7807. + *
  7808. + * Must call update_cfs_rq_load_avg() before this, since we rely on
  7809. + * cfs_rq->avg.last_update_time being current.
  7810. + */
  7811.  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  7812.  {
  7813. -   if (!sched_feat(ATTACH_AGE_LOAD))
  7814. -       goto skip_aging;
  7815. -
  7816. -   /*
  7817. -    * If we got migrated (either between CPUs or between cgroups) we'll
  7818. -    * have aged the average right before clearing @last_update_time.
  7819. -    */
  7820. -   if (se->avg.last_update_time) {
  7821. -       __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
  7822. -                 &se->avg, 0, 0, NULL);
  7823. -
  7824. -       /*
  7825. -        * XXX: we could have just aged the entire load away if we've been
  7826. -        * absent from the fair class for too long.
  7827. -        */
  7828. -   }
  7829. -
  7830. -skip_aging:
  7831.     se->avg.last_update_time = cfs_rq->avg.last_update_time;
  7832.     cfs_rq->avg.load_avg += se->avg.load_avg;
  7833.     cfs_rq->avg.load_sum += se->avg.load_sum;
  7834.     cfs_rq->avg.util_avg += se->avg.util_avg;
  7835.     cfs_rq->avg.util_sum += se->avg.util_sum;
  7836. +   set_tg_cfs_propagate(cfs_rq);
  7837. +
  7838. +   cfs_rq_util_change(cfs_rq);
  7839.  }
  7840.  
  7841. +/**
  7842. + * detach_entity_load_avg - detach this entity from its cfs_rq load avg
  7843. + * @cfs_rq: cfs_rq to detach from
  7844. + * @se: sched_entity to detach
  7845. + *
  7846. + * Must call update_cfs_rq_load_avg() before this, since we rely on
  7847. + * cfs_rq->avg.last_update_time being current.
  7848. + */
  7849.  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  7850.  {
  7851. -   __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
  7852. -             &se->avg, se->on_rq * scale_load_down(se->load.weight),
  7853. -             cfs_rq->curr == se, NULL);
  7854.  
  7855. -   cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
  7856. -   cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
  7857. -   cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
  7858. -   cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
  7859. +   sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
  7860. +   sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
  7861. +   sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
  7862. +   sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
  7863. +   set_tg_cfs_propagate(cfs_rq);
  7864. +
  7865. +   cfs_rq_util_change(cfs_rq);
  7866.  }
  7867.  
  7868.  /* Add the load generated by se into cfs_rq's load average */
  7869. @@ -2621,34 +3183,20 @@
  7870.  enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  7871.  {
  7872.     struct sched_avg *sa = &se->avg;
  7873. -   u64 now = cfs_rq_clock_task(cfs_rq);
  7874. -   int migrated, decayed;
  7875. -
  7876. -   migrated = !sa->last_update_time;
  7877. -   if (!migrated) {
  7878. -       __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
  7879. -           se->on_rq * scale_load_down(se->load.weight),
  7880. -           cfs_rq->curr == se, NULL);
  7881. -   }
  7882. -
  7883. -   decayed = update_cfs_rq_load_avg(now, cfs_rq);
  7884.  
  7885.     cfs_rq->runnable_load_avg += sa->load_avg;
  7886.     cfs_rq->runnable_load_sum += sa->load_sum;
  7887.  
  7888. -   if (migrated)
  7889. +   if (!sa->last_update_time) {
  7890.         attach_entity_load_avg(cfs_rq, se);
  7891. -
  7892. -   if (decayed || migrated)
  7893.         update_tg_load_avg(cfs_rq, 0);
  7894. +   }
  7895.  }
  7896.  
  7897.  /* Remove the runnable load generated by se from cfs_rq's runnable load average */
  7898.  static inline void
  7899.  dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  7900.  {
  7901. -   update_load_avg(se, 1);
  7902. -
  7903.     cfs_rq->runnable_load_avg =
  7904.         max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
  7905.     cfs_rq->runnable_load_sum =
  7906. @@ -2677,46 +3225,36 @@
  7907.  #endif
  7908.  
  7909.  /*
  7910. + * Synchronize entity load avg of dequeued entity without locking
  7911. + * the previous rq.
  7912. + */
  7913. +void sync_entity_load_avg(struct sched_entity *se)
  7914. +{
  7915. +   struct cfs_rq *cfs_rq = cfs_rq_of(se);
  7916. +   u64 last_update_time;
  7917. +
  7918. +   last_update_time = cfs_rq_last_update_time(cfs_rq);
  7919. +   __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
  7920. +}
  7921. +
  7922. +/*
  7923.   * Task first catches up with cfs_rq, and then subtract
  7924.   * itself from the cfs_rq (task must be off the queue now).
  7925.   */
  7926.  void remove_entity_load_avg(struct sched_entity *se)
  7927.  {
  7928.     struct cfs_rq *cfs_rq = cfs_rq_of(se);
  7929. -   u64 last_update_time;
  7930.  
  7931.     /*
  7932.      * Newly created task or never used group entity should not be removed
  7933.      * from its (source) cfs_rq
  7934.      */
  7935. -   if (se->avg.last_update_time == 0)
  7936. -       return;
  7937. -
  7938. -   last_update_time = cfs_rq_last_update_time(cfs_rq);
  7939.  
  7940. -   __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
  7941. +   sync_entity_load_avg(se);
  7942.     atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
  7943.     atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
  7944.  }
  7945.  
  7946. -/*
  7947. - * Update the rq's load with the elapsed running time before entering
  7948. - * idle. if the last scheduled task is not a CFS task, idle_enter will
  7949. - * be the only way to update the runnable statistic.
  7950. - */
  7951. -void idle_enter_fair(struct rq *this_rq)
  7952. -{
  7953. -}
  7954. -
  7955. -/*
  7956. - * Update the rq's load with the elapsed idle time before a task is
  7957. - * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
  7958. - * be the only way to update the runnable statistic.
  7959. - */
  7960. -void idle_exit_fair(struct rq *this_rq)
  7961. -{
  7962. -}
  7963. -
  7964.  static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
  7965.  {
  7966.     return cfs_rq->runnable_load_avg;
  7967. @@ -2731,7 +3269,17 @@
  7968.  
  7969.  #else /* CONFIG_SMP */
  7970.  
  7971. -static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
  7972. +static inline int
  7973. +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
  7974. +{
  7975. +   return 0;
  7976. +}
  7977. +
  7978. +#define UPDATE_TG  0x0
  7979. +#define SKIP_AGE_LOAD  0x0
  7980. +#define SKIP_CPUFREQ   0x0
  7981. +
  7982. +static inline void update_load_avg(struct sched_entity *se, int not_used1){}
  7983.  static inline void
  7984.  enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  7985.  static inline void
  7986. @@ -2750,69 +3298,6 @@
  7987.  
  7988.  #endif /* CONFIG_SMP */
  7989.  
  7990. -static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
  7991. -{
  7992. -#ifdef CONFIG_SCHEDSTATS
  7993. -   struct task_struct *tsk = NULL;
  7994. -
  7995. -   if (entity_is_task(se))
  7996. -       tsk = task_of(se);
  7997. -
  7998. -   if (se->statistics.sleep_start) {
  7999. -       u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
  8000. -
  8001. -       if ((s64)delta < 0)
  8002. -           delta = 0;
  8003. -
  8004. -       if (unlikely(delta > se->statistics.sleep_max))
  8005. -           se->statistics.sleep_max = delta;
  8006. -
  8007. -       se->statistics.sleep_start = 0;
  8008. -       se->statistics.sum_sleep_runtime += delta;
  8009. -
  8010. -       if (tsk) {
  8011. -           account_scheduler_latency(tsk, delta >> 10, 1);
  8012. -           trace_sched_stat_sleep(tsk, delta);
  8013. -       }
  8014. -   }
  8015. -   if (se->statistics.block_start) {
  8016. -       u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
  8017. -
  8018. -       if ((s64)delta < 0)
  8019. -           delta = 0;
  8020. -
  8021. -       if (unlikely(delta > se->statistics.block_max))
  8022. -           se->statistics.block_max = delta;
  8023. -
  8024. -       se->statistics.block_start = 0;
  8025. -       se->statistics.sum_sleep_runtime += delta;
  8026. -
  8027. -       if (tsk) {
  8028. -           if (tsk->in_iowait) {
  8029. -               se->statistics.iowait_sum += delta;
  8030. -               se->statistics.iowait_count++;
  8031. -               trace_sched_stat_iowait(tsk, delta);
  8032. -           }
  8033. -
  8034. -           trace_sched_stat_blocked(tsk, delta);
  8035. -           trace_sched_blocked_reason(tsk);
  8036. -
  8037. -           /*
  8038. -            * Blocking time is in units of nanosecs, so shift by
  8039. -            * 20 to get a milliseconds-range estimation of the
  8040. -            * amount of time that the task spent sleeping:
  8041. -            */
  8042. -           if (unlikely(prof_on == SLEEP_PROFILING)) {
  8043. -               profile_hits(SLEEP_PROFILING,
  8044. -                       (void *)get_wchan(tsk),
  8045. -                       delta >> 20);
  8046. -           }
  8047. -           account_scheduler_latency(tsk, delta >> 10, 0);
  8048. -       }
  8049. -   }
  8050. -#endif
  8051. -}
  8052. -
  8053.  static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
  8054.  {
  8055.  #ifdef CONFIG_SCHED_DEBUG
  8056. @@ -2822,7 +3307,7 @@
  8057.         d = -d;
  8058.  
  8059.     if (d > 3*sysctl_sched_latency)
  8060. -       schedstat_inc(cfs_rq, nr_spread_over);
  8061. +       schedstat_inc(cfs_rq->nr_spread_over);
  8062.  #endif
  8063.  }
  8064.  
  8065. @@ -2860,6 +3345,26 @@
  8066.  
  8067.  static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
  8068.  
  8069. +static inline void check_schedstat_required(void)
  8070. +{
  8071. +#ifdef CONFIG_SCHEDSTATS
  8072. +   if (schedstat_enabled())
  8073. +       return;
  8074. +
  8075. +   /* Force schedstat enabled if a dependent tracepoint is active */
  8076. +   if (trace_sched_stat_wait_enabled()    ||
  8077. +           trace_sched_stat_sleep_enabled()   ||
  8078. +           trace_sched_stat_iowait_enabled()  ||
  8079. +           trace_sched_stat_blocked_enabled() ||
  8080. +           trace_sched_stat_runtime_enabled())  {
  8081. +       pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
  8082. +                "stat_blocked and stat_runtime require the "
  8083. +                "kernel parameter schedstats=enabled or "
  8084. +                "kernel.sched_schedstats=1\n");
  8085. +   }
  8086. +#endif
  8087. +}
  8088. +
  8089.  static void
  8090.  enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  8091.  {
  8092. @@ -2874,16 +3379,16 @@
  8093.      * Update run-time statistics of the 'current'.
  8094.      */
  8095.     update_curr(cfs_rq);
  8096. +   update_load_avg(se, UPDATE_TG);
  8097.     enqueue_entity_load_avg(cfs_rq, se);
  8098. +   update_cfs_shares(se);
  8099.     account_entity_enqueue(cfs_rq, se);
  8100. -   update_cfs_shares(cfs_rq);
  8101.  
  8102. -   if (flags & ENQUEUE_WAKEUP) {
  8103. +   if (flags & ENQUEUE_WAKEUP)
  8104.         place_entity(cfs_rq, se, 0);
  8105. -       enqueue_sleeper(cfs_rq, se);
  8106. -   }
  8107.  
  8108. -   update_stats_enqueue(cfs_rq, se);
  8109. +   check_schedstat_required();
  8110. +   update_stats_enqueue(cfs_rq, se, flags);
  8111.     check_spread(cfs_rq, se);
  8112.     if (se != cfs_rq->curr)
  8113.         __enqueue_entity(cfs_rq, se);
  8114. @@ -2945,25 +3450,30 @@
  8115.  static void
  8116.  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  8117.  {
  8118. +   int update_flags;
  8119. +
  8120.     /*
  8121.      * Update run-time statistics of the 'current'.
  8122.      */
  8123.     update_curr(cfs_rq);
  8124. -   dequeue_entity_load_avg(cfs_rq, se);
  8125.  
  8126. -   update_stats_dequeue(cfs_rq, se);
  8127. -   if (flags & DEQUEUE_SLEEP) {
  8128. -#ifdef CONFIG_SCHEDSTATS
  8129. -       if (entity_is_task(se)) {
  8130. -           struct task_struct *tsk = task_of(se);
  8131. +   /*
  8132. +    * When dequeuing a sched_entity, we must:
  8133. +    *   - Update loads to have both entity and cfs_rq synced with now.
  8134. +    *   - Substract its load from the cfs_rq->runnable_avg.
  8135. +    *   - Substract its previous weight from cfs_rq->load.weight.
  8136. +    *   - For group entity, update its weight to reflect the new share
  8137. +    *     of its group cfs_rq.
  8138. +    */
  8139. +   update_flags = UPDATE_TG;
  8140.  
  8141. -           if (tsk->state & TASK_INTERRUPTIBLE)
  8142. -               se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
  8143. -           if (tsk->state & TASK_UNINTERRUPTIBLE)
  8144. -               se->statistics.block_start = rq_clock(rq_of(cfs_rq));
  8145. -       }
  8146. -#endif
  8147. -   }
  8148. +   if (flags & DEQUEUE_IDLE)
  8149. +       update_flags |= SKIP_CPUFREQ;
  8150. +
  8151. +   update_load_avg(se, update_flags);
  8152. +   dequeue_entity_load_avg(cfs_rq, se);
  8153. +
  8154. +   update_stats_dequeue(cfs_rq, se, flags);
  8155.  
  8156.     clear_buddies(cfs_rq, se);
  8157.  
  8158. @@ -2983,8 +3493,16 @@
  8159.     /* return excess runtime on last dequeue */
  8160.     return_cfs_rq_runtime(cfs_rq);
  8161.  
  8162. -   update_min_vruntime(cfs_rq);
  8163. -   update_cfs_shares(cfs_rq);
  8164. +   update_cfs_shares(se);
  8165. +
  8166. +   /*
  8167. +    * Now advance min_vruntime if @se was the entity holding it back,
  8168. +    * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
  8169. +    * put back on, and if we advance min_vruntime, we'll be placed back
  8170. +    * further than we started -- ie. we'll be penalized.
  8171. +    */
  8172. +   if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
  8173. +       update_min_vruntime(cfs_rq);
  8174.  }
  8175.  
  8176.  /*
  8177. @@ -3039,22 +3557,23 @@
  8178.          */
  8179.         update_stats_wait_end(cfs_rq, se);
  8180.         __dequeue_entity(cfs_rq, se);
  8181. -       update_load_avg(se, 1);
  8182. +       update_load_avg(se, UPDATE_TG);
  8183.     }
  8184.  
  8185.     update_stats_curr_start(cfs_rq, se);
  8186.     cfs_rq->curr = se;
  8187. -#ifdef CONFIG_SCHEDSTATS
  8188. +
  8189.     /*
  8190.      * Track our maximum slice length, if the CPU's load is at
  8191.      * least twice that of our own weight (i.e. dont track it
  8192.      * when there are only lesser-weight tasks around):
  8193.      */
  8194. -   if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
  8195. -       se->statistics.slice_max = max(se->statistics.slice_max,
  8196. -           se->sum_exec_runtime - se->prev_sum_exec_runtime);
  8197. +   if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
  8198. +       schedstat_set(se->statistics.slice_max,
  8199. +           max((u64)schedstat_val(se->statistics.slice_max),
  8200. +               se->sum_exec_runtime - se->prev_sum_exec_runtime));
  8201.     }
  8202. -#endif
  8203. +
  8204.     se->prev_sum_exec_runtime = se->sum_exec_runtime;
  8205.  }
  8206.  
  8207. @@ -3134,6 +3653,7 @@
  8208.     check_cfs_rq_runtime(cfs_rq);
  8209.  
  8210.     check_spread(cfs_rq, prev);
  8211. +
  8212.     if (prev->on_rq) {
  8213.         update_stats_wait_start(cfs_rq, prev);
  8214.         /* Put 'current' back into the tree. */
  8215. @@ -3155,8 +3675,8 @@
  8216.     /*
  8217.      * Ensure that runnable average is periodically updated.
  8218.      */
  8219. -   update_load_avg(curr, 1);
  8220. -   update_cfs_shares(cfs_rq);
  8221. +   update_load_avg(curr, UPDATE_TG);
  8222. +   update_cfs_shares(curr);
  8223.  
  8224.  #ifdef CONFIG_SCHED_HRTICK
  8225.     /*
  8226. @@ -3255,7 +3775,7 @@
  8227.  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
  8228.  {
  8229.     if (unlikely(cfs_rq->throttle_count))
  8230. -       return cfs_rq->throttled_clock_task;
  8231. +       return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
  8232.  
  8233.     return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
  8234.  }
  8235. @@ -3393,13 +3913,11 @@
  8236.     struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
  8237.  
  8238.     cfs_rq->throttle_count--;
  8239. -#ifdef CONFIG_SMP
  8240.     if (!cfs_rq->throttle_count) {
  8241.         /* adjust cfs_rq_clock_task() */
  8242.         cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
  8243.                          cfs_rq->throttled_clock_task;
  8244.     }
  8245. -#endif
  8246.  
  8247.     return 0;
  8248.  }
  8249. @@ -3766,6 +4284,23 @@
  8250.         throttle_cfs_rq(cfs_rq);
  8251.  }
  8252.  
  8253. +static void sync_throttle(struct task_group *tg, int cpu)
  8254. +{
  8255. +   struct cfs_rq *pcfs_rq, *cfs_rq;
  8256. +
  8257. +   if (!cfs_bandwidth_used())
  8258. +       return;
  8259. +
  8260. +   if (!tg->parent)
  8261. +       return;
  8262. +
  8263. +   cfs_rq = tg->cfs_rq[cpu];
  8264. +   pcfs_rq = tg->parent->cfs_rq[cpu];
  8265. +
  8266. +   cfs_rq->throttle_count = pcfs_rq->throttle_count;
  8267. +   pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
  8268. +}
  8269. +
  8270.  /* conditionally throttle active cfs_rq's from put_prev_entity() */
  8271.  static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  8272.  {
  8273. @@ -3851,6 +4386,10 @@
  8274.  
  8275.  static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
  8276.  {
  8277. +   /* init_cfs_bandwidth() was not called */
  8278. +   if (!cfs_b->throttled_cfs_rq.next)
  8279. +       return;
  8280. +
  8281.     hrtimer_cancel(&cfs_b->period_timer);
  8282.     hrtimer_cancel(&cfs_b->slack_timer);
  8283.  }
  8284. @@ -3901,6 +4440,7 @@
  8285.  static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
  8286.  static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
  8287.  static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
  8288. +static inline void sync_throttle(struct task_group *tg, int cpu) {}
  8289.  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
  8290.  
  8291.  static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
  8292. @@ -3945,9 +4485,9 @@
  8293.     struct sched_entity *se = &p->se;
  8294.     struct cfs_rq *cfs_rq = cfs_rq_of(se);
  8295.  
  8296. -   WARN_ON(task_rq(p) != rq);
  8297. +   SCHED_WARN_ON(task_rq(p) != rq);
  8298.  
  8299. -   if (cfs_rq->nr_running > 1) {
  8300. +   if (rq->cfs.h_nr_running > 1) {
  8301.         u64 slice = sched_slice(cfs_rq, se);
  8302.         u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
  8303.         s64 delta = slice - ran;
  8304. @@ -3988,21 +4528,25 @@
  8305.  #endif
  8306.  
  8307.  #ifdef CONFIG_SMP
  8308. +static bool __cpu_overutilized(int cpu, int delta);
  8309.  static bool cpu_overutilized(int cpu);
  8310. +unsigned long boosted_cpu_util(int cpu);
  8311. +#else
  8312. +#define boosted_cpu_util(cpu) cpu_util_freq(cpu)
  8313.  #endif
  8314.  
  8315. -#ifdef CONFIG_CPU_FREQ_GOV_SCHED
  8316. -static void update_capacity_of(int cpu, bool request)
  8317. +#ifdef CONFIG_SMP
  8318. +static void update_capacity_of(int cpu)
  8319.  {
  8320.     unsigned long req_cap;
  8321.  
  8322.     if (!sched_freq())
  8323.         return;
  8324.  
  8325. -   /* Convert scale-invariant capacity to cpu. */
  8326. +   /* Normalize scale-invariant capacity to cpu. */
  8327.     req_cap = boosted_cpu_util(cpu);
  8328.     req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
  8329. -   set_cfs_cpu_capacity(cpu, request, req_cap);
  8330. +   set_cfs_cpu_capacity(cpu, true, req_cap);
  8331.  }
  8332.  #endif
  8333.  
  8334. @@ -4019,8 +4563,35 @@
  8335.  #ifdef CONFIG_SMP
  8336.     int task_new = flags & ENQUEUE_WAKEUP_NEW;
  8337.     int task_wakeup = flags & ENQUEUE_WAKEUP;
  8338. +
  8339. +   /*
  8340. +    * Update SchedTune accounting.
  8341. +    *
  8342. +    * We do it before updating the CPU capacity to ensure the
  8343. +    * boost value of the current task is accounted for in the
  8344. +    * selection of the OPP.
  8345. +    *
  8346. +    * We do it also in the case where we enqueue a throttled task;
  8347. +    * we could argue that a throttled task should not boost a CPU,
  8348. +    * however:
  8349. +    * a) properly implementing CPU boosting considering throttled
  8350. +    *    tasks will increase a lot the complexity of the solution
  8351. +    * b) it's not easy to quantify the benefits introduced by
  8352. +    *    such a more complex solution.
  8353. +    * Thus, for the time being we go for the simple solution and boost
  8354. +    * also for throttled RQs.
  8355. +    */
  8356. +   schedtune_enqueue_task(p, cpu_of(rq));
  8357.  #endif
  8358.  
  8359. +   /*
  8360. +    * If in_iowait is set, the code below may not trigger any cpufreq
  8361. +    * utilization updates, so do it here explicitly with the IOWAIT flag
  8362. +    * passed.
  8363. +    */
  8364. +   if (p->in_iowait)
  8365. +       cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
  8366. +
  8367.     for_each_sched_entity(se) {
  8368.         if (se->on_rq)
  8369.             break;
  8370. @@ -4032,7 +4603,7 @@
  8371.          *
  8372.          * note: in the case of encountering a throttled cfs_rq we will
  8373.          * post the final h_nr_running increment below.
  8374. -       */
  8375. +        */
  8376.         if (cfs_rq_throttled(cfs_rq))
  8377.             break;
  8378.         cfs_rq->h_nr_running++;
  8379. @@ -4049,34 +4620,14 @@
  8380.         if (cfs_rq_throttled(cfs_rq))
  8381.             break;
  8382.  
  8383. -       update_load_avg(se, 1);
  8384. -       update_cfs_shares(cfs_rq);
  8385. +       update_load_avg(se, UPDATE_TG);
  8386. +       update_cfs_shares(se);
  8387.     }
  8388.  
  8389.     if (!se)
  8390.         add_nr_running(rq, 1);
  8391.  
  8392.  #ifdef CONFIG_SMP
  8393. -
  8394. -   /*
  8395. -    * Update SchedTune accouting.
  8396. -    *
  8397. -    * We do it before updating the CPU capacity to ensure the
  8398. -    * boost value of the current task is accounted for in the
  8399. -    * selection of the OPP.
  8400. -    *
  8401. -    * We do it also in the case where we enqueue a trottled task;
  8402. -    * we could argue that a throttled task should not boost a CPU,
  8403. -    * however:
  8404. -    * a) properly implementing CPU boosting considering throttled
  8405. -    *    tasks will increase a lot the complexity of the solution
  8406. -    * b) it's not easy to quantify the benefits introduced by
  8407. -    *    such a more complex solution.
  8408. -    * Thus, for the time being we go for the simple solution and boost
  8409. -    * also for throttled RQs.
  8410. -    */
  8411. -   schedtune_enqueue_task(p, cpu_of(rq));
  8412. -
  8413.     if (!se) {
  8414.         walt_inc_cumulative_runnable_avg(rq, p);
  8415.         if (!task_new && !rq->rd->overutilized &&
  8416. @@ -4093,17 +4644,10 @@
  8417.          * request after load balancing is done.
  8418.          */
  8419.         if (task_new || task_wakeup)
  8420. -           update_capacity_of(cpu_of(rq), true);
  8421. +           update_capacity_of(cpu_of(rq));
  8422.     }
  8423.  
  8424. -   /* Get the top level CFS RQ for the task CPU */
  8425. -   cfs_rq = &(task_rq(p)->cfs);
  8426. -
  8427. -   /* Update RQ estimated utilization */
  8428. -   cfs_rq->avg.util_est += task_util_est(p);
  8429. -
  8430.  #endif /* CONFIG_SMP */
  8431. -
  8432.     hrtick_update(rq);
  8433.  }
  8434.  
  8435. @@ -4120,6 +4664,20 @@
  8436.     struct sched_entity *se = &p->se;
  8437.     int task_sleep = flags & DEQUEUE_SLEEP;
  8438.  
  8439. +   if (task_sleep && rq->nr_running == 1)
  8440. +       flags |= DEQUEUE_IDLE;
  8441. +
  8442. +#ifdef CONFIG_SMP
  8443. +   /*
  8444. +    * Update SchedTune accounting
  8445. +    *
  8446. +    * We do it before updating the CPU capacity to ensure the
  8447. +    * boost value of the current task is accounted for in the
  8448. +    * selection of the OPP.
  8449. +    */
  8450. +   schedtune_dequeue_task(p, cpu_of(rq));
  8451. +#endif
  8452. +
  8453.     for_each_sched_entity(se) {
  8454.         cfs_rq = cfs_rq_of(se);
  8455.         dequeue_entity(cfs_rq, se, flags);
  8456. @@ -4137,21 +4695,22 @@
  8457.  
  8458.         /* Don't dequeue parent if it has other entities besides us */
  8459.         if (cfs_rq->load.weight) {
  8460. +           /* Avoid re-evaluating load for this entity: */
  8461. +           se = parent_entity(se);
  8462.             /*
  8463.              * Bias pick_next to pick a task from this cfs_rq, as
  8464.              * p is sleeping when it is within its sched_slice.
  8465.              */
  8466. -           if (task_sleep && parent_entity(se))
  8467. -               set_next_buddy(parent_entity(se));
  8468. -
  8469. -           /* avoid re-evaluating load for this entity */
  8470. -           se = parent_entity(se);
  8471. +           if (task_sleep && se && !throttled_hierarchy(cfs_rq))
  8472. +               set_next_buddy(se);
  8473.             break;
  8474.         }
  8475.         flags |= DEQUEUE_SLEEP;
  8476.     }
  8477.  
  8478.     for_each_sched_entity(se) {
  8479. +       int update_flags;
  8480. +
  8481.         cfs_rq = cfs_rq_of(se);
  8482.         cfs_rq->h_nr_running--;
  8483.         walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
  8484. @@ -4159,24 +4718,19 @@
  8485.         if (cfs_rq_throttled(cfs_rq))
  8486.             break;
  8487.  
  8488. -       update_load_avg(se, 1);
  8489. -       update_cfs_shares(cfs_rq);
  8490. +       update_flags = UPDATE_TG;
  8491. +
  8492. +       if (flags & DEQUEUE_IDLE)
  8493. +           update_flags |= SKIP_CPUFREQ;
  8494. +
  8495. +       update_load_avg(se, update_flags);
  8496. +       update_cfs_shares(se);
  8497.     }
  8498.  
  8499.     if (!se)
  8500.         sub_nr_running(rq, 1);
  8501.  
  8502.  #ifdef CONFIG_SMP
  8503. -
  8504. -   /*
  8505. -    * Update SchedTune accouting
  8506. -    *
  8507. -    * We do it before updating the CPU capacity to ensure the
  8508. -    * boost value of the current task is accounted for in the
  8509. -    * selection of the OPP.
  8510. -    */
  8511. -   schedtune_dequeue_task(p, cpu_of(rq));
  8512. -
  8513.     if (!se) {
  8514.         walt_dec_cumulative_runnable_avg(rq, p);
  8515.  
  8516. @@ -4190,26 +4744,12 @@
  8517.          */
  8518.         if (task_sleep) {
  8519.             if (rq->cfs.nr_running)
  8520. -               update_capacity_of(cpu_of(rq), true);
  8521. +               update_capacity_of(cpu_of(rq));
  8522.             else if (sched_freq())
  8523. -               update_capacity_of(cpu_of(rq), false);
  8524. +               set_cfs_cpu_capacity(cpu_of(rq), false, 0); /* no normalization required for 0 */
  8525.         }
  8526.     }
  8527.  
  8528. -   /* Get the top level CFS RQ for the task CPU */
  8529. -   cfs_rq = &(task_rq(p)->cfs);
  8530. -
  8531. -   /* Update RQ estimated utilization */
  8532. -   if (cfs_rq->avg.util_est >= task_util_est(p))
  8533. -       cfs_rq->avg.util_est -= task_util_est(p);
  8534. -   else
  8535. -       cfs_rq->avg.util_est = 0;
  8536. -
  8537. -
  8538. -   /* Update estimated utilization */
  8539. -   if (task_sleep)
  8540. -       p->se.avg.util_est = p->se.avg.util_avg;
  8541. -
  8542.  #endif /* CONFIG_SMP */
  8543.  
  8544.     hrtick_update(rq);
  8545. @@ -4545,25 +5085,30 @@
  8546.         return wl;
  8547.  
  8548.     for_each_sched_entity(se) {
  8549. -       long w, W;
  8550. +       struct cfs_rq *cfs_rq = se->my_q;
  8551. +       long W, w = cfs_rq_load_avg(cfs_rq);
  8552.  
  8553. -       tg = se->my_q->tg;
  8554. +       tg = cfs_rq->tg;
  8555.  
  8556.         /*
  8557.          * W = @wg + \Sum rw_j
  8558.          */
  8559. -       W = wg + calc_tg_weight(tg, se->my_q);
  8560. +       W = wg + atomic_long_read(&tg->load_avg);
  8561. +
  8562. +       /* Ensure \Sum rw_j >= rw_i */
  8563. +       W -= cfs_rq->tg_load_avg_contrib;
  8564. +       W += w;
  8565.  
  8566.         /*
  8567.          * w = rw_i + @wl
  8568.          */
  8569. -       w = cfs_rq_load_avg(se->my_q) + wl;
  8570. +       w += wl;
  8571.  
  8572.         /*
  8573.          * wl = S * s'_i; see (2)
  8574.          */
  8575.         if (W > 0 && w < W)
  8576. -           wl = (w * tg->shares) / W;
  8577. +           wl = (w * (long)tg->shares) / W;
  8578.         else
  8579.             wl = tg->shares;
  8580.  
  8581. @@ -4612,16 +5157,95 @@
  8582.            >> SCHED_CAPACITY_SHIFT;
  8583.  }
  8584.  
  8585. +/*
  8586. + * Returns the current capacity of cpu after applying both
  8587. + * cpu and min freq scaling.
  8588. + */
  8589. +unsigned long capacity_min_of(int cpu)
  8590. +{
  8591. +   if (!sched_feat(MIN_CAPACITY_CAPPING))
  8592. +       return 0;
  8593. +   return arch_scale_cpu_capacity(NULL, cpu) *
  8594. +          arch_scale_min_freq_capacity(NULL, cpu)
  8595. +          >> SCHED_CAPACITY_SHIFT;
  8596. +}
  8597. +
  8598. +
  8599.  static inline bool energy_aware(void)
  8600.  {
  8601.     return sched_feat(ENERGY_AWARE);
  8602.  }
  8603.  
  8604.  /*
  8605. + * CPU candidates.
  8606. + *
  8607. + * These are labels to reference CPU candidates for an energy_diff.
  8608. + * Currently we support only two possible candidates: the task's previous CPU
  8609. + * and another candiate CPU.
  8610. + * More advanced/aggressive EAS selection policies can consider more
  8611. + * candidates.
  8612. + */
  8613. +#define EAS_CPU_PRV    0
  8614. +#define EAS_CPU_NXT    1
  8615. +#define EAS_CPU_BKP    2
  8616. +#define EAS_CPU_CNT    3
  8617. +
  8618. +/*
  8619. + * energy_diff - supports the computation of the estimated energy impact in
  8620. + * moving a "task"'s "util_delta" between different CPU candidates.
  8621. + */
  8622. +struct energy_env {
  8623. +   /* Utilization to move */
  8624. +   struct task_struct  *p;
  8625. +   int         util_delta;
  8626. +
  8627. +   /* Mask of CPUs candidates to evaluate */
  8628. +   cpumask_t       cpus_mask;
  8629. +
  8630. +   /* CPU candidates to evaluate */
  8631. +   struct {
  8632. +
  8633. +       /* CPU ID, must be in cpus_mask */
  8634. +       int cpu_id;
  8635. +
  8636. +       /*
  8637. +        * Index (into sched_group_energy::cap_states) of the OPP the
  8638. +        * CPU needs to run at if the task is placed on it.
  8639. +        * This includes the both active and blocked load, due to
  8640. +        * other tasks on this CPU,  as well as the task's own
  8641. +        * utilization.
  8642. +        */
  8643. +       int cap_idx;
  8644. +       int cap;
  8645. +
  8646. +       /* Estimated system energy */
  8647. +       unsigned int energy;
  8648. +
  8649. +       /* Estimated energy variation wrt EAS_CPU_PRV */
  8650. +       int nrg_delta;
  8651. +
  8652. +   } cpu[EAS_CPU_CNT];
  8653. +
  8654. +   /*
  8655. +    * Index (into energy_env::cpu) of the morst energy efficient CPU for
  8656. +    * the specified energy_env::task
  8657. +    */
  8658. +   int         next_idx;
  8659. +
  8660. +   /* Support data */
  8661. +   struct sched_group  *sg_top;
  8662. +   struct sched_group  *sg_cap;
  8663. +   struct sched_group  *sg;
  8664. +};
  8665. +
  8666. +static int cpu_util_wake(int cpu, struct task_struct *p);
  8667. +
  8668. +/*
  8669.   * __cpu_norm_util() returns the cpu util relative to a specific capacity,
  8670. - * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
  8671. - * energy calculations. Using the scale-invariant util returned by
  8672. - * cpu_util() and approximating scale-invariant util by:
  8673. + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
  8674. + * energy calculations.
  8675. + *
  8676. + * Since util is a scale-invariant utilization defined as:
  8677.   *
  8678.   *   util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
  8679.   *
  8680. @@ -4631,34 +5255,41 @@
  8681.   *
  8682.   *   norm_util = running_time/time ~ util/capacity
  8683.   */
  8684. -static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta)
  8685. +static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
  8686.  {
  8687. -   int util = __cpu_util(cpu, delta, UTIL_EST);
  8688. -
  8689.     if (util >= capacity)
  8690.         return SCHED_CAPACITY_SCALE;
  8691.  
  8692.     return (util << SCHED_CAPACITY_SHIFT)/capacity;
  8693.  }
  8694.  
  8695. -static int calc_util_delta(struct energy_env *eenv, int cpu)
  8696. -{
  8697. -   if (cpu == eenv->src_cpu)
  8698. -       return -eenv->util_delta;
  8699. -   if (cpu == eenv->dst_cpu)
  8700. -       return eenv->util_delta;
  8701. -   return 0;
  8702. -}
  8703. -
  8704. -static
  8705. -unsigned long group_max_util(struct energy_env *eenv)
  8706. +static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx)
  8707.  {
  8708. -   int i, delta;
  8709.     unsigned long max_util = 0;
  8710. +   unsigned long util;
  8711. +   int cpu;
  8712. +
  8713. +   for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
  8714. +       util = cpu_util_wake(cpu, eenv->p);
  8715. +
  8716. +       /*
  8717. +        * If we are looking at the target CPU specified by the eenv,
  8718. +        * then we should add the (estimated) utilization of the task
  8719. +        * assuming we will wake it up on that CPU.
  8720. +        */
  8721. +       if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
  8722. +           util += eenv->util_delta;
  8723. +
  8724. +       max_util = max(max_util, util);
  8725. +
  8726. +       /*
  8727. +        * Take into account any minimum frequency imposed
  8728. +        * elsewhere which limits the energy states available
  8729. +        * If the MIN_CAPACITY_CAPPING feature is not enabled
  8730. +        * capacity_min_of will return 0 (not capped).
  8731. +        */
  8732. +       max_util = max(max_util, capacity_min_of(cpu));
  8733.  
  8734. -   for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) {
  8735. -       delta = calc_util_delta(eenv, i);
  8736. -       max_util = max(max_util, __cpu_util(i, delta, UTIL_EST));
  8737.     }
  8738.  
  8739.     return max_util;
  8740. @@ -4666,93 +5297,67 @@
  8741.  
  8742.  /*
  8743.   * group_norm_util() returns the approximated group util relative to it's
  8744. - * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in
  8745. - * energy calculations. Since task executions may or may not overlap in time in
  8746. - * the group the true normalized util is between max(cpu_norm_util(i)) and
  8747. - * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
  8748. - * latter is used as the estimate as it leads to a more pessimistic energy
  8749. + * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
  8750. + * in energy calculations.
  8751. + *
  8752. + * Since task executions may or may not overlap in time in the group the true
  8753. + * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
  8754. + * when iterating over all CPUs in the group.
  8755. + * The latter estimate is used as it leads to a more pessimistic energy
  8756.   * estimate (more busy).
  8757.   */
  8758.  static unsigned
  8759. -long group_norm_util(struct energy_env *eenv)
  8760. +long group_norm_util(struct energy_env *eenv, int cpu_idx)
  8761.  {
  8762. -   int i, delta;
  8763. -   unsigned long util_sum = 0;
  8764. -   struct sched_group *sg = eenv->sg;
  8765. -   unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
  8766. +   unsigned long capacity = eenv->cpu[cpu_idx].cap;
  8767. +   unsigned long util, util_sum = 0;
  8768. +   int cpu;
  8769.  
  8770. -   for_each_cpu(i, sched_group_cpus(sg)) {
  8771. -       delta = calc_util_delta(eenv, i);
  8772. -       util_sum += __cpu_norm_util(i, capacity, delta);
  8773. -   }
  8774. +   for_each_cpu(cpu, sched_group_cpus(eenv->sg)) {
  8775. +       util = cpu_util_wake(cpu, eenv->p);
  8776.  
  8777. -   if (util_sum > SCHED_CAPACITY_SCALE)
  8778. -       return SCHED_CAPACITY_SCALE;
  8779. -   return util_sum;
  8780. -}
  8781. +       /*
  8782. +        * If we are looking at the target CPU specified by the eenv,
  8783. +        * then we should add the (estimated) utilization of the task
  8784. +        * assuming we will wake it up on that CPU.
  8785. +        */
  8786. +       if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
  8787. +           util += eenv->util_delta;
  8788.  
  8789. -#ifdef CONFIG_SCHED_TUNE
  8790. -static inline int
  8791. -find_min_capacity(struct energy_env *eenv)
  8792. -{
  8793. -   const struct sched_group_energy const *sge = eenv->sg->sge;
  8794. -   unsigned long min_capacity, cur_capacity;
  8795. -   int min_cap_idx, cap_idx;
  8796. -   unsigned long min_util;
  8797. -
  8798. -   /* Non boosted tasks do not affect the minimum capacity */
  8799. -   if (!schedtune_task_boost(eenv->task))
  8800. -       return eenv->cap_idx;
  8801. -
  8802. -   /* Find minimum capacity to satify the task boost value */
  8803. -   min_util = boosted_task_util(eenv->task);
  8804. -   for (min_cap_idx = 0; min_cap_idx < (sge->nr_cap_states-1); min_cap_idx++) {
  8805. -       if (sge->cap_states[min_cap_idx].cap >= min_util)
  8806. -           break;
  8807. +       util_sum += __cpu_norm_util(util, capacity);
  8808.     }
  8809. -   min_capacity = sge->cap_states[min_cap_idx].cap;
  8810. -
  8811. -   /* The current capacity is the one computed by the caller */
  8812. -   cur_capacity = sge->cap_states[eenv->cap_idx].cap;
  8813. -
  8814. -   /*
  8815. -    * Compute the minumum CPU capacity required to support task boosting
  8816. -    * within this SG.
  8817. -    */
  8818. -   cur_capacity = max(min_capacity, cur_capacity);
  8819. -   cap_idx = max(eenv->cap_idx, min_cap_idx);
  8820.  
  8821. -   return cap_idx;
  8822. +   return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
  8823.  }
  8824. -#else
  8825. -#define find_min_capacity(eenv) eenv->cap_idx
  8826. -#endif /* CONFIG_SCHED_TUNE */
  8827.  
  8828. -static int find_new_capacity(struct energy_env *eenv)
  8829. +static int find_new_capacity(struct energy_env *eenv, int cpu_idx)
  8830.  {
  8831. -   const struct sched_group_energy const *sge = eenv->sg->sge;
  8832. +   const struct sched_group_energy *sge = eenv->sg->sge;
  8833.     int idx, max_idx = sge->nr_cap_states - 1;
  8834. -   unsigned long util = group_max_util(eenv);
  8835. +   unsigned long util = group_max_util(eenv, cpu_idx);
  8836.  
  8837.     /* default is max_cap if we don't find a match */
  8838. -   eenv->cap_idx = max_idx;
  8839. +   eenv->cpu[cpu_idx].cap_idx = max_idx;
  8840. +   eenv->cpu[cpu_idx].cap = sge->cap_states[max_idx].cap;
  8841.  
  8842.     for (idx = 0; idx < sge->nr_cap_states; idx++) {
  8843.         if (sge->cap_states[idx].cap >= util) {
  8844. -           /* Keep track of SG's capacity index */
  8845. -           eenv->cap_idx = idx;
  8846. +           /* Keep track of SG's capacity */
  8847. +           eenv->cpu[cpu_idx].cap_idx = idx;
  8848. +           eenv->cpu[cpu_idx].cap = sge->cap_states[idx].cap;
  8849.             break;
  8850.         }
  8851.     }
  8852. -   /* Update SG's capacity based on boost value of the current task */
  8853. -   eenv->cap_idx = find_min_capacity(eenv);
  8854.  
  8855. -   return eenv->cap_idx;
  8856. +   return eenv->cpu[cpu_idx].cap_idx;
  8857.  }
  8858.  
  8859. -static int group_idle_state(struct sched_group *sg)
  8860. +static int group_idle_state(struct energy_env *eenv, int cpu_idx)
  8861.  {
  8862. +   struct sched_group *sg = eenv->sg;
  8863.     int i, state = INT_MAX;
  8864. +   int src_in_grp, dst_in_grp;
  8865. +   long grp_util = 0;
  8866.  
  8867.     /* Find the shallowest idle state in the sched group. */
  8868.     for_each_cpu(i, sched_group_cpus(sg))
  8869. @@ -4761,114 +5366,161 @@
  8870.     /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
  8871.     state++;
  8872.  
  8873. +   src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id,
  8874. +                     sched_group_cpus(sg));
  8875. +   dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id,
  8876. +                     sched_group_cpus(sg));
  8877. +   if (src_in_grp == dst_in_grp) {
  8878. +       /* both CPUs under consideration are in the same group or not in
  8879. +        * either group, migration should leave idle state the same.
  8880. +        */
  8881. +       goto end;
  8882. +   }
  8883. +
  8884. +   /*
  8885. +    * Try to estimate if a deeper idle state is
  8886. +    * achievable when we move the task.
  8887. +    */
  8888. +   for_each_cpu(i, sched_group_cpus(sg)) {
  8889. +       grp_util += cpu_util_wake(i, eenv->p);
  8890. +       if (unlikely(i == eenv->cpu[cpu_idx].cpu_id))
  8891. +           grp_util += eenv->util_delta;
  8892. +   }
  8893. +
  8894. +   if (grp_util <=
  8895. +       ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
  8896. +       /* after moving, this group is at most partly
  8897. +        * occupied, so it should have some idle time.
  8898. +        */
  8899. +       int max_idle_state_idx = sg->sge->nr_idle_states - 2;
  8900. +       int new_state = grp_util * max_idle_state_idx;
  8901. +       if (grp_util <= 0)
  8902. +           /* group will have no util, use lowest state */
  8903. +           new_state = max_idle_state_idx + 1;
  8904. +       else {
  8905. +           /* for partially idle, linearly map util to idle
  8906. +            * states, excluding the lowest one. This does not
  8907. +            * correspond to the state we expect to enter in
  8908. +            * reality, but an indication of what might happen.
  8909. +            */
  8910. +           new_state = min(max_idle_state_idx, (int)
  8911. +                   (new_state / sg->sgc->max_capacity));
  8912. +           new_state = max_idle_state_idx - new_state;
  8913. +       }
  8914. +       state = new_state;
  8915. +   } else {
  8916. +       /* After moving, the group will be fully occupied
  8917. +        * so assume it will not be idle at all.
  8918. +        */
  8919. +       state = 0;
  8920. +   }
  8921. +end:
  8922.     return state;
  8923.  }
  8924.  
  8925.  /*
  8926. - * Compute energy for the eenv's SG (i.e. eenv->sg).
  8927. + * sched_group_energy(): Computes the absolute energy consumption of cpus
  8928. + * belonging to the sched_group including shared resources shared only by
  8929. + * members of the group. Iterates over all cpus in the hierarchy below the
  8930. + * sched_group starting from the bottom working it's way up before going to
  8931. + * the next cpu until all cpus are covered at all levels. The current
  8932. + * implementation is likely to gather the same util statistics multiple times.
  8933. + * This can probably be done in a faster but more complex way.
  8934. + * Note: sched_group_energy() may fail when racing with sched_domain updates.
  8935. + * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg).
  8936. + *
  8937. + * This works in iterations to compute the SG's energy for each CPU
  8938. + * candidate defined by the energy_env's cpu array.
  8939.   *
  8940. - * This works in two iterations:
  8941. - * first iteration, before moving the utilization, i.e.
  8942. - *   util_delta == 0
  8943. - * second iteration, after moving the utilization, i.e.
  8944. - *   util_delta != 0
  8945. + * NOTE: in the following computations for busy_energy and idle_energy we do
  8946. + * not shift by SCHED_CAPACITY_SHIFT in order to reduce rounding errors.
  8947. + * The required scaling will be performed just one time, by the calling
  8948. + * functions, once we accumulated the contributons for all the SGs.
  8949.   */
  8950. -static void before_after_energy(struct energy_env *eenv)
  8951. +static void calc_sg_energy(struct energy_env *eenv)
  8952.  {
  8953. -
  8954. -   int sg_busy_energy, sg_idle_energy;
  8955.     struct sched_group *sg = eenv->sg;
  8956. -   unsigned long util_delta;
  8957. -   unsigned long group_util;
  8958. +   int busy_energy, idle_energy;
  8959. +   unsigned int busy_power;
  8960. +   unsigned int idle_power;
  8961. +   unsigned long sg_util;
  8962.     int cap_idx, idle_idx;
  8963.     int total_energy = 0;
  8964. -   unsigned int cap;
  8965. -   bool after;
  8966. -
  8967. -   util_delta = eenv->util_delta;
  8968. -   eenv->util_delta = 0;
  8969. -   after = false;
  8970. -
  8971. -compute_after:
  8972. +   int cpu_idx;
  8973.  
  8974. -   idle_idx = group_idle_state(sg);
  8975. +   for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
  8976.  
  8977. -   cap_idx = find_new_capacity(eenv);
  8978. -   group_util = group_norm_util(eenv);
  8979. -   cap = sg->sge->cap_states[cap_idx].cap;
  8980.  
  8981. -   sg_busy_energy   = group_util * sg->sge->cap_states[cap_idx].power;
  8982. -   sg_busy_energy >>= SCHED_CAPACITY_SHIFT;
  8983. +       if (eenv->cpu[cpu_idx].cpu_id == -1)
  8984. +           continue;
  8985. +       /* Compute ACTIVE energy */
  8986. +       cap_idx = find_new_capacity(eenv, cpu_idx);
  8987. +       busy_power = sg->sge->cap_states[cap_idx].power;
  8988. +       /*
  8989. +        * in order to calculate cpu_norm_util, we need to know which
  8990. +        * capacity level the group will be at, so calculate that first
  8991. +        */
  8992. +       sg_util = group_norm_util(eenv, cpu_idx);
  8993.  
  8994. -   sg_idle_energy   = SCHED_CAPACITY_SCALE - group_util;
  8995. -   sg_idle_energy  *= sg->sge->idle_states[idle_idx].power;
  8996. -   sg_idle_energy >>= SCHED_CAPACITY_SHIFT;
  8997. +       busy_energy   = sg_util * busy_power;
  8998.  
  8999. -   total_energy = sg_busy_energy + sg_idle_energy;
  9000. +       /* Compute IDLE energy */
  9001. +       idle_idx = group_idle_state(eenv, cpu_idx);
  9002. +       idle_power = sg->sge->idle_states[idle_idx].power;
  9003.  
  9004. -   /* Account for "after" metrics */
  9005. -   if (after) {
  9006. -       if (sg->group_weight == 1 &&
  9007. -           cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
  9008. -           eenv->after.utilization = group_util;
  9009. -           eenv->after.capacity = cap;
  9010. -       }
  9011. -       eenv->after.energy += total_energy;
  9012. -       return;
  9013. -   }
  9014. +       idle_energy   = SCHED_CAPACITY_SCALE - sg_util;
  9015. +       idle_energy  *= idle_power;
  9016.  
  9017. -   /* Account for "before" metrics */
  9018. -   if (sg->group_weight == 1 &&
  9019. -       cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
  9020. -       eenv->after.utilization = group_util;
  9021. -       eenv->before.capacity = cap;
  9022. +       total_energy = busy_energy + idle_energy;
  9023. +       eenv->cpu[cpu_idx].energy += total_energy;
  9024.     }
  9025. -   eenv->before.energy += total_energy;
  9026. -
  9027. -   /* Setup eenv for the "after" case */
  9028. -   eenv->util_delta = util_delta;
  9029. -   after = true;
  9030. -
  9031. -   goto compute_after;
  9032. -
  9033.  }
  9034.  
  9035.  /*
  9036. - * sched_group_energy(): Computes the absolute energy consumption of cpus
  9037. - * belonging to the sched_group including shared resources shared only by
  9038. - * members of the group. Iterates over all cpus in the hierarchy below the
  9039. - * sched_group starting from the bottom working it's way up before going to
  9040. - * the next cpu until all cpus are covered at all levels. The current
  9041. - * implementation is likely to gather the same util statistics multiple times.
  9042. - * This can probably be done in a faster but more complex way.
  9043. - * Note: sched_group_energy() may fail when racing with sched_domain updates.
  9044. + * compute_energy() computes the absolute variation in energy consumption by
  9045. + * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT.
  9046. + *
  9047. + * NOTE: compute_energy() may fail when racing with sched_domain updates, in
  9048. + *       which case we abort by returning -EINVAL.
  9049.   */
  9050. -static int sched_group_energy(struct energy_env *eenv)
  9051. +static int compute_energy(struct energy_env *eenv)
  9052.  {
  9053. -   struct sched_domain *sd;
  9054.     struct cpumask visit_cpus;
  9055. -   struct sched_group *sg;
  9056. -   int cpu;
  9057. +   int cpu_count;
  9058.  
  9059.     WARN_ON(!eenv->sg_top->sge);
  9060.  
  9061.     cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
  9062. +   /* If a cpu is hotplugged in while we are in this function,
  9063. +    * it does not appear in the existing visit_cpus mask
  9064. +    * which came from the sched_group pointer of the
  9065. +    * sched_domain pointed at by sd_ea for either the prev
  9066. +    * or next cpu and was dereferenced in __energy_diff.
  9067. +    * Since we will dereference sd_scs later as we iterate
  9068. +    * through the CPUs we expect to visit, new CPUs can
  9069. +    * be present which are not in the visit_cpus mask.
  9070. +    * Guard this with cpu_count.
  9071. +    */
  9072. +   cpu_count = cpumask_weight(&visit_cpus);
  9073.  
  9074.     while (!cpumask_empty(&visit_cpus)) {
  9075.         struct sched_group *sg_shared_cap = NULL;
  9076. -
  9077. -       cpu = cpumask_first(&visit_cpus);
  9078. +       int cpu = cpumask_first(&visit_cpus);
  9079. +       struct sched_domain *sd;
  9080.  
  9081.         /*
  9082.          * Is the group utilization affected by cpus outside this
  9083.          * sched_group?
  9084. +        * This sd may have groups with cpus which were not present
  9085. +        * when we took visit_cpus.
  9086.          */
  9087.         sd = rcu_dereference(per_cpu(sd_scs, cpu));
  9088. +
  9089.         if (sd && sd->parent)
  9090.             sg_shared_cap = sd->parent->groups;
  9091.  
  9092.         for_each_domain(cpu, sd) {
  9093. -           sg = sd->groups;
  9094. +           struct sched_group *sg = sd->groups;
  9095.  
  9096.             /* Has this sched_domain already been visited? */
  9097.             if (sd->child && group_first_cpu(sg) != cpu)
  9098. @@ -4878,18 +5530,52 @@
  9099.                 eenv->sg_cap = sg;
  9100.                 if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
  9101.                     eenv->sg_cap = sg_shared_cap;
  9102. +               else
  9103. +                   eenv->sg_cap = sg;
  9104.  
  9105. +               /*
  9106. +                * Compute the energy for all the candidate
  9107. +                * CPUs in the current visited SG.
  9108. +                */
  9109.                 eenv->sg = sg;
  9110. -               before_after_energy(eenv);
  9111. -
  9112. -               if (!sd->child)
  9113. +               calc_sg_energy(eenv);
  9114. +
  9115. +               /* remove CPUs we have just visited */
  9116. +               if (!sd->child) {
  9117. +                   /*
  9118. +                    * cpu_count here is the number of
  9119. +                    * cpus we expect to visit in this
  9120. +                    * calculation. If we race against
  9121. +                    * hotplug, we can have extra cpus
  9122. +                    * added to the groups we are
  9123. +                    * iterating which do not appear in
  9124. +                    * the visit_cpus mask. In that case
  9125. +                    * we are not able to calculate energy
  9126. +                    * without restarting so we will bail
  9127. +                    * out and use prev_cpu this time.
  9128. +                    */
  9129. +                   if (!cpu_count)
  9130. +                       return -EINVAL;
  9131.                     cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
  9132. +                   cpu_count--;
  9133. +               }
  9134.  
  9135.                 if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
  9136.                     goto next_cpu;
  9137.  
  9138.             } while (sg = sg->next, sg != sd->groups);
  9139.         }
  9140. +
  9141. +       /*
  9142. +        * If we raced with hotplug and got an sd NULL-pointer;
  9143. +        * returning a wrong energy estimation is better than
  9144. +        * entering an infinite loop.
  9145. +        * Specifically: If a cpu is unplugged after we took
  9146. +        * the visit_cpus mask, it no longer has an sd_scs
  9147. +        * pointer, so when we dereference it, we get NULL.
  9148. +        */
  9149. +       if (cpumask_test_cpu(cpu, &visit_cpus))
  9150. +           return -EINVAL;
  9151.  next_cpu:
  9152.         cpumask_clear_cpu(cpu, &visit_cpus);
  9153.         continue;
  9154. @@ -4903,168 +5589,103 @@
  9155.     return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
  9156.  }
  9157.  
  9158. -static inline int normalize_energy(int energy_diff);
  9159. -
  9160. -#define eenv_before(__X) eenv->before.__X
  9161. -#define eenv_after(__X)  eenv->after.__X
  9162. -#define eenv_delta(__X)  eenv->after.__X - eenv->before.__X
  9163. -
  9164. -static inline void
  9165. -__update_perf_energy_deltas(struct energy_env *eenv)
  9166. -{
  9167. -   unsigned long task_util = eenv->util_delta;
  9168. -
  9169. -   /*
  9170. -    * SpeedUp Index
  9171. -    *
  9172. -    *   SPI := cpu_capacity - task_util
  9173. -    *
  9174. -    * which estimate how sooner a task will complete when running
  9175. -    * on an higher OPP wrt the minimum required.
  9176. -    */
  9177. -   eenv_before(speedup_idx) = eenv_before(capacity) - task_util;
  9178. -   eenv_after(speedup_idx)  = eenv_after(capacity) - task_util;
  9179. -
  9180. -   /*
  9181. -    * Delay Index
  9182. -    *
  9183. -    *   DLI := 1024 * (cpu_util - task_util) / cpu_util
  9184. -    *
  9185. -    * which represents the "fraction" of CPU bandwidth consumed by other
  9186. -    * tasks in the worst case, i.e. assuming all other tasks runs before.
  9187. -    *
  9188. -    * NOTE: in the above formula we assume that "cpu_util" includes
  9189. -    *       already the task utilization.
  9190. -    */
  9191. -   eenv_before(delay_idx)  =  SCHED_CAPACITY_SCALE;
  9192. -   eenv_before(delay_idx) *= (eenv_before(utilization) - task_util);
  9193. -   eenv_before(delay_idx) /=  eenv_before(utilization);
  9194. -   eenv_after(delay_idx)   =  SCHED_CAPACITY_SCALE;
  9195. -   eenv_after(delay_idx)  *= (eenv_after(utilization) - task_util);
  9196. -   eenv_after(delay_idx)  /=  eenv_after(utilization);
  9197. -
  9198. -   /* Performance Variation */
  9199. -   eenv->prf_delta = eenv_delta(speedup_idx) - eenv_delta(delay_idx);
  9200. -
  9201. -   /* Energy Variation */
  9202. -   eenv->nrg_delta = normalize_energy(eenv_delta(energy));
  9203. -
  9204. -}
  9205. -
  9206.  /*
  9207. - * energy_diff(): Estimate the energy impact of changing the utilization
  9208. - * distribution. eenv specifies the change: utilisation amount, source, and
  9209. - * destination cpu. Source or destination cpu may be -1 in which case the
  9210. - * utilization is removed from or added to the system (e.g. task wake-up). If
  9211. - * both are specified, the utilization is migrated.
  9212. + * select_energy_cpu_idx(): estimate the energy impact of changing the
  9213. + * utilization distribution.
  9214. + *
  9215. + * The eenv parameter specifies the changes: utilisation amount and a pair of
  9216. + * possible CPU candidates (the previous CPU and a different target CPU).
  9217. + *
  9218. + * This function returns the index of a CPU candidate specified by the
  9219. + * energy_env which corresponds to the first CPU saving energy.
  9220. + * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy
  9221. + * efficient than running on prev_cpu. This is also the value returned in case
  9222. + * of abort due to error conditions during the computations.
  9223. + * A value greater than zero means that the first energy-efficient CPU is the
  9224. + * one represented by eenv->cpu[eenv->next_idx].cpu_id.
  9225.   */
  9226. -static inline int __energy_diff(struct energy_env *eenv)
  9227. +static inline int select_energy_cpu_idx(struct energy_env *eenv)
  9228.  {
  9229.     struct sched_domain *sd;
  9230.     struct sched_group *sg;
  9231.     int sd_cpu = -1;
  9232. +   int cpu_idx;
  9233. +   int margin;
  9234.  
  9235. -   if (eenv->src_cpu == eenv->dst_cpu)
  9236. -       return 0;
  9237. -
  9238. -   sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
  9239. +   sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id;
  9240.     sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
  9241. +
  9242.     if (!sd)
  9243. -       return 0; /* Error */
  9244. +       return EAS_CPU_PRV;
  9245. +
  9246. +   cpumask_clear(&eenv->cpus_mask);
  9247. +   for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
  9248. +       int cpu = eenv->cpu[cpu_idx].cpu_id;
  9249. +
  9250. +       if (cpu < 0)
  9251. +           continue;
  9252. +       cpumask_set_cpu(cpu, &eenv->cpus_mask);
  9253. +   }
  9254.  
  9255.     sg = sd->groups;
  9256. +
  9257.     do {
  9258. -       if (!cpu_in_sg(sg, eenv->src_cpu) &&
  9259. -           !cpu_in_sg(sg, eenv->dst_cpu))
  9260. +       /* Skip SGs which do not contains a candidate CPU */
  9261. +       if (!cpumask_intersects(&eenv->cpus_mask, sched_group_cpus(sg)))
  9262.             continue;
  9263.  
  9264.         eenv->sg_top = sg;
  9265. -       if (sched_group_energy(eenv))
  9266. -           return 0; /* Invalid result abort */
  9267. +       /* energy is unscaled to reduce rounding errors */
  9268. +       if (compute_energy(eenv) == -EINVAL)
  9269. +           return EAS_CPU_PRV;
  9270.  
  9271.     } while (sg = sg->next, sg != sd->groups);
  9272.  
  9273. -   __update_perf_energy_deltas(eenv);
  9274. -
  9275. -   trace_sched_energy_diff(eenv);
  9276. -   trace_sched_energy_perf_deltas(eenv);
  9277. -
  9278. -   return eenv->nrg_delta;
  9279. -}
  9280. +   /* Scale energy before comparisons */
  9281. +   for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx)
  9282. +       eenv->cpu[cpu_idx].energy >>= SCHED_CAPACITY_SHIFT;
  9283.  
  9284. -#ifdef CONFIG_SCHED_TUNE
  9285. -
  9286. -struct target_nrg schedtune_target_nrg;
  9287. -
  9288. -/*
  9289. - * System energy normalization
  9290. - * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
  9291. - * corresponding to the specified energy variation.
  9292. - */
  9293. -static inline int
  9294. -normalize_energy(int energy_diff)
  9295. -{
  9296. -   u32 normalized_nrg;
  9297. -#ifdef CONFIG_SCHED_DEBUG
  9298. -   int max_delta;
  9299. -
  9300. -   /* Check for boundaries */
  9301. -   max_delta  = schedtune_target_nrg.max_power;
  9302. -   max_delta -= schedtune_target_nrg.min_power;
  9303. -   WARN_ON(abs(energy_diff) >= max_delta);
  9304. -#endif
  9305. -
  9306. -   /* Do scaling using positive numbers to increase the range */
  9307. -   normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
  9308. -
  9309. -   /* Scale by energy magnitude */
  9310. -   normalized_nrg <<= SCHED_CAPACITY_SHIFT;
  9311. -
  9312. -   /* Normalize on max energy for target platform */
  9313. -   normalized_nrg = reciprocal_divide(
  9314. -           normalized_nrg, schedtune_target_nrg.rdiv);
  9315. -
  9316. -   return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
  9317. -}
  9318. -
  9319. -static inline bool filter_energy(void)
  9320. -{
  9321. -   return sched_feat(ENERGY_FILTER);
  9322. -}
  9323. -
  9324. -static inline int
  9325. -energy_diff(struct energy_env *eenv)
  9326. -{
  9327. -   int boost;
  9328. +   /*
  9329. +    * Compute the dead-zone margin used to prevent too many task
  9330. +    * migrations with negligible energy savings.
  9331. +    * An energy saving is considered meaningful if it reduces the energy
  9332. +    * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56%
  9333. +    */
  9334. +   margin = eenv->cpu[EAS_CPU_PRV].energy >> 6;
  9335.  
  9336. -   /* Conpute "absolute" energy diff */
  9337. -   __energy_diff(eenv);
  9338. -   if (!filter_energy())
  9339. -       return eenv->nrg_delta;
  9340. +   /*
  9341. +    * By default the EAS_CPU_PRV CPU is considered the most energy
  9342. +    * efficient, with a 0 energy variation.
  9343. +    */
  9344. +   eenv->next_idx = EAS_CPU_PRV;
  9345.  
  9346. -   /* Return energy diff when boost margin is 0 */
  9347. -   boost = schedtune_task_boost(eenv->task);
  9348. -   if (boost == 0)
  9349. -       return eenv->nrg_delta;
  9350. +   /*
  9351. +    * Compare the other CPU candidates to find a CPU which can be
  9352. +    * more energy efficient then EAS_CPU_PRV
  9353. +    */
  9354. +   for (cpu_idx = EAS_CPU_NXT; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
  9355. +       /* Skip not valid scheduled candidates */
  9356. +       if (eenv->cpu[cpu_idx].cpu_id < 0)
  9357. +           continue;
  9358. +       /* Compute energy delta wrt EAS_CPU_PRV */
  9359. +       eenv->cpu[cpu_idx].nrg_delta =
  9360. +           eenv->cpu[cpu_idx].energy -
  9361. +           eenv->cpu[EAS_CPU_PRV].energy;
  9362. +       /* filter energy variations within the dead-zone margin */
  9363. +       if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin)
  9364. +           eenv->cpu[cpu_idx].nrg_delta = 0;
  9365. +       /* update the schedule candidate with min(nrg_delta) */
  9366. +       if (eenv->cpu[cpu_idx].nrg_delta <
  9367. +           eenv->cpu[eenv->next_idx].nrg_delta) {
  9368. +           eenv->next_idx = cpu_idx;
  9369. +           if (sched_feat(FBT_STRICT_ORDER))
  9370. +               break;
  9371. +       }
  9372. +   }
  9373.  
  9374. -   eenv->payoff = schedtune_accept_deltas(
  9375. -           eenv->nrg_delta,
  9376. -           eenv->prf_delta,
  9377. -           eenv->task);
  9378. -
  9379. -   /*
  9380. -    * When SchedTune is enabled, the energy_diff() function will return
  9381. -    * the computed energy payoff value. Since the energy_diff() return
  9382. -    * value is expected to be negative by its callers, this evaluation
  9383. -    * function return a negative value each time the evaluation return a
  9384. -    * positive payoff, which is the condition for the acceptance of
  9385. -    * a scheduling decision
  9386. -    */
  9387. -   return -eenv->payoff;
  9388. +   return eenv->next_idx;
  9389. +  
  9390.  }
  9391. -#else /* CONFIG_SCHED_TUNE */
  9392. -#define energy_diff(eenv) __energy_diff(eenv)
  9393. -#endif
  9394.  
  9395.  /*
  9396.   * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
  9397. @@ -5078,31 +5699,34 @@
  9398.   * being client/server, worker/dispatcher, interrupt source or whatever is
  9399.   * irrelevant, spread criteria is apparent partner count exceeds socket size.
  9400.   */
  9401. -static int wake_wide(struct task_struct *p)
  9402. +static int wake_wide(struct task_struct *p, int sibling_count_hint)
  9403.  {
  9404.     unsigned int master = current->wakee_flips;
  9405.     unsigned int slave = p->wakee_flips;
  9406. -   int factor = this_cpu_read(sd_llc_size);
  9407. +   int llc_size = this_cpu_read(sd_llc_size);
  9408. +
  9409. +   if (sibling_count_hint >= llc_size)
  9410. +       return 1;
  9411.  
  9412.     if (master < slave)
  9413.         swap(master, slave);
  9414. -   if (slave < factor || master < slave * factor)
  9415. +   if (slave < llc_size || master < slave * llc_size)
  9416.         return 0;
  9417.     return 1;
  9418.  }
  9419.  
  9420. -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
  9421. +static int wake_affine(struct sched_domain *sd, struct task_struct *p,
  9422. +              int prev_cpu, int sync)
  9423.  {
  9424.     s64 this_load, load;
  9425.     s64 this_eff_load, prev_eff_load;
  9426. -   int idx, this_cpu, prev_cpu;
  9427. +   int idx, this_cpu;
  9428.     struct task_group *tg;
  9429.     unsigned long weight;
  9430.     int balanced;
  9431.  
  9432.     idx   = sd->wake_idx;
  9433.     this_cpu  = smp_processor_id();
  9434. -   prev_cpu  = task_cpu(p);
  9435.     load      = source_load(prev_cpu, idx);
  9436.     this_load = target_load(this_cpu, idx);
  9437.  
  9438. @@ -5146,18 +5770,29 @@
  9439.  
  9440.     balanced = this_eff_load <= prev_eff_load;
  9441.  
  9442. -   schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
  9443. +   schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
  9444.  
  9445.     if (!balanced)
  9446.         return 0;
  9447.  
  9448. -   schedstat_inc(sd, ttwu_move_affine);
  9449. -   schedstat_inc(p, se.statistics.nr_wakeups_affine);
  9450. +   schedstat_inc(sd->ttwu_move_affine);
  9451. +   schedstat_inc(p->se.statistics.nr_wakeups_affine);
  9452.  
  9453.     return 1;
  9454.  }
  9455.  
  9456. -unsigned int capacity_margin = 1280; /* ~20% margin */
  9457. +static inline unsigned long task_util(struct task_struct *p)
  9458. +{
  9459. +#ifdef CONFIG_SCHED_WALT
  9460. +   if (!walt_disabled && sysctl_sched_use_walt_task_util) {
  9461. +       unsigned long demand = p->ravg.demand;
  9462. +       return (demand << 10) / walt_ravg_window;
  9463. +   }
  9464. +#endif
  9465. +   return p->se.avg.util_avg;
  9466. +}
  9467. +
  9468. +static inline unsigned long boosted_task_util(struct task_struct *p);
  9469.  
  9470.  static inline bool __task_fits(struct task_struct *p, int cpu, int util)
  9471.  {
  9472. @@ -5182,29 +5817,131 @@
  9473.     return __task_fits(p, cpu, 0);
  9474.  }
  9475.  
  9476. -static inline bool task_fits_spare(struct task_struct *p, int cpu)
  9477. +static bool __cpu_overutilized(int cpu, int delta)
  9478.  {
  9479. -   return __task_fits(p, cpu, cpu_util(cpu, UTIL_EST));
  9480. +   return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin);
  9481.  }
  9482.  
  9483.  static bool cpu_overutilized(int cpu)
  9484.  {
  9485. -   return (capacity_of(cpu) * 1024) < (cpu_util(cpu, UTIL_AVG) * capacity_margin);
  9486. +   return __cpu_overutilized(cpu, 0);
  9487. +}
  9488. +
  9489. +#ifdef CONFIG_SCHED_TUNE
  9490. +
  9491. +struct reciprocal_value schedtune_spc_rdiv;
  9492. +
  9493. +static long
  9494. +schedtune_margin(unsigned long signal, long boost)
  9495. +{
  9496. +   long long margin = 0;
  9497. +
  9498. +   /*
  9499. +    * Signal proportional compensation (SPC)
  9500. +    *
  9501. +    * The Boost (B) value is used to compute a Margin (M) which is
  9502. +    * proportional to the complement of the original Signal (S):
  9503. +    *   M = B * (SCHED_CAPACITY_SCALE - S)
  9504. +    * The obtained M could be used by the caller to "boost" S.
  9505. +    */
  9506. +   if (boost >= 0) {
  9507. +       margin  = SCHED_CAPACITY_SCALE - signal;
  9508. +       margin *= boost;
  9509. +   } else {
  9510. +       margin = -signal * boost;
  9511. +   }
  9512. +
  9513. +   margin  = reciprocal_divide(margin, schedtune_spc_rdiv);
  9514. +   if (boost < 0)
  9515. +       margin *= -1;
  9516. +
  9517. +   return margin;
  9518. +}
  9519. +
  9520. +static inline int
  9521. +schedtune_cpu_margin(unsigned long util, int cpu)
  9522. +{
  9523. +   int boost = schedtune_cpu_boost(cpu);
  9524. +
  9525. +   if (boost == 0)
  9526. +       return 0;
  9527. +
  9528. +   return schedtune_margin(util, boost);
  9529. +}
  9530. +
  9531. +static inline long
  9532. +schedtune_task_margin(struct task_struct *p)
  9533. +{
  9534. +   int boost = schedtune_task_boost(p);
  9535. +   unsigned long util;
  9536. +   long margin;
  9537. +
  9538. +   if (boost == 0)
  9539. +       return 0;
  9540. +
  9541. +   util = task_util(p);
  9542. +   margin = schedtune_margin(util, boost);
  9543. +
  9544. +   return margin;
  9545. +}
  9546. +
  9547. +#else /* CONFIG_SCHED_TUNE */
  9548. +
  9549. +static inline int
  9550. +schedtune_cpu_margin(unsigned long util, int cpu)
  9551. +{
  9552. +   return 0;
  9553. +}
  9554. +
  9555. +static inline int
  9556. +schedtune_task_margin(struct task_struct *p)
  9557. +{
  9558. +   return 0;
  9559. +}
  9560. +
  9561. +#endif /* CONFIG_SCHED_TUNE */
  9562. +
  9563. +unsigned long
  9564. +boosted_cpu_util(int cpu)
  9565. +{
  9566. +   unsigned long util = cpu_util_freq(cpu);
  9567. +   long margin = schedtune_cpu_margin(util, cpu);
  9568. +
  9569. +   trace_sched_boost_cpu(cpu, util, margin);
  9570. +
  9571. +   return util + margin;
  9572. +}
  9573. +
  9574. +static inline unsigned long
  9575. +boosted_task_util(struct task_struct *p)
  9576. +{
  9577. +   unsigned long util = task_util(p);
  9578. +   long margin = schedtune_task_margin(p);
  9579. +
  9580. +   trace_sched_boost_task(p, util, margin);
  9581. +
  9582. +   return util + margin;
  9583. +}
  9584. +
  9585. +static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
  9586. +{
  9587. +   return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
  9588.  }
  9589.  
  9590.  /*
  9591.   * find_idlest_group finds and returns the least busy CPU group within the
  9592.   * domain.
  9593. + *
  9594. + * Assumes p is allowed on at least one CPU in sd.
  9595.   */
  9596.  static struct sched_group *
  9597.  find_idlest_group(struct sched_domain *sd, struct task_struct *p,
  9598.           int this_cpu, int sd_flag)
  9599.  {
  9600.     struct sched_group *idlest = NULL, *group = sd->groups;
  9601. -   struct sched_group *fit_group = NULL, *spare_group = NULL;
  9602. -   unsigned long min_load = ULONG_MAX, this_load = 0;
  9603. -   unsigned long fit_capacity = ULONG_MAX;
  9604. -   unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE;
  9605. +   struct sched_group *most_spare_sg = NULL;
  9606. +   unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
  9607. +   unsigned long most_spare = 0, this_spare = 0;
  9608.     int load_idx = sd->forkexec_idx;
  9609.     int imbalance = 100 + (sd->imbalance_pct-100)/2;
  9610.  
  9611. @@ -5212,7 +5949,7 @@
  9612.         load_idx = sd->wake_idx;
  9613.  
  9614.     do {
  9615. -       unsigned long load, avg_load, spare_capacity;
  9616. +       unsigned long load, avg_load, spare_cap, max_spare_cap;
  9617.         int local_group;
  9618.         int i;
  9619.  
  9620. @@ -5224,8 +5961,12 @@
  9621.         local_group = cpumask_test_cpu(this_cpu,
  9622.                            sched_group_cpus(group));
  9623.  
  9624. -       /* Tally up the load of all CPUs in the group */
  9625. +       /*
  9626. +        * Tally up the load of all CPUs in the group and find
  9627. +        * the group containing the CPU with most spare capacity.
  9628. +        */
  9629.         avg_load = 0;
  9630. +       max_spare_cap = 0;
  9631.  
  9632.         for_each_cpu(i, sched_group_cpus(group)) {
  9633.             /* Bias balancing toward cpus of our domain */
  9634. @@ -5236,24 +5977,10 @@
  9635.  
  9636.             avg_load += load;
  9637.  
  9638. -           /*
  9639. -            * Look for most energy-efficient group that can fit
  9640. -            * that can fit the task.
  9641. -            */
  9642. -           if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
  9643. -               fit_capacity = capacity_of(i);
  9644. -               fit_group = group;
  9645. -           }
  9646. +           spare_cap = capacity_spare_wake(i, p);
  9647.  
  9648. -           /*
  9649. -            * Look for group which has most spare capacity on a
  9650. -            * single cpu.
  9651. -            */
  9652. -           spare_capacity = capacity_of(i) - cpu_util(i, UTIL_EST);
  9653. -           if (spare_capacity > max_spare_capacity) {
  9654. -               max_spare_capacity = spare_capacity;
  9655. -               spare_group = group;
  9656. -           }
  9657. +           if (spare_cap > max_spare_cap)
  9658. +               max_spare_cap = spare_cap;
  9659.         }
  9660.  
  9661.         /* Adjust by relative CPU capacity of the group */
  9662. @@ -5261,28 +5988,51 @@
  9663.  
  9664.         if (local_group) {
  9665.             this_load = avg_load;
  9666. -       } else if (avg_load < min_load) {
  9667. -           min_load = avg_load;
  9668. -           idlest = group;
  9669. +           this_spare = max_spare_cap;
  9670. +       } else {
  9671. +           if (avg_load < min_load) {
  9672. +               min_load = avg_load;
  9673. +               idlest = group;
  9674. +           }
  9675. +
  9676. +           if (most_spare < max_spare_cap) {
  9677. +               most_spare = max_spare_cap;
  9678. +               most_spare_sg = group;
  9679. +           }
  9680.         }
  9681.     } while (group = group->next, group != sd->groups);
  9682.  
  9683. -   if (fit_group)
  9684. -       return fit_group;
  9685. +   /*
  9686. +    * The cross-over point between using spare capacity or least load
  9687. +    * is too conservative for high utilization tasks on partially
  9688. +    * utilized systems if we require spare_capacity > task_util(p)
  9689. +    * so we allow for some task stuffing by using
  9690. +    * spare_capacity > task_util(p)/2.
  9691. +    *
  9692. +    * Spare capacity can't be used for fork because the utilization has
  9693. +    * not been set yet, we must first select a rq to compute the initial
  9694. +    * utilization.
  9695. +    */
  9696. +   if (sd_flag & SD_BALANCE_FORK)
  9697. +       goto skip_spare;
  9698.  
  9699. -   if (spare_group)
  9700. -       return spare_group;
  9701. +   if (this_spare > task_util(p) / 2 &&
  9702. +       imbalance*this_spare > 100*most_spare)
  9703. +       return NULL;
  9704. +   else if (most_spare > task_util(p) / 2)
  9705. +       return most_spare_sg;
  9706.  
  9707. +skip_spare:
  9708.     if (!idlest || 100*this_load < imbalance*min_load)
  9709.         return NULL;
  9710.     return idlest;
  9711.  }
  9712.  
  9713.  /*
  9714. - * find_idlest_cpu - find the idlest cpu among the cpus in group.
  9715. + * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
  9716.   */
  9717.  static int
  9718. -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  9719. +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  9720.  {
  9721.     unsigned long load, min_load = ULONG_MAX;
  9722.     unsigned int min_exit_latency = UINT_MAX;
  9723. @@ -5291,9 +6041,13 @@
  9724.     int shallowest_idle_cpu = -1;
  9725.     int i;
  9726.  
  9727. +   /* Check if we have any choice: */
  9728. +   if (group->group_weight == 1)
  9729. +       return cpumask_first(sched_group_cpus(group));
  9730. +
  9731.     /* Traverse only the allowed CPUs */
  9732.     for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
  9733. -       if (task_fits_spare(p, i)) {
  9734. +       if (idle_cpu(i)) {
  9735.             struct rq *rq = cpu_rq(i);
  9736.             struct cpuidle_state *idle = idle_get_state(rq);
  9737.             if (idle && idle->exit_latency < min_exit_latency) {
  9738. @@ -5305,8 +6059,7 @@
  9739.                 min_exit_latency = idle->exit_latency;
  9740.                 latest_idle_timestamp = rq->idle_stamp;
  9741.                 shallowest_idle_cpu = i;
  9742. -           } else if (idle_cpu(i) &&
  9743. -                  (!idle || idle->exit_latency == min_exit_latency) &&
  9744. +           } else if ((!idle || idle->exit_latency == min_exit_latency) &&
  9745.                    rq->idle_stamp > latest_idle_timestamp) {
  9746.                 /*
  9747.                  * If equal or no active idle state, then
  9748. @@ -5315,15 +6068,8 @@
  9749.                  */
  9750.                 latest_idle_timestamp = rq->idle_stamp;
  9751.                 shallowest_idle_cpu = i;
  9752. -           } else if (shallowest_idle_cpu == -1) {
  9753. -               /*
  9754. -                * If we haven't found an idle CPU yet
  9755. -                * pick a non-idle one that can fit the task as
  9756. -                * fallback.
  9757. -                */
  9758. -               shallowest_idle_cpu = i;
  9759.             }
  9760. -       } else {
  9761. +       } else if (shallowest_idle_cpu == -1) {
  9762.             load = weighted_cpuload(i);
  9763.             if (load < min_load || (load == min_load && i == this_cpu)) {
  9764.                 min_load = load;
  9765. @@ -5333,29 +6079,99 @@
  9766.     }
  9767.  
  9768.     return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
  9769. + }
  9770. +
  9771. +static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
  9772. +                 int cpu, int prev_cpu, int sd_flag)
  9773. +{
  9774. +   int new_cpu = cpu;
  9775. +   int wu = sd_flag & SD_BALANCE_WAKE;
  9776. +   int cas_cpu = -1;
  9777. +
  9778. +   if (wu) {
  9779. +       schedstat_inc(p->se.statistics.nr_wakeups_cas_attempts);
  9780. +       schedstat_inc(this_rq()->eas_stats.cas_attempts);
  9781. +   }
  9782. +
  9783. +   if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
  9784. +       return prev_cpu;
  9785. +
  9786. +   while (sd) {
  9787. +       struct sched_group *group;
  9788. +       struct sched_domain *tmp;
  9789. +       int weight;
  9790. +
  9791. +       if (wu)
  9792. +           schedstat_inc(sd->eas_stats.cas_attempts);
  9793. +
  9794. +       if (!(sd->flags & sd_flag)) {
  9795. +           sd = sd->child;
  9796. +           continue;
  9797. +       }
  9798. +
  9799. +       group = find_idlest_group(sd, p, cpu, sd_flag);
  9800. +       if (!group) {
  9801. +           sd = sd->child;
  9802. +           continue;
  9803. +       }
  9804. +
  9805. +       new_cpu = find_idlest_group_cpu(group, p, cpu);
  9806. +       if (new_cpu == cpu) {
  9807. +           /* Now try balancing at a lower domain level of cpu */
  9808. +           sd = sd->child;
  9809. +           continue;
  9810. +       }
  9811. +
  9812. +       /* Now try balancing at a lower domain level of new_cpu */
  9813. +       cpu = cas_cpu = new_cpu;
  9814. +       weight = sd->span_weight;
  9815. +       sd = NULL;
  9816. +       for_each_domain(cpu, tmp) {
  9817. +           if (weight <= tmp->span_weight)
  9818. +               break;
  9819. +           if (tmp->flags & sd_flag)
  9820. +               sd = tmp;
  9821. +       }
  9822. +       /* while loop will break here if sd == NULL */
  9823. +   }
  9824. +
  9825. +   if (wu && (cas_cpu >= 0)) {
  9826. +       schedstat_inc(p->se.statistics.nr_wakeups_cas_count);
  9827. +       schedstat_inc(this_rq()->eas_stats.cas_count);
  9828. +   }
  9829. +
  9830. +   return new_cpu;
  9831.  }
  9832.  
  9833.  /*
  9834.   * Try and locate an idle CPU in the sched_domain.
  9835.   */
  9836. -static int select_idle_sibling(struct task_struct *p, int target)
  9837. +static int select_idle_sibling(struct task_struct *p, int prev, int target)
  9838.  {
  9839.     struct sched_domain *sd;
  9840.     struct sched_group *sg;
  9841. -   int i = task_cpu(p);
  9842. -   int best_idle = -1;
  9843. -   int best_idle_cstate = -1;
  9844. -   int best_idle_capacity = INT_MAX;
  9845. +   int best_idle_cpu = -1;
  9846. +   int best_idle_cstate = INT_MAX;
  9847. +   unsigned long best_idle_capacity = ULONG_MAX;
  9848. +
  9849. +   schedstat_inc(p->se.statistics.nr_wakeups_sis_attempts);
  9850. +   schedstat_inc(this_rq()->eas_stats.sis_attempts);
  9851.  
  9852.     if (!sysctl_sched_cstate_aware) {
  9853. -       if (idle_cpu(target))
  9854. +       if (idle_cpu(target)) {
  9855. +           schedstat_inc(p->se.statistics.nr_wakeups_sis_idle);
  9856. +           schedstat_inc(this_rq()->eas_stats.sis_idle);
  9857.             return target;
  9858. +       }
  9859.  
  9860.         /*
  9861.          * If the prevous cpu is cache affine and idle, don't be stupid.
  9862.          */
  9863. -       if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
  9864. -           return i;
  9865. +       if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {
  9866. +           schedstat_inc(p->se.statistics.nr_wakeups_sis_cache_affine);
  9867. +           schedstat_inc(this_rq()->eas_stats.sis_cache_affine);
  9868. +           return prev;
  9869. +       }
  9870.     }
  9871.  
  9872.     /*
  9873. @@ -5365,24 +6181,30 @@
  9874.     for_each_lower_domain(sd) {
  9875.         sg = sd->groups;
  9876.         do {
  9877. +           int i;
  9878.             if (!cpumask_intersects(sched_group_cpus(sg),
  9879.                         tsk_cpus_allowed(p)))
  9880.                 goto next;
  9881.  
  9882.             if (sysctl_sched_cstate_aware) {
  9883.                 for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
  9884. -                   struct rq *rq = cpu_rq(i);
  9885. -                   int idle_idx = idle_get_state_idx(rq);
  9886. +                   int idle_idx = idle_get_state_idx(cpu_rq(i));
  9887.                     unsigned long new_usage = boosted_task_util(p);
  9888.                     unsigned long capacity_orig = capacity_orig_of(i);
  9889. +
  9890.                     if (new_usage > capacity_orig || !idle_cpu(i))
  9891.                         goto next;
  9892.  
  9893. -                   if (i == target && new_usage <= capacity_curr_of(target))
  9894. +                   if (i == target && new_usage <= capacity_curr_of(target)) {
  9895. +                       schedstat_inc(p->se.statistics.nr_wakeups_sis_suff_cap);
  9896. +                       schedstat_inc(this_rq()->eas_stats.sis_suff_cap);
  9897. +                       schedstat_inc(sd->eas_stats.sis_suff_cap);
  9898.                         return target;
  9899. +                   }
  9900.  
  9901. -                   if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
  9902. -                       best_idle = i;
  9903. +                   if (idle_idx < best_idle_cstate &&
  9904. +                       capacity_orig <= best_idle_capacity) {
  9905. +                       best_idle_cpu = i;
  9906.                         best_idle_cstate = idle_idx;
  9907.                         best_idle_capacity = capacity_orig;
  9908.                     }
  9909. @@ -5395,6 +6217,9 @@
  9910.  
  9911.                 target = cpumask_first_and(sched_group_cpus(sg),
  9912.                     tsk_cpus_allowed(p));
  9913. +               schedstat_inc(p->se.statistics.nr_wakeups_sis_idle_cpu);
  9914. +               schedstat_inc(this_rq()->eas_stats.sis_idle_cpu);
  9915. +               schedstat_inc(sd->eas_stats.sis_idle_cpu);
  9916.                 goto done;
  9917.             }
  9918.  next:
  9919. @@ -5402,171 +6227,112 @@
  9920.         } while (sg != sd->groups);
  9921.     }
  9922.  
  9923. -   if (best_idle > 0)
  9924. -       target = best_idle;
  9925. +   if (best_idle_cpu >= 0)
  9926. +       target = best_idle_cpu;
  9927.  
  9928.  done:
  9929. +   schedstat_inc(p->se.statistics.nr_wakeups_sis_count);
  9930. +   schedstat_inc(this_rq()->eas_stats.sis_count);
  9931. +
  9932.     return target;
  9933.  }
  9934.  
  9935. -static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
  9936. +/*
  9937. + * cpu_util_wake: Compute cpu utilization with any contributions from
  9938. + * the waking task p removed.  check_for_migration() looks for a better CPU of
  9939. + * rq->curr. For that case we should return cpu util with contributions from
  9940. + * currently running task p removed.
  9941. + */
  9942. +static int cpu_util_wake(int cpu, struct task_struct *p)
  9943.  {
  9944. -   int iter_cpu;
  9945. -   int target_cpu = -1;
  9946. -   int target_util = 0;
  9947. -   int backup_capacity = 0;
  9948. -   int best_idle_cpu = -1;
  9949. -   int best_idle_cstate = INT_MAX;
  9950. -   int backup_cpu = -1;
  9951. -   unsigned long min_util;
  9952. -   unsigned long new_util;
  9953. -
  9954. -   min_util = boosted_task_util(p);
  9955. -   for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
  9956. -       int cur_capacity;
  9957. -       struct rq *rq;
  9958. -       int idle_idx;
  9959. -
  9960. -       /*
  9961. -        * Iterate from higher cpus for boosted tasks.
  9962. -        */
  9963. -       int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
  9964. -
  9965. -       if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
  9966. -           continue;
  9967. -
  9968. -       /*
  9969. -        * p's blocked utilization is still accounted for on prev_cpu
  9970. -        * so prev_cpu will receive a negative bias due to the double
  9971. -        * accounting. However, the blocked utilization may be zero.
  9972. -        */
  9973. -       new_util = cpu_util(i, UTIL_EST) + task_util(p, UTIL_EST);
  9974. -
  9975. -       /*
  9976. -        * Ensure minimum capacity to grant the required boost.
  9977. -        * The target CPU can be already at a capacity level higher
  9978. -        * than the one required to boost the task.
  9979. -        */
  9980. -       new_util = max(min_util, new_util);
  9981. -       if (new_util > capacity_orig_of(i))
  9982. -           continue;
  9983. +   unsigned long util, capacity;
  9984.  
  9985.  #ifdef CONFIG_SCHED_WALT
  9986. -       if (walt_cpu_high_irqload(i))
  9987. -           continue;
  9988. +   /*
  9989. +    * WALT does not decay idle tasks in the same manner
  9990. +    * as PELT, so it makes little sense to subtract task
  9991. +    * utilization from cpu utilization. Instead just use
  9992. +    * cpu_util for this case.
  9993. +    */
  9994. +   if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
  9995. +       p->state == TASK_WAKING)
  9996. +       return cpu_util(cpu);
  9997.  #endif
  9998. +   /* Task has no contribution or is new */
  9999. +   if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
  10000. +       return cpu_util(cpu);
  10001.  
  10002. -       /*
  10003. -        * Unconditionally favoring tasks that prefer idle cpus to
  10004. -        * improve latency.
  10005. -        */
  10006. -       if (idle_cpu(i) && prefer_idle) {
  10007. -           if (best_idle_cpu < 0)
  10008. -               best_idle_cpu = i;
  10009. -           continue;
  10010. -       }
  10011. -
  10012. -       cur_capacity = capacity_curr_of(i);
  10013. -       rq = cpu_rq(i);
  10014. -       idle_idx = idle_get_state_idx(rq);
  10015. +   capacity = capacity_orig_of(cpu);
  10016. +   util = max_t(long, cpu_util(cpu) - task_util(p), 0);
  10017.  
  10018. -       if (new_util < cur_capacity) {
  10019. -           if (cpu_rq(i)->nr_running) {
  10020. -               if (!prefer_idle) {
  10021. -                   /* Find a target cpu with highest
  10022. -                    * utilization.
  10023. -                    */
  10024. -                   if (target_util == 0 ||
  10025. -                       target_util < new_util) {
  10026. -                       target_cpu = i;
  10027. -                       target_util = new_util;
  10028. -                   }
  10029. -               } else {
  10030. -                   /* Find a target cpu with lowest
  10031. -                    * utilization.
  10032. -                    */
  10033. -                   if (target_util == 0 ||
  10034. -                       target_util > new_util) {
  10035. -                       target_cpu = i;
  10036. -                       target_util = new_util;
  10037. -                   }
  10038. -               }
  10039. -           } else if (!prefer_idle) {
  10040. -               if (best_idle_cpu < 0 ||
  10041. -                   (sysctl_sched_cstate_aware &&
  10042. -                       best_idle_cstate > idle_idx)) {
  10043. -                   best_idle_cstate = idle_idx;
  10044. -                   best_idle_cpu = i;
  10045. -               }
  10046. -           }
  10047. -       } else if (backup_capacity == 0 ||
  10048. -               backup_capacity > cur_capacity) {
  10049. -           // Find a backup cpu with least capacity.
  10050. -           backup_capacity = cur_capacity;
  10051. -           backup_cpu = i;
  10052. -       }
  10053. -   }
  10054. +   return (util >= capacity) ? capacity : util;
  10055. +}
  10056.  
  10057. -   if (prefer_idle && best_idle_cpu >= 0)
  10058. -       target_cpu = best_idle_cpu;
  10059. -   else if (target_cpu < 0)
  10060. -       target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
  10061. +static int start_cpu(bool boosted)
  10062. +{
  10063. +   struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
  10064.  
  10065. -   return target_cpu;
  10066. +   return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
  10067.  }
  10068.  
  10069. -static int energy_aware_wake_cpu(struct task_struct *p, int target)
  10070. +static inline int find_best_target(struct task_struct *p, int *backup_cpu,
  10071. +                  bool boosted, bool prefer_idle)
  10072.  {
  10073. +   unsigned long min_util = boosted_task_util(p);
  10074. +   unsigned long target_capacity = ULONG_MAX;
  10075. +   unsigned long min_wake_util = ULONG_MAX;
  10076. +   unsigned long target_max_spare_cap = 0;
  10077. +   unsigned long best_active_util = ULONG_MAX;
  10078. +   int best_idle_cstate = INT_MAX;
  10079.     struct sched_domain *sd;
  10080. -   struct sched_group *sg, *sg_target;
  10081. -   int target_max_cap = INT_MAX;
  10082. -   int target_cpu = task_cpu(p);
  10083. -   unsigned long min_util;
  10084. -   unsigned long new_util;
  10085. -   int i;
  10086. +   struct sched_group *sg;
  10087. +   int best_active_cpu = -1;
  10088. +   int best_idle_cpu = -1;
  10089. +   int target_cpu = -1;
  10090. +   int cpu, i;
  10091.  
  10092. -   sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
  10093. +   *backup_cpu = -1;
  10094.  
  10095. -   if (!sd)
  10096. -       return target;
  10097. +   schedstat_inc(p->se.statistics.nr_wakeups_fbt_attempts);
  10098. +   schedstat_inc(this_rq()->eas_stats.fbt_attempts);
  10099.  
  10100. -   sg = sd->groups;
  10101. -   sg_target = sg;
  10102. +   /* Find start CPU based on boost value */
  10103. +   cpu = start_cpu(boosted);
  10104. +   if (cpu < 0) {
  10105. +       schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_cpu);
  10106. +       schedstat_inc(this_rq()->eas_stats.fbt_no_cpu);
  10107. +       return -1;
  10108. +   }
  10109.  
  10110. -   if (sysctl_sched_is_big_little) {
  10111. +   /* Find SD for the start CPU */
  10112. +   sd = rcu_dereference(per_cpu(sd_ea, cpu));
  10113. +   if (!sd) {
  10114. +       schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_sd);
  10115. +       schedstat_inc(this_rq()->eas_stats.fbt_no_sd);
  10116. +       return -1;
  10117. +   }
  10118.  
  10119. -       /*
  10120. -        * Find group with sufficient capacity. We only get here if no cpu is
  10121. -        * overutilized. We may end up overutilizing a cpu by adding the task,
  10122. -        * but that should not be any worse than select_idle_sibling().
  10123. -        * load_balance() should sort it out later as we get above the tipping
  10124. -        * point.
  10125. -        */
  10126. -       do {
  10127. -           /* Assuming all cpus are the same in group */
  10128. -           int max_cap_cpu = group_first_cpu(sg);
  10129. +   /* Scan CPUs in all SDs */
  10130. +   sg = sd->groups;
  10131. +   do {
  10132. +       for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
  10133. +           unsigned long capacity_curr = capacity_curr_of(i);
  10134. +           unsigned long capacity_orig = capacity_orig_of(i);
  10135. +           unsigned long wake_util, new_util;
  10136.  
  10137. -           /*
  10138. -            * Assume smaller max capacity means more energy-efficient.
  10139. -            * Ideally we should query the energy model for the right
  10140. -            * answer but it easily ends up in an exhaustive search.
  10141. -            */
  10142. -           if (capacity_of(max_cap_cpu) < target_max_cap &&
  10143. -               task_fits_max(p, max_cap_cpu)) {
  10144. -               sg_target = sg;
  10145. -               target_max_cap = capacity_of(max_cap_cpu);
  10146. -           }
  10147. -       } while (sg = sg->next, sg != sd->groups);
  10148. +           if (!cpu_online(i))
  10149. +               continue;
  10150. +
  10151. +           if (walt_cpu_high_irqload(i))
  10152. +               continue;
  10153.  
  10154. -       /* Find cpu with sufficient capacity */
  10155. -       min_util = boosted_task_util(p);
  10156. -       for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
  10157.             /*
  10158.              * p's blocked utilization is still accounted for on prev_cpu
  10159.              * so prev_cpu will receive a negative bias due to the double
  10160.              * accounting. However, the blocked utilization may be zero.
  10161.              */
  10162. -           new_util = cpu_util(i, UTIL_EST) + task_util(p, UTIL_EST);
  10163. +           wake_util = cpu_util_wake(i, p);
  10164. +           new_util = wake_util + task_util(p);
  10165.  
  10166.             /*
  10167.              * Ensure minimum capacity to grant the required boost.
  10168. @@ -5574,49 +6340,349 @@
  10169.              * than the one required to boost the task.
  10170.              */
  10171.             new_util = max(min_util, new_util);
  10172. -           if (new_util > capacity_orig_of(i))
  10173. +           if (new_util > capacity_orig)
  10174. +               continue;
  10175. +
  10176. +           /*
  10177. +            * Case A) Latency sensitive tasks
  10178. +            *
  10179. +            * Unconditionally favoring tasks that prefer idle CPU to
  10180. +            * improve latency.
  10181. +            *
  10182. +            * Looking for:
  10183. +            * - an idle CPU, whatever its idle_state is, since
  10184. +            *   the first CPUs we explore are more likely to be
  10185. +            *   reserved for latency sensitive tasks.
  10186. +            * - a non idle CPU where the task fits in its current
  10187. +            *   capacity and has the maximum spare capacity.
  10188. +            * - a non idle CPU with lower contention from other
  10189. +            *   tasks and running at the lowest possible OPP.
  10190. +            *
  10191. +            * The last two goals tries to favor a non idle CPU
  10192. +            * where the task can run as if it is "almost alone".
  10193. +            * A maximum spare capacity CPU is favoured since
  10194. +            * the task already fits into that CPU's capacity
  10195. +            * without waiting for an OPP chance.
  10196. +            *
  10197. +            * The following code path is the only one in the CPUs
  10198. +            * exploration loop which is always used by
  10199. +            * prefer_idle tasks. It exits the loop with wither a
  10200. +            * best_active_cpu or a target_cpu which should
  10201. +            * represent an optimal choice for latency sensitive
  10202. +            * tasks.
  10203. +            */
  10204. +           if (prefer_idle) {
  10205. +
  10206. +               /*
  10207. +                * Case A.1: IDLE CPU
  10208. +                * Return the first IDLE CPU we find.
  10209. +                */
  10210. +               if (idle_cpu(i)) {
  10211. +                   schedstat_inc(p->se.statistics.nr_wakeups_fbt_pref_idle);
  10212. +                   schedstat_inc(this_rq()->eas_stats.fbt_pref_idle);
  10213. +
  10214. +                   trace_sched_find_best_target(p,
  10215. +                           prefer_idle, min_util,
  10216. +                           cpu, best_idle_cpu,
  10217. +                           best_active_cpu, i);
  10218. +
  10219. +                   return i;
  10220. +               }
  10221. +
  10222. +               /*
  10223. +                * Case A.2: Target ACTIVE CPU
  10224. +                * Favor CPUs with max spare capacity.
  10225. +                */
  10226. +               if ((capacity_curr > new_util) &&
  10227. +                   (capacity_orig - new_util > target_max_spare_cap)) {
  10228. +                   target_max_spare_cap = capacity_orig - new_util;
  10229. +                   target_cpu = i;
  10230. +                   continue;
  10231. +               }
  10232. +               if (target_cpu != -1)
  10233. +                   continue;
  10234. +
  10235. +
  10236. +               /*
  10237. +                * Case A.3: Backup ACTIVE CPU
  10238. +                * Favor CPUs with:
  10239. +                * - lower utilization due to other tasks
  10240. +                * - lower utilization with the task in
  10241. +                */
  10242. +               if (wake_util > min_wake_util)
  10243. +                   continue;
  10244. +               if (new_util > best_active_util)
  10245. +                   continue;
  10246. +               min_wake_util = wake_util;
  10247. +               best_active_util = new_util;
  10248. +               best_active_cpu = i;
  10249.                 continue;
  10250. +           }
  10251.  
  10252. -           if (new_util < capacity_curr_of(i)) {
  10253. -               target_cpu = i;
  10254. -               if (cpu_rq(i)->nr_running)
  10255. -                   break;
  10256. +           /*
  10257. +            * Enforce EAS mode
  10258. +            *
  10259. +            * For non latency sensitive tasks, skip CPUs that
  10260. +            * will be overutilized by moving the task there.
  10261. +            *
  10262. +            * The goal here is to remain in EAS mode as long as
  10263. +            * possible at least for !prefer_idle tasks.
  10264. +            */
  10265. +           if ((new_util * capacity_margin) >
  10266. +               (capacity_orig * SCHED_CAPACITY_SCALE))
  10267. +               continue;
  10268. +
  10269. +           /*
  10270. +            * Favor CPUs with smaller capacity for Non latency
  10271. +            * sensitive tasks.
  10272. +            */
  10273. +           if (capacity_orig > target_capacity)
  10274. +               continue;
  10275. +
  10276. +           /*
  10277. +            * Case B) Non latency sensitive tasks on IDLE CPUs.
  10278. +            *
  10279. +            * Find an optimal backup IDLE CPU for non latency
  10280. +            * sensitive tasks.
  10281. +            *
  10282. +            * Looking for:
  10283. +            * - minimizing the capacity_orig,
  10284. +            *   i.e. preferring LITTLE CPUs
  10285. +            * - favoring shallowest idle states
  10286. +            *   i.e. avoid to wakeup deep-idle CPUs
  10287. +            *
  10288. +            * The following code path is used by non latency
  10289. +            * sensitive tasks if IDLE CPUs are available. If at
  10290. +            * least one of such CPUs are available it sets the
  10291. +            * best_idle_cpu to the most suitable idle CPU to be
  10292. +            * selected.
  10293. +            *
  10294. +            * If idle CPUs are available, favour these CPUs to
  10295. +            * improve performances by spreading tasks.
  10296. +            * Indeed, the energy_diff() computed by the caller
  10297. +            * will take care to ensure the minimization of energy
  10298. +            * consumptions without affecting performance.
  10299. +            */
  10300. +           if (idle_cpu(i)) {
  10301. +               int idle_idx = idle_get_state_idx(cpu_rq(i));
  10302. +
  10303. +               /*
  10304. +                * Skip CPUs in deeper idle state, but only
  10305. +                * if they are also less energy efficient.
  10306. +                * IOW, prefer a deep IDLE LITTLE CPU vs a
  10307. +                * shallow idle big CPU.
  10308. +                */
  10309. +               if (sysctl_sched_cstate_aware &&
  10310. +                   best_idle_cstate <= idle_idx)
  10311. +                   continue;
  10312. +
  10313. +               /* Keep track of best idle CPU */
  10314. +               target_capacity = capacity_orig;
  10315. +               best_idle_cstate = idle_idx;
  10316. +               best_idle_cpu = i;
  10317. +               continue;
  10318.             }
  10319.  
  10320. -           /* cpu has capacity at higher OPP, keep it as fallback */
  10321. -           if (target_cpu == task_cpu(p))
  10322. -               target_cpu = i;
  10323. +           /*
  10324. +            * Case C) Non latency sensitive tasks on ACTIVE CPUs.
  10325. +            *
  10326. +            * Pack tasks in the most energy efficient capacities.
  10327. +            *
  10328. +            * This task packing strategy prefers more energy
  10329. +            * efficient CPUs (i.e. pack on smaller maximum
  10330. +            * capacity CPUs) while also trying to spread tasks to
  10331. +            * run them all at the lower OPP.
  10332. +            *
  10333. +            * This assumes for example that it's more energy
  10334. +            * efficient to run two tasks on two CPUs at a lower
  10335. +            * OPP than packing both on a single CPU but running
  10336. +            * that CPU at an higher OPP.
  10337. +            *
  10338. +            * Thus, this case keep track of the CPU with the
  10339. +            * smallest maximum capacity and highest spare maximum
  10340. +            * capacity.
  10341. +            */
  10342. +
  10343. +           /* Favor CPUs with maximum spare capacity */
  10344. +           if ((capacity_orig - new_util) < target_max_spare_cap)
  10345. +               continue;
  10346. +
  10347. +           target_max_spare_cap = capacity_orig - new_util;
  10348. +           target_capacity = capacity_orig;
  10349. +           target_cpu = i;
  10350.         }
  10351. -   } else {
  10352. -       /*
  10353. -        * Find a cpu with sufficient capacity
  10354. -        */
  10355. -       bool boosted = schedtune_task_boost(p) > 0;
  10356. -       bool prefer_idle = schedtune_prefer_idle(p) > 0;
  10357. -       int tmp_target = find_best_target(p, boosted, prefer_idle);
  10358. -       if (tmp_target >= 0) {
  10359. -           target_cpu = tmp_target;
  10360. -           if ((boosted || prefer_idle) && idle_cpu(target_cpu))
  10361. -               return target_cpu;
  10362. +
  10363. +   } while (sg = sg->next, sg != sd->groups);
  10364. +
  10365. +   /*
  10366. +    * For non latency sensitive tasks, cases B and C in the previous loop,
  10367. +    * we pick the best IDLE CPU only if we was not able to find a target
  10368. +    * ACTIVE CPU.
  10369. +    *
  10370. +    * Policies priorities:
  10371. +    *
  10372. +    * - prefer_idle tasks:
  10373. +    *
  10374. +    *   a) IDLE CPU available, we return immediately
  10375. +    *   b) ACTIVE CPU where task fits and has the bigger maximum spare
  10376. +    *      capacity (i.e. target_cpu)
  10377. +    *   c) ACTIVE CPU with less contention due to other tasks
  10378. +    *      (i.e. best_active_cpu)
  10379. +    *
  10380. +    * - NON prefer_idle tasks:
  10381. +    *
  10382. +    *   a) ACTIVE CPU: target_cpu
  10383. +    *   b) IDLE CPU: best_idle_cpu
  10384. +    */
  10385. +   if (target_cpu == -1)
  10386. +       target_cpu = prefer_idle
  10387. +           ? best_active_cpu
  10388. +           : best_idle_cpu;
  10389. +   else
  10390. +       *backup_cpu = prefer_idle
  10391. +       ? best_active_cpu
  10392. +       : best_idle_cpu;
  10393. +
  10394. +   trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
  10395. +                    best_idle_cpu, best_active_cpu,
  10396. +                    target_cpu);
  10397. +
  10398. +   schedstat_inc(p->se.statistics.nr_wakeups_fbt_count);
  10399. +   schedstat_inc(this_rq()->eas_stats.fbt_count);
  10400. +
  10401. +   return target_cpu;
  10402. +}
  10403. +
  10404. +/*
  10405. + * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
  10406. + * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
  10407. + *
  10408. + * In that case WAKE_AFFINE doesn't make sense and we'll let
  10409. + * BALANCE_WAKE sort things out.
  10410. + */
  10411. +static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
  10412. +{
  10413. +   long min_cap, max_cap;
  10414. +   min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
  10415. +   max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
  10416. +   /* Minimum capacity is close to max, no need to abort wake_affine */
  10417. +   if (max_cap - min_cap < max_cap >> 3)
  10418. +       return 0;
  10419. +
  10420. +   /* Bring task utilization in sync with prev_cpu */
  10421. +   sync_entity_load_avg(&p->se);
  10422. +
  10423. +   return min_cap * 1024 < task_util(p) * capacity_margin;
  10424. +}
  10425. +
  10426. +static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
  10427. +{
  10428. +   bool boosted, prefer_idle;
  10429. +   struct sched_domain *sd;
  10430. +   int target_cpu;
  10431. +   int backup_cpu;
  10432. +   int next_cpu;
  10433. +
  10434. +   schedstat_inc(p->se.statistics.nr_wakeups_secb_attempts);
  10435. +   schedstat_inc(this_rq()->eas_stats.secb_attempts);
  10436. +
  10437. +   if (sysctl_sched_sync_hint_enable && sync) {
  10438. +       int cpu = smp_processor_id();
  10439. +
  10440. +       if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
  10441. +           schedstat_inc(p->se.statistics.nr_wakeups_secb_sync);
  10442. +           schedstat_inc(this_rq()->eas_stats.secb_sync);
  10443. +           return cpu;
  10444.         }
  10445.     }
  10446.  
  10447. -   if (target_cpu != task_cpu(p)) {
  10448. +   rcu_read_lock();
  10449. +#ifdef CONFIG_CGROUP_SCHEDTUNE
  10450. +   boosted = schedtune_task_boost(p) > 0;
  10451. +   prefer_idle = schedtune_prefer_idle(p) > 0;
  10452. +#else
  10453. +   boosted = get_sysctl_sched_cfs_boost() > 0;
  10454. +   prefer_idle = 0;
  10455. +#endif
  10456. +
  10457. +
  10458. +
  10459. +   sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
  10460. +   if (!sd) {
  10461. +       target_cpu = prev_cpu;
  10462. +       goto unlock;
  10463. +   }
  10464. +
  10465. +   sync_entity_load_avg(&p->se);
  10466. +
  10467. +   /* Find a cpu with sufficient capacity */
  10468. +   next_cpu = find_best_target(p, &backup_cpu, boosted, prefer_idle);
  10469. +   if (next_cpu == -1) {
  10470. +       target_cpu = prev_cpu;
  10471. +       goto unlock;
  10472. +   }
  10473. +
  10474. +   /* Unconditionally prefer IDLE CPUs for boosted/prefer_idle tasks */
  10475. +   if ((boosted || prefer_idle) && idle_cpu(next_cpu)) {
  10476. +       schedstat_inc(p->se.statistics.nr_wakeups_secb_idle_bt);
  10477. +       schedstat_inc(this_rq()->eas_stats.secb_idle_bt);
  10478. +       target_cpu = next_cpu;
  10479. +       goto unlock;
  10480. +   }
  10481. +
  10482. +   target_cpu = prev_cpu;
  10483. +   if (next_cpu != prev_cpu) {
  10484. +       int delta = 0;
  10485.         struct energy_env eenv = {
  10486. -           .util_delta = task_util(p, UTIL_EST),
  10487. -           .src_cpu    = task_cpu(p),
  10488. -           .dst_cpu    = target_cpu,
  10489. -           .task       = p,
  10490. +           .p              = p,
  10491. +           .util_delta     = task_util(p),
  10492. +           /* Task's previous CPU candidate */
  10493. +           .cpu[EAS_CPU_PRV] = {
  10494. +               .cpu_id = prev_cpu,
  10495. +           },
  10496. +           /* Main alternative CPU candidate */
  10497. +           .cpu[EAS_CPU_NXT] = {
  10498. +               .cpu_id = next_cpu,
  10499. +           },
  10500. +           /* Backup alternative CPU candidate */
  10501. +           .cpu[EAS_CPU_BKP] = {
  10502. +               .cpu_id = backup_cpu,
  10503. +           },
  10504.         };
  10505.  
  10506. +
  10507. +#ifdef CONFIG_SCHED_WALT
  10508. +       if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
  10509. +           p->state == TASK_WAKING)
  10510. +           delta = task_util(p);
  10511. +#endif
  10512.         /* Not enough spare capacity on previous cpu */
  10513. -       if (cpu_overutilized(task_cpu(p)))
  10514. -           return target_cpu;
  10515. +       if (__cpu_overutilized(prev_cpu, delta)) {
  10516. +           schedstat_inc(p->se.statistics.nr_wakeups_secb_insuff_cap);
  10517. +           schedstat_inc(this_rq()->eas_stats.secb_insuff_cap);
  10518. +           target_cpu = next_cpu;
  10519. +           goto unlock;
  10520. +       }
  10521.  
  10522. -       if (energy_diff(&eenv) >= 0)
  10523. -           return task_cpu(p);
  10524. +       /* Check if EAS_CPU_NXT is a more energy efficient CPU */
  10525. +       if (select_energy_cpu_idx(&eenv) != EAS_CPU_PRV) {
  10526. +           schedstat_inc(p->se.statistics.nr_wakeups_secb_nrg_sav);
  10527. +           schedstat_inc(this_rq()->eas_stats.secb_nrg_sav);
  10528. +           target_cpu = eenv.cpu[eenv.next_idx].cpu_id;
  10529. +           goto unlock;
  10530. +       }
  10531. +
  10532. +       schedstat_inc(p->se.statistics.nr_wakeups_secb_no_nrg_sav);
  10533. +       schedstat_inc(this_rq()->eas_stats.secb_no_nrg_sav);
  10534. +       target_cpu = prev_cpu;
  10535. +       goto unlock;
  10536.     }
  10537.  
  10538. +   schedstat_inc(p->se.statistics.nr_wakeups_secb_count);
  10539. +   schedstat_inc(this_rq()->eas_stats.secb_count);
  10540. +
  10541. +unlock:
  10542. +   rcu_read_unlock();
  10543.     return target_cpu;
  10544.  }
  10545.  
  10546. @@ -5633,7 +6699,8 @@
  10547.   * preempt must be disabled.
  10548.   */
  10549.  static int
  10550. -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
  10551. +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
  10552. +           int sibling_count_hint)
  10553.  {
  10554.     struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
  10555.     int cpu = smp_processor_id();
  10556. @@ -5641,13 +6708,15 @@
  10557.     int want_affine = 0;
  10558.     int sync = wake_flags & WF_SYNC;
  10559.  
  10560. -   if (p->nr_cpus_allowed == 1)
  10561. -       return prev_cpu;
  10562. +   if (sd_flag & SD_BALANCE_WAKE) {
  10563. +       record_wakee(p);
  10564. +       want_affine = !wake_wide(p, sibling_count_hint) &&
  10565. +                 !wake_cap(p, cpu, prev_cpu) &&
  10566. +                 cpumask_test_cpu(cpu, &p->cpus_allowed);
  10567. +   }
  10568.  
  10569. -   if (sd_flag & SD_BALANCE_WAKE)
  10570. -       want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
  10571. -                 cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
  10572. -                 energy_aware();
  10573. +   if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
  10574. +       return select_energy_cpu_brute(p, prev_cpu, sync);
  10575.  
  10576.     rcu_read_lock();
  10577.     for_each_domain(cpu, tmp) {
  10578. @@ -5672,65 +6741,25 @@
  10579.  
  10580.     if (affine_sd) {
  10581.         sd = NULL; /* Prefer wake_affine over balance flags */
  10582. -       if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
  10583. +       if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
  10584.             new_cpu = cpu;
  10585.     }
  10586.  
  10587. -   if (!sd) {
  10588. -       int sync_used = 0;
  10589. -       bool about_to_idle = (cpu_rq(cpu)->nr_running < 2);
  10590. -
  10591. -       if (sysctl_sched_sync_hint_enable && sync
  10592. -               && about_to_idle) {
  10593. -           cpumask_t search_cpus;
  10594. -           cpumask_and(&search_cpus, tsk_cpus_allowed(p),
  10595. -                   cpu_online_mask);
  10596. -           if (cpumask_test_cpu(cpu, &search_cpus)) {
  10597. -               sync_used = 1;
  10598. -               new_cpu = cpu;
  10599. -           }
  10600. -       }
  10601. -
  10602. -       if (!sync_used) {
  10603. -           if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
  10604. -               new_cpu = energy_aware_wake_cpu(p, prev_cpu);
  10605. -           else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
  10606. -               new_cpu = select_idle_sibling(p, new_cpu);
  10607. -       }
  10608. -
  10609. -   } else while (sd) {
  10610. -       struct sched_group *group;
  10611. -       int weight;
  10612. -
  10613. -       if (!(sd->flags & sd_flag)) {
  10614. -           sd = sd->child;
  10615. -           continue;
  10616. -       }
  10617. -
  10618. -       group = find_idlest_group(sd, p, cpu, sd_flag);
  10619. -       if (!group) {
  10620. -           sd = sd->child;
  10621. -           continue;
  10622. -       }
  10623. +   if (sd && !(sd_flag & SD_BALANCE_FORK)) {
  10624. +       /*
  10625. +        * We're going to need the task's util for capacity_spare_wake
  10626. +        * in find_idlest_group. Sync it up to prev_cpu's
  10627. +        * last_update_time.
  10628. +        */
  10629. +       sync_entity_load_avg(&p->se);
  10630. +   }
  10631.  
  10632. -       new_cpu = find_idlest_cpu(group, p, cpu);
  10633. -       if (new_cpu == -1 || new_cpu == cpu) {
  10634. -           /* Now try balancing at a lower domain level of cpu */
  10635. -           sd = sd->child;
  10636. -           continue;
  10637. -       }
  10638. +   if (!sd) {
  10639. +       if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
  10640. +           new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
  10641.  
  10642. -       /* Now try balancing at a lower domain level of new_cpu */
  10643. -       cpu = new_cpu;
  10644. -       weight = sd->span_weight;
  10645. -       sd = NULL;
  10646. -       for_each_domain(cpu, tmp) {
  10647. -           if (weight <= tmp->span_weight)
  10648. -               break;
  10649. -           if (tmp->flags & sd_flag)
  10650. -               sd = tmp;
  10651. -       }
  10652. -       /* while loop will break here if sd == NULL */
  10653. +   } else {
  10654. +       new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
  10655.     }
  10656.     rcu_read_unlock();
  10657.  
  10658. @@ -5742,7 +6771,7 @@
  10659.   * cfs_rq_of(p) references at time of call are still valid and identify the
  10660.   * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
  10661.   */
  10662. -static void migrate_task_rq_fair(struct task_struct *p, int next_cpu)
  10663. +static void migrate_task_rq_fair(struct task_struct *p)
  10664.  {
  10665.     /*
  10666.      * We are supposed to update the task to "current" time, then its up to date
  10667. @@ -5929,7 +6958,7 @@
  10668.  }
  10669.  
  10670.  static struct task_struct *
  10671. -pick_next_task_fair(struct rq *rq, struct task_struct *prev)
  10672. +pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
  10673.  {
  10674.     struct cfs_rq *cfs_rq = &rq->cfs;
  10675.     struct sched_entity *se;
  10676. @@ -6041,8 +7070,15 @@
  10677.  
  10678.  idle:
  10679.     rq->misfit_task = 0;
  10680. -
  10681. +   /*
  10682. +    * This is OK, because current is on_cpu, which avoids it being picked
  10683. +    * for load-balance and preemption/IRQs are still disabled avoiding
  10684. +    * further scheduler activity on it and we're being very careful to
  10685. +    * re-start the picking loop.
  10686. +    */
  10687. +   lockdep_unpin_lock(&rq->lock, cookie);
  10688.     new_tasks = idle_balance(rq);
  10689. +   lockdep_repin_lock(&rq->lock, cookie);
  10690.     /*
  10691.      * Because idle_balance() releases (and re-acquires) rq->lock, it is
  10692.      * possible for any higher priority task to appear. In that case we
  10693. @@ -6101,7 +7137,7 @@
  10694.          * so we don't do microscopic update in schedule()
  10695.          * and double the fastpath cost.
  10696.          */
  10697. -        rq->skip_clock_update = 1;
  10698. +       rq_clock_skip_update(rq, true);
  10699.     }
  10700.  
  10701.     set_skip_buddy(se);
  10702. @@ -6320,90 +7356,57 @@
  10703.  }
  10704.  
  10705.  #ifdef CONFIG_NUMA_BALANCING
  10706. -/* Returns true if the destination node has incurred more faults */
  10707. -static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
  10708. +/*
  10709. + * Returns 1, if task migration degrades locality
  10710. + * Returns 0, if task migration improves locality i.e migration preferred.
  10711. + * Returns -1, if task migration is not affected by locality.
  10712. + */
  10713. +static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
  10714.  {
  10715.     struct numa_group *numa_group = rcu_dereference(p->numa_group);
  10716. +   unsigned long src_faults, dst_faults;
  10717.     int src_nid, dst_nid;
  10718.  
  10719. -   if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
  10720. -       !(env->sd->flags & SD_NUMA)) {
  10721. -       return false;
  10722. -   }
  10723. +   if (!static_branch_likely(&sched_numa_balancing))
  10724. +       return -1;
  10725. +
  10726. +   if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
  10727. +       return -1;
  10728.  
  10729.     src_nid = cpu_to_node(env->src_cpu);
  10730.     dst_nid = cpu_to_node(env->dst_cpu);
  10731.  
  10732.     if (src_nid == dst_nid)
  10733. -       return false;
  10734. -
  10735. -   if (numa_group) {
  10736. -       /* Task is already in the group's interleave set. */
  10737. -       if (node_isset(src_nid, numa_group->active_nodes))
  10738. -           return false;
  10739. -
  10740. -       /* Task is moving into the group's interleave set. */
  10741. -       if (node_isset(dst_nid, numa_group->active_nodes))
  10742. -           return true;
  10743. +       return -1;
  10744.  
  10745. -       return group_faults(p, dst_nid) > group_faults(p, src_nid);
  10746. +   /* Migrating away from the preferred node is always bad. */
  10747. +   if (src_nid == p->numa_preferred_nid) {
  10748. +       if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
  10749. +           return 1;
  10750. +       else
  10751. +           return -1;
  10752.     }
  10753.  
  10754.     /* Encourage migration to the preferred node. */
  10755.     if (dst_nid == p->numa_preferred_nid)
  10756. -       return true;
  10757. -
  10758. -   return task_faults(p, dst_nid) > task_faults(p, src_nid);
  10759. -}
  10760. -
  10761. -
  10762. -static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
  10763. -{
  10764. -   struct numa_group *numa_group = rcu_dereference(p->numa_group);
  10765. -   int src_nid, dst_nid;
  10766. -
  10767. -   if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
  10768. -       return false;
  10769. -
  10770. -   if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
  10771. -       return false;
  10772. -
  10773. -   src_nid = cpu_to_node(env->src_cpu);
  10774. -   dst_nid = cpu_to_node(env->dst_cpu);
  10775. -
  10776. -   if (src_nid == dst_nid)
  10777. -       return false;
  10778. +       return 0;
  10779.  
  10780.     if (numa_group) {
  10781. -       /* Task is moving within/into the group's interleave set. */
  10782. -       if (node_isset(dst_nid, numa_group->active_nodes))
  10783. -           return false;
  10784. -
  10785. -       /* Task is moving out of the group's interleave set. */
  10786. -       if (node_isset(src_nid, numa_group->active_nodes))
  10787. -           return true;
  10788. -
  10789. -       return group_faults(p, dst_nid) < group_faults(p, src_nid);
  10790. +       src_faults = group_faults(p, src_nid);
  10791. +       dst_faults = group_faults(p, dst_nid);
  10792. +   } else {
  10793. +       src_faults = task_faults(p, src_nid);
  10794. +       dst_faults = task_faults(p, dst_nid);
  10795.     }
  10796.  
  10797. -   /* Migrating away from the preferred node is always bad. */
  10798. -   if (src_nid == p->numa_preferred_nid)
  10799. -       return true;
  10800. -
  10801. -   return task_faults(p, dst_nid) < task_faults(p, src_nid);
  10802. +   return dst_faults < src_faults;
  10803.  }
  10804.  
  10805.  #else
  10806. -static inline bool migrate_improves_locality(struct task_struct *p,
  10807. +static inline int migrate_degrades_locality(struct task_struct *p,
  10808.                          struct lb_env *env)
  10809.  {
  10810. -   return false;
  10811. -}
  10812. -
  10813. -static inline bool migrate_degrades_locality(struct task_struct *p,
  10814. -                        struct lb_env *env)
  10815. -{
  10816. -   return false;
  10817. +   return -1;
  10818.  }
  10819.  #endif
  10820.  
  10821. @@ -6413,7 +7416,7 @@
  10822.  static
  10823.  int can_migrate_task(struct task_struct *p, struct lb_env *env)
  10824.  {
  10825. -   int tsk_cache_hot = 0;
  10826. +   int tsk_cache_hot;
  10827.  
  10828.     lockdep_assert_held(&env->src_rq->lock);
  10829.  
  10830. @@ -6430,7 +7433,7 @@
  10831.     if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
  10832.         int cpu;
  10833.  
  10834. -       schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
  10835. +       schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
  10836.  
  10837.         env->flags |= LBF_SOME_PINNED;
  10838.  
  10839. @@ -6461,7 +7464,7 @@
  10840.     env->flags &= ~LBF_ALL_PINNED;
  10841.  
  10842.     if (task_running(env->src_rq, p)) {
  10843. -       schedstat_inc(p, se.statistics.nr_failed_migrations_running);
  10844. +       schedstat_inc(p->se.statistics.nr_failed_migrations_running);
  10845.         return 0;
  10846.     }
  10847.  
  10848. @@ -6471,20 +7474,20 @@
  10849.      * 2) task is cache cold, or
  10850.      * 3) too many balance attempts have failed.
  10851.      */
  10852. -   tsk_cache_hot = task_hot(p, env);
  10853. -   if (!tsk_cache_hot)
  10854. -       tsk_cache_hot = migrate_degrades_locality(p, env);
  10855. +   tsk_cache_hot = migrate_degrades_locality(p, env);
  10856. +   if (tsk_cache_hot == -1)
  10857. +       tsk_cache_hot = task_hot(p, env);
  10858.  
  10859. -   if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
  10860. +   if (tsk_cache_hot <= 0 ||
  10861.         env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
  10862. -       if (tsk_cache_hot) {
  10863. -           schedstat_inc(env->sd, lb_hot_gained[env->idle]);
  10864. -           schedstat_inc(p, se.statistics.nr_forced_migrations);
  10865. +       if (tsk_cache_hot == 1) {
  10866. +           schedstat_inc(env->sd->lb_hot_gained[env->idle]);
  10867. +           schedstat_inc(p->se.statistics.nr_forced_migrations);
  10868.         }
  10869.         return 1;
  10870.     }
  10871.  
  10872. -   schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
  10873. +   schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
  10874.     return 0;
  10875.  }
  10876.  
  10877. @@ -6495,8 +7498,8 @@
  10878.  {
  10879.     lockdep_assert_held(&env->src_rq->lock);
  10880.  
  10881. -   deactivate_task(env->src_rq, p, 0);
  10882.     p->on_rq = TASK_ON_RQ_MIGRATING;
  10883. +   deactivate_task(env->src_rq, p, 0);
  10884.     double_lock_balance(env->src_rq, env->dst_rq);
  10885.     set_task_cpu(p, env->dst_cpu);
  10886.     double_unlock_balance(env->src_rq, env->dst_rq);
  10887. @@ -6526,7 +7529,7 @@
  10888.          * so we can safely collect stats here rather than
  10889.          * inside detach_tasks().
  10890.          */
  10891. -       schedstat_inc(env->sd, lb_gained[env->idle]);
  10892. +       schedstat_inc(env->sd->lb_gained[env->idle]);
  10893.         return p;
  10894.     }
  10895.     return NULL;
  10896. @@ -6618,7 +7621,7 @@
  10897.      * so we can safely collect detach_one_task() stats here rather
  10898.      * than inside detach_one_task().
  10899.      */
  10900. -   schedstat_add(env->sd, lb_gained[env->idle], detached);
  10901. +   schedstat_add(env->sd->lb_gained[env->idle], detached);
  10902.  
  10903.     return detached;
  10904.  }
  10905. @@ -6631,8 +7634,8 @@
  10906.     lockdep_assert_held(&rq->lock);
  10907.  
  10908.     BUG_ON(task_rq(p) != rq);
  10909. -   p->on_rq = TASK_ON_RQ_QUEUED;
  10910.     activate_task(rq, p, 0);
  10911. +   p->on_rq = TASK_ON_RQ_QUEUED;
  10912.     check_preempt_curr(rq, p, 0);
  10913.  }
  10914.  
  10915. @@ -6647,7 +7650,7 @@
  10916.     /*
  10917.      * We want to potentially raise target_cpu's OPP.
  10918.      */
  10919. -   update_capacity_of(cpu_of(rq), true);
  10920. +   update_capacity_of(cpu_of(rq));
  10921.     raw_spin_unlock(&rq->lock);
  10922.  }
  10923.  
  10924. @@ -6672,7 +7675,7 @@
  10925.     /*
  10926.      * We want to potentially raise env.dst_cpu's OPP.
  10927.      */
  10928. -   update_capacity_of(env->dst_cpu, true);
  10929. +   update_capacity_of(env->dst_cpu);
  10930.  
  10931.     raw_spin_unlock(&env->dst_rq->lock);
  10932.  }
  10933. @@ -6692,12 +7695,20 @@
  10934.      * list_add_leaf_cfs_rq() for details.
  10935.      */
  10936.     for_each_leaf_cfs_rq(rq, cfs_rq) {
  10937. +       struct sched_entity *se;
  10938. +
  10939.         /* throttled entities do not contribute to load */
  10940.         if (throttled_hierarchy(cfs_rq))
  10941.             continue;
  10942.  
  10943. -       if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
  10944. +       if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
  10945. +                      true))
  10946.             update_tg_load_avg(cfs_rq, 0);
  10947. +
  10948. +       /* Propagate pending load changes to the parent, if any: */
  10949. +       se = cfs_rq->tg->se[cpu];
  10950. +       if (se && !skip_blocked_update(se))
  10951. +           update_load_avg(se, 0);
  10952.     }
  10953.     raw_spin_unlock_irqrestore(&rq->lock, flags);
  10954.  }
  10955. @@ -6757,7 +7768,7 @@
  10956.  
  10957.     raw_spin_lock_irqsave(&rq->lock, flags);
  10958.     update_rq_clock(rq);
  10959. -   update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
  10960. +   update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
  10961.     raw_spin_unlock_irqrestore(&rq->lock, flags);
  10962.  }
  10963.  
  10964. @@ -6908,6 +7919,9 @@
  10965.  
  10966.     cpu_rq(cpu)->cpu_capacity_orig = capacity;
  10967.  
  10968. +   capacity *= arch_scale_max_freq_capacity(sd, cpu);
  10969. +   capacity >>= SCHED_CAPACITY_SHIFT;
  10970. +
  10971.     mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
  10972.  
  10973.     raw_spin_lock_irqsave(&mcc->lock, flags);
  10974. @@ -6937,13 +7951,14 @@
  10975.     cpu_rq(cpu)->cpu_capacity = capacity;
  10976.     sdg->sgc->capacity = capacity;
  10977.     sdg->sgc->max_capacity = capacity;
  10978. +   sdg->sgc->min_capacity = capacity;
  10979.  }
  10980.  
  10981.  void update_group_capacity(struct sched_domain *sd, int cpu)
  10982.  {
  10983.     struct sched_domain *child = sd->child;
  10984.     struct sched_group *group, *sdg = sd->groups;
  10985. -   unsigned long capacity, max_capacity;
  10986. +   unsigned long capacity, max_capacity, min_capacity;
  10987.     unsigned long interval;
  10988.  
  10989.     interval = msecs_to_jiffies(sd->balance_interval);
  10990. @@ -6957,6 +7972,7 @@
  10991.  
  10992.     capacity = 0;
  10993.     max_capacity = 0;
  10994. +   min_capacity = ULONG_MAX;
  10995.  
  10996.     if (child->flags & SD_OVERLAP) {
  10997.         /*
  10998. @@ -6987,6 +8003,7 @@
  10999.             }
  11000.  
  11001.             max_capacity = max(capacity, max_capacity);
  11002. +           min_capacity = min(capacity, min_capacity);
  11003.         }
  11004.     } else  {
  11005.         /*
  11006. @@ -7000,12 +8017,14 @@
  11007.  
  11008.             capacity += sgc->capacity;
  11009.             max_capacity = max(sgc->max_capacity, max_capacity);
  11010. +           min_capacity = min(sgc->min_capacity, min_capacity);
  11011.             group = group->next;
  11012.         } while (group != child->groups);
  11013.     }
  11014.  
  11015.     sdg->sgc->capacity = capacity;
  11016.     sdg->sgc->max_capacity = max_capacity;
  11017. +   sdg->sgc->min_capacity = min_capacity;
  11018.  }
  11019.  
  11020.  /*
  11021. @@ -7112,9 +8131,9 @@
  11022.                             ref->sgc->max_capacity;
  11023.  }
  11024.  
  11025. -static enum group_type group_classify(struct lb_env *env,
  11026. -       struct sched_group *group,
  11027. -       struct sg_lb_stats *sgs)
  11028. +static inline enum
  11029. +group_type group_classify(struct sched_group *group,
  11030. +             struct sg_lb_stats *sgs)
  11031.  {
  11032.     if (sgs->group_no_capacity)
  11033.         return group_overloaded;
  11034. @@ -7128,6 +8147,38 @@
  11035.     return group_other;
  11036.  }
  11037.  
  11038. +#ifdef CONFIG_NO_HZ_COMMON
  11039. +/*
  11040. + * idle load balancing data
  11041. + *  - used by the nohz balance, but we want it available here
  11042. + *    so that we can see which CPUs have no tick.
  11043. + */
  11044. +static struct {
  11045. +   cpumask_var_t idle_cpus_mask;
  11046. +   atomic_t nr_cpus;
  11047. +   unsigned long next_balance;     /* in jiffy units */
  11048. +} nohz ____cacheline_aligned;
  11049. +
  11050. +static inline void update_cpu_stats_if_tickless(struct rq *rq)
  11051. +{
  11052. +   /* only called from update_sg_lb_stats when irqs are disabled */
  11053. +   if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
  11054. +       /* rate limit updates to once-per-jiffie at most */
  11055. +       if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
  11056. +           return;
  11057. +
  11058. +       raw_spin_lock(&rq->lock);
  11059. +       update_rq_clock(rq);
  11060. +       update_idle_cpu_load(rq);
  11061. +       update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
  11062. +       raw_spin_unlock(&rq->lock);
  11063. +   }
  11064. +}
  11065. +
  11066. +#else
  11067. +static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
  11068. +#endif
  11069. +
  11070.  /**
  11071.   * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  11072.   * @env: The load balancing environment.
  11073. @@ -7151,6 +8202,12 @@
  11074.     for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
  11075.         struct rq *rq = cpu_rq(i);
  11076.  
  11077. +       /* if we are entering idle and there are CPUs with
  11078. +        * their tick stopped, do an update for them
  11079. +        */
  11080. +       if (env->idle == CPU_NEWLY_IDLE)
  11081. +           update_cpu_stats_if_tickless(rq);
  11082. +
  11083.         /* Bias balancing toward cpus of our domain */
  11084.         if (local_group)
  11085.             load = target_load(i, load_idx);
  11086. @@ -7158,7 +8215,7 @@
  11087.             load = source_load(i, load_idx);
  11088.  
  11089.         sgs->group_load += load;
  11090. -       sgs->group_util += cpu_util(i, UTIL_AVG);
  11091. +       sgs->group_util += cpu_util(i);
  11092.         sgs->sum_nr_running += rq->cfs.h_nr_running;
  11093.  
  11094.         nr_running = rq->nr_running;
  11095. @@ -7193,7 +8250,7 @@
  11096.     sgs->group_weight = group->group_weight;
  11097.  
  11098.     sgs->group_no_capacity = group_is_overloaded(env, sgs);
  11099. -   sgs->group_type = group_classify(env, group, sgs);
  11100. +   sgs->group_type = group_classify(group, sgs);
  11101.  }
  11102.  
  11103.  /**
  11104. @@ -7233,18 +8290,27 @@
  11105.     if (sgs->avg_load <= busiest->avg_load)
  11106.         return false;
  11107.  
  11108. +   if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
  11109. +       goto asym_packing;
  11110. +
  11111.     /*
  11112. -    * Candiate sg has no more than one task per cpu and has higher
  11113. -    * per-cpu capacity. No reason to pull tasks to less capable cpus.
  11114. +    * Candidate sg has no more than one task per CPU and
  11115. +    * has higher per-CPU capacity. Migrating tasks to less
  11116. +    * capable CPUs may harm throughput. Maximize throughput,
  11117. +    * power/energy consequences are not considered.
  11118.      */
  11119.     if (sgs->sum_nr_running <= sgs->group_weight &&
  11120.         group_smaller_cpu_capacity(sds->local, sg))
  11121.         return false;
  11122.  
  11123. +asym_packing:
  11124.     /* This is the busiest node in its class. */
  11125.     if (!(env->sd->flags & SD_ASYM_PACKING))
  11126.         return true;
  11127.  
  11128. +   /* No ASYM_PACKING if target cpu is already busy */
  11129. +   if (env->idle == CPU_NOT_IDLE)
  11130. +       return true;
  11131.     /*
  11132.      * ASYM_PACKING needs to move all the work to the lowest
  11133.      * numbered CPUs in the group, therefore mark all groups
  11134. @@ -7254,7 +8320,8 @@
  11135.         if (!sds->busiest)
  11136.             return true;
  11137.  
  11138. -       if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
  11139. +       /* Prefer to move from highest possible cpu's work */
  11140. +       if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
  11141.             return true;
  11142.     }
  11143.  
  11144. @@ -7291,6 +8358,9 @@
  11145.  }
  11146.  #endif /* CONFIG_NUMA_BALANCING */
  11147.  
  11148. +#define lb_sd_parent(sd) \
  11149. +   (sd->parent && sd->parent->groups != sd->parent->groups->next)
  11150. +
  11151.  /**
  11152.   * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
  11153.   * @env: The load balancing environment.
  11154. @@ -7343,7 +8413,7 @@
  11155.             group_has_capacity(env, &sds->local_stat) &&
  11156.             (sgs->sum_nr_running > 1)) {
  11157.             sgs->group_no_capacity = 1;
  11158. -           sgs->group_type = group_overloaded;
  11159. +           sgs->group_type = group_classify(sg, sgs);
  11160.         }
  11161.  
  11162.         /*
  11163. @@ -7373,7 +8443,7 @@
  11164.  
  11165.     env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
  11166.  
  11167. -   if (!env->sd->parent) {
  11168. +   if (!lb_sd_parent(env->sd)) {
  11169.         /* update overload indicator if we are at root domain */
  11170.         if (env->dst_rq->rd->overload != overload)
  11171.             env->dst_rq->rd->overload = overload;
  11172. @@ -7422,6 +8492,9 @@
  11173.     if (!(env->sd->flags & SD_ASYM_PACKING))
  11174.         return 0;
  11175.  
  11176. +   if (env->idle == CPU_NOT_IDLE)
  11177. +       return 0;
  11178. +
  11179.     if (!sds->busiest)
  11180.         return 0;
  11181.  
  11182. @@ -7639,8 +8712,7 @@
  11183.     busiest = &sds.busiest_stat;
  11184.  
  11185.     /* ASYM feature bypasses nice load balance check */
  11186. -   if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
  11187. -       check_asym_packing(env, &sds))
  11188. +   if (check_asym_packing(env, &sds))
  11189.         return sds.busiest;
  11190.  
  11191.     /* There is no busy sibling group to pull tasks from */
  11192. @@ -7658,8 +8730,11 @@
  11193.     if (busiest->group_type == group_imbalanced)
  11194.         goto force_balance;
  11195.  
  11196. -   /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
  11197. -   if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
  11198. +   /*
  11199. +    * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
  11200. +    * capacities from resulting in underutilization due to avg_load.
  11201. +    */
  11202. +   if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
  11203.         busiest->group_no_capacity)
  11204.         goto force_balance;
  11205.  
  11206. @@ -7827,6 +8902,7 @@
  11207.     }
  11208.  
  11209.     if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
  11210. +       ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
  11211.                 env->src_rq->cfs.h_nr_running == 1 &&
  11212.                 cpu_overutilized(env->src_cpu) &&
  11213.                 !cpu_overutilized(env->dst_cpu)) {
  11214. @@ -7881,7 +8957,7 @@
  11215.             int *continue_balancing)
  11216.  {
  11217.     int ld_moved, cur_ld_moved, active_balance = 0;
  11218. -   struct sched_domain *sd_parent = sd->parent;
  11219. +   struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
  11220.     struct sched_group *group;
  11221.     struct rq *busiest;
  11222.     unsigned long flags;
  11223. @@ -7908,7 +8984,7 @@
  11224.  
  11225.     cpumask_copy(cpus, cpu_active_mask);
  11226.  
  11227. -   schedstat_inc(sd, lb_count[idle]);
  11228. +   schedstat_inc(sd->lb_count[idle]);
  11229.  
  11230.  redo:
  11231.     if (!should_we_balance(&env)) {
  11232. @@ -7918,19 +8994,19 @@
  11233.  
  11234.     group = find_busiest_group(&env);
  11235.     if (!group) {
  11236. -       schedstat_inc(sd, lb_nobusyg[idle]);
  11237. +       schedstat_inc(sd->lb_nobusyg[idle]);
  11238.         goto out_balanced;
  11239.     }
  11240.  
  11241.     busiest = find_busiest_queue(&env, group);
  11242.     if (!busiest) {
  11243. -       schedstat_inc(sd, lb_nobusyq[idle]);
  11244. +       schedstat_inc(sd->lb_nobusyq[idle]);
  11245.         goto out_balanced;
  11246.     }
  11247.  
  11248.     BUG_ON(busiest == env.dst_rq);
  11249.  
  11250. -   schedstat_add(sd, lb_imbalance[idle], env.imbalance);
  11251. +   schedstat_add(sd->lb_imbalance[idle], env.imbalance);
  11252.  
  11253.     env.src_cpu = busiest->cpu;
  11254.     env.src_rq = busiest;
  11255. @@ -7948,6 +9024,7 @@
  11256.  
  11257.  more_balance:
  11258.         raw_spin_lock_irqsave(&busiest->lock, flags);
  11259. +       update_rq_clock(busiest);
  11260.  
  11261.         /*
  11262.          * cur_ld_moved - load moved in current iteration
  11263. @@ -7958,7 +9035,7 @@
  11264.          * We want to potentially lower env.src_cpu's OPP.
  11265.          */
  11266.         if (cur_ld_moved)
  11267. -           update_capacity_of(env.src_cpu, true);
  11268. +           update_capacity_of(env.src_cpu);
  11269.  
  11270.         /*
  11271.          * We've detached some tasks from busiest_rq. Every
  11272. @@ -8032,7 +9109,24 @@
  11273.         /* All tasks on this runqueue were pinned by CPU affinity */
  11274.         if (unlikely(env.flags & LBF_ALL_PINNED)) {
  11275.             cpumask_clear_cpu(cpu_of(busiest), cpus);
  11276. -           if (!cpumask_empty(cpus)) {
  11277. +           /*
  11278. +            * dst_cpu is not a valid busiest cpu in the following
  11279. +            * check since load cannot be pulled from dst_cpu to be
  11280. +            * put on dst_cpu.
  11281. +            */
  11282. +           cpumask_clear_cpu(env.dst_cpu, cpus);
  11283. +           /*
  11284. +            * Go back to "redo" iff the load-balance cpumask
  11285. +            * contains other potential busiest cpus for the
  11286. +            * current sched domain.
  11287. +            */
  11288. +           if (cpumask_intersects(cpus, sched_domain_span(env.sd))) {
  11289. +               /*
  11290. +                * Now that the check has passed, reenable
  11291. +                * dst_cpu so that load can be calculated on
  11292. +                * it in the redo path.
  11293. +                */
  11294. +               cpumask_set_cpu(env.dst_cpu, cpus);
  11295.                 env.loop = 0;
  11296.                 env.loop_break = sched_nr_migrate_break;
  11297.                 goto redo;
  11298. @@ -8042,7 +9136,7 @@
  11299.     }
  11300.  
  11301.     if (!ld_moved) {
  11302. -       schedstat_inc(sd, lb_failed[idle]);
  11303. +       schedstat_inc(sd->lb_failed[idle]);
  11304.         /*
  11305.          * Increment the failure counter only on periodic balance.
  11306.          * We do not want newidle balance, which can be very
  11307. @@ -8086,10 +9180,7 @@
  11308.                     &busiest->active_balance_work);
  11309.             }
  11310.  
  11311. -           /*
  11312. -            * We've kicked active balancing, reset the failure
  11313. -            * counter.
  11314. -            */
  11315. +           /* We've kicked active balancing, force task migration. */
  11316.             sd->nr_balance_failed = sd->cache_nice_tries+1;
  11317.         }
  11318.     } else
  11319. @@ -8129,7 +9220,7 @@
  11320.      * we can't migrate them. Let the imbalance flag set so parent level
  11321.      * can try to migrate them.
  11322.      */
  11323. -   schedstat_inc(sd, lb_balanced[idle]);
  11324. +   schedstat_inc(sd->lb_balanced[idle]);
  11325.  
  11326.     sd->nr_balance_failed = 0;
  11327.  
  11328. @@ -8185,8 +9276,6 @@
  11329.     u64 curr_cost = 0;
  11330.     long removed_util = 0;
  11331.  
  11332. -   idle_enter_fair(this_rq);
  11333. -
  11334.     /*
  11335.      * We must set idle_stamp _before_ calling idle_balance(), such that we
  11336.      * measure the duration of idle_balance() as idle time.
  11337. @@ -8283,14 +9372,13 @@
  11338.         pulled_task = -1;
  11339.  
  11340.     if (pulled_task) {
  11341. -       idle_exit_fair(this_rq);
  11342.         this_rq->idle_stamp = 0;
  11343.     } else if (removed_util) {
  11344.         /*
  11345.          * No task pulled and someone has been migrated away.
  11346.          * Good case to trigger an OPP update.
  11347.          */
  11348. -       update_capacity_of(this_cpu, true);
  11349. +       update_capacity_of(this_cpu);
  11350.     }
  11351.  
  11352.     return pulled_task;
  11353. @@ -8308,8 +9396,18 @@
  11354.     int busiest_cpu = cpu_of(busiest_rq);
  11355.     int target_cpu = busiest_rq->push_cpu;
  11356.     struct rq *target_rq = cpu_rq(target_cpu);
  11357. -   struct sched_domain *sd;
  11358. +   struct sched_domain *sd = NULL;
  11359.     struct task_struct *p = NULL;
  11360. +   struct task_struct *push_task = NULL;
  11361. +   int push_task_detached = 0;
  11362. +   struct lb_env env = {
  11363. +       .sd     = sd,
  11364. +       .dst_cpu    = target_cpu,
  11365. +       .dst_rq     = target_rq,
  11366. +       .src_cpu    = busiest_rq->cpu,
  11367. +       .src_rq     = busiest_rq,
  11368. +       .idle       = CPU_IDLE,
  11369. +   };
  11370.  
  11371.     raw_spin_lock_irq(&busiest_rq->lock);
  11372.  
  11373. @@ -8329,6 +9427,17 @@
  11374.      */
  11375.     BUG_ON(busiest_rq == target_rq);
  11376.  
  11377. +   push_task = busiest_rq->push_task;
  11378. +   if (push_task) {
  11379. +       if (task_on_rq_queued(push_task) &&
  11380. +           task_cpu(push_task) == busiest_cpu &&
  11381. +                   cpu_online(target_cpu)) {
  11382. +           detach_task(push_task, &env);
  11383. +           push_task_detached = 1;
  11384. +       }
  11385. +       goto out_unlock;
  11386. +   }
  11387. +
  11388.     /* Search for an sd spanning us and the target CPU. */
  11389.     rcu_read_lock();
  11390.     for_each_domain(target_cpu, sd) {
  11391. @@ -8338,33 +9447,36 @@
  11392.     }
  11393.  
  11394.     if (likely(sd)) {
  11395. -       struct lb_env env = {
  11396. -           .sd     = sd,
  11397. -           .dst_cpu    = target_cpu,
  11398. -           .dst_rq     = target_rq,
  11399. -           .src_cpu    = busiest_rq->cpu,
  11400. -           .src_rq     = busiest_rq,
  11401. -           .idle       = CPU_IDLE,
  11402. -       };
  11403. -
  11404. -       schedstat_inc(sd, alb_count);
  11405. +       env.sd = sd;
  11406. +       schedstat_inc(sd->alb_count);
  11407. +       update_rq_clock(busiest_rq);
  11408.  
  11409.         p = detach_one_task(&env);
  11410.         if (p) {
  11411. -           schedstat_inc(sd, alb_pushed);
  11412. +           schedstat_inc(sd->alb_pushed);
  11413.             /*
  11414.              * We want to potentially lower env.src_cpu's OPP.
  11415.              */
  11416. -           update_capacity_of(env.src_cpu, true);
  11417. +           update_capacity_of(env.src_cpu);
  11418.         }
  11419.         else
  11420. -           schedstat_inc(sd, alb_failed);
  11421. +           schedstat_inc(sd->alb_failed);
  11422.     }
  11423.     rcu_read_unlock();
  11424.  out_unlock:
  11425.     busiest_rq->active_balance = 0;
  11426. +
  11427. +   if (push_task)
  11428. +       busiest_rq->push_task = NULL;
  11429. +
  11430.     raw_spin_unlock(&busiest_rq->lock);
  11431.  
  11432. +   if (push_task) {
  11433. +       if (push_task_detached)
  11434. +           attach_one_task(target_rq, push_task);
  11435. +       put_task_struct(push_task);
  11436. +   }
  11437. +
  11438.     if (p)
  11439.         attach_one_task(target_rq, p);
  11440.  
  11441. @@ -8385,12 +9497,6 @@
  11442.   *   needed, they will kick the idle load balancer, which then does idle
  11443.   *   load balancing for all the idle CPUs.
  11444.   */
  11445. -static struct {
  11446. -   cpumask_var_t idle_cpus_mask;
  11447. -   atomic_t nr_cpus;
  11448. -   unsigned long next_balance;     /* in jiffy units */
  11449. -} nohz ____cacheline_aligned;
  11450. -
  11451.  static inline int find_new_ilb(void)
  11452.  {
  11453.     int ilb = cpumask_first(nohz.idle_cpus_mask);
  11454. @@ -8449,13 +9555,13 @@
  11455.     int cpu = smp_processor_id();
  11456.  
  11457.     rcu_read_lock();
  11458. -   sd = rcu_dereference(per_cpu(sd_busy, cpu));
  11459. +   sd = rcu_dereference(per_cpu(sd_llc, cpu));
  11460.  
  11461.     if (!sd || !sd->nohz_idle)
  11462.         goto unlock;
  11463.     sd->nohz_idle = 0;
  11464.  
  11465. -   atomic_inc(&sd->groups->sgc->nr_busy_cpus);
  11466. +   atomic_inc(&sd->shared->nr_busy_cpus);
  11467.  unlock:
  11468.     rcu_read_unlock();
  11469.  }
  11470. @@ -8466,13 +9572,13 @@
  11471.     int cpu = smp_processor_id();
  11472.  
  11473.     rcu_read_lock();
  11474. -   sd = rcu_dereference(per_cpu(sd_busy, cpu));
  11475. +   sd = rcu_dereference(per_cpu(sd_llc, cpu));
  11476.  
  11477.     if (!sd || sd->nohz_idle)
  11478.         goto unlock;
  11479.     sd->nohz_idle = 1;
  11480.  
  11481. -   atomic_dec(&sd->groups->sgc->nr_busy_cpus);
  11482. +   atomic_dec(&sd->shared->nr_busy_cpus);
  11483.  unlock:
  11484.     rcu_read_unlock();
  11485.  }
  11486. @@ -8711,8 +9817,8 @@
  11487.  static inline bool nohz_kick_needed(struct rq *rq)
  11488.  {
  11489.     unsigned long now = jiffies;
  11490. +   struct sched_domain_shared *sds;
  11491.     struct sched_domain *sd;
  11492. -   struct sched_group_capacity *sgc;
  11493.     int nr_busy, cpu = rq->cpu;
  11494.     bool kick = false;
  11495.  
  11496. @@ -8740,12 +9846,18 @@
  11497.         (!energy_aware() || cpu_overutilized(cpu)))
  11498.         return true;
  11499.  
  11500. -   rcu_read_lock();
  11501. -   sd = rcu_dereference(per_cpu(sd_busy, cpu));
  11502. -   if (sd && !energy_aware()) {
  11503. -       sgc = sd->groups->sgc;
  11504. -       nr_busy = atomic_read(&sgc->nr_busy_cpus);
  11505. +   /* Do idle load balance if there have misfit task */
  11506. +   if (energy_aware())
  11507. +       return rq->misfit_task;
  11508.  
  11509. +   rcu_read_lock();
  11510. +   sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
  11511. +   if (sds) {
  11512. +       /*
  11513. +        * XXX: write a coherent comment on why we do this.
  11514. +        * See also: http://lkml.kernel.org/r/[email protected]
  11515. +        */
  11516. +       nr_busy = atomic_read(&sds->nr_busy_cpus);
  11517.         if (nr_busy > 1) {
  11518.             kick = true;
  11519.             goto unlock;
  11520. @@ -8831,6 +9943,47 @@
  11521.     unthrottle_offline_cfs_rqs(rq);
  11522.  }
  11523.  
  11524. +static inline int
  11525. +kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
  11526. +{
  11527. +   int rc = 0;
  11528. +
  11529. +   /* Invoke active balance to force migrate currently running task */
  11530. +   raw_spin_lock(&rq->lock);
  11531. +   if (!rq->active_balance) {
  11532. +       rq->active_balance = 1;
  11533. +       rq->push_cpu = new_cpu;
  11534. +       get_task_struct(p);
  11535. +       rq->push_task = p;
  11536. +       rc = 1;
  11537. +   }
  11538. +   raw_spin_unlock(&rq->lock);
  11539. +
  11540. +   return rc;
  11541. +}
  11542. +
  11543. +void check_for_migration(struct rq *rq, struct task_struct *p)
  11544. +{
  11545. +   int new_cpu;
  11546. +   int active_balance;
  11547. +   int cpu = task_cpu(p);
  11548. +
  11549. +   if (energy_aware() && rq->misfit_task) {
  11550. +       if (rq->curr->state != TASK_RUNNING ||
  11551. +           rq->curr->nr_cpus_allowed == 1)
  11552. +           return;
  11553. +
  11554. +       new_cpu = select_energy_cpu_brute(p, cpu, 0);
  11555. +       if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) {
  11556. +           active_balance = kick_active_balance(rq, p, new_cpu);
  11557. +           if (active_balance)
  11558. +               stop_one_cpu_nowait(cpu,
  11559. +                       active_load_balance_cpu_stop,
  11560. +                       rq, &rq->active_balance_work);
  11561. +       }
  11562. +   }
  11563. +}
  11564. +
  11565.  #endif /* CONFIG_SMP */
  11566.  
  11567.  /*
  11568. @@ -8846,7 +9999,7 @@
  11569.         entity_tick(cfs_rq, se, queued);
  11570.     }
  11571.  
  11572. -   if (numabalancing_enabled)
  11573. +   if (static_branch_unlikely(&sched_numa_balancing))
  11574.         task_tick_numa(rq, curr);
  11575.  
  11576.  #ifdef CONFIG_SMP
  11577. @@ -8869,31 +10022,17 @@
  11578.  {
  11579.     struct cfs_rq *cfs_rq;
  11580.     struct sched_entity *se = &p->se, *curr;
  11581. -   int this_cpu = smp_processor_id();
  11582.     struct rq *rq = this_rq();
  11583. -   unsigned long flags;
  11584. -
  11585. -   raw_spin_lock_irqsave(&rq->lock, flags);
  11586.  
  11587. +   raw_spin_lock(&rq->lock);
  11588.     update_rq_clock(rq);
  11589.  
  11590.     cfs_rq = task_cfs_rq(current);
  11591.     curr = cfs_rq->curr;
  11592. -
  11593. -   /*
  11594. -    * Not only the cpu but also the task_group of the parent might have
  11595. -    * been changed after parent->se.parent,cfs_rq were copied to
  11596. -    * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
  11597. -    * of child point to valid ones.
  11598. -    */
  11599. -   rcu_read_lock();
  11600. -   __set_task_cpu(p, this_cpu);
  11601. -   rcu_read_unlock();
  11602. -
  11603. -   update_curr(cfs_rq);
  11604. -
  11605. -   if (curr)
  11606. +   if (curr) {
  11607. +       update_curr(cfs_rq);
  11608.         se->vruntime = curr->vruntime;
  11609. +   }
  11610.     place_entity(cfs_rq, se, 1);
  11611.  
  11612.     if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
  11613. @@ -8906,8 +10045,7 @@
  11614.     }
  11615.  
  11616.     se->vruntime -= cfs_rq->min_vruntime;
  11617. -
  11618. -   raw_spin_unlock_irqrestore(&rq->lock, flags);
  11619. +   raw_spin_unlock(&rq->lock);
  11620.  }
  11621.  
  11622.  /*
  11623. @@ -8959,6 +10097,61 @@
  11624.     return false;
  11625.  }
  11626.  
  11627. +#ifdef CONFIG_FAIR_GROUP_SCHED
  11628. +/*
  11629. + * Propagate the changes of the sched_entity across the tg tree to make it
  11630. + * visible to the root
  11631. + */
  11632. +static void propagate_entity_cfs_rq(struct sched_entity *se)
  11633. +{
  11634. +   struct cfs_rq *cfs_rq;
  11635. +
  11636. +   /* Start to propagate at parent */
  11637. +   se = se->parent;
  11638. +
  11639. +   for_each_sched_entity(se) {
  11640. +       cfs_rq = cfs_rq_of(se);
  11641. +
  11642. +       if (cfs_rq_throttled(cfs_rq))
  11643. +           break;
  11644. +
  11645. +       update_load_avg(se, UPDATE_TG);
  11646. +   }
  11647. +}
  11648. +#else
  11649. +static void propagate_entity_cfs_rq(struct sched_entity *se) { }
  11650. +#endif
  11651. +
  11652. +static void detach_entity_cfs_rq(struct sched_entity *se)
  11653. +{
  11654. +   struct cfs_rq *cfs_rq = cfs_rq_of(se);
  11655. +
  11656. +   /* Catch up with the cfs_rq and remove our load when we leave */
  11657. +   update_load_avg(se, 0);
  11658. +   detach_entity_load_avg(cfs_rq, se);
  11659. +   update_tg_load_avg(cfs_rq, false);
  11660. +   propagate_entity_cfs_rq(se);
  11661. +}
  11662. +
  11663. +static void attach_entity_cfs_rq(struct sched_entity *se)
  11664. +{
  11665. +   struct cfs_rq *cfs_rq = cfs_rq_of(se);
  11666. +
  11667. +#ifdef CONFIG_FAIR_GROUP_SCHED
  11668. +   /*
  11669. +    * Since the real-depth could have been changed (only FAIR
  11670. +    * class maintain depth value), reset depth properly.
  11671. +    */
  11672. +   se->depth = se->parent ? se->parent->depth + 1 : 0;
  11673. +#endif
  11674. +
  11675. +   /* Synchronize entity with its cfs_rq */
  11676. +   update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
  11677. +   attach_entity_load_avg(cfs_rq, se);
  11678. +   update_tg_load_avg(cfs_rq, false);
  11679. +   propagate_entity_cfs_rq(se);
  11680. +}
  11681. +
  11682.  static void detach_task_cfs_rq(struct task_struct *p)
  11683.  {
  11684.     struct sched_entity *se = &p->se;
  11685. @@ -8973,8 +10166,7 @@
  11686.         se->vruntime -= cfs_rq->min_vruntime;
  11687.     }
  11688.  
  11689. -   /* Catch up with the cfs_rq and remove our load when we leave */
  11690. -   detach_entity_load_avg(cfs_rq, se);
  11691. +   detach_entity_cfs_rq(se);
  11692.  }
  11693.  
  11694.  static void attach_task_cfs_rq(struct task_struct *p)
  11695. @@ -8982,16 +10174,7 @@
  11696.     struct sched_entity *se = &p->se;
  11697.     struct cfs_rq *cfs_rq = cfs_rq_of(se);
  11698.  
  11699. -#ifdef CONFIG_FAIR_GROUP_SCHED
  11700. -   /*
  11701. -    * Since the real-depth could have been changed (only FAIR
  11702. -    * class maintain depth value), reset depth properly.
  11703. -    */
  11704. -   se->depth = se->parent ? se->parent->depth + 1 : 0;
  11705. -#endif
  11706. -
  11707. -   /* Synchronize task with its cfs_rq */
  11708. -   attach_entity_load_avg(cfs_rq, se);
  11709. +   attach_entity_cfs_rq(se);
  11710.  
  11711.     if (!vruntime_normalized(p))
  11712.         se->vruntime += cfs_rq->min_vruntime;
  11713. @@ -9045,12 +10228,23 @@
  11714.     cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
  11715.  #endif
  11716.  #ifdef CONFIG_SMP
  11717. +#ifdef CONFIG_FAIR_GROUP_SCHED
  11718. +   cfs_rq->propagate_avg = 0;
  11719. +#endif
  11720.     atomic_long_set(&cfs_rq->removed_load_avg, 0);
  11721.     atomic_long_set(&cfs_rq->removed_util_avg, 0);
  11722.  #endif
  11723.  }
  11724.  
  11725.  #ifdef CONFIG_FAIR_GROUP_SCHED
  11726. +static void task_set_group_fair(struct task_struct *p)
  11727. +{
  11728. +   struct sched_entity *se = &p->se;
  11729. +
  11730. +   set_task_rq(p, task_cpu(p));
  11731. +   se->depth = se->parent ? se->parent->depth + 1 : 0;
  11732. +}
  11733. +
  11734.  static void task_move_group_fair(struct task_struct *p)
  11735.  {
  11736.     detach_task_cfs_rq(p);
  11737. @@ -9063,6 +10257,19 @@
  11738.     attach_task_cfs_rq(p);
  11739.  }
  11740.  
  11741. +static void task_change_group_fair(struct task_struct *p, int type)
  11742. +{
  11743. +   switch (type) {
  11744. +   case TASK_SET_GROUP:
  11745. +       task_set_group_fair(p);
  11746. +       break;
  11747. +
  11748. +   case TASK_MOVE_GROUP:
  11749. +       task_move_group_fair(p);
  11750. +       break;
  11751. +   }
  11752. +}
  11753. +
  11754.  void free_fair_sched_group(struct task_group *tg)
  11755.  {
  11756.     int i;
  11757. @@ -9085,8 +10292,9 @@
  11758.  
  11759.  int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  11760.  {
  11761. -   struct cfs_rq *cfs_rq;
  11762.     struct sched_entity *se;
  11763. +   struct cfs_rq *cfs_rq;
  11764. +   struct rq *rq;
  11765.     int i;
  11766.  
  11767.     tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
  11768. @@ -9101,6 +10309,8 @@
  11769.     init_cfs_bandwidth(tg_cfs_bandwidth(tg));
  11770.  
  11771.     for_each_possible_cpu(i) {
  11772. +       rq = cpu_rq(i);
  11773. +
  11774.         cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
  11775.                       GFP_KERNEL, cpu_to_node(i));
  11776.         if (!cfs_rq)
  11777. @@ -9114,6 +10324,11 @@
  11778.         init_cfs_rq(cfs_rq);
  11779.         init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
  11780.         init_entity_runnable_average(se);
  11781. +
  11782. +       raw_spin_lock_irq(&rq->lock);
  11783. +       post_init_entity_util_avg(se);
  11784. +       sync_throttle(tg, i);
  11785. +       raw_spin_unlock_irq(&rq->lock);
  11786.     }
  11787.  
  11788.     return 1;
  11789. @@ -9202,8 +10417,10 @@
  11790.  
  11791.         /* Possible calls to update_curr() need rq clock */
  11792.         update_rq_clock(rq);
  11793. -       for_each_sched_entity(se)
  11794. -           update_cfs_shares(group_cfs_rq(se));
  11795. +       for_each_sched_entity(se) {
  11796. +           update_load_avg(se, UPDATE_TG);
  11797. +           update_cfs_shares(se);
  11798. +       }
  11799.         raw_spin_unlock_irqrestore(&rq->lock, flags);
  11800.     }
  11801.  
  11802. @@ -9264,6 +10481,7 @@
  11803.  
  11804.     .task_waking        = task_waking_fair,
  11805.     .task_dead      = task_dead_fair,
  11806. +   .set_cpus_allowed   = set_cpus_allowed_common,
  11807.  #endif
  11808.  
  11809.     .set_curr_task          = set_curr_task_fair,
  11810. @@ -9279,7 +10497,7 @@
  11811.     .update_curr        = update_curr_fair,
  11812.  
  11813.  #ifdef CONFIG_FAIR_GROUP_SCHED
  11814. -   .task_move_group    = task_move_group_fair,
  11815. +   .task_change_group  = task_change_group_fair,
  11816.  #endif
  11817.  };
  11818.  
  11819. diff -Nur /home/ninez/android/marlin/kernel/sched/features.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/features.h
  11820. --- /home/ninez/android/marlin/kernel/sched/features.h  2018-08-10 01:54:08.566728454 -0400
  11821. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/features.h  2018-08-15 17:51:31.901600413 -0400
  11822. @@ -49,7 +49,7 @@
  11823.   * Queue remote wakeups on the target CPU and process them
  11824.   * using the scheduler IPI. Reduces rq->lock contention/bounces.
  11825.   */
  11826. -SCHED_FEAT(TTWU_QUEUE, true)
  11827. +SCHED_FEAT(TTWU_QUEUE, false)
  11828.  
  11829.  #ifdef HAVE_RT_PUSH_IPI
  11830.  /*
  11831. @@ -66,48 +66,39 @@
  11832.  
  11833.  SCHED_FEAT(FORCE_SD_OVERLAP, false)
  11834.  SCHED_FEAT(RT_RUNTIME_SHARE, true)
  11835. +SCHED_FEAT(RT_RUNTIME_GREED, false)
  11836.  SCHED_FEAT(LB_MIN, false)
  11837.  
  11838.  SCHED_FEAT(ATTACH_AGE_LOAD, true)
  11839.  
  11840.  /*
  11841. - * Apply the automatic NUMA scheduling policy. Enabled automatically
  11842. - * at runtime if running on a NUMA machine. Can be controlled via
  11843. - * numa_balancing=
  11844. - */
  11845. -#ifdef CONFIG_NUMA_BALANCING
  11846. -SCHED_FEAT(NUMA,   false)
  11847. -
  11848. -/*
  11849. - * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
  11850. - * higher number of hinting faults are recorded during active load
  11851. - * balancing.
  11852. - */
  11853. -SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
  11854. -
  11855. -/*
  11856. - * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
  11857. - * lower number of hinting faults have been recorded. As this has
  11858. - * the potential to prevent a task ever migrating to a new node
  11859. - * due to CPU overload it is disabled by default.
  11860. + * Energy aware scheduling. Use platform energy model to guide scheduling
  11861. + * decisions optimizing for energy efficiency.
  11862.   */
  11863. -SCHED_FEAT(NUMA_RESIST_LOWER, false)
  11864. -#endif
  11865. +SCHED_FEAT(ENERGY_AWARE, true)
  11866.  
  11867.  /*
  11868. - * Energy aware scheduling. Use platform energy model to guide scheduling
  11869. - * decisions optimizing for energy efficiency.
  11870. + * Minimum capacity capping. Keep track of minimum capacity factor when
  11871. + * minimum frequency available to a policy is modified.
  11872. + * If enabled, this can be used to inform the scheduler about capacity
  11873. + * restrictions.
  11874.   */
  11875. -SCHED_FEAT(ENERGY_AWARE, false)
  11876. +SCHED_FEAT(MIN_CAPACITY_CAPPING, false)
  11877.  
  11878.  /*
  11879. - * UtilEstimation. Use estimated CPU utiliation.
  11880. + * Enforce the priority of candidates selected by find_best_target()
  11881. + * ON: If the target CPU saves any energy, use that.
  11882. + * OFF: Use whichever of target or backup saves most.
  11883.   */
  11884. -SCHED_FEAT(UTIL_EST, false)
  11885. +SCHED_FEAT(FBT_STRICT_ORDER, false)
  11886.  
  11887.  /*
  11888. - * SchedTune. Use Performance/Energy filtering function to evaluate the
  11889. - * trade off between energy consumption and performance impact when comparing
  11890. - * previous and next candidate CPUs.
  11891. + * Apply schedtune boost hold to tasks of all sched classes.
  11892. + * If enabled, schedtune will hold the boost applied to a CPU
  11893. + * for 50ms regardless of task activation - if the task is
  11894. + * still running 50ms later, the boost hold expires and schedtune
  11895. + * boost will expire immediately the task stops.
  11896. + * If disabled, this behaviour will only apply to tasks of the
  11897. + * RT class.
  11898.   */
  11899. -SCHED_FEAT(ENERGY_FILTER, true)
  11900. +SCHED_FEAT(SCHEDTUNE_BOOST_HOLD_ALL, false)
  11901. diff -Nur /home/ninez/android/marlin/kernel/sched/idle.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/idle.c
  11902. --- /home/ninez/android/marlin/kernel/sched/idle.c  2018-08-10 01:54:08.566728454 -0400
  11903. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/idle.c  2018-08-11 23:57:17.131940887 -0400
  11904. @@ -58,7 +58,8 @@
  11905.     rcu_idle_enter();
  11906.     trace_cpu_idle_rcuidle(0, smp_processor_id());
  11907.     local_irq_enable();
  11908. -   while (!tif_need_resched())
  11909. +   while (!tif_need_resched() &&
  11910. +       (cpu_idle_force_poll || tick_check_broadcast_expired()))
  11911.         cpu_relax();
  11912.     trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
  11913.     rcu_idle_exit();
  11914. @@ -208,6 +209,8 @@
  11915.     goto exit_idle;
  11916.  }
  11917.  
  11918. +DEFINE_PER_CPU(bool, cpu_dead_idle);
  11919. +
  11920.  /*
  11921.   * Generic idle loop implementation
  11922.   *
  11923. @@ -233,8 +236,13 @@
  11924.             check_pgt_cache();
  11925.             rmb();
  11926.  
  11927. -           if (cpu_is_offline(smp_processor_id()))
  11928. +           if (cpu_is_offline(smp_processor_id())) {
  11929. +               rcu_cpu_notify(NULL, CPU_DYING_IDLE,
  11930. +                          (void *)(long)smp_processor_id());
  11931. +               smp_mb(); /* all activity before dead. */
  11932. +               this_cpu_write(cpu_dead_idle, true);
  11933.                 arch_cpu_idle_dead();
  11934. +           }
  11935.  
  11936.             local_irq_disable();
  11937.             arch_cpu_idle_enter();
  11938. diff -Nur /home/ninez/android/marlin/kernel/sched/idle_task.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/idle_task.c
  11939. --- /home/ninez/android/marlin/kernel/sched/idle_task.c 2018-08-10 01:54:08.566728454 -0400
  11940. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/idle_task.c 2018-08-26 16:43:11.650539699 -0400
  11941. @@ -9,7 +9,8 @@
  11942.  
  11943.  #ifdef CONFIG_SMP
  11944.  static int
  11945. -select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
  11946. +select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags,
  11947. +           int sibling_count_hint)
  11948.  {
  11949.     return task_cpu(p); /* IDLE tasks as never migrated */
  11950.  }
  11951. @@ -24,11 +25,16 @@
  11952.  }
  11953.  
  11954.  static struct task_struct *
  11955. -pick_next_task_idle(struct rq *rq, struct task_struct *prev)
  11956. +pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
  11957.  {
  11958. +   if (sched_feat(RT_RUNTIME_GREED))
  11959. +       if (try_to_unthrottle_rt_rq(&rq->rt))
  11960. +           return RETRY_TASK;
  11961. +
  11962.     put_prev_task(rq, prev);
  11963.  
  11964. -   schedstat_inc(rq, sched_goidle);
  11965. +   update_idle_core(rq);
  11966. +   schedstat_inc(rq->sched_goidle);
  11967.     return rq->idle;
  11968.  }
  11969.  
  11970. @@ -47,7 +53,6 @@
  11971.  
  11972.  static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
  11973.  {
  11974. -   idle_exit_fair(rq);
  11975.     rq_last_tick_reset(rq);
  11976.  }
  11977.  
  11978. @@ -96,6 +101,7 @@
  11979.  
  11980.  #ifdef CONFIG_SMP
  11981.     .select_task_rq     = select_task_rq_idle,
  11982. +   .set_cpus_allowed   = set_cpus_allowed_common,
  11983.  #endif
  11984.  
  11985.     .set_curr_task          = set_curr_task_idle,
  11986. diff -Nur /home/ninez/android/marlin/kernel/sched/loadavg.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/loadavg.c
  11987. --- /home/ninez/android/marlin/kernel/sched/loadavg.c   2018-08-10 01:54:08.566728454 -0400
  11988. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/loadavg.c   2018-08-11 23:57:17.131940887 -0400
  11989. @@ -168,7 +168,7 @@
  11990.      * If the folding window started, make sure we start writing in the
  11991.      * next idle-delta.
  11992.      */
  11993. -   if (!time_before(jiffies, calc_load_update))
  11994. +   if (!time_before(jiffies, READ_ONCE(calc_load_update)))
  11995.         idx++;
  11996.  
  11997.     return idx & 1;
  11998. @@ -201,8 +201,9 @@
  11999.     struct rq *this_rq = this_rq();
  12000.  
  12001.     /*
  12002. -    * If we're still before the sample window, we're done.
  12003. +    * If we're still before the pending sample window, we're done.
  12004.      */
  12005. +   this_rq->calc_load_update = READ_ONCE(calc_load_update);
  12006.     if (time_before(jiffies, this_rq->calc_load_update))
  12007.         return;
  12008.  
  12009. @@ -211,7 +212,6 @@
  12010.      * accounted through the nohz accounting, so skip the entire deal and
  12011.      * sync up for the next window.
  12012.      */
  12013. -   this_rq->calc_load_update = calc_load_update;
  12014.     if (time_before(jiffies, this_rq->calc_load_update + 10))
  12015.         this_rq->calc_load_update += LOAD_FREQ;
  12016.  }
  12017. @@ -307,13 +307,15 @@
  12018.   */
  12019.  static void calc_global_nohz(void)
  12020.  {
  12021. +   unsigned long sample_window;
  12022.     long delta, active, n;
  12023.  
  12024. -   if (!time_before(jiffies, calc_load_update + 10)) {
  12025. +   sample_window = READ_ONCE(calc_load_update);
  12026. +   if (!time_before(jiffies, sample_window + 10)) {
  12027.         /*
  12028.          * Catch-up, fold however many we are behind still
  12029.          */
  12030. -       delta = jiffies - calc_load_update - 10;
  12031. +       delta = jiffies - sample_window - 10;
  12032.         n = 1 + (delta / LOAD_FREQ);
  12033.  
  12034.         active = atomic_long_read(&calc_load_tasks);
  12035. @@ -323,7 +325,7 @@
  12036.         avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
  12037.         avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
  12038.  
  12039. -       calc_load_update += n * LOAD_FREQ;
  12040. +       WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ);
  12041.     }
  12042.  
  12043.     /*
  12044. @@ -351,9 +353,11 @@
  12045.   */
  12046.  void calc_global_load(unsigned long ticks)
  12047.  {
  12048. +   unsigned long sample_window;
  12049.     long active, delta;
  12050.  
  12051. -   if (time_before(jiffies, calc_load_update + 10))
  12052. +   sample_window = READ_ONCE(calc_load_update);
  12053. +   if (time_before(jiffies, sample_window + 10))
  12054.         return;
  12055.  
  12056.     /*
  12057. @@ -370,7 +374,7 @@
  12058.     avenrun[1] = calc_load(avenrun[1], EXP_5, active);
  12059.     avenrun[2] = calc_load(avenrun[2], EXP_15, active);
  12060.  
  12061. -   calc_load_update += LOAD_FREQ;
  12062. +   WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);
  12063.  
  12064.     /*
  12065.      * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
  12066. diff -Nur /home/ninez/android/marlin/kernel/sched/Makefile /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/Makefile
  12067. --- /home/ninez/android/marlin/kernel/sched/Makefile    2018-08-10 01:54:08.563395055 -0400
  12068. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/Makefile    2018-08-21 23:16:53.820436609 -0400
  12069. @@ -2,15 +2,6 @@
  12070.  CFLAGS_REMOVE_clock.o = -pg
  12071.  endif
  12072.  
  12073. -# KASAN instrumentation is temporarily disabled for energy.o due to the repeated
  12074. -# reports that caused the kernel to not boot as seen in b/31800756. Should a fix
  12075. -# be provided, this line can be removed again. But given that KCOV is also disabled
  12076. -# for this module, it might be worth thinking about whether or not we should also
  12077. -# just turn off KASAN instrumentation entirely here.
  12078. -KASAN_SANITIZE_core.o := n
  12079. -KASAN_SANITIZE_energy.o := n
  12080. -KASAN_SANITIZE_fair.o := n
  12081. -
  12082.  # These files are disabled because they produce non-interesting flaky coverage
  12083.  # that is not a function of syscall inputs. E.g. involuntary context switches.
  12084.  KCOV_INSTRUMENT := n
  12085. @@ -26,7 +17,7 @@
  12086.  
  12087.  obj-y += core.o loadavg.o clock.o cputime.o
  12088.  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
  12089. -obj-y += wait.o completion.o idle.o
  12090. +obj-y += wait.o swait.o swork.o completion.o idle.o
  12091.  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
  12092.  obj-$(CONFIG_SCHED_WALT) += walt.o
  12093.  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
  12094. @@ -34,4 +25,7 @@
  12095.  obj-$(CONFIG_SCHED_DEBUG) += debug.o
  12096.  obj-$(CONFIG_SCHED_TUNE) += tune.o
  12097.  obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
  12098. +obj-$(CONFIG_CPU_FREQ) += cpufreq.o
  12099.  obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
  12100. +obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
  12101. +obj-y += boost.o
  12102. diff -Nur /home/ninez/android/marlin/kernel/sched/rt.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/rt.c
  12103. --- /home/ninez/android/marlin/kernel/sched/rt.c    2018-08-10 01:54:08.566728454 -0400
  12104. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/rt.c    2018-08-26 16:43:11.650539699 -0400
  12105. @@ -8,10 +8,9 @@
  12106.  #include <linux/interrupt.h>
  12107.  #include <linux/slab.h>
  12108.  #include <linux/irq_work.h>
  12109. -#include <linux/hrtimer.h>
  12110.  
  12111.  #include "walt.h"
  12112. -#include "tune.h"
  12113. +#include "tune.h"
  12114.  
  12115.  int sched_rr_timeslice = RR_TIMESLICE;
  12116.  
  12117. @@ -69,11 +68,7 @@
  12118.     raw_spin_unlock(&rt_b->rt_runtime_lock);
  12119.  }
  12120.  
  12121. -#ifdef CONFIG_SMP
  12122. -static void push_irq_work_func(struct irq_work *work);
  12123. -#endif
  12124. -
  12125. -void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
  12126. +void init_rt_rq(struct rt_rq *rt_rq)
  12127.  {
  12128.     struct rt_prio_array *array;
  12129.     int i;
  12130. @@ -92,13 +87,6 @@
  12131.     rt_rq->rt_nr_migratory = 0;
  12132.     rt_rq->overloaded = 0;
  12133.     plist_head_init(&rt_rq->pushable_tasks);
  12134. -
  12135. -#ifdef HAVE_RT_PUSH_IPI
  12136. -   rt_rq->push_flags = 0;
  12137. -   rt_rq->push_cpu = nr_cpu_ids;
  12138. -   raw_spin_lock_init(&rt_rq->push_lock);
  12139. -   init_irq_work(&rt_rq->push_work, push_irq_work_func);
  12140. -#endif
  12141.  #endif /* CONFIG_SMP */
  12142.     /* We start is dequeued state, because no RT tasks are queued */
  12143.     rt_rq->rt_queued = 0;
  12144. @@ -214,7 +202,7 @@
  12145.         if (!rt_se)
  12146.             goto err_free_rq;
  12147.  
  12148. -       init_rt_rq(rt_rq, cpu_rq(i));
  12149. +       init_rt_rq(rt_rq);
  12150.         rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
  12151.         init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
  12152.     }
  12153. @@ -331,7 +319,7 @@
  12154.     rt_rq = &rq_of_rt_rq(rt_rq)->rt;
  12155.  
  12156.     rt_rq->rt_nr_total++;
  12157. -   if (p->nr_cpus_allowed > 1)
  12158. +   if (tsk_nr_cpus_allowed(p) > 1)
  12159.         rt_rq->rt_nr_migratory++;
  12160.  
  12161.     update_rt_migration(rt_rq);
  12162. @@ -348,7 +336,7 @@
  12163.     rt_rq = &rq_of_rt_rq(rt_rq)->rt;
  12164.  
  12165.     rt_rq->rt_nr_total--;
  12166. -   if (p->nr_cpus_allowed > 1)
  12167. +   if (tsk_nr_cpus_allowed(p) > 1)
  12168.         rt_rq->rt_nr_migratory--;
  12169.  
  12170.     update_rt_migration(rt_rq);
  12171. @@ -370,14 +358,12 @@
  12172.     if (!has_pushable_tasks(rq))
  12173.         return;
  12174.  
  12175. -   queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu),
  12176. -       push_rt_tasks);
  12177. +   queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
  12178.  }
  12179.  
  12180.  static inline void queue_pull_task(struct rq *rq)
  12181.  {
  12182. -   queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu),
  12183. -       pull_rt_task);
  12184. +   queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
  12185.  }
  12186.  
  12187.  static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
  12188. @@ -443,7 +429,7 @@
  12189.  
  12190.  static inline int on_rt_rq(struct sched_rt_entity *rt_se)
  12191.  {
  12192. -   return !list_empty(&rt_se->run_list);
  12193. +   return rt_se->on_rq;
  12194.  }
  12195.  
  12196.  #ifdef CONFIG_RT_GROUP_SCHED
  12197. @@ -489,8 +475,8 @@
  12198.     return rt_se->my_q;
  12199.  }
  12200.  
  12201. -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
  12202. -static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
  12203. +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
  12204. +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
  12205.  
  12206.  static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
  12207.  {
  12208. @@ -506,7 +492,7 @@
  12209.         if (!rt_se)
  12210.             enqueue_top_rt_rq(rt_rq);
  12211.         else if (!on_rt_rq(rt_se))
  12212. -           enqueue_rt_entity(rt_se, false);
  12213. +           enqueue_rt_entity(rt_se, 0);
  12214.  
  12215.         if (rt_rq->highest_prio.curr < curr->prio)
  12216.             resched_curr(rq);
  12217. @@ -523,7 +509,7 @@
  12218.     if (!rt_se)
  12219.         dequeue_top_rt_rq(rt_rq);
  12220.     else if (on_rt_rq(rt_se))
  12221. -       dequeue_rt_entity(rt_se);
  12222. +       dequeue_rt_entity(rt_se, 0);
  12223.  }
  12224.  
  12225.  static inline int rt_rq_throttled(struct rt_rq *rt_rq)
  12226. @@ -630,6 +616,22 @@
  12227.  
  12228.  #endif /* CONFIG_RT_GROUP_SCHED */
  12229.  
  12230. +static inline void unthrottle_rt_rq(struct rt_rq *rt_rq)
  12231. +{
  12232. +   rt_rq->rt_time = 0;
  12233. +   rt_rq->rt_throttled = 0;
  12234. +   sched_rt_rq_enqueue(rt_rq);
  12235. +}
  12236. +
  12237. +int try_to_unthrottle_rt_rq(struct rt_rq *rt_rq)
  12238. +{
  12239. +   if (rt_rq_throttled(rt_rq)) {
  12240. +       unthrottle_rt_rq(rt_rq);
  12241. +       return 1;
  12242. +   }
  12243. +   return 0;
  12244. +}
  12245. +
  12246.  bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
  12247.  {
  12248.     struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
  12249. @@ -642,11 +644,11 @@
  12250.  /*
  12251.   * We ran out of runtime, see if we can borrow some from our neighbours.
  12252.   */
  12253. -static int do_balance_runtime(struct rt_rq *rt_rq)
  12254. +static void do_balance_runtime(struct rt_rq *rt_rq)
  12255.  {
  12256.     struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
  12257.     struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
  12258. -   int i, weight, more = 0;
  12259. +   int i, weight;
  12260.     u64 rt_period;
  12261.  
  12262.     weight = cpumask_weight(rd->span);
  12263. @@ -680,7 +682,6 @@
  12264.                 diff = rt_period - rt_rq->rt_runtime;
  12265.             iter->rt_runtime -= diff;
  12266.             rt_rq->rt_runtime += diff;
  12267. -           more = 1;
  12268.             if (rt_rq->rt_runtime == rt_period) {
  12269.                 raw_spin_unlock(&iter->rt_runtime_lock);
  12270.                 break;
  12271. @@ -690,8 +691,6 @@
  12272.         raw_spin_unlock(&iter->rt_runtime_lock);
  12273.     }
  12274.     raw_spin_unlock(&rt_b->rt_runtime_lock);
  12275. -
  12276. -   return more;
  12277.  }
  12278.  
  12279.  /*
  12280. @@ -803,26 +802,19 @@
  12281.     }
  12282.  }
  12283.  
  12284. -static int balance_runtime(struct rt_rq *rt_rq)
  12285. +static void balance_runtime(struct rt_rq *rt_rq)
  12286.  {
  12287. -   int more = 0;
  12288. -
  12289.     if (!sched_feat(RT_RUNTIME_SHARE))
  12290. -       return more;
  12291. +       return;
  12292.  
  12293.     if (rt_rq->rt_time > rt_rq->rt_runtime) {
  12294.         raw_spin_unlock(&rt_rq->rt_runtime_lock);
  12295. -       more = do_balance_runtime(rt_rq);
  12296. +       do_balance_runtime(rt_rq);
  12297.         raw_spin_lock(&rt_rq->rt_runtime_lock);
  12298.     }
  12299. -
  12300. -   return more;
  12301.  }
  12302.  #else /* !CONFIG_SMP */
  12303. -static inline int balance_runtime(struct rt_rq *rt_rq)
  12304. -{
  12305. -   return 0;
  12306. -}
  12307. +static inline void balance_runtime(struct rt_rq *rt_rq) {}
  12308.  #endif /* CONFIG_SMP */
  12309.  
  12310.  static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
  12311. @@ -848,6 +840,17 @@
  12312.         int enqueue = 0;
  12313.         struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
  12314.         struct rq *rq = rq_of_rt_rq(rt_rq);
  12315. +       int skip;
  12316. +
  12317. +       /*
  12318. +        * When span == cpu_online_mask, taking each rq->lock
  12319. +        * can be time-consuming. Try to avoid it when possible.
  12320. +        */
  12321. +       raw_spin_lock(&rt_rq->rt_runtime_lock);
  12322. +       skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
  12323. +       raw_spin_unlock(&rt_rq->rt_runtime_lock);
  12324. +       if (skip)
  12325. +           continue;
  12326.  
  12327.         raw_spin_lock(&rq->lock);
  12328.         update_rq_clock(rq);
  12329. @@ -865,11 +868,14 @@
  12330.                 enqueue = 1;
  12331.  
  12332.                 /*
  12333. -                * Force a clock update if the CPU was idle,
  12334. -                * lest wakeup -> unthrottle time accumulate.
  12335. +                * When we're idle and a woken (rt) task is
  12336. +                * throttled check_preempt_curr() will set
  12337. +                * skip_update and the time between the wakeup
  12338. +                * and this unthrottle will get accounted as
  12339. +                * 'runtime'.
  12340.                  */
  12341.                 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
  12342. -                   rq->skip_clock_update = -1;
  12343. +                   rq_clock_skip_update(rq, false);
  12344.             }
  12345.             if (rt_rq->rt_time || rt_rq->rt_nr_running)
  12346.                 idle = 0;
  12347. @@ -973,8 +979,22 @@
  12348.          * but accrue some time due to boosting.
  12349.          */
  12350.         if (likely(rt_b->rt_runtime)) {
  12351. +
  12352.             static bool once = false;
  12353.  
  12354. +           if (sched_feat(RT_RUNTIME_GREED)) {
  12355. +               struct rq *rq = rq_of_rt_rq(rt_rq);
  12356. +               /*
  12357. +                * If there is no other tasks able to run
  12358. +                * on this rq, lets be greed and reset our
  12359. +                * rt_time.
  12360. +                */
  12361. +               if (rq->nr_running == rt_rq->rt_nr_running) {
  12362. +                   rt_rq->rt_time = 0;
  12363. +                   return 0;
  12364. +               }
  12365. +           }
  12366. +
  12367.             rt_rq->rt_throttled = 1;
  12368.  
  12369.             if (!once) {
  12370. @@ -999,73 +1019,6 @@
  12371.     return 0;
  12372.  }
  12373.  
  12374. -/* TODO: Make configurable */
  12375. -#define RT_SCHEDTUNE_INTERVAL 50000000ULL
  12376. -
  12377. -static void sched_rt_update_capacity_req(struct rq *rq, bool tick);
  12378. -
  12379. -static enum hrtimer_restart rt_schedtune_timer(struct hrtimer *timer)
  12380. -{
  12381. -   struct sched_rt_entity *rt_se = container_of(timer,
  12382. -                            struct sched_rt_entity,
  12383. -                            schedtune_timer);
  12384. -   struct task_struct *p = rt_task_of(rt_se);
  12385. -   struct rq *rq = task_rq(p);
  12386. -
  12387. -   raw_spin_lock(&rq->lock);
  12388. -
  12389. -   /*
  12390. -    * Nothing to do if:
  12391. -    * - task has switched runqueues
  12392. -    * - task isn't RT anymore
  12393. -    */
  12394. -   if (rq != task_rq(p) || (p->sched_class != &rt_sched_class))
  12395. -       goto out;
  12396. -
  12397. -   /*
  12398. -    * If task got enqueued back during callback time, it means we raced
  12399. -    * with the enqueue on another cpu, that's Ok, just do nothing as
  12400. -    * enqueue path would have tried to cancel us and we shouldn't run
  12401. -    * Also check the schedtune_enqueued flag as class-switch on a
  12402. -    * sleeping task may have already canceled the timer and done dq
  12403. -    */
  12404. -   if (p->on_rq || rt_se->schedtune_enqueued == false)
  12405. -       goto out;
  12406. -
  12407. -   /*
  12408. -    * RT task is no longer active, cancel boost
  12409. -    */
  12410. -   rt_se->schedtune_enqueued = false;
  12411. -   schedtune_dequeue_task(p, cpu_of(rq));
  12412. -   sched_rt_update_capacity_req(rq, false);
  12413. -out:
  12414. -   raw_spin_unlock(&rq->lock);
  12415. -
  12416. -   /*
  12417. -    * This can free the task_struct if no more references.
  12418. -    */
  12419. -   put_task_struct(p);
  12420. -
  12421. -   return HRTIMER_NORESTART;
  12422. -}
  12423. -
  12424. -void init_rt_schedtune_timer(struct sched_rt_entity *rt_se)
  12425. -{
  12426. -   struct hrtimer *timer = &rt_se->schedtune_timer;
  12427. -
  12428. -   hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  12429. -   timer->function = rt_schedtune_timer;
  12430. -   rt_se->schedtune_enqueued = false;
  12431. -}
  12432. -
  12433. -static void start_schedtune_timer(struct sched_rt_entity *rt_se)
  12434. -{
  12435. -   struct hrtimer *timer = &rt_se->schedtune_timer;
  12436. -
  12437. -   hrtimer_start(timer, ns_to_ktime(RT_SCHEDTUNE_INTERVAL),
  12438. -             HRTIMER_MODE_REL_PINNED);
  12439. -}
  12440. -
  12441.  /*
  12442.   * Update the current task's runtime statistics. Skip current tasks that
  12443.   * are not in our scheduling class.
  12444. @@ -1083,6 +1036,9 @@
  12445.     if (unlikely((s64)delta_exec <= 0))
  12446.         return;
  12447.  
  12448. +   /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
  12449. +   cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
  12450. +
  12451.     schedstat_set(curr->se.statistics.exec_max,
  12452.               max(curr->se.statistics.exec_max, delta_exec));
  12453.  
  12454. @@ -1276,12 +1232,27 @@
  12455.  }
  12456.  
  12457.  static inline
  12458. +unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
  12459. +{
  12460. +   struct rt_rq *group_rq = group_rt_rq(rt_se);
  12461. +   struct task_struct *tsk;
  12462. +
  12463. +   if (group_rq)
  12464. +       return group_rq->rr_nr_running;
  12465. +
  12466. +   tsk = rt_task_of(rt_se);
  12467. +
  12468. +   return (tsk->policy == SCHED_RR) ? 1 : 0;
  12469. +}
  12470. +
  12471. +static inline
  12472.  void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  12473.  {
  12474.     int prio = rt_se_prio(rt_se);
  12475.  
  12476.     WARN_ON(!rt_prio(prio));
  12477.     rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
  12478. +   rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
  12479.  
  12480.     inc_rt_prio(rt_rq, prio);
  12481.     inc_rt_migration(rt_se, rt_rq);
  12482. @@ -1294,13 +1265,37 @@
  12483.     WARN_ON(!rt_prio(rt_se_prio(rt_se)));
  12484.     WARN_ON(!rt_rq->rt_nr_running);
  12485.     rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
  12486. +   rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
  12487.  
  12488.     dec_rt_prio(rt_rq, rt_se_prio(rt_se));
  12489.     dec_rt_migration(rt_se, rt_rq);
  12490.     dec_rt_group(rt_se, rt_rq);
  12491.  }
  12492.  
  12493. -static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
  12494. +/*
  12495. + * Change rt_se->run_list location unless SAVE && !MOVE
  12496. + *
  12497. + * assumes ENQUEUE/DEQUEUE flags match
  12498. + */
  12499. +static inline bool move_entity(unsigned int flags)
  12500. +{
  12501. +   if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
  12502. +       return false;
  12503. +
  12504. +   return true;
  12505. +}
  12506. +
  12507. +static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
  12508. +{
  12509. +   list_del_init(&rt_se->run_list);
  12510. +
  12511. +   if (list_empty(array->queue + rt_se_prio(rt_se)))
  12512. +       __clear_bit(rt_se_prio(rt_se), array->bitmap);
  12513. +
  12514. +   rt_se->on_list = 0;
  12515. +}
  12516. +
  12517. +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
  12518.  {
  12519.     struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
  12520.     struct rt_prio_array *array = &rt_rq->active;
  12521. @@ -1313,26 +1308,37 @@
  12522.      * get throttled and the current group doesn't have any other
  12523.      * active members.
  12524.      */
  12525. -   if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
  12526. +   if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
  12527. +       if (rt_se->on_list)
  12528. +           __delist_rt_entity(rt_se, array);
  12529.         return;
  12530. +   }
  12531.  
  12532. -   if (head)
  12533. -       list_add(&rt_se->run_list, queue);
  12534. -   else
  12535. -       list_add_tail(&rt_se->run_list, queue);
  12536. -   __set_bit(rt_se_prio(rt_se), array->bitmap);
  12537. +   if (move_entity(flags)) {
  12538. +       WARN_ON_ONCE(rt_se->on_list);
  12539. +       if (flags & ENQUEUE_HEAD)
  12540. +           list_add(&rt_se->run_list, queue);
  12541. +       else
  12542. +           list_add_tail(&rt_se->run_list, queue);
  12543. +
  12544. +       __set_bit(rt_se_prio(rt_se), array->bitmap);
  12545. +       rt_se->on_list = 1;
  12546. +   }
  12547. +   rt_se->on_rq = 1;
  12548.  
  12549.     inc_rt_tasks(rt_se, rt_rq);
  12550.  }
  12551.  
  12552. -static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
  12553. +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
  12554.  {
  12555.     struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
  12556.     struct rt_prio_array *array = &rt_rq->active;
  12557.  
  12558. -   list_del_init(&rt_se->run_list);
  12559. -   if (list_empty(array->queue + rt_se_prio(rt_se)))
  12560. -       __clear_bit(rt_se_prio(rt_se), array->bitmap);
  12561. +   if (move_entity(flags)) {
  12562. +       WARN_ON_ONCE(!rt_se->on_list);
  12563. +       __delist_rt_entity(rt_se, array);
  12564. +   }
  12565. +   rt_se->on_rq = 0;
  12566.  
  12567.     dec_rt_tasks(rt_se, rt_rq);
  12568.  }
  12569. @@ -1341,7 +1347,7 @@
  12570.   * Because the prio of an upper entry depends on the lower
  12571.   * entries, we must remove entries top - down.
  12572.   */
  12573. -static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
  12574. +static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
  12575.  {
  12576.     struct sched_rt_entity *back = NULL;
  12577.  
  12578. @@ -1354,35 +1360,64 @@
  12579.  
  12580.     for (rt_se = back; rt_se; rt_se = rt_se->back) {
  12581.         if (on_rt_rq(rt_se))
  12582. -           __dequeue_rt_entity(rt_se);
  12583. +           __dequeue_rt_entity(rt_se, flags);
  12584.     }
  12585.  }
  12586.  
  12587. -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
  12588. +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
  12589.  {
  12590.     struct rq *rq = rq_of_rt_se(rt_se);
  12591.  
  12592. -   dequeue_rt_stack(rt_se);
  12593. +   dequeue_rt_stack(rt_se, flags);
  12594.     for_each_sched_rt_entity(rt_se)
  12595. -       __enqueue_rt_entity(rt_se, head);
  12596. +       __enqueue_rt_entity(rt_se, flags);
  12597.     enqueue_top_rt_rq(&rq->rt);
  12598.  }
  12599.  
  12600. -static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
  12601. +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
  12602.  {
  12603.     struct rq *rq = rq_of_rt_se(rt_se);
  12604.  
  12605. -   dequeue_rt_stack(rt_se);
  12606. +   dequeue_rt_stack(rt_se, flags);
  12607.  
  12608.     for_each_sched_rt_entity(rt_se) {
  12609.         struct rt_rq *rt_rq = group_rt_rq(rt_se);
  12610.  
  12611.         if (rt_rq && rt_rq->rt_nr_running)
  12612. -           __enqueue_rt_entity(rt_se, false);
  12613. +           __enqueue_rt_entity(rt_se, flags);
  12614.     }
  12615.     enqueue_top_rt_rq(&rq->rt);
  12616.  }
  12617.  
  12618. +static void sched_rt_update_capacity_req(struct rq *rq)
  12619. +{
  12620. +   u64 total, used, age_stamp, avg;
  12621. +   s64 delta;
  12622. +
  12623. +   if (!sched_freq())
  12624. +       return;
  12625. +
  12626. +   sched_avg_update(rq);
  12627. +   /*
  12628. +    * Since we're reading these variables without serialization make sure
  12629. +    * we read them once before doing sanity checks on them.
  12630. +    */
  12631. +   age_stamp = READ_ONCE(rq->age_stamp);
  12632. +   avg = READ_ONCE(rq->rt_avg);
  12633. +   delta = rq_clock(rq) - age_stamp;
  12634. +
  12635. +   if (unlikely(delta < 0))
  12636. +       delta = 0;
  12637. +
  12638. +   total = sched_avg_period() + delta;
  12639. +
  12640. +   used = div_u64(avg, total);
  12641. +   if (unlikely(used > SCHED_CAPACITY_SCALE))
  12642. +       used = SCHED_CAPACITY_SCALE;
  12643. +
  12644. +   set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used));
  12645. +}
  12646. +
  12647.  /*
  12648.   * Adding/removing a task to/from a priority array:
  12649.   */
  12650. @@ -1391,65 +1426,37 @@
  12651.  {
  12652.     struct sched_rt_entity *rt_se = &p->rt;
  12653.  
  12654. +#ifdef CONFIG_SMP
  12655. +   schedtune_enqueue_task(p, cpu_of(rq));
  12656. +#endif
  12657. +
  12658.     if (flags & ENQUEUE_WAKEUP)
  12659.         rt_se->timeout = 0;
  12660.  
  12661. -   enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
  12662. +   enqueue_rt_entity(rt_se, flags);
  12663.     walt_inc_cumulative_runnable_avg(rq, p);
  12664.  
  12665. -   if (!task_current(rq, p) && p->nr_cpus_allowed > 1) {
  12666. +   if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
  12667.         enqueue_pushable_task(rq, p);
  12668. -   }
  12669. -   if (!schedtune_task_boost(p))
  12670. -       return;
  12671.  
  12672. -   /*
  12673. -    * If schedtune timer is active, that means a boost was already
  12674. -    * done, just cancel the timer so that deboost doesn't happen.
  12675. -    * Otherwise, increase the boost. If an enqueued timer was
  12676. -    * cancelled, put the task reference.
  12677. -    */
  12678. -   if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
  12679. -       put_task_struct(p);
  12680. -
  12681. -   /*
  12682. -    * schedtune_enqueued can be true in the following situation:
  12683. -    * enqueue_task_rt grabs rq lock before timer fires
  12684. -    *    or before its callback acquires rq lock
  12685. -    * schedtune_enqueued can be false if timer callback is running
  12686. -    * and timer just released rq lock, or if the timer finished
  12687. -    * running and canceling the boost
  12688. -    */
  12689. -   if (rt_se->schedtune_enqueued == true)
  12690. -       return;
  12691. -
  12692. -   rt_se->schedtune_enqueued = true;
  12693. -   schedtune_enqueue_task(p, cpu_of(rq));
  12694. -   sched_rt_update_capacity_req(rq, false);
  12695. +   sched_rt_update_capacity_req(rq);
  12696.  }
  12697.  
  12698.  static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
  12699.  {
  12700.     struct sched_rt_entity *rt_se = &p->rt;
  12701.  
  12702. +#ifdef CONFIG_SMP
  12703. +   schedtune_dequeue_task(p, cpu_of(rq));
  12704. +#endif
  12705. +
  12706.     update_curr_rt(rq);
  12707. -   dequeue_rt_entity(rt_se);
  12708. +   dequeue_rt_entity(rt_se, flags);
  12709.     walt_dec_cumulative_runnable_avg(rq, p);
  12710.  
  12711.     dequeue_pushable_task(rq, p);
  12712.  
  12713. -   if (rt_se->schedtune_enqueued == false)
  12714. -       return;
  12715. -
  12716. -   if (flags == DEQUEUE_SLEEP) {
  12717. -       get_task_struct(p);
  12718. -       start_schedtune_timer(rt_se);
  12719. -       return;
  12720. -   }
  12721. -
  12722. -   rt_se->schedtune_enqueued = false;
  12723. -   schedtune_dequeue_task(p, cpu_of(rq));
  12724. -   sched_rt_update_capacity_req(rq, false);
  12725. +   sched_rt_update_capacity_req(rq);
  12726.  }
  12727.  
  12728.  /*
  12729. @@ -1499,20 +1506,6 @@
  12730.     return !!((pc & SOFTIRQ_MASK)>= SOFTIRQ_DISABLE_OFFSET);
  12731.  }
  12732.  
  12733. -static bool is_top_app_cpu(int cpu)
  12734. -{
  12735. -   bool boosted = (schedtune_cpu_boost(cpu) > 0);
  12736. -
  12737. -   return boosted;
  12738. -}
  12739. -
  12740. -static bool is_top_app(struct task_struct *cur)
  12741. -{
  12742. -   bool boosted = (schedtune_task_boost(cur) > 0);
  12743. -
  12744. -   return boosted;
  12745. -}
  12746. -
  12747.  /*
  12748.   * Return whether the task on the given cpu is currently non-preemptible
  12749.   * while handling a potentially long softint, or if the task is likely
  12750. @@ -1527,14 +1520,8 @@
  12751.     struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu);
  12752.     int task_pc = 0;
  12753.  
  12754. -   if (task) {
  12755. -       if (is_top_app(task))
  12756. -           return true;
  12757. +   if (task)
  12758.         task_pc = task_preempt_count(task);
  12759. -   }
  12760. -
  12761. -   if (is_top_app_cpu(cpu))
  12762. -       return true;
  12763.  
  12764.     if (softirq_masked(task_pc))
  12765.         return true;
  12766. @@ -1544,37 +1531,12 @@
  12767.          task_pc & SOFTIRQ_MASK));
  12768.  }
  12769.  
  12770. -static void schedtune_dequeue_rt(struct rq *rq, struct task_struct *p)
  12771. -{
  12772. -   struct sched_rt_entity *rt_se = &p->rt;
  12773. -
  12774. -   BUG_ON(!raw_spin_is_locked(&rq->lock));
  12775. -
  12776. -   if (rt_se->schedtune_enqueued == false)
  12777. -       return;
  12778. -
  12779. -   /*
  12780. -    * Incase of class change cancel any active timers. Otherwise, increase
  12781. -    * the boost. If an enqueued timer was cancelled, put the task ref.
  12782. -    */
  12783. -   if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
  12784. -       put_task_struct(p);
  12785. -
  12786. -   /* schedtune_enqueued is true, deboost it */
  12787. -   rt_se->schedtune_enqueued = false;
  12788. -   schedtune_dequeue_task(p, task_cpu(p));
  12789. -   sched_rt_update_capacity_req(rq, false);
  12790. -}
  12791. -
  12792.  static int
  12793. -select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
  12794. +select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
  12795. +         int sibling_count_hint)
  12796.  {
  12797.     struct task_struct *curr;
  12798.     struct rq *rq;
  12799. -   bool may_not_preempt;
  12800. -
  12801. -   if (p->nr_cpus_allowed == 1)
  12802. -       goto out;
  12803.  
  12804.     /* For anything but wake ups, just return the task_cpu */
  12805.     if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
  12806. @@ -1586,12 +1548,7 @@
  12807.     curr = READ_ONCE(rq->curr); /* unlocked access */
  12808.  
  12809.     /*
  12810. -    * If the current task on @p's runqueue is a softirq task,
  12811. -    * it may run without preemption for a time that is
  12812. -    * ill-suited for a waiting RT task. Therefore, try to
  12813. -    * wake this RT task on another runqueue.
  12814. -    *
  12815. -    * Also, if the current task on @p's runqueue is an RT task, then
  12816. +    * If the current task on @p's runqueue is an RT task, then
  12817.      * try to see if we can wake this RT task up on another
  12818.      * runqueue. Otherwise simply start this RT task
  12819.      * on its current runqueue.
  12820. @@ -1612,54 +1569,43 @@
  12821.      * This test is optimistic, if we get it wrong the load-balancer
  12822.      * will have to sort it out.
  12823.      */
  12824. -   may_not_preempt = task_may_not_preempt(curr, cpu);
  12825. -   if (curr && (may_not_preempt ||
  12826. -            (unlikely(rt_task(curr)) &&
  12827. -             (curr->nr_cpus_allowed < 2 ||
  12828. -              curr->prio <= p->prio)))) {
  12829. +   if (curr && unlikely(rt_task(curr)) &&
  12830. +       (tsk_nr_cpus_allowed(curr) < 2 ||
  12831. +        curr->prio <= p->prio)) {
  12832.         int target = find_lowest_rq(p);
  12833. +
  12834.         /*
  12835. -        * If cpu is non-preemptible, prefer remote cpu
  12836. -        * even if it's running a higher-prio task.
  12837. -        * Otherwise: Possible race. Don't bother moving it if the
  12838. -        * destination CPU is not running a lower priority task.
  12839. +        * Don't bother moving it if the destination CPU is
  12840. +        * not running a lower priority task.
  12841.          */
  12842.         if (target != -1 &&
  12843. -           (may_not_preempt ||
  12844. -            p->prio < cpu_rq(target)->rt.highest_prio.curr))
  12845. +           p->prio < cpu_rq(target)->rt.highest_prio.curr)
  12846.             cpu = target;
  12847.     }
  12848.     rcu_read_unlock();
  12849.  
  12850.  out:
  12851. -   /*
  12852. -    * If previous CPU was different, make sure to cancel any active
  12853. -    * schedtune timers and deboost.
  12854. -    */
  12855. -   if (task_cpu(p) != cpu) {
  12856. -       unsigned long fl;
  12857. -       struct rq *prq = task_rq(p);
  12858. -
  12859. -       raw_spin_lock_irqsave(&prq->lock, fl);
  12860. -       schedtune_dequeue_rt(prq, p);
  12861. -       raw_spin_unlock_irqrestore(&prq->lock, fl);
  12862. -   }
  12863. -
  12864.     return cpu;
  12865.  }
  12866.  
  12867.  static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
  12868.  {
  12869. -   if (rq->curr->nr_cpus_allowed == 1)
  12870. +   /*
  12871. +    * Current can't be migrated, useless to reschedule,
  12872. +    * let's hope p can move out.
  12873. +    */
  12874. +   if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
  12875. +       !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
  12876.         return;
  12877.  
  12878. -   if (p->nr_cpus_allowed != 1
  12879. +   /*
  12880. +    * p is migratable, so let's not schedule it and
  12881. +    * see if it is pushed or pulled somewhere else.
  12882. +    */
  12883. +   if (tsk_nr_cpus_allowed(p) != 1
  12884.         && cpupri_find(&rq->rd->cpupri, p, NULL))
  12885.         return;
  12886.  
  12887. -   if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
  12888. -       return;
  12889. -
  12890.     /*
  12891.      * There appears to be other cpus that can accept
  12892.      * current and none to run 'p', so lets reschedule
  12893. @@ -1699,61 +1645,6 @@
  12894.  #endif
  12895.  }
  12896.  
  12897. -#ifdef CONFIG_SMP
  12898. -
  12899. -static void sched_rt_update_capacity_req(struct rq *rq, bool tick)
  12900. -{
  12901. -   u64 total, used, age_stamp, avg;
  12902. -   s64 delta;
  12903. -   int cpu = cpu_of(rq);
  12904. -
  12905. -   if (!sched_freq())
  12906. -       return;
  12907. -
  12908. -#ifdef CONFIG_SCHED_WALT
  12909. -   if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
  12910. -       unsigned long cpu_utilization = boosted_cpu_util(cpu);
  12911. -       unsigned long capacity_curr = capacity_curr_of(cpu);
  12912. -       int req = 1;
  12913. -
  12914. -       /*
  12915. -        * During a tick, we don't throttle frequency down, just update
  12916. -        * the rt utilization.
  12917. -        */
  12918. -       if (tick && cpu_utilization <= capacity_curr)
  12919. -           req = 0;
  12920. -
  12921. -       set_rt_cpu_capacity(cpu, req, cpu_utilization);
  12922. -
  12923. -       return;
  12924. -   }
  12925. -#endif
  12926. -   sched_avg_update(rq);
  12927. -   /*
  12928. -    * Since we're reading these variables without serialization make sure
  12929. -    * we read them once before doing sanity checks on them.
  12930. -    */
  12931. -   age_stamp = READ_ONCE(rq->age_stamp);
  12932. -   avg = READ_ONCE(rq->rt_avg);
  12933. -   delta = rq_clock(rq) - age_stamp;
  12934. -
  12935. -   if (unlikely(delta < 0))
  12936. -       delta = 0;
  12937. -
  12938. -   total = sched_avg_period() + delta;
  12939. -
  12940. -   used = div_u64(avg, total);
  12941. -   if (unlikely(used > SCHED_CAPACITY_SCALE))
  12942. -       used = SCHED_CAPACITY_SCALE;
  12943. -
  12944. -   set_rt_cpu_capacity(cpu, 1, (unsigned long)(used));
  12945. -}
  12946. -#else
  12947. -static inline void sched_rt_update_capacity_req(struct rq *rq, bool tick)
  12948. -{ }
  12949. -
  12950. -#endif
  12951. -
  12952.  static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
  12953.                            struct rt_rq *rt_rq)
  12954.  {
  12955. @@ -1790,13 +1681,21 @@
  12956.  }
  12957.  
  12958.  static struct task_struct *
  12959. -pick_next_task_rt(struct rq *rq, struct task_struct *prev)
  12960. +pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
  12961.  {
  12962.     struct task_struct *p;
  12963.     struct rt_rq *rt_rq = &rq->rt;
  12964.  
  12965.     if (need_pull_rt_task(rq, prev)) {
  12966. +       /*
  12967. +        * This is OK, because current is on_cpu, which avoids it being
  12968. +        * picked for load-balance and preemption/IRQs are still
  12969. +        * disabled avoiding further scheduler activity on it and we're
  12970. +        * being very careful to re-start the picking loop.
  12971. +        */
  12972. +       lockdep_unpin_lock(&rq->lock, cookie);
  12973.         pull_rt_task(rq);
  12974. +       lockdep_repin_lock(&rq->lock, cookie);
  12975.         /*
  12976.          * pull_rt_task() can drop (and re-acquire) rq->lock; this
  12977.          * means a dl or stop task can slip in, in which case we need
  12978. @@ -1822,7 +1721,7 @@
  12979.          * This value will be the used as an estimation of the next
  12980.          * activity.
  12981.          */
  12982. -       sched_rt_update_capacity_req(rq, false);
  12983. +       sched_rt_update_capacity_req(rq);
  12984.         return NULL;
  12985.     }
  12986.  
  12987. @@ -1846,7 +1745,7 @@
  12988.      * The previous task needs to be made eligible for pushing
  12989.      * if it is still active
  12990.      */
  12991. -   if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
  12992. +   if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
  12993.         enqueue_pushable_task(rq, p);
  12994.  }
  12995.  
  12996. @@ -1896,7 +1795,7 @@
  12997.     if (unlikely(!lowest_mask))
  12998.         return -1;
  12999.  
  13000. -   if (task->nr_cpus_allowed == 1)
  13001. +   if (tsk_nr_cpus_allowed(task) == 1)
  13002.         return -1; /* No other targets possible */
  13003.  
  13004.     if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
  13005. @@ -1956,9 +1855,7 @@
  13006.     cpu = cpumask_any(lowest_mask);
  13007.     if (cpu < nr_cpu_ids)
  13008.         return cpu;
  13009. -
  13010. -   cpu = -1;
  13011. -   return cpu;
  13012. +   return -1;
  13013.  }
  13014.  
  13015.  /* Will lock the rq it finds */
  13016. @@ -1986,6 +1883,16 @@
  13017.             break;
  13018.         }
  13019.  
  13020. +       if (lowest_rq->rt.highest_prio.curr <= task->prio) {
  13021. +           /*
  13022. +            * Target rq has tasks of equal or higher priority,
  13023. +            * retrying does not release any lock and is unlikely
  13024. +            * to yield a different result.
  13025. +            */
  13026. +           lowest_rq = NULL;
  13027. +           break;
  13028. +       }
  13029. +
  13030.         /* if the prio of this runqueue changed, try again */
  13031.         if (double_lock_balance(rq, lowest_rq)) {
  13032.             /*
  13033. @@ -1998,6 +1905,7 @@
  13034.                      !cpumask_test_cpu(lowest_rq->cpu,
  13035.                                tsk_cpus_allowed(task)) ||
  13036.                      task_running(rq, task) ||
  13037. +                    !rt_task(task) ||
  13038.                      !task_on_rq_queued(task))) {
  13039.  
  13040.                 double_unlock_balance(rq, lowest_rq);
  13041. @@ -2030,7 +1938,7 @@
  13042.  
  13043.     BUG_ON(rq->cpu != task_cpu(p));
  13044.     BUG_ON(task_current(rq, p));
  13045. -   BUG_ON(p->nr_cpus_allowed <= 1);
  13046. +   BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
  13047.  
  13048.     BUG_ON(!task_on_rq_queued(p));
  13049.     BUG_ON(!rt_task(p));
  13050. @@ -2111,7 +2019,9 @@
  13051.     }
  13052.  
  13053.     deactivate_task(rq, next_task, 0);
  13054. +   next_task->on_rq = TASK_ON_RQ_MIGRATING;
  13055.     set_task_cpu(next_task, lowest_rq->cpu);
  13056. +   next_task->on_rq = TASK_ON_RQ_QUEUED;
  13057.     activate_task(lowest_rq, next_task, 0);
  13058.     ret = 1;
  13059.  
  13060. @@ -2133,160 +2043,172 @@
  13061.  }
  13062.  
  13063.  #ifdef HAVE_RT_PUSH_IPI
  13064. +
  13065.  /*
  13066. - * The search for the next cpu always starts at rq->cpu and ends
  13067. - * when we reach rq->cpu again. It will never return rq->cpu.
  13068. - * This returns the next cpu to check, or nr_cpu_ids if the loop
  13069. - * is complete.
  13070. + * When a high priority task schedules out from a CPU and a lower priority
  13071. + * task is scheduled in, a check is made to see if there's any RT tasks
  13072. + * on other CPUs that are waiting to run because a higher priority RT task
  13073. + * is currently running on its CPU. In this case, the CPU with multiple RT
  13074. + * tasks queued on it (overloaded) needs to be notified that a CPU has opened
  13075. + * up that may be able to run one of its non-running queued RT tasks.
  13076. + *
  13077. + * All CPUs with overloaded RT tasks need to be notified as there is currently
  13078. + * no way to know which of these CPUs have the highest priority task waiting
  13079. + * to run. Instead of trying to take a spinlock on each of these CPUs,
  13080. + * which has shown to cause large latency when done on machines with many
  13081. + * CPUs, sending an IPI to the CPUs to have them push off the overloaded
  13082. + * RT tasks waiting to run.
  13083. + *
  13084. + * Just sending an IPI to each of the CPUs is also an issue, as on large
  13085. + * count CPU machines, this can cause an IPI storm on a CPU, especially
  13086. + * if its the only CPU with multiple RT tasks queued, and a large number
  13087. + * of CPUs scheduling a lower priority task at the same time.
  13088. + *
  13089. + * Each root domain has its own irq work function that can iterate over
  13090. + * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
  13091. + * tassk must be checked if there's one or many CPUs that are lowering
  13092. + * their priority, there's a single irq work iterator that will try to
  13093. + * push off RT tasks that are waiting to run.
  13094. + *
  13095. + * When a CPU schedules a lower priority task, it will kick off the
  13096. + * irq work iterator that will jump to each CPU with overloaded RT tasks.
  13097. + * As it only takes the first CPU that schedules a lower priority task
  13098. + * to start the process, the rto_start variable is incremented and if
  13099. + * the atomic result is one, then that CPU will try to take the rto_lock.
  13100. + * This prevents high contention on the lock as the process handles all
  13101. + * CPUs scheduling lower priority tasks.
  13102. + *
  13103. + * All CPUs that are scheduling a lower priority task will increment the
  13104. + * rt_loop_next variable. This will make sure that the irq work iterator
  13105. + * checks all RT overloaded CPUs whenever a CPU schedules a new lower
  13106. + * priority task, even if the iterator is in the middle of a scan. Incrementing
  13107. + * the rt_loop_next will cause the iterator to perform another scan.
  13108.   *
  13109. - * rq->rt.push_cpu holds the last cpu returned by this function,
  13110. - * or if this is the first instance, it must hold rq->cpu.
  13111.   */
  13112. -static int rto_next_cpu(struct rq *rq)
  13113. +static int rto_next_cpu(struct root_domain *rd)
  13114.  {
  13115. -   int prev_cpu = rq->rt.push_cpu;
  13116. +   int next;
  13117.     int cpu;
  13118.  
  13119. -   cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
  13120. -
  13121.     /*
  13122. -    * If the previous cpu is less than the rq's CPU, then it already
  13123. -    * passed the end of the mask, and has started from the beginning.
  13124. -    * We end if the next CPU is greater or equal to rq's CPU.
  13125. +    * When starting the IPI RT pushing, the rto_cpu is set to -1,
  13126. +    * rt_next_cpu() will simply return the first CPU found in
  13127. +    * the rto_mask.
  13128. +    *
  13129. +    * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
  13130. +    * will return the next CPU found in the rto_mask.
  13131. +    *
  13132. +    * If there are no more CPUs left in the rto_mask, then a check is made
  13133. +    * against rto_loop and rto_loop_next. rto_loop is only updated with
  13134. +    * the rto_lock held, but any CPU may increment the rto_loop_next
  13135. +    * without any locking.
  13136.      */
  13137. -   if (prev_cpu < rq->cpu) {
  13138. -       if (cpu >= rq->cpu)
  13139. -           return nr_cpu_ids;
  13140. +   for (;;) {
  13141.  
  13142. -   } else if (cpu >= nr_cpu_ids) {
  13143. -       /*
  13144. -        * We passed the end of the mask, start at the beginning.
  13145. -        * If the result is greater or equal to the rq's CPU, then
  13146. -        * the loop is finished.
  13147. -        */
  13148. -       cpu = cpumask_first(rq->rd->rto_mask);
  13149. -       if (cpu >= rq->cpu)
  13150. -           return nr_cpu_ids;
  13151. -   }
  13152. -   rq->rt.push_cpu = cpu;
  13153. +       /* When rto_cpu is -1 this acts like cpumask_first() */
  13154. +       cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
  13155.  
  13156. -   /* Return cpu to let the caller know if the loop is finished or not */
  13157. -   return cpu;
  13158. -}
  13159. +       rd->rto_cpu = cpu;
  13160.  
  13161. -static int find_next_push_cpu(struct rq *rq)
  13162. -{
  13163. -   struct rq *next_rq;
  13164. -   int cpu;
  13165. +       if (cpu < nr_cpu_ids)
  13166. +           return cpu;
  13167.  
  13168. -   while (1) {
  13169. -       cpu = rto_next_cpu(rq);
  13170. -       if (cpu >= nr_cpu_ids)
  13171. -           break;
  13172. -       next_rq = cpu_rq(cpu);
  13173. +       rd->rto_cpu = -1;
  13174. +
  13175. +       /*
  13176. +        * ACQUIRE ensures we see the @rto_mask changes
  13177. +        * made prior to the @next value observed.
  13178. +        *
  13179. +        * Matches WMB in rt_set_overload().
  13180. +        */
  13181. +       next = atomic_read_acquire(&rd->rto_loop_next);
  13182.  
  13183. -       /* Make sure the next rq can push to this rq */
  13184. -       if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
  13185. +       if (rd->rto_loop == next)
  13186.             break;
  13187. +
  13188. +       rd->rto_loop = next;
  13189.     }
  13190.  
  13191. -   return cpu;
  13192. +   return -1;
  13193.  }
  13194.  
  13195. -#define RT_PUSH_IPI_EXECUTING      1
  13196. -#define RT_PUSH_IPI_RESTART        2
  13197. +static inline bool rto_start_trylock(atomic_t *v)
  13198. +{
  13199. +   return !atomic_cmpxchg_acquire(v, 0, 1);
  13200. +}
  13201.  
  13202. -static void tell_cpu_to_push(struct rq *rq)
  13203. +static inline void rto_start_unlock(atomic_t *v)
  13204.  {
  13205. -   int cpu;
  13206. +   atomic_set_release(v, 0);
  13207. +}
  13208.  
  13209. -   if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
  13210. -       raw_spin_lock(&rq->rt.push_lock);
  13211. -       /* Make sure it's still executing */
  13212. -       if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
  13213. -           /*
  13214. -            * Tell the IPI to restart the loop as things have
  13215. -            * changed since it started.
  13216. -            */
  13217. -           rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
  13218. -           raw_spin_unlock(&rq->rt.push_lock);
  13219. -           return;
  13220. -       }
  13221. -       raw_spin_unlock(&rq->rt.push_lock);
  13222. -   }
  13223. +static void tell_cpu_to_push(struct rq *rq)
  13224. +{
  13225. +   int cpu = -1;
  13226.  
  13227. -   /* When here, there's no IPI going around */
  13228. +   /* Keep the loop going if the IPI is currently active */
  13229. +   atomic_inc(&rq->rd->rto_loop_next);
  13230.  
  13231. -   rq->rt.push_cpu = rq->cpu;
  13232. -   cpu = find_next_push_cpu(rq);
  13233. -   if (cpu >= nr_cpu_ids)
  13234. +   /* Only one CPU can initiate a loop at a time */
  13235. +   if (!rto_start_trylock(&rq->rd->rto_loop_start))
  13236.         return;
  13237.  
  13238. -   rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
  13239. +   raw_spin_lock(&rq->rd->rto_lock);
  13240.  
  13241. -   irq_work_queue_on(&rq->rt.push_work, cpu);
  13242. +   /*
  13243. +    * The rto_cpu is updated under the lock, if it has a valid cpu
  13244. +    * then the IPI is still running and will continue due to the
  13245. +    * update to loop_next, and nothing needs to be done here.
  13246. +    * Otherwise it is finishing up and an ipi needs to be sent.
  13247. +    */
  13248. +   if (rq->rd->rto_cpu < 0)
  13249. +       cpu = rto_next_cpu(rq->rd);
  13250. +
  13251. +   raw_spin_unlock(&rq->rd->rto_lock);
  13252. +
  13253. +   rto_start_unlock(&rq->rd->rto_loop_start);
  13254. +
  13255. +   if (cpu >= 0) {
  13256. +       /* Make sure the rd does not get freed while pushing */
  13257. +       sched_get_rd(rq->rd);
  13258. +       irq_work_queue_on(&rq->rd->rto_push_work, cpu);
  13259. +   }
  13260.  }
  13261.  
  13262.  /* Called from hardirq context */
  13263. -static void try_to_push_tasks(void *arg)
  13264. +void rto_push_irq_work_func(struct irq_work *work)
  13265.  {
  13266. -   struct rt_rq *rt_rq = arg;
  13267. -   struct rq *rq, *src_rq;
  13268. -   int this_cpu;
  13269. +   struct root_domain *rd =
  13270. +       container_of(work, struct root_domain, rto_push_work);
  13271. +   struct rq *rq;
  13272.     int cpu;
  13273.  
  13274. -   this_cpu = rt_rq->push_cpu;
  13275. -
  13276. -   /* Paranoid check */
  13277. -   BUG_ON(this_cpu != smp_processor_id());
  13278. +   rq = this_rq();
  13279.  
  13280. -   rq = cpu_rq(this_cpu);
  13281. -   src_rq = rq_of_rt_rq(rt_rq);
  13282. -
  13283. -again:
  13284. +   /*
  13285. +    * We do not need to grab the lock to check for has_pushable_tasks.
  13286. +    * When it gets updated, a check is made if a push is possible.
  13287. +    */
  13288.     if (has_pushable_tasks(rq)) {
  13289.         raw_spin_lock(&rq->lock);
  13290. -       push_rt_task(rq);
  13291. +       push_rt_tasks(rq);
  13292.         raw_spin_unlock(&rq->lock);
  13293.     }
  13294.  
  13295. -   /* Pass the IPI to the next rt overloaded queue */
  13296. -   raw_spin_lock(&rt_rq->push_lock);
  13297. -   /*
  13298. -    * If the source queue changed since the IPI went out,
  13299. -    * we need to restart the search from that CPU again.
  13300. -    */
  13301. -   if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
  13302. -       rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
  13303. -       rt_rq->push_cpu = src_rq->cpu;
  13304. -   }
  13305. +   raw_spin_lock(&rd->rto_lock);
  13306.  
  13307. -   cpu = find_next_push_cpu(src_rq);
  13308. +   /* Pass the IPI to the next rt overloaded queue */
  13309. +   cpu = rto_next_cpu(rd);
  13310.  
  13311. -   if (cpu >= nr_cpu_ids)
  13312. -       rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
  13313. -   raw_spin_unlock(&rt_rq->push_lock);
  13314. +   raw_spin_unlock(&rd->rto_lock);
  13315.  
  13316. -   if (cpu >= nr_cpu_ids)
  13317. +   if (cpu < 0) {
  13318. +       sched_put_rd(rd);
  13319.         return;
  13320. -
  13321. -   /*
  13322. -    * It is possible that a restart caused this CPU to be
  13323. -    * chosen again. Don't bother with an IPI, just see if we
  13324. -    * have more to push.
  13325. -    */
  13326. -   if (unlikely(cpu == rq->cpu))
  13327. -       goto again;
  13328. +   }
  13329.  
  13330.     /* Try the next RT overloaded CPU */
  13331. -   irq_work_queue_on(&rt_rq->push_work, cpu);
  13332. -}
  13333. -
  13334. -static void push_irq_work_func(struct irq_work *work)
  13335. -{
  13336. -   struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
  13337. -
  13338. -   try_to_push_tasks(rt_rq);
  13339. +   irq_work_queue_on(&rd->rto_push_work, cpu);
  13340.  }
  13341.  #endif /* HAVE_RT_PUSH_IPI */
  13342.  
  13343. @@ -2296,8 +2218,9 @@
  13344.     bool resched = false;
  13345.     struct task_struct *p;
  13346.     struct rq *src_rq;
  13347. +   int rt_overload_count = rt_overloaded(this_rq);
  13348.  
  13349. -   if (likely(!rt_overloaded(this_rq)))
  13350. +   if (likely(!rt_overload_count))
  13351.         return;
  13352.  
  13353.     /*
  13354. @@ -2306,6 +2229,11 @@
  13355.      */
  13356.     smp_rmb();
  13357.  
  13358. +   /* If we are the only overloaded CPU do nothing */
  13359. +   if (rt_overload_count == 1 &&
  13360. +       cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
  13361. +       return;
  13362. +
  13363.  #ifdef HAVE_RT_PUSH_IPI
  13364.     if (sched_feat(RT_PUSH_IPI)) {
  13365.         tell_cpu_to_push(this_rq);
  13366. @@ -2365,7 +2293,9 @@
  13367.             resched = true;
  13368.  
  13369.             deactivate_task(src_rq, p, 0);
  13370. +           p->on_rq = TASK_ON_RQ_MIGRATING;
  13371.             set_task_cpu(p, this_cpu);
  13372. +           p->on_rq = TASK_ON_RQ_QUEUED;
  13373.             activate_task(this_rq, p, 0);
  13374.             /*
  13375.              * We continue with the search, just in
  13376. @@ -2390,53 +2320,13 @@
  13377.  {
  13378.     if (!task_running(rq, p) &&
  13379.         !test_tsk_need_resched(rq->curr) &&
  13380. -       has_pushable_tasks(rq) &&
  13381. -       p->nr_cpus_allowed > 1 &&
  13382. +       tsk_nr_cpus_allowed(p) > 1 &&
  13383.         (dl_task(rq->curr) || rt_task(rq->curr)) &&
  13384. -       (rq->curr->nr_cpus_allowed < 2 ||
  13385. +       (tsk_nr_cpus_allowed(rq->curr) < 2 ||
  13386.          rq->curr->prio <= p->prio))
  13387.         push_rt_tasks(rq);
  13388.  }
  13389.  
  13390. -static void set_cpus_allowed_rt(struct task_struct *p,
  13391. -               const struct cpumask *new_mask)
  13392. -{
  13393. -   struct rq *rq;
  13394. -   int weight;
  13395. -
  13396. -   BUG_ON(!rt_task(p));
  13397. -
  13398. -   if (!task_on_rq_queued(p))
  13399. -       return;
  13400. -
  13401. -   weight = cpumask_weight(new_mask);
  13402. -
  13403. -   /*
  13404. -    * Only update if the process changes its state from whether it
  13405. -    * can migrate or not.
  13406. -    */
  13407. -   if ((p->nr_cpus_allowed > 1) == (weight > 1))
  13408. -       return;
  13409. -
  13410. -   rq = task_rq(p);
  13411. -
  13412. -   /*
  13413. -    * The process used to be able to migrate OR it can now migrate
  13414. -    */
  13415. -   if (weight <= 1) {
  13416. -       if (!task_current(rq, p))
  13417. -           dequeue_pushable_task(rq, p);
  13418. -       BUG_ON(!rq->rt.rt_nr_migratory);
  13419. -       rq->rt.rt_nr_migratory--;
  13420. -   } else {
  13421. -       if (!task_current(rq, p))
  13422. -           enqueue_pushable_task(rq, p);
  13423. -       rq->rt.rt_nr_migratory++;
  13424. -   }
  13425. -
  13426. -   update_rt_migration(&rq->rt);
  13427. -}
  13428. -
  13429.  /* Assumes rq->lock is held */
  13430.  static void rq_online_rt(struct rq *rq)
  13431.  {
  13432. @@ -2466,13 +2356,6 @@
  13433.  static void switched_from_rt(struct rq *rq, struct task_struct *p)
  13434.  {
  13435.     /*
  13436. -    * On class switch from rt, always cancel active schedtune timers,
  13437. -    * this handles the cases where we switch class for a task that is
  13438. -    * already rt-dequeued but has a running timer.
  13439. -    */
  13440. -   schedtune_dequeue_rt(rq, p);
  13441. -
  13442. -   /*
  13443.      * If there are other RT tasks then we will reschedule
  13444.      * and the scheduling of the other RT tasks will handle
  13445.      * the balancing. But if we are the last RT task
  13446. @@ -2512,7 +2395,7 @@
  13447.      */
  13448.     if (task_on_rq_queued(p) && rq->curr != p) {
  13449.  #ifdef CONFIG_SMP
  13450. -       if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
  13451. +       if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
  13452.             queue_push_tasks(rq);
  13453.  #endif /* CONFIG_SMP */
  13454.         if (p->prio < rq->curr->prio)
  13455. @@ -2590,7 +2473,7 @@
  13456.     update_curr_rt(rq);
  13457.  
  13458.     if (rq->rt.rt_nr_running)
  13459. -       sched_rt_update_capacity_req(rq, true);
  13460. +       sched_rt_update_capacity_req(rq);
  13461.  
  13462.     watchdog(rq, p);
  13463.  
  13464. @@ -2654,7 +2537,7 @@
  13465.  #ifdef CONFIG_SMP
  13466.     .select_task_rq     = select_task_rq_rt,
  13467.  
  13468. -   .set_cpus_allowed       = set_cpus_allowed_rt,
  13469. +   .set_cpus_allowed       = set_cpus_allowed_common,
  13470.     .rq_online              = rq_online_rt,
  13471.     .rq_offline             = rq_offline_rt,
  13472.     .task_woken     = task_woken_rt,
  13473. diff -Nur /home/ninez/android/marlin/kernel/sched/sched.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/sched.h
  13474. --- /home/ninez/android/marlin/kernel/sched/sched.h 2018-08-10 01:54:08.566728454 -0400
  13475. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/sched.h 2018-08-26 16:43:11.650539699 -0400
  13476. @@ -1,3 +1,4 @@
  13477. +
  13478.  #include <linux/sched.h>
  13479.  #include <linux/sched/sysctl.h>
  13480.  #include <linux/sched/rt.h>
  13481. @@ -13,6 +14,12 @@
  13482.  #include "cpudeadline.h"
  13483.  #include "cpuacct.h"
  13484.  
  13485. +#ifdef CONFIG_SCHED_DEBUG
  13486. +#define SCHED_WARN_ON(x)   WARN_ONCE(x, #x)
  13487. +#else
  13488. +#define SCHED_WARN_ON(x)   ((void)(x))
  13489. +#endif
  13490. +
  13491.  struct rq;
  13492.  struct cpuidle_state;
  13493.  
  13494. @@ -34,6 +41,12 @@
  13495.  static inline void update_cpu_load_active(struct rq *this_rq) { }
  13496.  #endif
  13497.  
  13498. +#ifdef CONFIG_SCHED_SMT
  13499. +extern void update_idle_core(struct rq *rq);
  13500. +#else
  13501. +static inline void update_idle_core(struct rq *rq) { }
  13502. +#endif
  13503. +
  13504.  /*
  13505.   * Helpers for converting nanosecond timing to jiffy resolution
  13506.   */
  13507. @@ -47,23 +60,30 @@
  13508.   * and does not change the user-interface for setting shares/weights.
  13509.   *
  13510.   * We increase resolution only if we have enough bits to allow this increased
  13511. - * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
  13512. - * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
  13513. - * increased costs.
  13514. - */
  13515. -#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */
  13516. -# define SCHED_LOAD_RESOLUTION 10
  13517. -# define scale_load(w)     ((w) << SCHED_LOAD_RESOLUTION)
  13518. -# define scale_load_down(w)    ((w) >> SCHED_LOAD_RESOLUTION)
  13519. + * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are
  13520. + * pretty high and the returns do not justify the increased costs.
  13521. + *
  13522. + * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to
  13523. + * increase coverage and consistency always enable it on 64bit platforms.
  13524. + */
  13525. +#ifdef CONFIG_64BIT
  13526. +# define SCHED_LOAD_SHIFT  (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
  13527. +# define scale_load(w)     ((w) << SCHED_FIXEDPOINT_SHIFT)
  13528. +# define scale_load_down(w)    ((w) >> SCHED_FIXEDPOINT_SHIFT)
  13529.  #else
  13530. -# define SCHED_LOAD_RESOLUTION 0
  13531. +# define SCHED_LOAD_SHIFT  (SCHED_FIXEDPOINT_SHIFT)
  13532.  # define scale_load(w)     (w)
  13533.  # define scale_load_down(w)    (w)
  13534.  #endif
  13535.  
  13536. -#define SCHED_LOAD_SHIFT   (10 + SCHED_LOAD_RESOLUTION)
  13537.  #define SCHED_LOAD_SCALE   (1L << SCHED_LOAD_SHIFT)
  13538.  
  13539. +/*
  13540. + * NICE_0's weight (visible to users) and its load (invisible to users) have
  13541. + * independent ranges, but they should be well calibrated. We use scale_load()
  13542. + * and scale_load_down(w) to convert between them, and the following must be true:
  13543. + * scale_load(sched_prio_to_weight[20]) == NICE_0_LOAD
  13544. + */
  13545.  #define NICE_0_LOAD        SCHED_LOAD_SCALE
  13546.  #define NICE_0_SHIFT       SCHED_LOAD_SHIFT
  13547.  
  13548. @@ -83,6 +103,10 @@
  13549.   */
  13550.  #define RUNTIME_INF    ((u64)~0ULL)
  13551.  
  13552. +static inline int idle_policy(int policy)
  13553. +{
  13554. +   return policy == SCHED_IDLE;
  13555. +}
  13556.  static inline int fair_policy(int policy)
  13557.  {
  13558.     return policy == SCHED_NORMAL || policy == SCHED_BATCH;
  13559. @@ -97,6 +121,11 @@
  13560.  {
  13561.     return policy == SCHED_DEADLINE;
  13562.  }
  13563. +static inline bool valid_policy(int policy)
  13564. +{
  13565. +   return idle_policy(policy) || fair_policy(policy) ||
  13566. +       rt_policy(policy) || dl_policy(policy);
  13567. +}
  13568.  
  13569.  static inline int task_has_rt_policy(struct task_struct *p)
  13570.  {
  13571. @@ -108,11 +137,6 @@
  13572.     return dl_policy(p->policy);
  13573.  }
  13574.  
  13575. -static inline bool dl_time_before(u64 a, u64 b)
  13576. -{
  13577. -   return (s64)(a - b) < 0;
  13578. -}
  13579. -
  13580.  /*
  13581.   * Tells if entity @a should preempt entity @b.
  13582.   */
  13583. @@ -183,6 +207,25 @@
  13584.     u64 bw, total_bw;
  13585.  };
  13586.  
  13587. +static inline
  13588. +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
  13589. +{
  13590. +   dl_b->total_bw -= tsk_bw;
  13591. +}
  13592. +
  13593. +static inline
  13594. +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
  13595. +{
  13596. +   dl_b->total_bw += tsk_bw;
  13597. +}
  13598. +
  13599. +static inline
  13600. +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
  13601. +{
  13602. +   return dl_b->bw != -1 &&
  13603. +          dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
  13604. +}
  13605. +
  13606.  extern struct mutex sched_domains_mutex;
  13607.  
  13608.  #ifdef CONFIG_CGROUP_SCHED
  13609. @@ -365,6 +408,7 @@
  13610.     unsigned long runnable_load_avg;
  13611.  #ifdef CONFIG_FAIR_GROUP_SCHED
  13612.     unsigned long tg_load_avg_contrib;
  13613. +   unsigned long propagate_avg;
  13614.  #endif
  13615.     atomic_long_t removed_load_avg, removed_util_avg;
  13616.  #ifndef CONFIG_64BIT
  13617. @@ -422,7 +466,7 @@
  13618.  }
  13619.  
  13620.  /* RT IPI pull logic requires IRQ_WORK */
  13621. -#ifdef CONFIG_IRQ_WORK
  13622. +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
  13623.  # define HAVE_RT_PUSH_IPI
  13624.  #endif
  13625.  
  13626. @@ -430,6 +474,7 @@
  13627.  struct rt_rq {
  13628.     struct rt_prio_array active;
  13629.     unsigned int rt_nr_running;
  13630. +   unsigned int rr_nr_running;
  13631.  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
  13632.     struct {
  13633.         int curr; /* highest queued rt task prio */
  13634. @@ -443,12 +488,6 @@
  13635.     unsigned long rt_nr_total;
  13636.     int overloaded;
  13637.     struct plist_head pushable_tasks;
  13638. -#ifdef HAVE_RT_PUSH_IPI
  13639. -   int push_flags;
  13640. -   int push_cpu;
  13641. -   struct irq_work push_work;
  13642. -   raw_spinlock_t push_lock;
  13643. -#endif
  13644.  #endif /* CONFIG_SMP */
  13645.     int rt_queued;
  13646.  
  13647. @@ -466,6 +505,8 @@
  13648.  #endif
  13649.  };
  13650.  
  13651. +int try_to_unthrottle_rt_rq(struct rt_rq *rt_rq);
  13652. +
  13653.  /* Deadline class' related fields in a runqueue */
  13654.  struct dl_rq {
  13655.     /* runqueue is an rbtree, ordered by deadline */
  13656. @@ -541,6 +582,19 @@
  13657.     struct dl_bw dl_bw;
  13658.     struct cpudl cpudl;
  13659.  
  13660. +#ifdef HAVE_RT_PUSH_IPI
  13661. +   /*
  13662. +    * For IPI pull requests, loop across the rto_mask.
  13663. +    */
  13664. +   struct irq_work rto_push_work;
  13665. +   raw_spinlock_t rto_lock;
  13666. +   /* These are only updated and read within rto_lock */
  13667. +   int rto_loop;
  13668. +   int rto_cpu;
  13669. +   /* These atomics are updated outside of a lock */
  13670. +   atomic_t rto_loop_next;
  13671. +   atomic_t rto_loop_start;
  13672. +#endif
  13673.     /*
  13674.      * The "RT overload" flag: it gets set if a CPU has more than
  13675.      * one runnable RT task.
  13676. @@ -550,10 +604,18 @@
  13677.  
  13678.     /* Maximum cpu capacity in the system. */
  13679.     struct max_cpu_capacity max_cpu_capacity;
  13680. +
  13681. +   /* First cpu with maximum and minimum original capacity */
  13682. +   int max_cap_orig_cpu, min_cap_orig_cpu;
  13683.  };
  13684.  
  13685.  extern struct root_domain def_root_domain;
  13686. +extern void sched_get_rd(struct root_domain *rd);
  13687. +extern void sched_put_rd(struct root_domain *rd);
  13688.  
  13689. +#ifdef HAVE_RT_PUSH_IPI
  13690. +extern void rto_push_irq_work_func(struct irq_work *work);
  13691. +#endif
  13692.  #endif /* CONFIG_SMP */
  13693.  
  13694.  /*
  13695. @@ -587,7 +649,13 @@
  13696.  #ifdef CONFIG_NO_HZ_FULL
  13697.     unsigned long last_sched_tick;
  13698.  #endif
  13699. -   int skip_clock_update;
  13700. +
  13701. +#ifdef CONFIG_CPU_QUIET
  13702. +   /* time-based average load */
  13703. +   u64 nr_last_stamp;
  13704. +   u64 nr_running_integral;
  13705. +   seqcount_t ave_seqcnt;
  13706. +#endif
  13707.  
  13708.     /* capture load from *all* tasks on this cpu: */
  13709.     struct load_weight load;
  13710. @@ -601,6 +669,7 @@
  13711.  #ifdef CONFIG_FAIR_GROUP_SCHED
  13712.     /* list of leaf cfs_rq on this cpu: */
  13713.     struct list_head leaf_cfs_rq_list;
  13714. +   struct list_head *tmp_alone_branch;
  13715.  #endif /* CONFIG_FAIR_GROUP_SCHED */
  13716.  
  13717.     /*
  13718. @@ -615,6 +684,7 @@
  13719.     unsigned long next_balance;
  13720.     struct mm_struct *prev_mm;
  13721.  
  13722. +   unsigned int clock_skip_update;
  13723.     u64 clock;
  13724.     u64 clock_task;
  13725.  
  13726. @@ -633,6 +703,7 @@
  13727.     /* For active balancing */
  13728.     int active_balance;
  13729.     int push_cpu;
  13730. +   struct task_struct *push_task;
  13731.     struct cpu_stop_work active_balance_work;
  13732.     /* cpu of this runqueue: */
  13733.     int cpu;
  13734. @@ -651,24 +722,14 @@
  13735.  #endif
  13736.  
  13737.  #ifdef CONFIG_SCHED_WALT
  13738. -   /*
  13739. -    * max_freq = user or thermal defined maximum
  13740. -    * max_possible_freq = maximum supported by hardware
  13741. -    */
  13742. -   unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
  13743. -   struct cpumask freq_domain_cpumask;
  13744. -
  13745.     u64 cumulative_runnable_avg;
  13746. -   int efficiency; /* Differentiate cpus with different IPC capability */
  13747. -   int load_scale_factor;
  13748. -   int capacity;
  13749. -   int max_possible_capacity;
  13750.     u64 window_start;
  13751.     u64 curr_runnable_sum;
  13752.     u64 prev_runnable_sum;
  13753.     u64 cur_irqload;
  13754.     u64 avg_irqload;
  13755.     u64 irqload_ts;
  13756. +   u64 cum_window_demand;
  13757.  #endif /* CONFIG_SCHED_WALT */
  13758.  
  13759.  
  13760. @@ -710,6 +771,8 @@
  13761.     /* try_to_wake_up() stats */
  13762.     unsigned int ttwu_count;
  13763.     unsigned int ttwu_local;
  13764. +
  13765. +   struct eas_stats eas_stats;
  13766.  #endif
  13767.  
  13768.  #ifdef CONFIG_SMP
  13769. @@ -742,7 +805,7 @@
  13770.  
  13771.  static inline u64 __rq_clock_broken(struct rq *rq)
  13772.  {
  13773. -   return ACCESS_ONCE(rq->clock);
  13774. +   return READ_ONCE(rq->clock);
  13775.  }
  13776.  
  13777.  static inline u64 rq_clock(struct rq *rq)
  13778. @@ -757,6 +820,18 @@
  13779.     return rq->clock_task;
  13780.  }
  13781.  
  13782. +#define RQCF_REQ_SKIP  0x01
  13783. +#define RQCF_ACT_SKIP  0x02
  13784. +
  13785. +static inline void rq_clock_skip_update(struct rq *rq, bool skip)
  13786. +{
  13787. +   lockdep_assert_held(&rq->lock);
  13788. +   if (skip)
  13789. +       rq->clock_skip_update |= RQCF_REQ_SKIP;
  13790. +   else
  13791. +       rq->clock_skip_update &= ~RQCF_REQ_SKIP;
  13792. +}
  13793. +
  13794.  #ifdef CONFIG_NUMA_BALANCING
  13795.  extern void sched_setnuma(struct task_struct *p, int node);
  13796.  extern int migrate_task_to(struct task_struct *p, int cpu);
  13797. @@ -836,8 +911,8 @@
  13798.  DECLARE_PER_CPU(struct sched_domain *, sd_llc);
  13799.  DECLARE_PER_CPU(int, sd_llc_size);
  13800.  DECLARE_PER_CPU(int, sd_llc_id);
  13801. +DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
  13802.  DECLARE_PER_CPU(struct sched_domain *, sd_numa);
  13803. -DECLARE_PER_CPU(struct sched_domain *, sd_busy);
  13804.  DECLARE_PER_CPU(struct sched_domain *, sd_asym);
  13805.  DECLARE_PER_CPU(struct sched_domain *, sd_ea);
  13806.  DECLARE_PER_CPU(struct sched_domain *, sd_scs);
  13807. @@ -850,12 +925,9 @@
  13808.      */
  13809.     unsigned long capacity;
  13810.     unsigned long max_capacity; /* Max per-cpu capacity in group */
  13811. +   unsigned long min_capacity; /* Min per-CPU capacity in group */
  13812.     unsigned long next_update;
  13813.     int imbalance; /* XXX unrelated to capacity but shared group state */
  13814. -   /*
  13815. -    * Number of busy cpus in this group.
  13816. -    */
  13817. -   atomic_t nr_busy_cpus;
  13818.  
  13819.     unsigned long cpumask[0]; /* iteration mask */
  13820.  };
  13821. @@ -866,7 +938,7 @@
  13822.  
  13823.     unsigned int group_weight;
  13824.     struct sched_group_capacity *sgc;
  13825. -   const struct sched_group_energy const *sge;
  13826. +   const struct sched_group_energy *sge;
  13827.  
  13828.     /*
  13829.      * The CPUs this group covers.
  13830. @@ -878,9 +950,6 @@
  13831.     unsigned long cpumask[0];
  13832.  };
  13833.  
  13834. -void set_energy_aware(void);
  13835. -void clear_energy_aware(void);
  13836. -
  13837.  static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
  13838.  {
  13839.     return to_cpumask(sg->cpumask);
  13840. @@ -961,7 +1030,6 @@
  13841.  {
  13842.     return NULL;
  13843.  }
  13844. -
  13845.  #endif /* CONFIG_CGROUP_SCHED */
  13846.  
  13847.  static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
  13848. @@ -1022,17 +1090,8 @@
  13849.  #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
  13850.  #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
  13851.  
  13852. -#ifdef CONFIG_NUMA_BALANCING
  13853. -#define sched_feat_numa(x) sched_feat(x)
  13854. -#ifdef CONFIG_SCHED_DEBUG
  13855. -#define numabalancing_enabled sched_feat_numa(NUMA)
  13856. -#else
  13857. -extern bool numabalancing_enabled;
  13858. -#endif /* CONFIG_SCHED_DEBUG */
  13859. -#else
  13860. -#define sched_feat_numa(x) (0)
  13861. -#define numabalancing_enabled (0)
  13862. -#endif /* CONFIG_NUMA_BALANCING */
  13863. +extern struct static_key_false sched_numa_balancing;
  13864. +extern struct static_key_false sched_schedstats;
  13865.  
  13866.  static inline u64 global_rt_period(void)
  13867.  {
  13868. @@ -1074,9 +1133,6 @@
  13869.  #ifndef prepare_arch_switch
  13870.  # define prepare_arch_switch(next) do { } while (0)
  13871.  #endif
  13872. -#ifndef finish_arch_switch
  13873. -# define finish_arch_switch(prev)  do { } while (0)
  13874. -#endif
  13875.  #ifndef finish_arch_post_lock_switch
  13876.  # define finish_arch_post_lock_switch()    do { } while (0)
  13877.  #endif
  13878. @@ -1101,7 +1157,7 @@
  13879.      * We must ensure this doesn't happen until the switch is completely
  13880.      * finished.
  13881.      *
  13882. -    * Pairs with the control dependency and rmb in try_to_wake_up().
  13883. +    * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
  13884.      */
  13885.     smp_store_release(&prev->on_cpu, 0);
  13886.  #endif
  13887. @@ -1139,59 +1195,45 @@
  13888.  #define WEIGHT_IDLEPRIO                3
  13889.  #define WMULT_IDLEPRIO         1431655765
  13890.  
  13891. -/*
  13892. - * Nice levels are multiplicative, with a gentle 10% change for every
  13893. - * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
  13894. - * nice 1, it will get ~10% less CPU time than another CPU-bound task
  13895. - * that remained on nice 0.
  13896. - *
  13897. - * The "10% effect" is relative and cumulative: from _any_ nice level,
  13898. - * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
  13899. - * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
  13900. - * If a task goes up by ~10% and another task goes down by ~10% then
  13901. - * the relative distance between them is ~25%.)
  13902. - */
  13903. -static const int prio_to_weight[40] = {
  13904. - /* -20 */     88761,     71755,     56483,     46273,     36291,
  13905. - /* -15 */     29154,     23254,     18705,     14949,     11916,
  13906. - /* -10 */      9548,      7620,      6100,      4904,      3906,
  13907. - /*  -5 */      3121,      2501,      1991,      1586,      1277,
  13908. - /*   0 */      1024,       820,       655,       526,       423,
  13909. - /*   5 */       335,       272,       215,       172,       137,
  13910. - /*  10 */       110,        87,        70,        56,        45,
  13911. - /*  15 */        36,        29,        23,        18,        15,
  13912. -};
  13913. +extern const int sched_prio_to_weight[40];
  13914. +extern const u32 sched_prio_to_wmult[40];
  13915.  
  13916.  /*
  13917. - * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
  13918. + * {de,en}queue flags:
  13919.   *
  13920. - * In cases where the weight does not change often, we can use the
  13921. - * precalculated inverse to speed up arithmetics by turning divisions
  13922. - * into multiplications:
  13923. - */
  13924. -static const u32 prio_to_wmult[40] = {
  13925. - /* -20 */     48388,     59856,     76040,     92818,    118348,
  13926. - /* -15 */    147320,    184698,    229616,    287308,    360437,
  13927. - /* -10 */    449829,    563644,    704093,    875809,   1099582,
  13928. - /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
  13929. - /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
  13930. - /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
  13931. - /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
  13932. - /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
  13933. -};
  13934. + * DEQUEUE_SLEEP  - task is no longer runnable
  13935. + * ENQUEUE_WAKEUP - task just became runnable
  13936. + *
  13937. + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
  13938. + *                are in a known state which allows modification. Such pairs
  13939. + *                should preserve as much state as possible.
  13940. + *
  13941. + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
  13942. + *        in the runqueue.
  13943. + *
  13944. + * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
  13945. + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
  13946. + * ENQUEUE_WAKING    - sched_class::task_waking was called
  13947. + *
  13948. + */
  13949.  
  13950. -#define ENQUEUE_WAKEUP     1
  13951. -#define ENQUEUE_HEAD       2
  13952. +#define DEQUEUE_SLEEP      0x01
  13953. +#define DEQUEUE_SAVE       0x02 /* matches ENQUEUE_RESTORE */
  13954. +#define DEQUEUE_MOVE       0x04 /* matches ENQUEUE_MOVE */
  13955. +#define DEQUEUE_IDLE       0x80 /* The last dequeue before IDLE */
  13956. +
  13957. +#define ENQUEUE_WAKEUP     0x01
  13958. +#define ENQUEUE_RESTORE        0x02
  13959. +#define ENQUEUE_MOVE       0x04
  13960. +
  13961. +#define ENQUEUE_HEAD       0x08
  13962. +#define ENQUEUE_REPLENISH  0x10
  13963.  #ifdef CONFIG_SMP
  13964. -#define ENQUEUE_WAKING     4   /* sched_class::task_waking was called */
  13965. +#define ENQUEUE_WAKING     0x20
  13966.  #else
  13967. -#define ENQUEUE_WAKING     0
  13968. +#define ENQUEUE_WAKING     0x00
  13969.  #endif
  13970. -#define ENQUEUE_REPLENISH  0x08
  13971. -#define ENQUEUE_RESTORE    0x10
  13972. -#define ENQUEUE_WAKEUP_NEW 0x20
  13973. -
  13974. -#define DEQUEUE_SLEEP      1
  13975. +#define ENQUEUE_WAKEUP_NEW 0x40
  13976.  
  13977.  #define RETRY_TASK     ((void *)-1UL)
  13978.  
  13979. @@ -1214,12 +1256,14 @@
  13980.      * tasks.
  13981.      */
  13982.     struct task_struct * (*pick_next_task) (struct rq *rq,
  13983. -                       struct task_struct *prev);
  13984. +                       struct task_struct *prev,
  13985. +                       struct pin_cookie cookie);
  13986.     void (*put_prev_task) (struct rq *rq, struct task_struct *p);
  13987.  
  13988.  #ifdef CONFIG_SMP
  13989. -   int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
  13990. -   void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
  13991. +   int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags,
  13992. +                  int subling_count_hint);
  13993. +   void (*migrate_task_rq)(struct task_struct *p);
  13994.  
  13995.     void (*task_waking) (struct task_struct *task);
  13996.     void (*task_woken) (struct rq *this_rq, struct task_struct *task);
  13997. @@ -1251,8 +1295,11 @@
  13998.  
  13999.     void (*update_curr) (struct rq *rq);
  14000.  
  14001. +#define TASK_SET_GROUP  0
  14002. +#define TASK_MOVE_GROUP    1
  14003. +
  14004.  #ifdef CONFIG_FAIR_GROUP_SCHED
  14005. -   void (*task_move_group) (struct task_struct *p);
  14006. +   void (*task_change_group)(struct task_struct *p, int type);
  14007.  #endif
  14008.  };
  14009.  
  14010. @@ -1261,6 +1308,11 @@
  14011.     prev->sched_class->put_prev_task(rq, prev);
  14012.  }
  14013.  
  14014. +static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
  14015. +{
  14016. +   curr->sched_class->set_curr_task(rq);
  14017. +}
  14018. +
  14019.  #define sched_class_highest (&stop_sched_class)
  14020.  #define for_each_class(class) \
  14021.     for (class = sched_class_highest; class; class = class->next)
  14022. @@ -1279,13 +1331,7 @@
  14023.  
  14024.  extern void trigger_load_balance(struct rq *rq);
  14025.  
  14026. -extern void idle_enter_fair(struct rq *this_rq);
  14027. -extern void idle_exit_fair(struct rq *this_rq);
  14028. -
  14029. -#else
  14030. -
  14031. -static inline void idle_enter_fair(struct rq *rq) { }
  14032. -static inline void idle_exit_fair(struct rq *rq) { }
  14033. +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
  14034.  
  14035.  #endif
  14036.  
  14037. @@ -1298,7 +1344,7 @@
  14038.  
  14039.  static inline struct cpuidle_state *idle_get_state(struct rq *rq)
  14040.  {
  14041. -   WARN_ON(!rcu_read_lock_held());
  14042. +   SCHED_WARN_ON(!rcu_read_lock_held());
  14043.     return rq->idle_state;
  14044.  }
  14045.  
  14046. @@ -1340,7 +1386,6 @@
  14047.  extern void init_sched_dl_class(void);
  14048.  extern void init_sched_rt_class(void);
  14049.  extern void init_sched_fair_class(void);
  14050. -extern void init_sched_dl_class(void);
  14051.  
  14052.  extern void resched_curr(struct rq *rq);
  14053.  extern void resched_cpu(int cpu);
  14054. @@ -1350,14 +1395,14 @@
  14055.  
  14056.  extern struct dl_bandwidth def_dl_bandwidth;
  14057.  extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
  14058. -extern void init_rt_schedtune_timer(struct sched_rt_entity *rt_se);
  14059.  extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
  14060.  
  14061.  unsigned long to_ratio(u64 period, u64 runtime);
  14062.  
  14063.  extern void init_entity_runnable_average(struct sched_entity *se);
  14064. +extern void post_init_entity_util_avg(struct sched_entity *se);
  14065.  
  14066. -static inline void add_nr_running(struct rq *rq, unsigned count)
  14067. +static inline void __add_nr_running(struct rq *rq, unsigned count)
  14068.  {
  14069.     unsigned prev_nr = rq->nr_running;
  14070.  
  14071. @@ -1385,11 +1430,48 @@
  14072.     }
  14073.  }
  14074.  
  14075. -static inline void sub_nr_running(struct rq *rq, unsigned count)
  14076. +static inline void __sub_nr_running(struct rq *rq, unsigned count)
  14077.  {
  14078.     rq->nr_running -= count;
  14079.  }
  14080.  
  14081. +#ifdef CONFIG_CPU_QUIET
  14082. +#define NR_AVE_SCALE(x)        ((x) << FSHIFT)
  14083. +static inline u64 do_nr_running_integral(struct rq *rq)
  14084. +{
  14085. +   s64 nr, deltax;
  14086. +   u64 nr_running_integral = rq->nr_running_integral;
  14087. +
  14088. +   deltax = rq->clock_task - rq->nr_last_stamp;
  14089. +   nr = NR_AVE_SCALE(rq->nr_running);
  14090. +
  14091. +   nr_running_integral += nr * deltax;
  14092. +
  14093. +   return nr_running_integral;
  14094. +}
  14095. +
  14096. +static inline void add_nr_running(struct rq *rq, unsigned count)
  14097. +{
  14098. +   write_seqcount_begin(&rq->ave_seqcnt);
  14099. +   rq->nr_running_integral = do_nr_running_integral(rq);
  14100. +   rq->nr_last_stamp = rq->clock_task;
  14101. +   __add_nr_running(rq, count);
  14102. +   write_seqcount_end(&rq->ave_seqcnt);
  14103. +}
  14104. +
  14105. +static inline void sub_nr_running(struct rq *rq, unsigned count)
  14106. +{
  14107. +   write_seqcount_begin(&rq->ave_seqcnt);
  14108. +   rq->nr_running_integral = do_nr_running_integral(rq);
  14109. +   rq->nr_last_stamp = rq->clock_task;
  14110. +   __sub_nr_running(rq, count);
  14111. +   write_seqcount_end(&rq->ave_seqcnt);
  14112. +}
  14113. +#else
  14114. +#define add_nr_running __add_nr_running
  14115. +#define sub_nr_running __sub_nr_running
  14116. +#endif
  14117. +
  14118.  static inline void rq_last_tick_reset(struct rq *rq)
  14119.  {
  14120.  #ifdef CONFIG_NO_HZ_FULL
  14121. @@ -1451,6 +1533,26 @@
  14122.  }
  14123.  #endif
  14124.  
  14125. +#ifndef arch_scale_max_freq_capacity
  14126. +static __always_inline
  14127. +unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
  14128. +{
  14129. +   return SCHED_CAPACITY_SCALE;
  14130. +}
  14131. +#endif
  14132. +
  14133. +#ifndef arch_scale_min_freq_capacity
  14134. +static __always_inline
  14135. +unsigned long arch_scale_min_freq_capacity(struct sched_domain *sd, int cpu)
  14136. +{
  14137. +   /*
  14138. +    * Multiplied with any capacity value, this scale factor will return
  14139. +    * 0, which represents an un-capped state
  14140. +    */
  14141. +   return 0;
  14142. +}
  14143. +#endif
  14144. +
  14145.  #ifndef arch_scale_cpu_capacity
  14146.  static __always_inline
  14147.  unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
  14148. @@ -1473,33 +1575,9 @@
  14149.     return cpu_rq(cpu)->cpu_capacity_orig;
  14150.  }
  14151.  
  14152. -/* Force usage of PELT signal, i.e. util_avg */
  14153. -#define UTIL_AVG true
  14154. -/* Use estimated utilization when possible, i.e. UTIL_EST feature enabled */
  14155. -#define UTIL_EST false
  14156. -static inline bool use_util_est(void)
  14157. -{
  14158. -   return sched_feat(UTIL_EST);
  14159. -}
  14160. -
  14161.  extern unsigned int sysctl_sched_use_walt_cpu_util;
  14162.  extern unsigned int walt_ravg_window;
  14163. -extern unsigned int walt_disabled;
  14164. -
  14165. -static inline unsigned long task_util(struct task_struct *p, bool use_pelt)
  14166. -{
  14167. -
  14168. -#ifdef CONFIG_SCHED_WALT
  14169. -   if (!walt_disabled && sysctl_sched_use_walt_task_util) {
  14170. -       unsigned long demand = p->ravg.demand;
  14171. -       return (demand << 10) / walt_ravg_window;
  14172. -   }
  14173. -#endif
  14174. -   if (use_util_est() && !use_pelt)
  14175. -       return p->se.avg.util_est;
  14176. -   return p->se.avg.util_avg;
  14177. -}
  14178. -
  14179. +extern bool walt_disabled;
  14180.  
  14181.  /*
  14182.   * cpu_util returns the amount of capacity of a CPU that is used by CFS
  14183. @@ -1527,18 +1605,15 @@
  14184.   * capacity_orig) as it useful for predicting the capacity required after task
  14185.   * migrations (scheduler-driven DVFS).
  14186.   */
  14187. -static inline unsigned long __cpu_util(int cpu, int delta, bool use_pelt)
  14188. +static inline unsigned long __cpu_util(int cpu, int delta)
  14189.  {
  14190.     unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
  14191.     unsigned long capacity = capacity_orig_of(cpu);
  14192.  
  14193. -   if (use_util_est() && !use_pelt)
  14194. -       util = max(util, cpu_rq(cpu)->cfs.avg.util_est);
  14195. -
  14196.  #ifdef CONFIG_SCHED_WALT
  14197.     if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
  14198. -       util = (cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT) /
  14199. -           walt_ravg_window;
  14200. +       util = div64_u64(cpu_rq(cpu)->cumulative_runnable_avg,
  14201. +                walt_ravg_window >> SCHED_LOAD_SHIFT);
  14202.  #endif
  14203.     delta += util;
  14204.     if (delta < 0)
  14205. @@ -1547,9 +1622,22 @@
  14206.     return (delta >= capacity) ? capacity : delta;
  14207.  }
  14208.  
  14209. -static inline unsigned long cpu_util(int cpu, bool use_pelt)
  14210. +static inline unsigned long cpu_util(int cpu)
  14211.  {
  14212. -   return __cpu_util(cpu, 0, use_pelt);
  14213. +   return __cpu_util(cpu, 0);
  14214. +}
  14215. +
  14216. +static inline unsigned long cpu_util_freq(int cpu)
  14217. +{
  14218. +   unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
  14219. +   unsigned long capacity = capacity_orig_of(cpu);
  14220. +
  14221. +#ifdef CONFIG_SCHED_WALT
  14222. +   if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
  14223. +       util = div64_u64(cpu_rq(cpu)->prev_runnable_sum,
  14224. +                walt_ravg_window >> SCHED_LOAD_SHIFT);
  14225. +#endif
  14226. +   return (util >= capacity) ? capacity : util;
  14227.  }
  14228.  
  14229.  #endif
  14230. @@ -1564,6 +1652,10 @@
  14231.     return static_key_false(&__sched_freq);
  14232.  }
  14233.  
  14234. +/*
  14235. + * sched_capacity_reqs expects capacity requests to be normalised.
  14236. + * All capacities should sum to the range of 0-1024.
  14237. + */
  14238.  DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
  14239.  void update_cpu_capacity_request(int cpu, bool request);
  14240.  
  14241. @@ -1572,32 +1664,45 @@
  14242.  {
  14243.     struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
  14244.  
  14245. -   if (scr->cfs == capacity)
  14246. -       return;
  14247. -   scr->cfs = capacity;
  14248. -   update_cpu_capacity_request(cpu, request);
  14249. +#ifdef CONFIG_SCHED_WALT
  14250. +       if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
  14251. +       int rtdl = scr->rt + scr->dl;
  14252. +       /*
  14253. +        * WALT tracks the utilization of a CPU considering the load
  14254. +        * generated by all the scheduling classes.
  14255. +        * Since the following call to:
  14256. +        *    update_cpu_capacity
  14257. +        * is already adding the RT and DL utilizations let's remove
  14258. +        * these contributions from the WALT signal.
  14259. +        */
  14260. +       if (capacity > rtdl)
  14261. +           capacity -= rtdl;
  14262. +       else
  14263. +           capacity = 0;
  14264. +   }
  14265. +#endif
  14266. +   if (scr->cfs != capacity) {
  14267. +       scr->cfs = capacity;
  14268. +       update_cpu_capacity_request(cpu, request);
  14269. +   }
  14270.  }
  14271.  
  14272.  static inline void set_rt_cpu_capacity(int cpu, bool request,
  14273.                        unsigned long capacity)
  14274.  {
  14275. -   struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
  14276. -
  14277. -   if (scr->rt == capacity)
  14278. -       return;
  14279. -   scr->rt = capacity;
  14280. -   update_cpu_capacity_request(cpu, request);
  14281. +   if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) {
  14282. +       per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity;
  14283. +       update_cpu_capacity_request(cpu, request);
  14284. +   }
  14285.  }
  14286.  
  14287.  static inline void set_dl_cpu_capacity(int cpu, bool request,
  14288.                        unsigned long capacity)
  14289.  {
  14290. -   struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
  14291. -
  14292. -   if (scr->dl == capacity)
  14293. -       return;
  14294. -   scr->dl = capacity;
  14295. -   update_cpu_capacity_request(cpu, request);
  14296. +   if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) {
  14297. +       per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity;
  14298. +       update_cpu_capacity_request(cpu, request);
  14299. +   }
  14300.  }
  14301.  #else
  14302.  static inline bool sched_freq(void) { return false; }
  14303. @@ -1621,8 +1726,33 @@
  14304.  static inline void sched_avg_update(struct rq *rq) { }
  14305.  #endif
  14306.  
  14307. -extern struct rq *lock_rq_of(struct task_struct *p, unsigned long *flags);
  14308. -extern void unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags);
  14309. +struct rq_flags {
  14310. +   unsigned long flags;
  14311. +   struct pin_cookie cookie;
  14312. +};
  14313. +
  14314. +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
  14315. +   __acquires(rq->lock);
  14316. +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
  14317. +   __acquires(p->pi_lock)
  14318. +   __acquires(rq->lock);
  14319. +
  14320. +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
  14321. +   __releases(rq->lock)
  14322. +{
  14323. +   lockdep_unpin_lock(&rq->lock, rf->cookie);
  14324. +   raw_spin_unlock(&rq->lock);
  14325. +}
  14326. +
  14327. +static inline void
  14328. +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
  14329. +   __releases(rq->lock)
  14330. +   __releases(p->pi_lock)
  14331. +{
  14332. +   lockdep_unpin_lock(&rq->lock, rf->cookie);
  14333. +   raw_spin_unlock(&rq->lock);
  14334. +   raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
  14335. +}
  14336.  
  14337.  #ifdef CONFIG_SMP
  14338.  #ifdef CONFIG_PREEMPT
  14339. @@ -1811,8 +1941,8 @@
  14340.  extern void print_rt_stats(struct seq_file *m, int cpu);
  14341.  
  14342.  extern void init_cfs_rq(struct cfs_rq *cfs_rq);
  14343. -extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
  14344. -extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
  14345. +extern void init_rt_rq(struct rt_rq *rt_rq);
  14346. +extern void init_dl_rq(struct dl_rq *dl_rq);
  14347.  
  14348.  extern void cfs_bandwidth_usage_inc(void);
  14349.  extern void cfs_bandwidth_usage_dec(void);
  14350. @@ -1878,6 +2008,69 @@
  14351.  #endif /* CONFIG_64BIT */
  14352.  #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
  14353.  
  14354. +#ifdef CONFIG_CPU_FREQ
  14355. +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
  14356. +
  14357. +/**
  14358. + * cpufreq_update_util - Take a note about CPU utilization changes.
  14359. + * @rq: Runqueue to carry out the update for.
  14360. + * @flags: Update reason flags.
  14361. + *
  14362. + * This function is called by the scheduler on the CPU whose utilization is
  14363. + * being updated.
  14364. + *
  14365. + * It can only be called from RCU-sched read-side critical sections.
  14366. + *
  14367. + * The way cpufreq is currently arranged requires it to evaluate the CPU
  14368. + * performance state (frequency/voltage) on a regular basis to prevent it from
  14369. + * being stuck in a completely inadequate performance level for too long.
  14370. + * That is not guaranteed to happen if the updates are only triggered from CFS,
  14371. + * though, because they may not be coming in if RT or deadline tasks are active
  14372. + * all the time (or there are RT and DL tasks only).
  14373. + *
  14374. + * As a workaround for that issue, this function is called by the RT and DL
  14375. + * sched classes to trigger extra cpufreq updates to prevent it from stalling,
  14376. + * but that really is a band-aid.  Going forward it should be replaced with
  14377. + * solutions targeted more specifically at RT and DL tasks.
  14378. + */
  14379. +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
  14380. +{
  14381. +        struct update_util_data *data;
  14382. +
  14383. +        data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
  14384. +        if (data)
  14385. +                data->func(data, rq_clock(rq), flags);
  14386. +}
  14387. +
  14388. +static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
  14389. +{
  14390. +        if (cpu_of(rq) == smp_processor_id())
  14391. +                cpufreq_update_util(rq, flags);
  14392. +}
  14393. +#else
  14394. +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
  14395. +static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
  14396. +#endif /* CONFIG_CPU_FREQ */
  14397. +
  14398. +#ifdef CONFIG_SCHED_WALT
  14399. +
  14400. +static inline bool
  14401. +walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
  14402. +{
  14403. +   return cpu_of(rq) == task_cpu(p) &&
  14404. +          (p->on_rq || p->last_sleep_ts >= rq->window_start);
  14405. +}
  14406. +
  14407. +#endif /* CONFIG_SCHED_WALT */
  14408. +
  14409. +#ifdef arch_scale_freq_capacity
  14410. +#ifndef arch_scale_freq_invariant
  14411. +#define arch_scale_freq_invariant()     (true)
  14412. +#endif
  14413. +#else /* arch_scale_freq_capacity */
  14414. +#define arch_scale_freq_invariant()     (false)
  14415. +#endif
  14416. +
  14417.  /*
  14418.   * task_may_not_preempt - check whether a task may not be preemptible soon
  14419.   */
  14420. diff -Nur /home/ninez/android/marlin/kernel/sched/stats.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stats.c
  14421. --- /home/ninez/android/marlin/kernel/sched/stats.c 2018-08-10 01:54:08.566728454 -0400
  14422. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stats.c 2018-08-11 23:57:17.131940887 -0400
  14423. @@ -12,6 +12,26 @@
  14424.   */
  14425.  #define SCHEDSTAT_VERSION 15
  14426.  
  14427. +static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats)
  14428. +{
  14429. +   /* eas-specific runqueue stats */
  14430. +   seq_printf(seq, "eas %llu %llu %llu %llu %llu %llu ",
  14431. +       stats->sis_attempts, stats->sis_idle, stats->sis_cache_affine,
  14432. +       stats->sis_suff_cap, stats->sis_idle_cpu, stats->sis_count);
  14433. +
  14434. +   seq_printf(seq, "%llu %llu %llu %llu %llu %llu %llu ",
  14435. +       stats->secb_attempts, stats->secb_sync, stats->secb_idle_bt,
  14436. +       stats->secb_insuff_cap, stats->secb_no_nrg_sav,
  14437. +       stats->secb_nrg_sav, stats->secb_count);
  14438. +
  14439. +   seq_printf(seq, "%llu %llu %llu %llu %llu ",
  14440. +       stats->fbt_attempts, stats->fbt_no_cpu, stats->fbt_no_sd,
  14441. +       stats->fbt_pref_idle, stats->fbt_count);
  14442. +
  14443. +   seq_printf(seq, "%llu %llu\n",
  14444. +       stats->cas_attempts, stats->cas_count);
  14445. +}
  14446. +
  14447.  static int show_schedstat(struct seq_file *seq, void *v)
  14448.  {
  14449.     int cpu;
  14450. @@ -44,6 +64,7 @@
  14451.  
  14452.         seq_printf(seq, "\n");
  14453.  
  14454. +       show_easstat(seq, &rq->eas_stats);
  14455.  #ifdef CONFIG_SMP
  14456.         /* domain-specific stats */
  14457.         rcu_read_lock();
  14458. @@ -72,6 +93,8 @@
  14459.                 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
  14460.                 sd->ttwu_wake_remote, sd->ttwu_move_affine,
  14461.                 sd->ttwu_move_balance);
  14462. +
  14463. +           show_easstat(seq, &sd->eas_stats);
  14464.         }
  14465.         rcu_read_unlock();
  14466.  #endif
  14467. diff -Nur /home/ninez/android/marlin/kernel/sched/stats.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stats.h
  14468. --- /home/ninez/android/marlin/kernel/sched/stats.h 2018-08-10 01:54:08.566728454 -0400
  14469. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stats.h 2018-08-26 16:43:11.650539699 -0400
  14470. @@ -29,9 +29,13 @@
  14471.     if (rq)
  14472.         rq->rq_sched_info.run_delay += delta;
  14473.  }
  14474. -# define schedstat_inc(rq, field)  do { (rq)->field++; } while (0)
  14475. -# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
  14476. -# define schedstat_set(var, val)   do { var = (val); } while (0)
  14477. +#define schedstat_enabled()        static_branch_unlikely(&sched_schedstats)
  14478. +#define schedstat_inc(var)     do { if (schedstat_enabled()) { var++; } } while (0)
  14479. +#define schedstat_add(var, amt)        do { if (schedstat_enabled()) { var += (amt); } } while (0)
  14480. +#define schedstat_set(var, val)        do { if (schedstat_enabled()) { var = (val); } } while (0)
  14481. +#define schedstat_val(var)     (var)
  14482. +#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
  14483. +
  14484.  #else /* !CONFIG_SCHEDSTATS */
  14485.  static inline void
  14486.  rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
  14487. @@ -42,10 +46,13 @@
  14488.  static inline void
  14489.  rq_sched_info_depart(struct rq *rq, unsigned long long delta)
  14490.  {}
  14491. -# define schedstat_inc(rq, field)  do { } while (0)
  14492. -# define schedstat_add(rq, field, amt) do { } while (0)
  14493. -# define schedstat_set(var, val)   do { } while (0)
  14494. -#endif
  14495. +#define schedstat_enabled()        0
  14496. +#define schedstat_inc(var)     do { } while (0)
  14497. +#define schedstat_add(var, amt)        do { } while (0)
  14498. +#define schedstat_set(var, val)        do { } while (0)
  14499. +#define schedstat_val(var)     0
  14500. +#define schedstat_val_or_zero(var) 0
  14501. +#endif /* CONFIG_SCHEDSTATS */
  14502.  
  14503.  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
  14504.  static inline void sched_info_reset_dequeued(struct task_struct *t)
  14505. @@ -174,7 +181,8 @@
  14506.  {
  14507.     struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
  14508.  
  14509. -   if (!cputimer->running)
  14510. +   /* Check if cputimer isn't running. This is accessed without locking. */
  14511. +   if (!READ_ONCE(cputimer->running))
  14512.         return false;
  14513.  
  14514.     /*
  14515. @@ -215,9 +223,7 @@
  14516.     if (!cputimer_running(tsk))
  14517.         return;
  14518.  
  14519. -   raw_spin_lock(&cputimer->lock);
  14520. -   cputimer->cputime.utime += cputime;
  14521. -   raw_spin_unlock(&cputimer->lock);
  14522. +   atomic64_add(cputime, &cputimer->cputime_atomic.utime);
  14523.  }
  14524.  
  14525.  /**
  14526. @@ -238,9 +244,7 @@
  14527.     if (!cputimer_running(tsk))
  14528.         return;
  14529.  
  14530. -   raw_spin_lock(&cputimer->lock);
  14531. -   cputimer->cputime.stime += cputime;
  14532. -   raw_spin_unlock(&cputimer->lock);
  14533. +   atomic64_add(cputime, &cputimer->cputime_atomic.stime);
  14534.  }
  14535.  
  14536.  /**
  14537. @@ -261,7 +265,5 @@
  14538.     if (!cputimer_running(tsk))
  14539.         return;
  14540.  
  14541. -   raw_spin_lock(&cputimer->lock);
  14542. -   cputimer->cputime.sum_exec_runtime += ns;
  14543. -   raw_spin_unlock(&cputimer->lock);
  14544. +   atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
  14545.  }
  14546. diff -Nur /home/ninez/android/marlin/kernel/sched/stop_task.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stop_task.c
  14547. --- /home/ninez/android/marlin/kernel/sched/stop_task.c 2018-08-10 01:54:08.566728454 -0400
  14548. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stop_task.c 2018-08-21 23:22:44.643944617 -0400
  14549. @@ -12,7 +12,8 @@
  14550.  
  14551.  #ifdef CONFIG_SMP
  14552.  static int
  14553. -select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
  14554. +select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags,
  14555. +           int sibling_count_hint)
  14556.  {
  14557.     return task_cpu(p); /* stop tasks as never migrate */
  14558.  }
  14559. @@ -25,7 +26,7 @@
  14560.  }
  14561.  
  14562.  static struct task_struct *
  14563. -pick_next_task_stop(struct rq *rq, struct task_struct *prev)
  14564. +pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
  14565.  {
  14566.     struct task_struct *stop = rq->stop;
  14567.  
  14568. @@ -126,6 +127,7 @@
  14569.  
  14570.  #ifdef CONFIG_SMP
  14571.     .select_task_rq     = select_task_rq_stop,
  14572. +   .set_cpus_allowed   = set_cpus_allowed_common,
  14573.  #endif
  14574.  
  14575.     .set_curr_task          = set_curr_task_stop,
  14576. diff -Nur /home/ninez/android/marlin/kernel/sched/swait.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/swait.c
  14577. --- /home/ninez/android/marlin/kernel/sched/swait.c 1969-12-31 19:00:00.000000000 -0500
  14578. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/swait.c 2018-08-13 18:40:12.199646700 -0400
  14579. @@ -0,0 +1,134 @@
  14580. +#include <linux/sched.h>
  14581. +#include <linux/swait.h>
  14582. +
  14583. +void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
  14584. +                struct lock_class_key *key)
  14585. +{
  14586. +   raw_spin_lock_init(&q->lock);
  14587. +   lockdep_set_class_and_name(&q->lock, key, name);
  14588. +   INIT_LIST_HEAD(&q->task_list);
  14589. +}
  14590. +EXPORT_SYMBOL(__init_swait_queue_head);
  14591. +
  14592. +/*
  14593. + * The thing about the wake_up_state() return value; I think we can ignore it.
  14594. + *
  14595. + * If for some reason it would return 0, that means the previously waiting
  14596. + * task is already running, so it will observe condition true (or has already).
  14597. + */
  14598. +void swake_up_locked(struct swait_queue_head *q)
  14599. +{
  14600. +   struct swait_queue *curr;
  14601. +
  14602. +   if (list_empty(&q->task_list))
  14603. +       return;
  14604. +
  14605. +   curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
  14606. +   wake_up_process(curr->task);
  14607. +   list_del_init(&curr->task_list);
  14608. +}
  14609. +EXPORT_SYMBOL(swake_up_locked);
  14610. +
  14611. +void swake_up_all_locked(struct swait_queue_head *q)
  14612. +{
  14613. +   struct swait_queue *curr;
  14614. +   int wakes = 0;
  14615. +
  14616. +   while (!list_empty(&q->task_list)) {
  14617. +
  14618. +       curr = list_first_entry(&q->task_list, typeof(*curr),
  14619. +                   task_list);
  14620. +       wake_up_process(curr->task);
  14621. +       list_del_init(&curr->task_list);
  14622. +       wakes++;
  14623. +   }
  14624. +   //WARN_ON(wakes > 2);
  14625. +}
  14626. +EXPORT_SYMBOL(swake_up_all_locked);
  14627. +
  14628. +void swake_up(struct swait_queue_head *q)
  14629. +{
  14630. +   unsigned long flags;
  14631. +
  14632. +   raw_spin_lock_irqsave(&q->lock, flags);
  14633. +   swake_up_locked(q);
  14634. +   raw_spin_unlock_irqrestore(&q->lock, flags);
  14635. +}
  14636. +EXPORT_SYMBOL(swake_up);
  14637. +
  14638. +/*
  14639. + * Does not allow usage from IRQ disabled, since we must be able to
  14640. + * release IRQs to guarantee bounded hold time.
  14641. + */
  14642. +void swake_up_all(struct swait_queue_head *q)
  14643. +{
  14644. +   struct swait_queue *curr;
  14645. +   LIST_HEAD(tmp);
  14646. +
  14647. +   raw_spin_lock_irq(&q->lock);
  14648. +   list_splice_init(&q->task_list, &tmp);
  14649. +   while (!list_empty(&tmp)) {
  14650. +       curr = list_first_entry(&tmp, typeof(*curr), task_list);
  14651. +
  14652. +       wake_up_state(curr->task, TASK_NORMAL);
  14653. +       list_del_init(&curr->task_list);
  14654. +
  14655. +       if (list_empty(&tmp))
  14656. +           break;
  14657. +
  14658. +       raw_spin_unlock_irq(&q->lock);
  14659. +       raw_spin_lock_irq(&q->lock);
  14660. +   }
  14661. +   raw_spin_unlock_irq(&q->lock);
  14662. +}
  14663. +EXPORT_SYMBOL(swake_up_all);
  14664. +
  14665. +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
  14666. +{
  14667. +   wait->task = current;
  14668. +   if (list_empty(&wait->task_list))
  14669. +       list_add(&wait->task_list, &q->task_list);
  14670. +}
  14671. +
  14672. +void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
  14673. +{
  14674. +   unsigned long flags;
  14675. +
  14676. +   raw_spin_lock_irqsave(&q->lock, flags);
  14677. +   __prepare_to_swait(q, wait);
  14678. +   set_current_state(state);
  14679. +   raw_spin_unlock_irqrestore(&q->lock, flags);
  14680. +}
  14681. +EXPORT_SYMBOL(prepare_to_swait);
  14682. +
  14683. +long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
  14684. +{
  14685. +   if (signal_pending_state(state, current))
  14686. +       return -ERESTARTSYS;
  14687. +
  14688. +   prepare_to_swait(q, wait, state);
  14689. +
  14690. +   return 0;
  14691. +}
  14692. +EXPORT_SYMBOL(prepare_to_swait_event);
  14693. +
  14694. +void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
  14695. +{
  14696. +   __set_current_state(TASK_RUNNING);
  14697. +   if (!list_empty(&wait->task_list))
  14698. +       list_del_init(&wait->task_list);
  14699. +}
  14700. +
  14701. +void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
  14702. +{
  14703. +   unsigned long flags;
  14704. +
  14705. +   __set_current_state(TASK_RUNNING);
  14706. +
  14707. +   if (!list_empty_careful(&wait->task_list)) {
  14708. +       raw_spin_lock_irqsave(&q->lock, flags);
  14709. +       list_del_init(&wait->task_list);
  14710. +       raw_spin_unlock_irqrestore(&q->lock, flags);
  14711. +   }
  14712. +}
  14713. +EXPORT_SYMBOL(finish_swait);
  14714. diff -Nur /home/ninez/android/marlin/kernel/sched/swork.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/swork.c
  14715. --- /home/ninez/android/marlin/kernel/sched/swork.c 1969-12-31 19:00:00.000000000 -0500
  14716. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/swork.c 2018-08-12 21:14:08.273505429 -0400
  14717. @@ -0,0 +1,172 @@
  14718. +/*
  14719. + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner [email protected]
  14720. + *
  14721. + * Provides a framework for enqueuing callbacks from irq context
  14722. + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
  14723. + */
  14724. +
  14725. +#include <linux/swait.h>
  14726. +#include <linux/swork.h>
  14727. +#include <linux/kthread.h>
  14728. +#include <linux/slab.h>
  14729. +#include <linux/spinlock.h>
  14730. +
  14731. +#define SWORK_EVENT_PENDING     (1 << 0)
  14732. +
  14733. +static DEFINE_MUTEX(worker_mutex);
  14734. +static struct sworker *glob_worker;
  14735. +
  14736. +struct sworker {
  14737. +   struct list_head events;
  14738. +   struct swait_queue_head wq;
  14739. +
  14740. +   raw_spinlock_t lock;
  14741. +
  14742. +   struct task_struct *task;
  14743. +   int refs;
  14744. +};
  14745. +
  14746. +static bool swork_readable(struct sworker *worker)
  14747. +{
  14748. +   bool r;
  14749. +
  14750. +   if (kthread_should_stop())
  14751. +       return true;
  14752. +
  14753. +   raw_spin_lock_irq(&worker->lock);
  14754. +   r = !list_empty(&worker->events);
  14755. +   raw_spin_unlock_irq(&worker->lock);
  14756. +
  14757. +   return r;
  14758. +}
  14759. +
  14760. +static int swork_kthread(void *arg)
  14761. +{
  14762. +   struct sworker *worker = arg;
  14763. +
  14764. +   for (;;) {
  14765. +       swait_event_interruptible(worker->wq,
  14766. +                   swork_readable(worker));
  14767. +       if (kthread_should_stop())
  14768. +           break;
  14769. +
  14770. +       raw_spin_lock_irq(&worker->lock);
  14771. +       while (!list_empty(&worker->events)) {
  14772. +           struct swork_event *sev;
  14773. +
  14774. +           sev = list_first_entry(&worker->events,
  14775. +                   struct swork_event, item);
  14776. +           list_del(&sev->item);
  14777. +           raw_spin_unlock_irq(&worker->lock);
  14778. +
  14779. +           WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
  14780. +                            &sev->flags));
  14781. +           sev->func(sev);
  14782. +           raw_spin_lock_irq(&worker->lock);
  14783. +       }
  14784. +       raw_spin_unlock_irq(&worker->lock);
  14785. +   }
  14786. +   return 0;
  14787. +}
  14788. +
  14789. +static struct sworker *swork_create(void)
  14790. +{
  14791. +   struct sworker *worker;
  14792. +
  14793. +   worker = kzalloc(sizeof(*worker), GFP_KERNEL);
  14794. +   if (!worker)
  14795. +       return ERR_PTR(-ENOMEM);
  14796. +
  14797. +   INIT_LIST_HEAD(&worker->events);
  14798. +   raw_spin_lock_init(&worker->lock);
  14799. +   init_swait_queue_head(&worker->wq);
  14800. +
  14801. +   worker->task = kthread_run(swork_kthread, worker, "kswork");
  14802. +   if (IS_ERR(worker->task)) {
  14803. +       kfree(worker);
  14804. +       return ERR_PTR(-ENOMEM);
  14805. +   }
  14806. +
  14807. +   return worker;
  14808. +}
  14809. +
  14810. +static void swork_destroy(struct sworker *worker)
  14811. +{
  14812. +   kthread_stop(worker->task);
  14813. +
  14814. +   WARN_ON(!list_empty(&worker->events));
  14815. +   kfree(worker);
  14816. +}
  14817. +
  14818. +/**
  14819. + * swork_queue - queue swork
  14820. + *
  14821. + * Returns %false if @work was already on a queue, %true otherwise.
  14822. + *
  14823. + * The work is queued and processed on a random CPU
  14824. + */
  14825. +bool swork_queue(struct swork_event *sev)
  14826. +{
  14827. +   unsigned long flags;
  14828. +
  14829. +   if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
  14830. +       return false;
  14831. +
  14832. +   raw_spin_lock_irqsave(&glob_worker->lock, flags);
  14833. +   list_add_tail(&sev->item, &glob_worker->events);
  14834. +   raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
  14835. +
  14836. +   swake_up(&glob_worker->wq);
  14837. +   return true;
  14838. +}
  14839. +EXPORT_SYMBOL_GPL(swork_queue);
  14840. +
  14841. +/**
  14842. + * swork_get - get an instance of the sworker
  14843. + *
  14844. + * Returns an negative error code if the initialization if the worker did not
  14845. + * work, %0 otherwise.
  14846. + *
  14847. + */
  14848. +int swork_get(void)
  14849. +{
  14850. +   struct sworker *worker;
  14851. +
  14852. +   mutex_lock(&worker_mutex);
  14853. +   if (!glob_worker) {
  14854. +       worker = swork_create();
  14855. +       if (IS_ERR(worker)) {
  14856. +           mutex_unlock(&worker_mutex);
  14857. +           return -ENOMEM;
  14858. +       }
  14859. +
  14860. +       glob_worker = worker;
  14861. +   }
  14862. +
  14863. +   glob_worker->refs++;
  14864. +   mutex_unlock(&worker_mutex);
  14865. +
  14866. +   return 0;
  14867. +}
  14868. +EXPORT_SYMBOL_GPL(swork_get);
  14869. +
  14870. +/**
  14871. + * swork_put - puts an instance of the sworker
  14872. + *
  14873. + * Will destroy the sworker thread. This function must not be called until all
  14874. + * queued events have been completed.
  14875. + */
  14876. +void swork_put(void)
  14877. +{
  14878. +   mutex_lock(&worker_mutex);
  14879. +
  14880. +   glob_worker->refs--;
  14881. +   if (glob_worker->refs > 0)
  14882. +       goto out;
  14883. +
  14884. +   swork_destroy(glob_worker);
  14885. +   glob_worker = NULL;
  14886. +out:
  14887. +   mutex_unlock(&worker_mutex);
  14888. +}
  14889. +EXPORT_SYMBOL_GPL(swork_put);
  14890. diff -Nur /home/ninez/android/marlin/kernel/sched/tune.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/tune.c
  14891. --- /home/ninez/android/marlin/kernel/sched/tune.c  2018-08-10 01:54:08.566728454 -0400
  14892. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/tune.c  2018-08-14 15:53:43.604124856 -0400
  14893. @@ -12,13 +12,25 @@
  14894.  #include "tune.h"
  14895.  
  14896.  #ifdef CONFIG_CGROUP_SCHEDTUNE
  14897. -static bool schedtune_initialized = false;
  14898. +bool schedtune_initialized = false;
  14899.  #endif
  14900.  
  14901. -unsigned int sysctl_sched_cfs_boost __read_mostly;
  14902. +extern struct rq *lock_rq_of(struct task_struct *p, struct rq_flags *rf);
  14903. +extern void unlock_rq_of(struct rq *rq, struct task_struct *p, struct rq_flags *rf);
  14904.  
  14905. -static struct reciprocal_value schedtune_spc_rdiv;
  14906. -extern struct target_nrg schedtune_target_nrg;
  14907. +int sysctl_sched_cfs_boost __read_mostly;
  14908. +
  14909. +/* We hold schedtune boost in effect for at least this long */
  14910. +#define SCHEDTUNE_BOOST_HOLD_NS 50000000ULL
  14911. +
  14912. +extern struct reciprocal_value schedtune_spc_rdiv;
  14913. +struct target_nrg schedtune_target_nrg;
  14914. +
  14915. +#ifdef CONFIG_DYNAMIC_STUNE_BOOST
  14916. +static DEFINE_MUTEX(stune_boost_mutex);
  14917. +static struct schedtune *getSchedtune(char *st_name);
  14918. +static int dynamic_boost_write(struct schedtune *st, int boost);
  14919. +#endif /* CONFIG_DYNAMIC_STUNE_BOOST */
  14920.  
  14921.  /* Performance Boost region (B) threshold params */
  14922.  static int perf_boost_idx;
  14923. @@ -130,6 +142,14 @@
  14924.     /* Hint to bias scheduling of tasks on that SchedTune CGroup
  14925.      * towards idle CPUs */
  14926.     int prefer_idle;
  14927. +
  14928. +#ifdef CONFIG_DYNAMIC_STUNE_BOOST
  14929. +   /*
  14930. +    * This tracks the default boost value and is used to restore
  14931. +    * the value when Dynamic SchedTune Boost is reset.
  14932. +    */
  14933. +   int boost_default;
  14934. +#endif /* CONFIG_DYNAMIC_STUNE_BOOST */
  14935.  };
  14936.  
  14937.  static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
  14938. @@ -162,6 +182,9 @@
  14939.     .perf_boost_idx = 0,
  14940.     .perf_constrain_idx = 0,
  14941.     .prefer_idle = 0,
  14942. +#ifdef CONFIG_DYNAMIC_STUNE_BOOST
  14943. +   .boost_default = 0,
  14944. +#endif /* CONFIG_DYNAMIC_STUNE_BOOST */
  14945.  };
  14946.  
  14947.  int
  14948. @@ -206,7 +229,8 @@
  14949.   *    implementation especially for the computation of the per-CPU boost
  14950.   *    value
  14951.   */
  14952. -#define BOOSTGROUPS_COUNT 5
  14953. +
  14954. +#define BOOSTGROUPS_COUNT 7
  14955.  
  14956.  /* Array of configured boostgroups */
  14957.  static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
  14958. @@ -226,45 +250,68 @@
  14959.     /* Maximum boost value for all RUNNABLE tasks on a CPU */
  14960.     bool idle;
  14961.     int boost_max;
  14962. +   u64 boost_ts;
  14963.     struct {
  14964.         /* The boost for tasks on that boost group */
  14965.         int boost;
  14966.         /* Count of RUNNABLE tasks on that boost group */
  14967.         unsigned tasks;
  14968. +       /* Timestamp of boost activation */
  14969. +       u64 ts;
  14970.     } group[BOOSTGROUPS_COUNT];
  14971.     /* CPU's boost group locking */
  14972.     raw_spinlock_t lock;
  14973.  };
  14974.  
  14975.  /* Boost groups affecting each CPU in the system */
  14976. -DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
  14977. +static DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
  14978. +
  14979. +static inline bool schedtune_boost_timeout(u64 now, u64 ts)
  14980. +{
  14981. +   return ((now - ts) > SCHEDTUNE_BOOST_HOLD_NS);
  14982. +}
  14983. +
  14984. +static inline bool
  14985. +schedtune_boost_group_active(int idx, struct boost_groups* bg, u64 now)
  14986. +{
  14987. +   if (bg->group[idx].tasks)
  14988. +       return true;
  14989. +
  14990. +   return !schedtune_boost_timeout(now, bg->group[idx].ts);
  14991. +}
  14992.  
  14993.  static void
  14994. -schedtune_cpu_update(int cpu)
  14995. +schedtune_cpu_update(int cpu, u64 now)
  14996.  {
  14997.     struct boost_groups *bg;
  14998. -   int boost_max;
  14999. +   u64 boost_ts = now;
  15000. +   int boost_max = INT_MIN;
  15001.     int idx;
  15002.  
  15003.     bg = &per_cpu(cpu_boost_groups, cpu);
  15004.  
  15005. -   /* The root boost group is always active */
  15006. -   boost_max = bg->group[0].boost;
  15007. -   for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
  15008. +   for (idx = 0; idx < BOOSTGROUPS_COUNT; ++idx) {
  15009.         /*
  15010.          * A boost group affects a CPU only if it has
  15011. -        * RUNNABLE tasks on that CPU
  15012. +        * RUNNABLE tasks on that CPU or it has hold
  15013. +        * in effect from a previous task.
  15014.          */
  15015. -       if (bg->group[idx].tasks == 0)
  15016. +       if (!schedtune_boost_group_active(idx, bg, now))
  15017. +           continue;
  15018. +
  15019. +       /* this boost group is active */
  15020. +       if (boost_max > bg->group[idx].boost)
  15021.             continue;
  15022.  
  15023. -       boost_max = max(boost_max, bg->group[idx].boost);
  15024. +       boost_max = bg->group[idx].boost;
  15025. +       boost_ts =  bg->group[idx].ts;
  15026.     }
  15027. -   /* Ensures boost_max is non-negative when all cgroup boost values
  15028. -    * are neagtive. Avoids under-accounting of cpu capacity which may cause
  15029. -    * task stacking and frequency spikes.*/
  15030. -   boost_max = max(boost_max, 0);
  15031. +
  15032. +   /* If there are no active boost groups on the CPU, set no boost  */
  15033. +   if (boost_max == INT_MIN)
  15034. +       boost_max = 0;
  15035.     bg->boost_max = boost_max;
  15036. +   bg->boost_ts = boost_ts;
  15037.  }
  15038.  
  15039.  static int
  15040. @@ -274,6 +321,7 @@
  15041.     int cur_boost_max;
  15042.     int old_boost;
  15043.     int cpu;
  15044. +   u64 now;
  15045.  
  15046.     /* Update per CPU boost groups */
  15047.     for_each_possible_cpu(cpu) {
  15048. @@ -290,16 +338,22 @@
  15049.         /* Update the boost value of this boost group */
  15050.         bg->group[idx].boost = boost;
  15051.  
  15052. -       /* Check if this update increase current max */
  15053. -       if (boost > cur_boost_max && bg->group[idx].tasks) {
  15054. +       now = sched_clock_cpu(cpu);
  15055. +       /*
  15056. +        * Check if this update increase current max.
  15057. +        */
  15058. +       if (boost > cur_boost_max &&
  15059. +           schedtune_boost_group_active(idx, bg, now)) {
  15060.             bg->boost_max = boost;
  15061. +           bg->boost_ts = bg->group[idx].ts;
  15062. +
  15063.             trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
  15064.             continue;
  15065.         }
  15066.  
  15067.         /* Check if this update has decreased current max */
  15068.         if (cur_boost_max == old_boost && old_boost > boost) {
  15069. -           schedtune_cpu_update(cpu);
  15070. +           schedtune_cpu_update(cpu, now);
  15071.             trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
  15072.             continue;
  15073.         }
  15074. @@ -313,21 +367,38 @@
  15075.  #define ENQUEUE_TASK  1
  15076.  #define DEQUEUE_TASK -1
  15077.  
  15078. +static inline bool
  15079. +schedtune_update_timestamp(struct task_struct *p)
  15080. +{
  15081. +   if (sched_feat(SCHEDTUNE_BOOST_HOLD_ALL))
  15082. +       return true;
  15083. +
  15084. +   return task_has_rt_policy(p);
  15085. +}
  15086. +
  15087.  static inline void
  15088.  schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
  15089.  {
  15090.     struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
  15091.     int tasks = bg->group[idx].tasks + task_count;
  15092. +   u64 now;
  15093.  
  15094.     /* Update boosted tasks count while avoiding to make it negative */
  15095.     bg->group[idx].tasks = max(0, tasks);
  15096. +   /* Update timeout on enqueue */
  15097. +   if (task_count > 0) {
  15098. +       now = sched_clock_cpu(cpu);
  15099. +       if (schedtune_update_timestamp(p))
  15100. +           bg->group[idx].ts = now;
  15101. +
  15102. +       /* Boost group activation or deactivation on that RQ */
  15103. +       if (bg->group[idx].tasks == 1)
  15104. +           schedtune_cpu_update(cpu, now);
  15105. +   }
  15106.  
  15107.     trace_sched_tune_tasks_update(p, cpu, tasks, idx,
  15108. -           bg->group[idx].boost, bg->boost_max);
  15109. -
  15110. -   /* Boost group activation or deactivation on that RQ */
  15111. -   if (tasks == 1 || tasks == 0)
  15112. -       schedtune_cpu_update(cpu);
  15113. +           bg->group[idx].boost, bg->boost_max,
  15114. +           bg->group[idx].ts);
  15115.  }
  15116.  
  15117.  /*
  15118. @@ -381,12 +452,13 @@
  15119.  {
  15120.     struct task_struct *task;
  15121.     struct boost_groups *bg;
  15122. -   unsigned long irq_flags;
  15123. +   struct rq_flags irq_flags;
  15124.     unsigned int cpu;
  15125.     struct rq *rq;
  15126.     int src_bg; /* Source boost group index */
  15127.     int dst_bg; /* Destination boost group index */
  15128.     int tasks;
  15129. +   u64 now;
  15130.  
  15131.     if (!unlikely(schedtune_initialized))
  15132.         return 0;
  15133. @@ -431,18 +503,19 @@
  15134.          * current boost group.
  15135.          */
  15136.  
  15137. +       now = sched_clock_cpu(cpu);
  15138. +
  15139.         /* Move task from src to dst boost group */
  15140.         tasks = bg->group[src_bg].tasks - 1;
  15141.         bg->group[src_bg].tasks = max(0, tasks);
  15142.         bg->group[dst_bg].tasks += 1;
  15143. +       bg->group[dst_bg].ts = now;
  15144. +
  15145. +       /* update next time someone asks */
  15146. +       bg->boost_ts = now - SCHEDTUNE_BOOST_HOLD_NS;
  15147.  
  15148.         raw_spin_unlock(&bg->lock);
  15149.         unlock_rq_of(rq, task, &irq_flags);
  15150. -
  15151. -       /* Update CPU boost group */
  15152. -       if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
  15153. -           schedtune_cpu_update(task_cpu(task));
  15154. -
  15155.     }
  15156.  
  15157.     return 0;
  15158. @@ -501,7 +574,7 @@
  15159.  void schedtune_exit_task(struct task_struct *tsk)
  15160.  {
  15161.     struct schedtune *st;
  15162. -   unsigned long irq_flags;
  15163. +   struct rq_flags irq_flags;
  15164.     unsigned int cpu;
  15165.     struct rq *rq;
  15166.     int idx;
  15167. @@ -524,8 +597,15 @@
  15168.  int schedtune_cpu_boost(int cpu)
  15169.  {
  15170.     struct boost_groups *bg;
  15171. +   u64 now;
  15172.  
  15173.     bg = &per_cpu(cpu_boost_groups, cpu);
  15174. +   now = sched_clock_cpu(cpu);
  15175. +
  15176. +   /* check to see if we have a hold in effect */
  15177. +   if (schedtune_boost_timeout(now, bg->boost_ts))
  15178. +       schedtune_cpu_update(cpu, now);
  15179. +
  15180.     return bg->boost_max;
  15181.  }
  15182.  
  15183. @@ -534,6 +614,9 @@
  15184.     struct schedtune *st;
  15185.     int task_boost;
  15186.  
  15187. +   if (!unlikely(schedtune_initialized))
  15188. +       return 0;
  15189. +
  15190.     /* Get task boost value */
  15191.     rcu_read_lock();
  15192.     st = task_schedtune(p);
  15193. @@ -548,6 +631,9 @@
  15194.     struct schedtune *st;
  15195.     int prefer_idle;
  15196.  
  15197. +   if (!unlikely(schedtune_initialized))
  15198. +       return 0;
  15199. +
  15200.     /* Get prefer_idle value */
  15201.     rcu_read_lock();
  15202.     st = task_schedtune(p);
  15203. @@ -606,6 +692,9 @@
  15204.     st->perf_constrain_idx = threshold_idx;
  15205.  
  15206.     st->boost = boost;
  15207. +#ifdef CONFIG_DYNAMIC_STUNE_BOOST
  15208. +   st->boost_default = boost;
  15209. +#endif /* CONFIG_DYNAMIC_STUNE_BOOST */
  15210.     if (css == &root_schedtune.css) {
  15211.         sysctl_sched_cfs_boost = boost;
  15212.         perf_boost_idx  = threshold_idx;
  15213. @@ -615,11 +704,11 @@
  15214.     /* Update CPU boost */
  15215.     schedtune_boostgroup_update(st->idx, st->boost);
  15216.  
  15217. -   trace_sched_tune_config(st->boost,
  15218. -           threshold_gains[st->perf_boost_idx].nrg_gain,
  15219. -           threshold_gains[st->perf_boost_idx].cap_gain,
  15220. -           threshold_gains[st->perf_constrain_idx].nrg_gain,
  15221. -           threshold_gains[st->perf_constrain_idx].cap_gain);
  15222. +// trace_sched_tune_config(st->boost,
  15223. +//         threshold_gains[st->perf_boost_idx].nrg_gain,
  15224. +//         threshold_gains[st->perf_boost_idx].cap_gain,
  15225. +//         threshold_gains[st->perf_constrain_idx].nrg_gain,
  15226. +//         threshold_gains[st->perf_constrain_idx].cap_gain);
  15227.  
  15228.     return 0;
  15229.  }
  15230. @@ -652,6 +741,8 @@
  15231.         bg = &per_cpu(cpu_boost_groups, cpu);
  15232.         bg->group[st->idx].boost = 0;
  15233.         bg->group[st->idx].tasks = 0;
  15234. +       bg->group[st->idx].ts = 0;
  15235. +       raw_spin_lock_init(&bg->lock);
  15236.     }
  15237.  
  15238.     return 0;
  15239. @@ -747,6 +838,114 @@
  15240.     schedtune_initialized = true;
  15241.  }
  15242.  
  15243. +#ifdef CONFIG_DYNAMIC_STUNE_BOOST
  15244. +static struct schedtune *getSchedtune(char *st_name)
  15245. +{
  15246. +   int idx;
  15247. +
  15248. +   for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
  15249. +       char name_buf[NAME_MAX + 1];
  15250. +       struct schedtune *st = allocated_group[idx];
  15251. +
  15252. +       if (!st) {
  15253. +           pr_warn("SCHEDTUNE: Could not find %s\n", st_name);
  15254. +           break;
  15255. +       }
  15256. +
  15257. +       cgroup_name(st->css.cgroup, name_buf, sizeof(name_buf));
  15258. +       if (strncmp(name_buf, st_name, strlen(st_name)) == 0)
  15259. +           return st;
  15260. +   }
  15261. +
  15262. +   return NULL;
  15263. +}
  15264. +
  15265. +static int dynamic_boost_write(struct schedtune *st, int boost)
  15266. +{
  15267. +   int ret;
  15268. +   /* Backup boost_default */
  15269. +   int boost_default_backup = st->boost_default;
  15270. +
  15271. +   ret = boost_write(&st->css, NULL, boost);
  15272. +
  15273. +   /* Restore boost_default */
  15274. +   st->boost_default = boost_default_backup;
  15275. +
  15276. +   return ret;
  15277. +}
  15278. +
  15279. +int do_stune_boost(char *st_name, int boost)
  15280. +{
  15281. +   int ret = 0;
  15282. +   struct schedtune *st = getSchedtune(st_name);
  15283. +
  15284. +   if (!st)
  15285. +       return -EINVAL;
  15286. +
  15287. +   mutex_lock(&stune_boost_mutex);
  15288. +
  15289. +   /* Boost if new value is greater than current */
  15290. +   if (boost > st->boost)
  15291. +       ret = dynamic_boost_write(st, boost);
  15292. +
  15293. +   mutex_unlock(&stune_boost_mutex);
  15294. +
  15295. +   return ret;
  15296. +}
  15297. +
  15298. +int do_stune_unboost(char *st_name, int boost)
  15299. +{
  15300. +   int ret = 0;
  15301. +   struct schedtune *st = getSchedtune(st_name);
  15302. +
  15303. +   if (!st)
  15304. +       return -EINVAL;
  15305. +
  15306. +   mutex_lock(&stune_boost_mutex);
  15307. +
  15308. +   /* Unboost if new value is less than current */
  15309. +   if (boost < st->boost)
  15310. +       ret = dynamic_boost_write(st, boost);
  15311. +
  15312. +   mutex_unlock(&stune_boost_mutex);
  15313. +
  15314. +   return ret;
  15315. +}
  15316. +
  15317. +int set_stune_boost(char *st_name, int boost)
  15318. +{
  15319. +   int ret = 0;
  15320. +   struct schedtune *st = getSchedtune(st_name);
  15321. +
  15322. +   if (!st)
  15323. +       return -EINVAL;
  15324. +
  15325. +   mutex_lock(&stune_boost_mutex);
  15326. +
  15327. +   /* Set Boost regardless if new value is greater than current */
  15328. +   ret = dynamic_boost_write(st, boost);
  15329. +
  15330. +   mutex_unlock(&stune_boost_mutex);
  15331. +
  15332. +   return ret;
  15333. +}
  15334. +
  15335. +int reset_stune_boost(char *st_name)
  15336. +{
  15337. +   int ret = 0;
  15338. +   struct schedtune *st = getSchedtune(st_name);
  15339. +
  15340. +   if (!st)
  15341. +       return -EINVAL;
  15342. +
  15343. +   mutex_lock(&stune_boost_mutex);
  15344. +   ret = dynamic_boost_write(st, st->boost_default);
  15345. +   mutex_unlock(&stune_boost_mutex);
  15346. +
  15347. +   return ret;
  15348. +}
  15349. +#endif /* CONFIG_DYNAMIC_STUNE_BOOST */
  15350. +
  15351.  #else /* CONFIG_CGROUP_SCHEDTUNE */
  15352.  
  15353.  int
  15354. @@ -894,79 +1093,6 @@
  15355.     }
  15356.  }
  15357.  
  15358. -static long
  15359. -schedtune_margin(unsigned long signal, long boost)
  15360. -{
  15361. -   long long margin = 0;
  15362. -
  15363. -   /*
  15364. -    * Signal proportional compensation (SPC)
  15365. -    *
  15366. -    * The Boost (B) value is used to compute a Margin (M) which is
  15367. -    * proportional to the complement of the original Signal (S):
  15368. -    *   M = B * (SCHED_CAPACITY_SCALE - S)
  15369. -    * The obtained M could be used by the caller to "boost" S.
  15370. -    */
  15371. -   if (boost >= 0) {
  15372. -       margin  = SCHED_CAPACITY_SCALE - signal;
  15373. -       margin *= boost;
  15374. -   } else
  15375. -       margin = -signal * boost;
  15376. -
  15377. -   margin  = reciprocal_divide(margin, schedtune_spc_rdiv);
  15378. -
  15379. -   if (boost < 0)
  15380. -       margin *= -1;
  15381. -   return margin;
  15382. -}
  15383. -
  15384. -static inline int
  15385. -schedtune_cpu_margin(unsigned long util, int cpu)
  15386. -{
  15387. -   int boost = schedtune_cpu_boost(cpu);
  15388. -
  15389. -   if (boost == 0)
  15390. -       return 0;
  15391. -
  15392. -   return schedtune_margin(util, boost);
  15393. -}
  15394. -
  15395. -static inline long
  15396. -schedtune_task_margin(struct task_struct *task)
  15397. -{
  15398. -   int boost = schedtune_task_boost(task);
  15399. -   unsigned long util;
  15400. -   long margin;
  15401. -
  15402. -   if (boost == 0)
  15403. -       return 0;
  15404. -
  15405. -   util = task_util(task, UTIL_AVG);
  15406. -   margin = schedtune_margin(util, boost);
  15407. -
  15408. -   return margin;
  15409. -}
  15410. -
  15411. -unsigned long boosted_cpu_util(int cpu)
  15412. -{
  15413. -   unsigned long util = cpu_util(cpu, UTIL_EST);
  15414. -   long margin = schedtune_cpu_margin(util, cpu);
  15415. -
  15416. -   trace_sched_boost_cpu(cpu, util, margin);
  15417. -
  15418. -   return util + margin;
  15419. -}
  15420. -
  15421. -unsigned long boosted_task_util(struct task_struct *task)
  15422. -{
  15423. -   unsigned long util = task_util(task, UTIL_EST);
  15424. -   long margin = schedtune_task_margin(task);
  15425. -
  15426. -   trace_sched_boost_task(task, util, margin);
  15427. -
  15428. -   return util + margin;
  15429. -}
  15430. -
  15431.  /*
  15432.   * Initialize the constants required to compute normalized energy.
  15433.   * The values of these constants depends on the EM data for the specific
  15434. @@ -1033,3 +1159,4 @@
  15435.  }
  15436.  postcore_initcall(schedtune_init);
  15437.  
  15438. +
  15439. diff -Nur /home/ninez/android/marlin/kernel/sched/tune.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/tune.h
  15440. --- /home/ninez/android/marlin/kernel/sched/tune.h  2018-08-10 01:54:08.566728454 -0400
  15441. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/tune.h  2018-08-23 19:57:44.817608733 -0400
  15442. @@ -24,6 +24,9 @@
  15443.  void schedtune_enqueue_task(struct task_struct *p, int cpu);
  15444.  void schedtune_dequeue_task(struct task_struct *p, int cpu);
  15445.  
  15446. +int schedtune_accept_deltas(int nrg_delta, int cap_delta,
  15447. +               struct task_struct *task);
  15448. +
  15449.  #else /* CONFIG_CGROUP_SCHEDTUNE */
  15450.  
  15451.  #define schedtune_cpu_boost(cpu)  get_sysctl_sched_cfs_boost()
  15452. @@ -39,13 +42,6 @@
  15453.  int schedtune_accept_deltas(int nrg_delta, int cap_delta,
  15454.                 struct task_struct *task);
  15455.  
  15456. -#ifdef CONFIG_SMP
  15457. -unsigned long boosted_cpu_util(int cpu);
  15458. -#else
  15459. -#define boosted_cpu_util(cpu) cpu_util(cpu, UTIL_EST);
  15460. -#endif
  15461. -unsigned long boosted_task_util(struct task_struct *task);
  15462. -
  15463.  #else /* CONFIG_SCHED_TUNE */
  15464.  
  15465.  #define schedtune_cpu_boost(cpu)  0
  15466. @@ -58,7 +54,4 @@
  15467.  
  15468.  #define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta
  15469.  
  15470. -#define boosted_cpu_util(cpu) cpu_util(cpu, UTIL_EST);
  15471. -#define boosted_task_util(cpu) task_util(cpu, UTIL_EST);
  15472. -
  15473.  #endif /* CONFIG_SCHED_TUNE */
  15474. diff -Nur /home/ninez/android/marlin/kernel/sched/wait.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/wait.c
  15475. --- /home/ninez/android/marlin/kernel/sched/wait.c  2018-08-10 01:54:08.566728454 -0400
  15476. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/wait.c  2018-08-11 23:57:17.131940887 -0400
  15477. @@ -9,6 +9,7 @@
  15478.  #include <linux/mm.h>
  15479.  #include <linux/wait.h>
  15480.  #include <linux/hash.h>
  15481. +#include <linux/kthread.h>
  15482.  
  15483.  void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
  15484.  {
  15485. @@ -297,6 +298,10 @@
  15486.  }
  15487.  EXPORT_SYMBOL(autoremove_wake_function);
  15488.  
  15489. +static inline bool is_kthread_should_stop(void)
  15490. +{
  15491. +   return (current->flags & PF_KTHREAD) && kthread_should_stop();
  15492. +}
  15493.  
  15494.  /*
  15495.   * DEFINE_WAIT_FUNC(wait, woken_wake_func);
  15496. @@ -326,7 +331,7 @@
  15497.      * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
  15498.      * also observe all state before the wakeup.
  15499.      */
  15500. -   if (!(wait->flags & WQ_FLAG_WOKEN))
  15501. +   if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
  15502.         timeout = schedule_timeout(timeout);
  15503.     __set_current_state(TASK_RUNNING);
  15504.  
  15505. @@ -336,7 +341,7 @@
  15506.      * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
  15507.      * an event.
  15508.      */
  15509. -   set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
  15510. +   smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
  15511.  
  15512.     return timeout;
  15513.  }
  15514. @@ -349,7 +354,7 @@
  15515.      * doesn't imply write barrier and the users expects write
  15516.      * barrier semantics on wakeup functions.  The following
  15517.      * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
  15518. -    * and is paired with set_mb() in wait_woken().
  15519. +    * and is paired with smp_store_mb() in wait_woken().
  15520.      */
  15521.     smp_wmb(); /* C */
  15522.     wait->flags |= WQ_FLAG_WOKEN;
  15523. diff -Nur /home/ninez/android/marlin/kernel/sched/walt.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/walt.c
  15524. --- /home/ninez/android/marlin/kernel/sched/walt.c  2018-08-10 01:54:08.566728454 -0400
  15525. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/walt.c  2018-08-11 23:57:17.131940887 -0400
  15526. @@ -20,7 +20,6 @@
  15527.   */
  15528.  
  15529.  #include <linux/syscore_ops.h>
  15530. -#include <linux/cpufreq.h>
  15531.  #include <trace/events/sched.h>
  15532.  #include "sched.h"
  15533.  #include "walt.h"
  15534. @@ -42,57 +41,49 @@
  15535.  
  15536.  unsigned int sysctl_sched_walt_init_task_load_pct = 15;
  15537.  
  15538. -/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
  15539. -unsigned int __read_mostly walt_disabled = 0;
  15540. -
  15541. -static unsigned int max_possible_efficiency = 1024;
  15542. -static unsigned int min_possible_efficiency = 1024;
  15543. +/* true -> use PELT based load stats, false -> use window-based load stats */
  15544. +bool __read_mostly walt_disabled = false;
  15545.  
  15546.  /*
  15547. - * Maximum possible frequency across all cpus. Task demand and cpu
  15548. - * capacity (cpu_power) metrics are scaled in reference to it.
  15549. + * Window size (in ns). Adjust for the tick size so that the window
  15550. + * rollover occurs just before the tick boundary.
  15551.   */
  15552. -static unsigned int max_possible_freq = 1;
  15553. -
  15554. -/*
  15555. - * Minimum possible max_freq across all cpus. This will be same as
  15556. - * max_possible_freq on homogeneous systems and could be different from
  15557. - * max_possible_freq on heterogenous systems. min_max_freq is used to derive
  15558. - * capacity (cpu_power) of cpus.
  15559. - */
  15560. -static unsigned int min_max_freq = 1;
  15561. -
  15562. -static unsigned int max_capacity = 1024;
  15563. -static unsigned int min_capacity = 1024;
  15564. -static unsigned int max_load_scale_factor = 1024;
  15565. -static unsigned int max_possible_capacity = 1024;
  15566. -
  15567. -/* Mask of all CPUs that have  max_possible_capacity */
  15568. -static cpumask_t mpc_mask = CPU_MASK_ALL;
  15569. -
  15570. -/* Window size (in ns) */
  15571. -__read_mostly unsigned int walt_ravg_window = 20000000;
  15572. -
  15573. -/* Min window size (in ns) = 10ms */
  15574. -#define MIN_SCHED_RAVG_WINDOW 10000000
  15575. -
  15576. -/* Max window size (in ns) = 1s */
  15577. -#define MAX_SCHED_RAVG_WINDOW 1000000000
  15578. +__read_mostly unsigned int walt_ravg_window =
  15579. +                       (20000000 / TICK_NSEC) * TICK_NSEC;
  15580. +#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC)
  15581. +#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC)
  15582.  
  15583.  static unsigned int sync_cpu;
  15584.  static ktime_t ktime_last;
  15585. -static bool walt_ktime_suspended;
  15586. +static __read_mostly bool walt_ktime_suspended;
  15587.  
  15588.  static unsigned int task_load(struct task_struct *p)
  15589.  {
  15590.     return p->ravg.demand;
  15591.  }
  15592.  
  15593. +static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
  15594. +{
  15595. +   rq->cum_window_demand += delta;
  15596. +   if (unlikely((s64)rq->cum_window_demand < 0))
  15597. +       rq->cum_window_demand = 0;
  15598. +}
  15599. +
  15600.  void
  15601.  walt_inc_cumulative_runnable_avg(struct rq *rq,
  15602.                  struct task_struct *p)
  15603.  {
  15604.     rq->cumulative_runnable_avg += p->ravg.demand;
  15605. +
  15606. +   /*
  15607. +    * Add a task's contribution to the cumulative window demand when
  15608. +    *
  15609. +    * (1) task is enqueued with on_rq = 1 i.e migration,
  15610. +    *     prio/cgroup/class change.
  15611. +    * (2) task is waking for the first time in this window.
  15612. +    */
  15613. +   if (p->on_rq || (p->last_sleep_ts < rq->window_start))
  15614. +       fixup_cum_window_demand(rq, p->ravg.demand);
  15615.  }
  15616.  
  15617.  void
  15618. @@ -101,16 +92,28 @@
  15619.  {
  15620.     rq->cumulative_runnable_avg -= p->ravg.demand;
  15621.     BUG_ON((s64)rq->cumulative_runnable_avg < 0);
  15622. +
  15623. +   /*
  15624. +    * on_rq will be 1 for sleeping tasks. So check if the task
  15625. +    * is migrating or dequeuing in RUNNING state to change the
  15626. +    * prio/cgroup/class.
  15627. +    */
  15628. +   if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
  15629. +       fixup_cum_window_demand(rq, -(s64)p->ravg.demand);
  15630.  }
  15631.  
  15632.  static void
  15633.  fixup_cumulative_runnable_avg(struct rq *rq,
  15634. -                 struct task_struct *p, s64 task_load_delta)
  15635. +                 struct task_struct *p, u64 new_task_load)
  15636.  {
  15637. +   s64 task_load_delta = (s64)new_task_load - task_load(p);
  15638. +
  15639.     rq->cumulative_runnable_avg += task_load_delta;
  15640.     if ((s64)rq->cumulative_runnable_avg < 0)
  15641.         panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
  15642.             task_load_delta, task_load(p));
  15643. +
  15644. +   fixup_cum_window_demand(rq, task_load_delta);
  15645.  }
  15646.  
  15647.  u64 walt_ktime_clock(void)
  15648. @@ -169,16 +172,33 @@
  15649.  
  15650.  static int __init set_walt_ravg_window(char *str)
  15651.  {
  15652. +   unsigned int adj_window;
  15653. +   bool no_walt = walt_disabled;
  15654. +
  15655.     get_option(&str, &walt_ravg_window);
  15656.  
  15657. -   walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
  15658. -               walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
  15659. +   /* Adjust for CONFIG_HZ */
  15660. +   adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC;
  15661. +
  15662. +   /* Warn if we're a bit too far away from the expected window size */
  15663. +   WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC,
  15664. +        "tick-adjusted window size %u, original was %u\n", adj_window,
  15665. +        walt_ravg_window);
  15666. +
  15667. +   walt_ravg_window = adj_window;
  15668. +
  15669. +   walt_disabled = walt_disabled ||
  15670. +           (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
  15671. +            walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
  15672. +
  15673. +   WARN(!no_walt && walt_disabled,
  15674. +        "invalid window size, disabling WALT\n");
  15675. +
  15676.     return 0;
  15677.  }
  15678.  
  15679.  early_param("walt_ravg_window", set_walt_ravg_window);
  15680.  
  15681. -extern u64 arch_counter_get_cntpct(void);
  15682.  static void
  15683.  update_window_start(struct rq *rq, u64 wallclock)
  15684.  {
  15685. @@ -188,10 +208,8 @@
  15686.     delta = wallclock - rq->window_start;
  15687.     /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
  15688.     if (delta < 0) {
  15689. -       if (arch_counter_get_cntpct() == 0)
  15690. -           delta = 0;
  15691. -       else
  15692. -           BUG_ON(1);
  15693. +       delta = 0;
  15694. +       WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
  15695.     }
  15696.  
  15697.     if (delta < walt_ravg_window)
  15698. @@ -199,26 +217,20 @@
  15699.  
  15700.     nr_windows = div64_u64(delta, walt_ravg_window);
  15701.     rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
  15702. +
  15703. +   rq->cum_window_demand = rq->cumulative_runnable_avg;
  15704.  }
  15705.  
  15706. +/*
  15707. + * Translate absolute delta time accounted on a CPU
  15708. + * to a scale where 1024 is the capacity of the most
  15709. + * capable CPU running at FMAX
  15710. + */
  15711.  static u64 scale_exec_time(u64 delta, struct rq *rq)
  15712.  {
  15713. -   unsigned int cur_freq = rq->cur_freq;
  15714. -   int sf;
  15715. -
  15716. -   if (unlikely(cur_freq > max_possible_freq))
  15717. -       cur_freq = rq->max_possible_freq;
  15718. +   unsigned long capcurr = capacity_curr_of(cpu_of(rq));
  15719.  
  15720. -   /* round up div64 */
  15721. -   delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
  15722. -             max_possible_freq);
  15723. -
  15724. -   sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency);
  15725. -
  15726. -   delta *= sf;
  15727. -   delta >>= 10;
  15728. -
  15729. -   return delta;
  15730. +   return (delta * capcurr) >> SCHED_CAPACITY_SHIFT;
  15731.  }
  15732.  
  15733.  static int cpu_is_waiting_on_io(struct rq *rq)
  15734. @@ -595,10 +607,20 @@
  15735.      * A throttled deadline sched class task gets dequeued without
  15736.      * changing p->on_rq. Since the dequeue decrements hmp stats
  15737.      * avoid decrementing it here again.
  15738. +    *
  15739. +    * When window is rolled over, the cumulative window demand
  15740. +    * is reset to the cumulative runnable average (contribution from
  15741. +    * the tasks on the runqueue). If the current task is dequeued
  15742. +    * already, it's demand is not included in the cumulative runnable
  15743. +    * average. So add the task demand separately to cumulative window
  15744. +    * demand.
  15745.      */
  15746. -   if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
  15747. -                       !p->dl.dl_throttled))
  15748. -       fixup_cumulative_runnable_avg(rq, p, demand);
  15749. +   if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
  15750. +       if (task_on_rq_queued(p))
  15751. +           fixup_cumulative_runnable_avg(rq, p, demand);
  15752. +       else if (rq->curr == p)
  15753. +           fixup_cum_window_demand(rq, demand);
  15754. +   }
  15755.  
  15756.     p->ravg.demand = demand;
  15757.  
  15758. @@ -741,33 +763,6 @@
  15759.     p->ravg.mark_start = wallclock;
  15760.  }
  15761.  
  15762. -unsigned long __weak arch_get_cpu_efficiency(int cpu)
  15763. -{
  15764. -   return SCHED_LOAD_SCALE;
  15765. -}
  15766. -
  15767. -void walt_init_cpu_efficiency(void)
  15768. -{
  15769. -   int i, efficiency;
  15770. -   unsigned int max = 0, min = UINT_MAX;
  15771. -
  15772. -   for_each_possible_cpu(i) {
  15773. -       efficiency = arch_get_cpu_efficiency(i);
  15774. -       cpu_rq(i)->efficiency = efficiency;
  15775. -
  15776. -       if (efficiency > max)
  15777. -           max = efficiency;
  15778. -       if (efficiency < min)
  15779. -           min = efficiency;
  15780. -   }
  15781. -
  15782. -   if (max)
  15783. -       max_possible_efficiency = max;
  15784. -
  15785. -   if (min)
  15786. -       min_possible_efficiency = min;
  15787. -}
  15788. -
  15789.  static void reset_task_stats(struct task_struct *p)
  15790.  {
  15791.     u32 sum = 0;
  15792. @@ -799,11 +794,11 @@
  15793.     int cpu = cpu_of(rq);
  15794.     struct rq *sync_rq = cpu_rq(sync_cpu);
  15795.  
  15796. -   if (rq->window_start)
  15797. +   if (likely(rq->window_start))
  15798.         return;
  15799.  
  15800.     if (cpu == sync_cpu) {
  15801. -       rq->window_start = walt_ktime_clock();
  15802. +       rq->window_start = 1;
  15803.     } else {
  15804.         raw_spin_unlock(&rq->lock);
  15805.         double_rq_lock(rq, sync_rq);
  15806. @@ -846,6 +841,17 @@
  15807.  
  15808.     walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
  15809.  
  15810. +   /*
  15811. +    * When a task is migrating during the wakeup, adjust
  15812. +    * the task's contribution towards cumulative window
  15813. +    * demand.
  15814. +    */
  15815. +   if (p->state == TASK_WAKING &&
  15816. +       p->last_sleep_ts >= src_rq->window_start) {
  15817. +       fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand);
  15818. +       fixup_cum_window_demand(dest_rq, p->ravg.demand);
  15819. +   }
  15820. +
  15821.     if (p->ravg.curr_window) {
  15822.         src_rq->curr_runnable_sum -= p->ravg.curr_window;
  15823.         dest_rq->curr_runnable_sum += p->ravg.curr_window;
  15824. @@ -872,283 +878,6 @@
  15825.         double_rq_unlock(src_rq, dest_rq);
  15826.  }
  15827.  
  15828. -/* Keep track of max/min capacity possible across CPUs "currently" */
  15829. -static void __update_min_max_capacity(void)
  15830. -{
  15831. -   int i;
  15832. -   int max = 0, min = INT_MAX;
  15833. -
  15834. -   for_each_online_cpu(i) {
  15835. -       if (cpu_rq(i)->capacity > max)
  15836. -           max = cpu_rq(i)->capacity;
  15837. -       if (cpu_rq(i)->capacity < min)
  15838. -           min = cpu_rq(i)->capacity;
  15839. -   }
  15840. -
  15841. -   max_capacity = max;
  15842. -   min_capacity = min;
  15843. -}
  15844. -
  15845. -static void update_min_max_capacity(void)
  15846. -{
  15847. -   unsigned long flags;
  15848. -   int i;
  15849. -
  15850. -   local_irq_save(flags);
  15851. -   for_each_possible_cpu(i)
  15852. -       raw_spin_lock(&cpu_rq(i)->lock);
  15853. -
  15854. -   __update_min_max_capacity();
  15855. -
  15856. -   for_each_possible_cpu(i)
  15857. -       raw_spin_unlock(&cpu_rq(i)->lock);
  15858. -   local_irq_restore(flags);
  15859. -}
  15860. -
  15861. -/*
  15862. - * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
  15863. - * least efficient cpu gets capacity of 1024
  15864. - */
  15865. -static unsigned long capacity_scale_cpu_efficiency(int cpu)
  15866. -{
  15867. -   return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency;
  15868. -}
  15869. -
  15870. -/*
  15871. - * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
  15872. - * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
  15873. - */
  15874. -static unsigned long capacity_scale_cpu_freq(int cpu)
  15875. -{
  15876. -   return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq;
  15877. -}
  15878. -
  15879. -/*
  15880. - * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
  15881. - * that "most" efficient cpu gets a load_scale_factor of 1
  15882. - */
  15883. -static unsigned long load_scale_cpu_efficiency(int cpu)
  15884. -{
  15885. -   return DIV_ROUND_UP(1024 * max_possible_efficiency,
  15886. -               cpu_rq(cpu)->efficiency);
  15887. -}
  15888. -
  15889. -/*
  15890. - * Return load_scale_factor of a cpu in reference to cpu with best max_freq
  15891. - * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
  15892. - * of 1.
  15893. - */
  15894. -static unsigned long load_scale_cpu_freq(int cpu)
  15895. -{
  15896. -   return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq);
  15897. -}
  15898. -
  15899. -static int compute_capacity(int cpu)
  15900. -{
  15901. -   int capacity = 1024;
  15902. -
  15903. -   capacity *= capacity_scale_cpu_efficiency(cpu);
  15904. -   capacity >>= 10;
  15905. -
  15906. -   capacity *= capacity_scale_cpu_freq(cpu);
  15907. -   capacity >>= 10;
  15908. -
  15909. -   return capacity;
  15910. -}
  15911. -
  15912. -static int compute_load_scale_factor(int cpu)
  15913. -{
  15914. -   int load_scale = 1024;
  15915. -
  15916. -   /*
  15917. -    * load_scale_factor accounts for the fact that task load
  15918. -    * is in reference to "best" performing cpu. Task's load will need to be
  15919. -    * scaled (up) by a factor to determine suitability to be placed on a
  15920. -    * (little) cpu.
  15921. -    */
  15922. -   load_scale *= load_scale_cpu_efficiency(cpu);
  15923. -   load_scale >>= 10;
  15924. -
  15925. -   load_scale *= load_scale_cpu_freq(cpu);
  15926. -   load_scale >>= 10;
  15927. -
  15928. -   return load_scale;
  15929. -}
  15930. -
  15931. -static int cpufreq_notifier_policy(struct notifier_block *nb,
  15932. -       unsigned long val, void *data)
  15933. -{
  15934. -   struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
  15935. -   int i, update_max = 0;
  15936. -   u64 highest_mpc = 0, highest_mplsf = 0;
  15937. -   const struct cpumask *cpus = policy->related_cpus;
  15938. -   unsigned int orig_min_max_freq = min_max_freq;
  15939. -   unsigned int orig_max_possible_freq = max_possible_freq;
  15940. -   /* Initialized to policy->max in case policy->related_cpus is empty! */
  15941. -   unsigned int orig_max_freq = policy->max;
  15942. -
  15943. -   if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
  15944. -                       val != CPUFREQ_CREATE_POLICY)
  15945. -       return 0;
  15946. -
  15947. -   if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
  15948. -       update_min_max_capacity();
  15949. -       return 0;
  15950. -   }
  15951. -
  15952. -   for_each_cpu(i, policy->related_cpus) {
  15953. -       cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
  15954. -                policy->related_cpus);
  15955. -       orig_max_freq = cpu_rq(i)->max_freq;
  15956. -       cpu_rq(i)->min_freq = policy->min;
  15957. -       cpu_rq(i)->max_freq = policy->max;
  15958. -       cpu_rq(i)->cur_freq = policy->cur;
  15959. -       cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq;
  15960. -   }
  15961. -
  15962. -   max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
  15963. -   if (min_max_freq == 1)
  15964. -       min_max_freq = UINT_MAX;
  15965. -   min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
  15966. -   BUG_ON(!min_max_freq);
  15967. -   BUG_ON(!policy->max);
  15968. -
  15969. -   /* Changes to policy other than max_freq don't require any updates */
  15970. -   if (orig_max_freq == policy->max)
  15971. -       return 0;
  15972. -
  15973. -   /*
  15974. -    * A changed min_max_freq or max_possible_freq (possible during bootup)
  15975. -    * needs to trigger re-computation of load_scale_factor and capacity for
  15976. -    * all possible cpus (even those offline). It also needs to trigger
  15977. -    * re-computation of nr_big_task count on all online cpus.
  15978. -    *
  15979. -    * A changed rq->max_freq otoh needs to trigger re-computation of
  15980. -    * load_scale_factor and capacity for just the cluster of cpus involved.
  15981. -    * Since small task definition depends on max_load_scale_factor, a
  15982. -    * changed load_scale_factor of one cluster could influence
  15983. -    * classification of tasks in another cluster. Hence a changed
  15984. -    * rq->max_freq will need to trigger re-computation of nr_big_task
  15985. -    * count on all online cpus.
  15986. -    *
  15987. -    * While it should be sufficient for nr_big_tasks to be
  15988. -    * re-computed for only online cpus, we have inadequate context
  15989. -    * information here (in policy notifier) with regard to hotplug-safety
  15990. -    * context in which notification is issued. As a result, we can't use
  15991. -    * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is
  15992. -    * fixed up to issue notification always in hotplug-safe context,
  15993. -    * re-compute nr_big_task for all possible cpus.
  15994. -    */
  15995. -
  15996. -   if (orig_min_max_freq != min_max_freq ||
  15997. -       orig_max_possible_freq != max_possible_freq) {
  15998. -           cpus = cpu_possible_mask;
  15999. -           update_max = 1;
  16000. -   }
  16001. -
  16002. -   /*
  16003. -    * Changed load_scale_factor can trigger reclassification of tasks as
  16004. -    * big or small. Make this change "atomic" so that tasks are accounted
  16005. -    * properly due to changed load_scale_factor
  16006. -    */
  16007. -   for_each_cpu(i, cpus) {
  16008. -       struct rq *rq = cpu_rq(i);
  16009. -
  16010. -       rq->capacity = compute_capacity(i);
  16011. -       rq->load_scale_factor = compute_load_scale_factor(i);
  16012. -
  16013. -       if (update_max) {
  16014. -           u64 mpc, mplsf;
  16015. -
  16016. -           mpc = div_u64(((u64) rq->capacity) *
  16017. -               rq->max_possible_freq, rq->max_freq);
  16018. -           rq->max_possible_capacity = (int) mpc;
  16019. -
  16020. -           mplsf = div_u64(((u64) rq->load_scale_factor) *
  16021. -               rq->max_possible_freq, rq->max_freq);
  16022. -
  16023. -           if (mpc > highest_mpc) {
  16024. -               highest_mpc = mpc;
  16025. -               cpumask_clear(&mpc_mask);
  16026. -               cpumask_set_cpu(i, &mpc_mask);
  16027. -           } else if (mpc == highest_mpc) {
  16028. -               cpumask_set_cpu(i, &mpc_mask);
  16029. -           }
  16030. -
  16031. -           if (mplsf > highest_mplsf)
  16032. -               highest_mplsf = mplsf;
  16033. -       }
  16034. -   }
  16035. -
  16036. -   if (update_max) {
  16037. -       max_possible_capacity = highest_mpc;
  16038. -       max_load_scale_factor = highest_mplsf;
  16039. -   }
  16040. -
  16041. -   __update_min_max_capacity();
  16042. -
  16043. -   return 0;
  16044. -}
  16045. -
  16046. -static int cpufreq_notifier_trans(struct notifier_block *nb,
  16047. -       unsigned long val, void *data)
  16048. -{
  16049. -   struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
  16050. -   unsigned int cpu = freq->cpu, new_freq = freq->new;
  16051. -   unsigned long flags;
  16052. -   int i;
  16053. -
  16054. -   if (val != CPUFREQ_POSTCHANGE)
  16055. -       return 0;
  16056. -
  16057. -   BUG_ON(!new_freq);
  16058. -
  16059. -   if (cpu_rq(cpu)->cur_freq == new_freq)
  16060. -       return 0;
  16061. -
  16062. -   for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) {
  16063. -       struct rq *rq = cpu_rq(i);
  16064. -
  16065. -       raw_spin_lock_irqsave(&rq->lock, flags);
  16066. -       walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
  16067. -                     walt_ktime_clock(), 0);
  16068. -       rq->cur_freq = new_freq;
  16069. -       raw_spin_unlock_irqrestore(&rq->lock, flags);
  16070. -   }
  16071. -
  16072. -   return 0;
  16073. -}
  16074. -
  16075. -static struct notifier_block notifier_policy_block = {
  16076. -   .notifier_call = cpufreq_notifier_policy
  16077. -};
  16078. -
  16079. -static struct notifier_block notifier_trans_block = {
  16080. -   .notifier_call = cpufreq_notifier_trans
  16081. -};
  16082. -
  16083. -static int register_sched_callback(void)
  16084. -{
  16085. -   int ret;
  16086. -
  16087. -   ret = cpufreq_register_notifier(&notifier_policy_block,
  16088. -                       CPUFREQ_POLICY_NOTIFIER);
  16089. -
  16090. -   if (!ret)
  16091. -       ret = cpufreq_register_notifier(&notifier_trans_block,
  16092. -                       CPUFREQ_TRANSITION_NOTIFIER);
  16093. -
  16094. -   return 0;
  16095. -}
  16096. -
  16097. -/*
  16098. - * cpufreq callbacks can be registered at core_initcall or later time.
  16099. - * Any registration done prior to that is "forgotten" by cpufreq. See
  16100. - * initialization of variable init_cpufreq_transition_notifier_list_called
  16101. - * for further information.
  16102. - */
  16103. -core_initcall(register_sched_callback);
  16104. -
  16105.  void walt_init_new_task_load(struct task_struct *p)
  16106.  {
  16107.     int i;
  16108. diff -Nur /home/ninez/android/marlin/kernel/sched/walt.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/walt.h
  16109. --- /home/ninez/android/marlin/kernel/sched/walt.h  2018-08-10 01:54:08.566728454 -0400
  16110. +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/walt.h  2018-08-11 23:57:17.131940887 -0400
  16111. @@ -55,8 +55,10 @@
  16112.  static inline void walt_init_cpu_efficiency(void) { }
  16113.  static inline u64 walt_ktime_clock(void) { return 0; }
  16114.  
  16115. +#define walt_cpu_high_irqload(cpu) false
  16116. +
  16117.  #endif /* CONFIG_SCHED_WALT */
  16118.  
  16119. -extern unsigned int walt_disabled;
  16120. +extern bool walt_disabled;
  16121.  
  16122.  #endif
Add Comment
Please, Sign In to add comment