marlin vs. exns sched

diff -Nur /home/ninez/android/marlin/kernel/sched/auto_group.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/auto_group.c
--- /home/ninez/android/marlin/kernel/sched/auto_group.c    2018-08-10 01:54:08.563395055 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/auto_group.c    2018-08-11 23:57:17.128607487 -0400
@@ -214,7 +214,7 @@
    ag = autogroup_task_get(p);

    down_write(&ag->lock);
-   err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
+   err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]);
    if (!err)
        ag->nice = nice;
    up_write(&ag->lock);
diff -Nur /home/ninez/android/marlin/kernel/sched/boost.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/boost.c
--- /home/ninez/android/marlin/kernel/sched/boost.c 1969-12-31 19:00:00.000000000 -0500
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/boost.c 2018-08-14 15:53:43.604124856 -0400
@@ -0,0 +1,68 @@
+/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "sched.h"
+#include <linux/sched.h>
+
+/*
+ * Scheduler boost is a mechanism to temporarily place tasks on CPUs
+ * with higher capacity than those where a task would have normally
+ * ended up with their load characteristics. Any entity enabling
+ * boost is responsible for disabling it as well.
+ */
+
+unsigned int sysctl_sched_boost;
+
+static bool verify_boost_params(int old_val, int new_val)
+{
+   /*
+    * Boost can only be turned on or off. There is no possiblity of
+    * switching from one boost type to another or to set the same
+    * kind of boost several times.
+    */
+   return !(!!old_val == !!new_val);
+}
+
+int sched_boost_handler(struct ctl_table *table, int write,
+       void __user *buffer, size_t *lenp,
+       loff_t *ppos)
+{
+   int ret;
+   unsigned int *data = (unsigned int *)table->data;
+   unsigned int old_val;
+   unsigned int dsb_top_app_boost = 30;
+   unsigned int dsb_top_app_floor = 0;
+
+   // Backup current sysctl_sched_boost value
+   old_val = *data;
+
+   // Set new sysctl_sched_boost value
+   ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+   if (ret || !write)
+       goto done;
+
+#ifdef CONFIG_DYNAMIC_STUNE_BOOST
+   if (verify_boost_params(old_val, *data)) {
+       if (*data > 0)
+           do_stune_boost("top-app", dsb_top_app_boost);
+       else
+           do_stune_unboost("top-app", dsb_top_app_floor);
+   } else {
+       *data = old_val;
+       ret = -EINVAL;
+   }
+#endif // CONFIG_DYNAMIC_STUNE_BOOST
+
+done:
+   return ret;
+}
diff -Nur /home/ninez/android/marlin/kernel/sched/completion.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/completion.c
--- /home/ninez/android/marlin/kernel/sched/completion.c    2018-08-10 01:54:08.563395055 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/completion.c    2018-08-12 21:13:57.906629665 -0400
@@ -30,10 +30,10 @@
 {
    unsigned long flags;

-   spin_lock_irqsave(&x->wait.lock, flags);
+   raw_spin_lock_irqsave(&x->wait.lock, flags);
    x->done++;
-   __wake_up_locked(&x->wait, TASK_NORMAL, 1);
-   spin_unlock_irqrestore(&x->wait.lock, flags);
+   swake_up_locked(&x->wait);
+   raw_spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete);

@@ -50,10 +50,10 @@
 {
    unsigned long flags;

-   spin_lock_irqsave(&x->wait.lock, flags);
+   raw_spin_lock_irqsave(&x->wait.lock, flags);
    x->done += UINT_MAX/2;
-   __wake_up_locked(&x->wait, TASK_NORMAL, 0);
-   spin_unlock_irqrestore(&x->wait.lock, flags);
+   swake_up_all_locked(&x->wait);
+   raw_spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete_all);

@@ -62,20 +62,20 @@
           long (*action)(long), long timeout, int state)
 {
    if (!x->done) {
-       DECLARE_WAITQUEUE(wait, current);
+       DECLARE_SWAITQUEUE(wait);

-       __add_wait_queue_tail_exclusive(&x->wait, &wait);
+       __prepare_to_swait(&x->wait, &wait);
        do {
            if (signal_pending_state(state, current)) {
                timeout = -ERESTARTSYS;
                break;
            }
            __set_current_state(state);
-           spin_unlock_irq(&x->wait.lock);
+           raw_spin_unlock_irq(&x->wait.lock);
            timeout = action(timeout);
-           spin_lock_irq(&x->wait.lock);
+           raw_spin_lock_irq(&x->wait.lock);
        } while (!x->done && timeout);
-       __remove_wait_queue(&x->wait, &wait);
+       __finish_swait(&x->wait, &wait);
        if (!x->done)
            return timeout;
    }
@@ -89,9 +89,9 @@
 {
    might_sleep();

-   spin_lock_irq(&x->wait.lock);
+   raw_spin_lock_irq(&x->wait.lock);
    timeout = do_wait_for_common(x, action, timeout, state);
-   spin_unlock_irq(&x->wait.lock);
+   raw_spin_unlock_irq(&x->wait.lock);
    return timeout;
 }

@@ -267,12 +267,21 @@
    unsigned long flags;
    int ret = 1;

-   spin_lock_irqsave(&x->wait.lock, flags);
+   /*
+    * Since x->done will need to be locked only
+    * in the non-blocking case, we check x->done
+    * first without taking the lock so we can
+    * return early in the blocking case.
+    */
+   if (!READ_ONCE(x->done))
+       return 0;
+
+   raw_spin_lock_irqsave(&x->wait.lock, flags);
    if (!x->done)
        ret = 0;
    else
        x->done--;
-   spin_unlock_irqrestore(&x->wait.lock, flags);
+   raw_spin_unlock_irqrestore(&x->wait.lock, flags);
    return ret;
 }
 EXPORT_SYMBOL(try_wait_for_completion);
@@ -287,13 +296,21 @@
  */
 bool completion_done(struct completion *x)
 {
-   unsigned long flags;
-   int ret = 1;
+   if (!READ_ONCE(x->done))
+       return false;

-   spin_lock_irqsave(&x->wait.lock, flags);
-   if (!x->done)
-       ret = 0;
-   spin_unlock_irqrestore(&x->wait.lock, flags);
-   return ret;
+   /*
+    * If ->done, we need to wait for complete() to release ->wait.lock
+    * otherwise we can end up freeing the completion before complete()
+    * is done referencing it.
+    *
+    * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
+    * the loads of ->done and ->wait.lock such that we cannot observe
+    * the lock before complete() acquires it while observing the ->done
+    * after it's acquired the lock.
+    */
+   smp_rmb();
+   raw_spin_unlock_wait(&x->wait.lock);
+   return true;
 }
 EXPORT_SYMBOL(completion_done);
diff -Nur /home/ninez/android/marlin/kernel/sched/core.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/core.c
--- /home/ninez/android/marlin/kernel/sched/core.c  2018-08-10 01:54:08.563395055 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/core.c  2018-08-26 16:43:11.647206295 -0400
@@ -94,7 +94,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 #include "walt.h"
-#include "tune.h"

 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -105,7 +104,9 @@
 {
    s64 delta;

-   if (rq->skip_clock_update > 0)
+   lockdep_assert_held(&rq->lock);
+
+   if (rq->clock_skip_update & RQCF_ACT_SKIP)
        return;

    delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -168,14 +169,12 @@

 static void sched_feat_disable(int i)
 {
-   if (static_key_enabled(&sched_feat_keys[i]))
-       static_key_slow_dec(&sched_feat_keys[i]);
+   static_key_disable(&sched_feat_keys[i]);
 }

 static void sched_feat_enable(int i)
 {
-   if (!static_key_enabled(&sched_feat_keys[i]))
-       static_key_slow_inc(&sched_feat_keys[i]);
+   static_key_enable(&sched_feat_keys[i]);
 }
 #else
 static void sched_feat_disable(int i) { };
@@ -290,10 +289,40 @@
  */
 int sysctl_sched_rt_runtime = 950000;

+/* cpus with isolated domains */
+cpumask_var_t cpu_isolated_map;
+
+struct rq *
+lock_rq_of(struct task_struct *p, struct rq_flags *rf)
+{
+   return task_rq_lock(p, rf);
+}
+
+void
+unlock_rq_of(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
+{
+   task_rq_unlock(rq, p, rf);
+}
+
+/*
+ * this_rq_lock - lock this runqueue and disable interrupts.
+ */
+static struct rq *this_rq_lock(void)
+   __acquires(rq->lock)
+{
+   struct rq *rq;
+
+   local_irq_disable();
+   rq = this_rq();
+   raw_spin_lock(&rq->lock);
+
+   return rq;
+}
+
 /*
  * __task_rq_lock - lock the rq @p resides on.
  */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
    __acquires(rq->lock)
 {
    struct rq *rq;
@@ -303,8 +332,10 @@
    for (;;) {
        rq = task_rq(p);
        raw_spin_lock(&rq->lock);
-       if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+       if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+           rf->cookie = lockdep_pin_lock(&rq->lock);
            return rq;
+       }
        raw_spin_unlock(&rq->lock);

        while (unlikely(task_on_rq_migrating(p)))
@@ -315,68 +346,44 @@
 /*
  * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
  */
-static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
    __acquires(p->pi_lock)
    __acquires(rq->lock)
 {
    struct rq *rq;

    for (;;) {
-       raw_spin_lock_irqsave(&p->pi_lock, *flags);
+       raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
        rq = task_rq(p);
        raw_spin_lock(&rq->lock);
-       if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+       /*
+        *  move_queued_task()      task_rq_lock()
+        *
+        *  ACQUIRE (rq->lock)
+        *  [S] ->on_rq = MIGRATING     [L] rq = task_rq()
+        *  WMB (__set_task_cpu())      ACQUIRE (rq->lock);
+        *  [S] ->cpu = new_cpu     [L] task_rq()
+        *                  [L] ->on_rq
+        *  RELEASE (rq->lock)
+        *
+        * If we observe the old cpu in task_rq_lock, the acquire of
+        * the old rq->lock will fully serialize against the stores.
+        *
+        * If we observe the new cpu in task_rq_lock, the acquire will
+        * pair with the WMB to ensure we must then also see migrating.
+        */
+       if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+           rf->cookie = lockdep_pin_lock(&rq->lock);
            return rq;
+       }
        raw_spin_unlock(&rq->lock);
-       raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+       raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);

        while (unlikely(task_on_rq_migrating(p)))
            cpu_relax();
    }
 }

-struct rq *
-lock_rq_of(struct task_struct *p, unsigned long *flags)
-{
-   return task_rq_lock(p, flags);
-}
-
-static void __task_rq_unlock(struct rq *rq)
-   __releases(rq->lock)
-{
-   raw_spin_unlock(&rq->lock);
-}
-
-static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
-   __releases(rq->lock)
-   __releases(p->pi_lock)
-{
-   raw_spin_unlock(&rq->lock);
-   raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-}
-
-void
-unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags)
-{
-   task_rq_unlock(rq, p, flags);
-}
-
-/*
- * this_rq_lock - lock this runqueue and disable interrupts.
- */
-static struct rq *this_rq_lock(void)
-   __acquires(rq->lock)
-{
-   struct rq *rq;
-
-   local_irq_disable();
-   rq = this_rq();
-   raw_spin_lock(&rq->lock);
-
-   return rq;
-}
-
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
@@ -531,15 +538,19 @@
 /*
  * cmpxchg based fetch_or, macro so it works for different integer types
  */
-#define fetch_or(ptr, val)                     \
-({ typeof(*(ptr)) __old, __val = *(ptr);               \
-   for (;;) {                          \
-       __old = cmpxchg((ptr), __val, __val | (val));       \
-       if (__old == __val)                 \
-           break;                      \
-       __val = __old;                      \
-   }                               \
-   __old;                              \
+#define fetch_or(ptr, mask)                        \
+   ({                              \
+       typeof(ptr) _ptr = (ptr);               \
+       typeof(mask) _mask = (mask);                \
+       typeof(*_ptr) _old, _val = *_ptr;           \
+                                   \
+       for (;;) {                      \
+           _old = cmpxchg(_ptr, _val, _val | _mask);   \
+           if (_old == _val)               \
+               break;                  \
+           _val = _old;                    \
+       }                           \
+   _old;                               \
 })

 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
@@ -593,6 +604,58 @@
 #endif
 #endif

+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+   struct wake_q_node *node = &task->wake_q;
+
+   /*
+    * Atomically grab the task, if ->wake_q is !nil already it means
+    * its already queued (either by us or someone else) and will get the
+    * wakeup due to that.
+    *
+    * This cmpxchg() implies a full barrier, which pairs with the write
+    * barrier implied by the wakeup in wake_up_list().
+    */
+   if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+       return;
+
+   head->count++;
+
+   get_task_struct(task);
+
+   /*
+    * The head is context local, there can be no concurrency.
+    */
+   *head->lastp = node;
+   head->lastp = &node->next;
+}
+
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+          int sibling_count_hint);
+
+void wake_up_q(struct wake_q_head *head)
+{
+   struct wake_q_node *node = head->first;
+
+   while (node != WAKE_Q_TAIL) {
+       struct task_struct *task;
+
+       task = container_of(node, struct task_struct, wake_q);
+       BUG_ON(!task);
+       /* task can safely be re-inserted now */
+       node = node->next;
+       task->wake_q.next = NULL;
+
+       /*
+        * try_to_wake_up() implies a wmb() to pair with the queueing
+        * in wake_q_add() so as not to miss wakeups.
+        */
+       try_to_wake_up(task, TASK_NORMAL, 0, head->count);
+       put_task_struct(task);
+   }
+}
+
 /*
  * resched_curr - mark rq's current task 'to be rescheduled now'.
  *
@@ -629,9 +692,9 @@
    struct rq *rq = cpu_rq(cpu);
    unsigned long flags;

-   raw_spin_lock_irqsave(&rq->lock, flags);
-   if (cpu_online(cpu) || cpu == smp_processor_id())
-       resched_curr(rq);
+   if (!raw_spin_trylock_irqsave(&rq->lock, flags))
+       return;
+   resched_curr(rq);
    raw_spin_unlock_irqrestore(&rq->lock, flags);
 }

@@ -745,6 +808,23 @@
 bool sched_can_stop_tick(void)
 {
    /*
+    * FIFO realtime policy runs the highest priority task. Other runnable
+    * tasks are of a lower priority. The scheduler tick does nothing.
+    */
+   if (current->policy == SCHED_FIFO)
+       return true;
+
+   /*
+    * Round-robin realtime tasks time slice with other tasks at the same
+    * realtime priority. Is this task the only one at this priority?
+    */
+   if (current->policy == SCHED_RR) {
+       struct sched_rt_entity *rt_se = &current->rt;
+
+       return rt_se->run_list.prev == rt_se->run_list.next;
+   }
+
+   /*
     * More than one running task need preemption.
     * nr_running update is assumed to be visible
     * after IPI is sent from wakers.
@@ -844,27 +924,29 @@
    /*
     * SCHED_IDLE tasks get minimal weight:
     */
-   if (p->policy == SCHED_IDLE) {
+   if (idle_policy(p->policy)) {
        load->weight = scale_load(WEIGHT_IDLEPRIO);
        load->inv_weight = WMULT_IDLEPRIO;
        return;
    }

-   load->weight = scale_load(prio_to_weight[prio]);
-   load->inv_weight = prio_to_wmult[prio];
+   load->weight = scale_load(sched_prio_to_weight[prio]);
+   load->inv_weight = sched_prio_to_wmult[prio];
 }

-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
    update_rq_clock(rq);
-   sched_info_queued(rq, p);
+   if (!(flags & ENQUEUE_RESTORE))
+       sched_info_queued(rq, p);
    p->sched_class->enqueue_task(rq, p, flags);
 }

-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
    update_rq_clock(rq);
-   sched_info_dequeued(rq, p);
+   if (!(flags & DEQUEUE_SAVE))
+       sched_info_dequeued(rq, p);
    p->sched_class->dequeue_task(rq, p, flags);
 }

@@ -1069,10 +1151,37 @@
     * this case, we can save a useless back to back clock update.
     */
    if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
-       rq->skip_clock_update = 1;
+       rq_clock_skip_update(rq, true);
 }

 #ifdef CONFIG_SMP
+
+static inline bool is_per_cpu_kthread(struct task_struct *p)
+{
+   if (!(p->flags & PF_KTHREAD))
+       return false;
+
+   if (p->nr_cpus_allowed != 1)
+       return false;
+
+   return true;
+}
+
+/*
+ * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
+ * __set_cpus_allowed_ptr() and select_fallback_rq().
+ */
+static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
+{
+   if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+       return false;
+
+   if (is_per_cpu_kthread(p))
+       return cpu_online(cpu);
+
+   return cpu_active(cpu);
+}
+
 /*
  * This is how migration works:
  *
@@ -1092,14 +1201,12 @@
  *
  * Returns (locked) new rq. Old rq's lock is released.
  */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
 {
-   struct rq *rq = task_rq(p);
-
    lockdep_assert_held(&rq->lock);

-   dequeue_task(rq, p, 0);
    p->on_rq = TASK_ON_RQ_MIGRATING;
+   dequeue_task(rq, p, 0);
    double_lock_balance(rq, cpu_rq(new_cpu));
    set_task_cpu(p, new_cpu);
    double_unlock_balance(rq, cpu_rq(new_cpu));
@@ -1109,8 +1216,8 @@

    raw_spin_lock(&rq->lock);
    BUG_ON(task_cpu(p) != new_cpu);
-   p->on_rq = TASK_ON_RQ_QUEUED;
    enqueue_task(rq, p, 0);
+   p->on_rq = TASK_ON_RQ_QUEUED;
    check_preempt_curr(rq, p, 0);

    return rq;
@@ -1129,41 +1236,16 @@
  *
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
  */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
 {
-   struct rq *rq;
-   int ret = 0;
-
-   if (unlikely(!cpu_active(dest_cpu)))
-       return ret;
-
-   rq = cpu_rq(src_cpu);
-
-   raw_spin_lock(&p->pi_lock);
-   raw_spin_lock(&rq->lock);
-   /* Already moved. */
-   if (task_cpu(p) != src_cpu)
-       goto done;
-
    /* Affinity changed (again). */
-   if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-       goto fail;
+   if (!is_cpu_allowed(p, dest_cpu))
+       return rq;

-   /*
-    * If we're not on a rq, the next wake-up will ensure we're
-    * placed properly.
-    */
-   if (task_on_rq_queued(p))
-       rq = move_queued_task(p, dest_cpu);
-done:
-   ret = 1;
-fail:
-   raw_spin_unlock(&rq->lock);
-   raw_spin_unlock(&p->pi_lock);
-   return ret;
+   rq = move_queued_task(rq, p, dest_cpu);
+
+   return rq;
 }

 /*
@@ -1174,6 +1256,8 @@
 static int migration_cpu_stop(void *data)
 {
    struct migration_arg *arg = data;
+   struct task_struct *p = arg->task;
+   struct rq *rq = this_rq();

    /*
     * The original target cpu might have gone down and we might
@@ -1186,20 +1270,77 @@
     * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
     */
    sched_ttwu_pending();
-   __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+
+   raw_spin_lock(&p->pi_lock);
+   raw_spin_lock(&rq->lock);
+   /*
+    * If task_rq(p) != rq, it cannot be migrated here, because we're
+    * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
+    * we're holding p->pi_lock.
+    */
+   if (task_rq(p) == rq && task_on_rq_queued(p))
+       rq = __migrate_task(rq, p, arg->dest_cpu);
+   raw_spin_unlock(&rq->lock);
+   raw_spin_unlock(&p->pi_lock);
+
    local_irq_enable();
    return 0;
 }

+/*
+ * sched_class::set_cpus_allowed must do the below, but is not required to
+ * actually call this function.
+ */
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+{
+   cpumask_copy(&p->cpus_allowed, new_mask);
+   p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+static const struct cpumask *get_adjusted_cpumask(const struct task_struct *p,
+   const struct cpumask *req_mask)
+{
+   /* Force all performance-critical kthreads onto the big cluster */
+   if (p->flags & PF_PERF_CRITICAL)
+       return cpu_perf_mask;
+
+   /* Force all trivial, unbound kthreads onto the little cluster */
+   if (p->flags & PF_KTHREAD && p->pid != 1 &&
+       cpumask_equal(req_mask, cpu_all_mask))
+       return cpu_lp_mask;
+
+   return req_mask;
+}
+
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
+   struct rq *rq = task_rq(p);
+   bool queued, running;
+
+   new_mask = get_adjusted_cpumask(p, new_mask);
+
    lockdep_assert_held(&p->pi_lock);

-   if (p->sched_class->set_cpus_allowed)
-       p->sched_class->set_cpus_allowed(p, new_mask);
+   queued = task_on_rq_queued(p);
+   running = task_current(rq, p);

-   cpumask_copy(&p->cpus_allowed, new_mask);
-   p->nr_cpus_allowed = cpumask_weight(new_mask);
+   if (queued) {
+       /*
+        * Because __kthread_bind() calls this on blocked tasks without
+        * holding rq->lock.
+        */
+       lockdep_assert_held(&rq->lock);
+       dequeue_task(rq, p, DEQUEUE_SAVE);
+   }
+   if (running)
+       put_prev_task(rq, p);
+
+   p->sched_class->set_cpus_allowed(p, new_mask);
+
+   if (queued)
+       enqueue_task(rq, p, ENQUEUE_RESTORE);
+   if (running)
+       set_curr_task(rq, p);
 }

 /*
@@ -1214,12 +1355,23 @@
 static int __set_cpus_allowed_ptr(struct task_struct *p,
                  const struct cpumask *new_mask, bool check)
 {
-   unsigned long flags;
-   struct rq *rq;
+   const struct cpumask *cpu_valid_mask = cpu_active_mask;
    unsigned int dest_cpu;
+   struct rq_flags rf;
+   struct rq *rq;
    int ret = 0;

-   rq = task_rq_lock(p, &flags);
+   new_mask = get_adjusted_cpumask(p, new_mask);
+
+   rq = task_rq_lock(p, &rf);
+   update_rq_clock(rq);
+
+   if (p->flags & PF_KTHREAD) {
+       /*
+        * Kernel threads are allowed on online && !active CPUs
+        */
+       cpu_valid_mask = cpu_online_mask;
+   }

    /*
     * Must re-check here, to close a race against __kthread_bind(),
@@ -1233,29 +1385,46 @@
    if (cpumask_equal(&p->cpus_allowed, new_mask))
        goto out;

-   if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+   if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
        ret = -EINVAL;
        goto out;
    }

    do_set_cpus_allowed(p, new_mask);

+   if (p->flags & PF_KTHREAD) {
+       /*
+        * For kernel threads that do indeed end up on online &&
+        * !active we want to ensure they are strict per-cpu threads.
+        */
+       WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
+           !cpumask_intersects(new_mask, cpu_active_mask) &&
+           p->nr_cpus_allowed != 1);
+   }
+
    /* Can the task run on the task's current CPU? If so, we're done */
    if (cpumask_test_cpu(task_cpu(p), new_mask))
        goto out;

-   dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+   dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
    if (task_running(rq, p) || p->state == TASK_WAKING) {
        struct migration_arg arg = { p, dest_cpu };
        /* Need help from migration thread: drop lock and wait. */
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
        stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
        tlb_migrate_finish(p->mm);
        return 0;
-   } else if (task_on_rq_queued(p))
-       rq = move_queued_task(p, dest_cpu);
+   } else if (task_on_rq_queued(p)) {
+       /*
+        * OK, since we're going to drop the lock immediately
+        * afterwards anyway.
+        */
+       lockdep_unpin_lock(&rq->lock, rf.cookie);
+       rq = move_queued_task(rq, p, dest_cpu);
+       lockdep_repin_lock(&rq->lock, rf.cookie);
+   }
 out:
-   task_rq_unlock(rq, p, &flags);
+   task_rq_unlock(rq, p, &rf);

    return ret;
 }
@@ -1274,7 +1443,16 @@
     * ttwu() will sort out the placement.
     */
    WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-           !(task_preempt_count(p) & PREEMPT_ACTIVE));
+           !p->on_rq);
+
+   /*
+    * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+    * because schedstat_wait_{start,end} rebase migrating task's wait_start
+    * time relying on p->on_rq.
+    */
+   WARN_ON_ONCE(p->state == TASK_RUNNING &&
+            p->sched_class == &fair_sched_class &&
+            (p->on_rq && !task_on_rq_migrating(p)));

 #ifdef CONFIG_LOCKDEP
    /*
@@ -1296,7 +1474,7 @@

    if (task_cpu(p) != new_cpu) {
        if (p->sched_class->migrate_task_rq)
-           p->sched_class->migrate_task_rq(p, new_cpu);
+           p->sched_class->migrate_task_rq(p);
        p->se.nr_migrations++;
        perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);

@@ -1314,9 +1492,13 @@
        src_rq = task_rq(p);
        dst_rq = cpu_rq(cpu);

+       p->on_rq = TASK_ON_RQ_MIGRATING;
        deactivate_task(src_rq, p, 0);
+       p->on_rq = TASK_ON_RQ_MIGRATING;
        set_task_cpu(p, cpu);
+       p->on_rq = TASK_ON_RQ_QUEUED;
        activate_task(dst_rq, p, 0);
+       p->on_rq = TASK_ON_RQ_QUEUED;
        check_preempt_curr(dst_rq, p, 0);
    } else {
        /*
@@ -1339,12 +1521,16 @@
    struct rq *src_rq, *dst_rq;
    int ret = -EAGAIN;

+   if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
+       return -EAGAIN;
+
    src_rq = cpu_rq(arg->src_cpu);
    dst_rq = cpu_rq(arg->dst_cpu);

    double_raw_lock(&arg->src_task->pi_lock,
            &arg->dst_task->pi_lock);
    double_rq_lock(src_rq, dst_rq);
+
    if (task_cpu(arg->dst_task) != arg->dst_cpu)
        goto unlock;

@@ -1426,8 +1612,8 @@
  */
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
-   unsigned long flags;
    int running, queued;
+   struct rq_flags rf;
    unsigned long ncsw;
    struct rq *rq;

@@ -1462,14 +1648,14 @@
         * lock now, to be *sure*. If we're wrong, we'll
         * just go back and repeat.
         */
-       rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &rf);
        trace_sched_wait_task(p);
        running = task_running(rq, p);
        queued = task_on_rq_queued(p);
        ncsw = 0;
        if (!match_state || p->state == match_state)
            ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);

        /*
         * If it changed from the expected state, bail out now.
@@ -1543,6 +1729,25 @@

 /*
  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ *
+ * A few notes on cpu_active vs cpu_online:
+ *
+ *  - cpu_active must be a subset of cpu_online
+ *
+ *  - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
+ *    see __set_cpus_allowed_ptr(). At this point the newly online
+ *    cpu isn't yet part of the sched domains, and balancing will not
+ *    see it.
+ *
+ *  - on cpu-down we clear cpu_active() to mask the sched domains and
+ *    avoid the load balancer to place new tasks on the to be removed
+ *    cpu. Existing tasks will remain running there and will be taken
+ *    off.
+ *
+ * This means that fallback selection must not select !active CPUs.
+ * And can assume that any active CPU must be online. Conversely
+ * select_task_rq() below may allow selection of !active CPUs in order
+ * to satisfy the above rules.
  */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
@@ -1561,8 +1766,6 @@

        /* Look for allowed, online CPU in same node. */
        for_each_cpu(dest_cpu, nodemask) {
-           if (!cpu_online(dest_cpu))
-               continue;
            if (!cpu_active(dest_cpu))
                continue;
            if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
@@ -1573,20 +1776,21 @@
    for (;;) {
        /* Any allowed, online CPU? */
        for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-           if (!cpu_online(dest_cpu))
-               continue;
-           if (!cpu_active(dest_cpu))
+           if (!is_cpu_allowed(p, dest_cpu))
                continue;
+
            goto out;
        }

+       /* No more Mr. Nice Guy. */
        switch (state) {
        case cpuset:
-           /* No more Mr. Nice Guy. */
-           cpuset_cpus_allowed_fallback(p);
-           state = possible;
-           break;
-
+           if (IS_ENABLED(CONFIG_CPUSETS)) {
+               cpuset_cpus_allowed_fallback(p);
+               state = possible;
+               break;
+           }
+           /* fall-through */
        case possible:
            do_set_cpus_allowed(p, cpu_possible_mask);
            state = fail;
@@ -1618,9 +1822,14 @@
  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
  */
 static inline
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
+          int sibling_count_hint)
 {
-   cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+   lockdep_assert_held(&p->pi_lock);
+
+   if (tsk_nr_cpus_allowed(p) > 1)
+       cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
+                            sibling_count_hint);

    /*
     * In order not to call set_task_cpu() on a blocking task we need
@@ -1658,23 +1867,25 @@
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
-#ifdef CONFIG_SCHEDSTATS
-   struct rq *rq = this_rq();
+   struct rq *rq;

-#ifdef CONFIG_SMP
-   int this_cpu = smp_processor_id();
+   if (!schedstat_enabled())
+       return;
+
+   rq = this_rq();

-   if (cpu == this_cpu) {
-       schedstat_inc(rq, ttwu_local);
-       schedstat_inc(p, se.statistics.nr_wakeups_local);
+#ifdef CONFIG_SMP
+   if (cpu == rq->cpu) {
+       schedstat_inc(rq->ttwu_local);
+       schedstat_inc(p->se.statistics.nr_wakeups_local);
    } else {
        struct sched_domain *sd;

-       schedstat_inc(p, se.statistics.nr_wakeups_remote);
+       schedstat_inc(p->se.statistics.nr_wakeups_remote);
        rcu_read_lock();
-       for_each_domain(this_cpu, sd) {
+       for_each_domain(rq->cpu, sd) {
            if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-               schedstat_inc(sd, ttwu_wake_remote);
+               schedstat_inc(sd->ttwu_wake_remote);
                break;
            }
        }
@@ -1682,34 +1893,27 @@
    }

    if (wake_flags & WF_MIGRATED)
-       schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-
+       schedstat_inc(p->se.statistics.nr_wakeups_migrate);
 #endif /* CONFIG_SMP */

-   schedstat_inc(rq, ttwu_count);
-   schedstat_inc(p, se.statistics.nr_wakeups);
+   schedstat_inc(rq->ttwu_count);
+   schedstat_inc(p->se.statistics.nr_wakeups);

    if (wake_flags & WF_SYNC)
-       schedstat_inc(p, se.statistics.nr_wakeups_sync);
-
-#endif /* CONFIG_SCHEDSTATS */
+       schedstat_inc(p->se.statistics.nr_wakeups_sync);
 }

 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
    activate_task(rq, p, en_flags);
    p->on_rq = TASK_ON_RQ_QUEUED;
-
-   /* if a worker is waking up, notify workqueue */
-   if (p->flags & PF_WQ_WORKER)
-       wq_worker_waking_up(p, cpu_of(rq));
 }

 /*
  * Mark the task runnable and perform wakeup-preemption.
  */
-static void
-ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
+              struct pin_cookie cookie)
 {
    check_preempt_curr(rq, p, wake_flags);
    p->state = TASK_RUNNING;
@@ -1718,9 +1922,12 @@
 #ifdef CONFIG_SMP
    if (p->sched_class->task_woken) {
        /*
-        * XXX can drop rq->lock; most likely ok.
+        * Our task @p is fully woken up and running; so its safe to
+        * drop the rq->lock, hereafter rq is only used for statistics.
         */
+       lockdep_unpin_lock(&rq->lock, cookie);
        p->sched_class->task_woken(rq, p);
+       lockdep_repin_lock(&rq->lock, cookie);
    }

    if (rq->idle_stamp) {
@@ -1738,15 +1945,18 @@
 }

 static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+        struct pin_cookie cookie)
 {
+   lockdep_assert_held(&rq->lock);
+
 #ifdef CONFIG_SMP
    if (p->sched_contributes_to_load)
        rq->nr_uninterruptible--;
 #endif

    ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
-   ttwu_do_wakeup(rq, p, wake_flags);
+   ttwu_do_wakeup(rq, p, wake_flags, cookie);
 }

 /*
@@ -1757,17 +1967,18 @@
  */
 static int ttwu_remote(struct task_struct *p, int wake_flags)
 {
+   struct rq_flags rf;
    struct rq *rq;
    int ret = 0;

-   rq = __task_rq_lock(p);
+   rq = __task_rq_lock(p, &rf);
    if (task_on_rq_queued(p)) {
        /* check_preempt_curr() may use rq clock */
        update_rq_clock(rq);
-       ttwu_do_wakeup(rq, p, wake_flags);
+       ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
        ret = 1;
    }
-   __task_rq_unlock(rq);
+   __task_rq_unlock(rq, &rf);

    return ret;
 }
@@ -1777,6 +1988,7 @@
 {
    struct rq *rq = this_rq();
    struct llist_node *llist = llist_del_all(&rq->wake_list);
+   struct pin_cookie cookie;
    struct task_struct *p;
    unsigned long flags;

@@ -1784,13 +1996,15 @@
        return;

    raw_spin_lock_irqsave(&rq->lock, flags);
+   cookie = lockdep_pin_lock(&rq->lock);

    while (llist) {
        p = llist_entry(llist, struct task_struct, wake_entry);
        llist = llist_next(llist);
-       ttwu_do_activate(rq, p, 0);
+       ttwu_do_activate(rq, p, 0, cookie);
    }

+   lockdep_unpin_lock(&rq->lock, cookie);
    raw_spin_unlock_irqrestore(&rq->lock, flags);
 }

@@ -1877,6 +2091,7 @@
 static void ttwu_queue(struct task_struct *p, int cpu)
 {
    struct rq *rq = cpu_rq(cpu);
+   struct pin_cookie cookie;

 #if defined(CONFIG_SMP)
    if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
@@ -1887,15 +2102,110 @@
 #endif

    raw_spin_lock(&rq->lock);
-   ttwu_do_activate(rq, p, 0);
+   cookie = lockdep_pin_lock(&rq->lock);
+   ttwu_do_activate(rq, p, 0, cookie);
+   lockdep_unpin_lock(&rq->lock, cookie);
    raw_spin_unlock(&rq->lock);
 }

+/*
+ * Notes on Program-Order guarantees on SMP systems.
+ *
+ *  MIGRATION
+ *
+ * The basic program-order guarantee on SMP systems is that when a task [t]
+ * migrates, all its activity on its old cpu [c0] happens-before any subsequent
+ * execution on its new cpu [c1].
+ *
+ * For migration (of runnable tasks) this is provided by the following means:
+ *
+ *  A) UNLOCK of the rq(c0)->lock scheduling out task t
+ *  B) migration for t is required to synchronize *both* rq(c0)->lock and
+ *     rq(c1)->lock (if not at the same time, then in that order).
+ *  C) LOCK of the rq(c1)->lock scheduling in task
+ *
+ * Transitivity guarantees that B happens after A and C after B.
+ * Note: we only require RCpc transitivity.
+ * Note: the cpu doing B need not be c0 or c1
+ *
+ * Example:
+ *
+ *   CPU0            CPU1            CPU2
+ *
+ *   LOCK rq(0)->lock
+ *   sched-out X
+ *   sched-in Y
+ *   UNLOCK rq(0)->lock
+ *
+ *                                   LOCK rq(0)->lock // orders against CPU0
+ *                                   dequeue X
+ *                                   UNLOCK rq(0)->lock
+ *
+ *                                   LOCK rq(1)->lock
+ *                                   enqueue X
+ *                                   UNLOCK rq(1)->lock
+ *
+ *                   LOCK rq(1)->lock // orders against CPU2
+ *                   sched-out Z
+ *                   sched-in X
+ *                   UNLOCK rq(1)->lock
+ *
+ *
+ *  BLOCKING -- aka. SLEEP + WAKEUP
+ *
+ * For blocking we (obviously) need to provide the same guarantee as for
+ * migration. However the means are completely different as there is no lock
+ * chain to provide order. Instead we do:
+ *
+ *   1) smp_store_release(X->on_cpu, 0)
+ *   2) smp_cond_load_acquire(!X->on_cpu)
+ *
+ * Example:
+ *
+ *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)
+ *
+ *   LOCK rq(0)->lock LOCK X->pi_lock
+ *   dequeue X
+ *   sched-out X
+ *   smp_store_release(X->on_cpu, 0);
+ *
+ *                    smp_cond_load_acquire(&X->on_cpu, !VAL);
+ *                    X->state = WAKING
+ *                    set_task_cpu(X,2)
+ *
+ *                    LOCK rq(2)->lock
+ *                    enqueue X
+ *                    X->state = RUNNING
+ *                    UNLOCK rq(2)->lock
+ *
+ *                                          LOCK rq(2)->lock // orders against CPU1
+ *                                          sched-out Z
+ *                                          sched-in X
+ *                                          UNLOCK rq(2)->lock
+ *
+ *                    UNLOCK X->pi_lock
+ *   UNLOCK rq(0)->lock
+ *
+ *
+ * However; for wakeups there is a second guarantee we must provide, namely we
+ * must observe the state that lead to our wakeup. That is, not only must our
+ * task observe its own prior state, it must also observe the stores prior to
+ * its wakeup.
+ *
+ * This means that any means of doing remote wakeups must order the CPU doing
+ * the wakeup against the CPU the task is going to end up running on. This,
+ * however, is already required for the regular Program-Order guarantee above,
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
+ *
+ */
+
 /**
  * try_to_wake_up - wake up a thread
  * @p: the thread to be awakened
  * @state: the mask of task states that can be woken
  * @wake_flags: wake modifier flags (WF_*)
+ * @sibling_count_hint: A hint at the number of threads that are being woken up
+ *                      in this event.
  *
  * Put it on the run-queue if it's not already there. The "current"
  * thread is always on the run-queue (except when the actual
@@ -1907,7 +2217,8 @@
  * or @state didn't match @p's state.
  */
 static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+          int sibling_count_hint)
 {
    unsigned long flags;
    int cpu, success = 0;
@@ -1959,15 +2270,34 @@

 #ifdef CONFIG_SMP
    /*
-    * If the owning (remote) cpu is still in the middle of schedule() with
-    * this task as prev, wait until its done referencing the task.
+    * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+    * possible to, falsely, observe p->on_cpu == 0.
+    *
+    * One must be running (->on_cpu == 1) in order to remove oneself
+    * from the runqueue.
+    *
+    *  [S] ->on_cpu = 1;   [L] ->on_rq
+    *      UNLOCK rq->lock
+    *          RMB
+    *      LOCK   rq->lock
+    *  [S] ->on_rq = 0;    [L] ->on_cpu
+    *
+    * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
+    * from the consecutive calls to schedule(); the first switching to our
+    * task, the second putting it to sleep.
     */
-   while (p->on_cpu)
-       cpu_relax();
+   smp_rmb();
+
    /*
-    * Pairs with the smp_wmb() in finish_lock_switch().
+    * If the owning (remote) cpu is still in the middle of schedule() with
+    * this task as prev, wait until its done referencing the task.
+    *
+    * Pairs with the smp_store_release() in finish_lock_switch().
+    *
+    * This ensures that tasks getting woken will be fully ordered against
+    * their previous state and preserve Program Order.
     */
-   smp_rmb();
+   smp_cond_load_acquire(&p->on_cpu, !VAL);

    rq = cpu_rq(task_cpu(p));

@@ -1983,8 +2313,8 @@
    if (p->sched_class->task_waking)
        p->sched_class->task_waking(p);

-   cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
-
+   cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
+                sibling_count_hint);
    if (task_cpu(p) != cpu) {
        wake_flags |= WF_MIGRATED;
        set_task_cpu(p, cpu);
@@ -2002,47 +2332,6 @@
 }

 /**
- * try_to_wake_up_local - try to wake up a local task with rq lock held
- * @p: the thread to be awakened
- *
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.
- */
-static void try_to_wake_up_local(struct task_struct *p)
-{
-   struct rq *rq = task_rq(p);
-
-   if (WARN_ON_ONCE(rq != this_rq()) ||
-       WARN_ON_ONCE(p == current))
-       return;
-
-   lockdep_assert_held(&rq->lock);
-
-   if (!raw_spin_trylock(&p->pi_lock)) {
-       raw_spin_unlock(&rq->lock);
-       raw_spin_lock(&p->pi_lock);
-       raw_spin_lock(&rq->lock);
-   }
-
-   if (!(p->state & TASK_NORMAL))
-       goto out;
-
-   if (!task_on_rq_queued(p)) {
-       u64 wallclock = walt_ktime_clock();
-
-       walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
-       walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
-       ttwu_activate(rq, p, ENQUEUE_WAKEUP);
-   }
-
-   ttwu_do_wakeup(rq, p, 0);
-   ttwu_stat(p, smp_processor_id(), 0);
-out:
-   raw_spin_unlock(&p->pi_lock);
-}
-
-/**
  * wake_up_process - Wake up a specific process
  * @p: The process to be woken up.
  *
@@ -2056,34 +2345,13 @@
  */
 int wake_up_process(struct task_struct *p)
 {
-   WARN_ON(task_is_stopped_or_traced(p));
-   return try_to_wake_up(p, TASK_NORMAL, 0);
+   return try_to_wake_up(p, TASK_NORMAL, 0, 1);
 }
 EXPORT_SYMBOL(wake_up_process);

-/**
- * wake_up_process_no_notif - Wake up a specific process without notifying
- * governor
- * @p: The process to be woken up.
- *
- * Attempt to wake up the nominated process and move it to the set of runnable
- * processes.
- *
- * Return: 1 if the process was woken up, 0 if it was already running.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-int wake_up_process_no_notif(struct task_struct *p)
-{
-   WARN_ON(task_is_stopped_or_traced(p));
-   return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER);
-}
-EXPORT_SYMBOL(wake_up_process_no_notif);
-
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
-   return try_to_wake_up(p, state, 0);
+   return try_to_wake_up(p, state, 0, 1);
 }

 /*
@@ -2120,6 +2388,10 @@
    p->se.prev_sum_exec_runtime = 0;
    p->se.nr_migrations     = 0;
    p->se.vruntime          = 0;
+#ifdef CONFIG_SCHED_WALT
+   p->last_sleep_ts        = 0;
+#endif
+
    INIT_LIST_HEAD(&p->se.group_node);
    walt_init_new_task_load(p);

@@ -2128,19 +2400,19 @@
 #endif

 #ifdef CONFIG_SCHEDSTATS
+   /* Even if schedstat is disabled, there should not be garbage */
    memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif

-#ifdef CONFIG_CPU_FREQ_STAT
-   cpufreq_task_stats_init(p);
-#endif
-
    RB_CLEAR_NODE(&p->dl.rb_node);
    init_dl_task_timer(&p->dl);
    __dl_clear_params(p);

-   init_rt_schedtune_timer(&p->rt);
    INIT_LIST_HEAD(&p->rt.run_list);
+   p->rt.timeout       = 0;
+   p->rt.time_slice    = sched_rr_timeslice;
+   p->rt.on_rq     = 0;
+   p->rt.on_list       = 0;

 #ifdef CONFIG_PREEMPT_NOTIFIERS
    INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2171,31 +2443,88 @@
 #endif /* CONFIG_NUMA_BALANCING */
 }

+DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+
 #ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_SCHED_DEBUG
+
 void set_numabalancing_state(bool enabled)
 {
    if (enabled)
-       sched_feat_set("NUMA");
+       static_branch_enable(&sched_numa_balancing);
    else
-       sched_feat_set("NO_NUMA");
+       static_branch_disable(&sched_numa_balancing);
 }
-#else
-__read_mostly bool numabalancing_enabled;

-void set_numabalancing_state(bool enabled)
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_numa_balancing(struct ctl_table *table, int write,
+            void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-   numabalancing_enabled = enabled;
+   struct ctl_table t;
+   int err;
+   int state = static_branch_likely(&sched_numa_balancing);
+
+   if (write && !capable(CAP_SYS_ADMIN))
+       return -EPERM;
+
+   t = *table;
+   t.data = &state;
+   err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+   if (err < 0)
+       return err;
+   if (write)
+       set_numabalancing_state(state);
+   return err;
 }
-#endif /* CONFIG_SCHED_DEBUG */
+#endif
+#endif
+
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+
+#ifdef CONFIG_SCHEDSTATS
+static void set_schedstats(bool enabled)
+{
+   if (enabled)
+       static_branch_enable(&sched_schedstats);
+   else
+       static_branch_disable(&sched_schedstats);
+}
+
+void force_schedstat_enabled(void)
+{
+   if (!schedstat_enabled()) {
+       pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+       static_branch_enable(&sched_schedstats);
+   }
+}
+
+static int __init setup_schedstats(char *str)
+{
+   int ret = 0;
+   if (!str)
+       goto out;
+
+   if (!strcmp(str, "enable")) {
+       set_schedstats(true);
+       ret = 1;
+   } else if (!strcmp(str, "disable")) {
+       set_schedstats(false);
+       ret = 1;
+   }
+out:
+   if (!ret)
+       pr_warn("Unable to parse schedstats=\n");
+
+   return ret;
+}
+__setup("schedstats=", setup_schedstats);

 #ifdef CONFIG_PROC_SYSCTL
-int sysctl_numa_balancing(struct ctl_table *table, int write,
+int sysctl_schedstats(struct ctl_table *table, int write,
             void __user *buffer, size_t *lenp, loff_t *ppos)
 {
    struct ctl_table t;
    int err;
-   int state = numabalancing_enabled;
+   int state = static_branch_likely(&sched_schedstats);

    if (write && !capable(CAP_SYS_ADMIN))
        return -EPERM;
@@ -2206,7 +2535,7 @@
    if (err < 0)
        return err;
    if (write)
-       set_numabalancing_state(state);
+       set_schedstats(state);
    return err;
 }
 #endif
@@ -2220,12 +2549,11 @@
    unsigned long flags;
    int cpu = get_cpu();

-   __sched_fork(clone_flags, p);
-
 #ifdef CONFIG_CPU_FREQ_STAT
-   cpufreq_task_stats_alloc(p);
+   cpufreq_task_stats_init(p);
 #endif

+   __sched_fork(clone_flags, p);
    /*
     * We mark the process as running here. This guarantees that
     * nobody will actually run it, and a signal or other external
@@ -2268,8 +2596,7 @@
        p->sched_class = &fair_sched_class;
    }

-   if (p->sched_class->task_fork)
-       p->sched_class->task_fork(p);
+   init_entity_runnable_average(&p->se);

    /*
     * The child is not yet in the pid-hash so no cgroup attach races,
@@ -2279,7 +2606,13 @@
     * Silence PROVE_RCU.
     */
    raw_spin_lock_irqsave(&p->pi_lock, flags);
-   set_task_cpu(p, cpu);
+   /*
+    * We're setting the cpu for the first time, we don't migrate,
+    * so use __set_task_cpu().
+    */
+   __set_task_cpu(p, cpu);
+   if (p->sched_class->task_fork)
+       p->sched_class->task_fork(p);
    raw_spin_unlock_irqrestore(&p->pi_lock, flags);

 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2318,8 +2651,8 @@
 #ifdef CONFIG_SMP
 inline struct dl_bw *dl_bw_of(int i)
 {
-   rcu_lockdep_assert(rcu_read_lock_sched_held(),
-              "sched RCU must be held");
+   RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+            "sched RCU must be held");
    return &cpu_rq(i)->rd->dl_bw;
 }

@@ -2328,8 +2661,8 @@
    struct root_domain *rd = cpu_rq(i)->rd;
    int cpus = 0;

-   rcu_lockdep_assert(rcu_read_lock_sched_held(),
-              "sched RCU must be held");
+   RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+            "sched RCU must be held");
    for_each_cpu_and(i, rd->span, cpu_active_mask)
        cpus++;

@@ -2347,25 +2680,6 @@
 }
 #endif

-static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
-{
-   dl_b->total_bw -= tsk_bw;
-}
-
-static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
-{
-   dl_b->total_bw += tsk_bw;
-}
-
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
-{
-   return dl_b->bw != -1 &&
-          dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
-}
-
 /*
  * We must be sure that accepting a new task (or allowing changing the
  * parameters of an existing one) is consistent with the bandwidth
@@ -2387,7 +2701,8 @@
    u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
    int cpus, err = -1;

-   if (new_bw == p->dl.dl_bw)
+   /* !deadline task may carry old deadline bandwidth */
+   if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
        return 0;

    /*
@@ -2426,45 +2741,76 @@
  */
 void wake_up_new_task(struct task_struct *p)
 {
-   unsigned long flags;
+   struct rq_flags rf;
    struct rq *rq;

-   raw_spin_lock_irqsave(&p->pi_lock, flags);
+   /* Initialize new task's runnable average */
+   init_entity_runnable_average(&p->se);
+   raw_spin_lock_irqsave(&p->pi_lock, rf.flags);

    walt_init_new_task_load(p);

-   /* Initialize new task's runnable average */
-   init_entity_runnable_average(&p->se);
+   p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
    /*
     * Fork balancing, do it here and not earlier because:
     *  - cpus_allowed can change in the fork path
     *  - any previously selected cpu might disappear through hotplug
+    *
+    * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+    * as we're not fully set-up yet.
     */
-   set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+   __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
 #endif
+   rq = __task_rq_lock(p, &rf);
+   update_rq_clock(rq);
+   post_init_entity_util_avg(&p->se);

-   rq = __task_rq_lock(p);
    walt_mark_task_starting(p);
+
    activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
    p->on_rq = TASK_ON_RQ_QUEUED;
    trace_sched_wakeup_new(p);
    check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
-   if (p->sched_class->task_woken)
+   if (p->sched_class->task_woken) {
+       /*
+        * Nothing relies on rq->lock after this, so its fine to
+        * drop it.
+        */
+       lockdep_unpin_lock(&rq->lock, rf.cookie);
        p->sched_class->task_woken(rq, p);
+       lockdep_repin_lock(&rq->lock, rf.cookie);
+   }
 #endif
-   task_rq_unlock(rq, p, &flags);
+   task_rq_unlock(rq, p, &rf);
 }

 #ifdef CONFIG_PREEMPT_NOTIFIERS

+static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+
+void preempt_notifier_inc(void)
+{
+   static_key_slow_inc(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_inc);
+
+void preempt_notifier_dec(void)
+{
+   static_key_slow_dec(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_dec);
+
 /**
  * preempt_notifier_register - tell me when current is being preempted & rescheduled
  * @notifier: notifier struct to register
  */
 void preempt_notifier_register(struct preempt_notifier *notifier)
 {
+   if (!static_key_false(&preempt_notifier_key))
+       WARN(1, "registering preempt_notifier while notifiers disabled\n");
+
    hlist_add_head(&notifier->link, &current->preempt_notifiers);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_register);
@@ -2473,7 +2819,7 @@
  * preempt_notifier_unregister - no longer interested in preemption notifications
  * @notifier: notifier struct to unregister
  *
- * This is safe to call from within a preemption notifier.
+ * This is *not* safe to call from within a preemption notifier.
  */
 void preempt_notifier_unregister(struct preempt_notifier *notifier)
 {
@@ -2481,7 +2827,7 @@
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
    struct preempt_notifier *notifier;

@@ -2489,9 +2835,15 @@
        notifier->ops->sched_in(notifier, raw_smp_processor_id());
 }

+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+   if (static_key_false(&preempt_notifier_key))
+       __fire_sched_in_preempt_notifiers(curr);
+}
+
 static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
-                struct task_struct *next)
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+                  struct task_struct *next)
 {
    struct preempt_notifier *notifier;

@@ -2499,13 +2851,21 @@
        notifier->ops->sched_out(notifier, next);
 }

+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+                struct task_struct *next)
+{
+   if (static_key_false(&preempt_notifier_key))
+       __fire_sched_out_preempt_notifiers(curr, next);
+}
+
 #else /* !CONFIG_PREEMPT_NOTIFIERS */

-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 }

-static void
+static inline void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
                 struct task_struct *next)
 {
@@ -2530,7 +2890,6 @@
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
            struct task_struct *next)
 {
-   trace_sched_switch(prev, next);
    sched_info_switch(rq, prev, next);
    perf_event_task_sched_out(prev, next);
    fire_sched_out_preempt_notifiers(prev, next);
@@ -2545,7 +2904,6 @@

 /**
  * finish_task_switch - clean up after a task-switch
- * @rq: runqueue associated with task-switch
  * @prev: the thread we just switched away from.
  *
  * finish_task_switch must be called after the context switch, paired
@@ -2557,13 +2915,35 @@
  * so, we finish that here outside of the runqueue lock. (Doing it
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
+ *
+ * The context switch have flipped the stack from under us and restored the
+ * local variables which were saved when this task called schedule() in the
+ * past. prev == current is still correct but we need to recalculate this_rq
+ * because prev may have moved to another CPU.
  */
-static void finish_task_switch(struct rq *rq, struct task_struct *prev)
+static struct rq *finish_task_switch(struct task_struct *prev)
    __releases(rq->lock)
 {
+   struct rq *rq = this_rq();
    struct mm_struct *mm = rq->prev_mm;
    long prev_state;

+   /*
+    * The previous task will have left us with a preempt_count of 2
+    * because it left us after:
+    *
+    *  schedule()
+    *    preempt_disable();            // 1
+    *    __schedule()
+    *      raw_spin_lock_irq(&rq->lock)    // 2
+    *
+    * Also, see FORK_PREEMPT_COUNT.
+    */
+   if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+             "corrupted preempt_count: %s/%d/0x%x\n",
+             current->comm, current->pid, preempt_count()))
+       preempt_count_set(FORK_PREEMPT_COUNT);
+
    rq->prev_mm = NULL;

    /*
@@ -2579,7 +2959,6 @@
     */
    prev_state = prev->state;
    vtime_task_switch(prev);
-   finish_arch_switch(prev);
    perf_event_task_sched_in(prev, current);
    finish_lock_switch(rq, prev);
    finish_arch_post_lock_switch();
@@ -2596,10 +2975,15 @@
         * task and put them back on the free list.
         */
        kprobe_flush_task(prev);
+
+       /* Task is done with its stack. */
+       put_task_stack(prev);
+
        put_task_struct(prev);
    }

    tick_nohz_task_switch(current);
+   return rq;
 }

 #ifdef CONFIG_SMP
@@ -2646,27 +3030,31 @@
 asmlinkage __visible void schedule_tail(struct task_struct *prev)
    __releases(rq->lock)
 {
-   struct rq *rq = this_rq();
-
-   finish_task_switch(rq, prev);
+   struct rq *rq;

    /*
-    * FIXME: do we need to worry about rq being invalidated by the
-    * task_switch?
+    * New tasks start with FORK_PREEMPT_COUNT, see there and
+    * finish_task_switch() for details.
+    *
+    * finish_task_switch() will drop rq->lock() and lower preempt_count
+    * and the preempt_enable() will end up enabling preemption (on
+    * PREEMPT_COUNT kernels).
     */
+
+   rq = finish_task_switch(prev);
    balance_callback(rq);
+   preempt_enable();

    if (current->set_child_tid)
        put_user(task_pid_vnr(current), current->set_child_tid);
 }

 /*
- * context_switch - switch to the new MM and the new
- * thread's register state.
+ * context_switch - switch to the new MM and the new thread's register state.
  */
-static inline void
+static inline struct rq *
 context_switch(struct rq *rq, struct task_struct *prev,
-          struct task_struct *next)
+          struct task_struct *next, struct pin_cookie cookie)
 {
    struct mm_struct *mm, *oldmm;

@@ -2698,19 +3086,15 @@
     * of the scheduler it's an obvious special-case), so we
     * do an early lockdep release here:
     */
+   lockdep_unpin_lock(&rq->lock, cookie);
    spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

    context_tracking_task_switch(prev, next);
    /* Here we just switch the register state and the stack. */
    switch_to(prev, next, prev);
-
    barrier();
-   /*
-    * this_rq must be evaluated again because prev may have moved
-    * CPUs since it called schedule(), thus the 'rq' on its stack
-    * frame will be invalid.
-    */
-   finish_task_switch(this_rq(), prev);
+
+   return finish_task_switch(prev);
 }

 /*
@@ -2775,6 +3159,36 @@
    return atomic_read(&this->nr_iowait);
 }

+#ifdef CONFIG_CPU_QUIET
+u64 nr_running_integral(unsigned int cpu)
+{
+   unsigned int seqcnt;
+   u64 integral;
+   struct rq *q;
+
+   if (cpu >= nr_cpu_ids)
+       return 0;
+
+   q = cpu_rq(cpu);
+
+   /*
+    * Update average to avoid reading stalled value if there were
+    * no run-queue changes for a long time. On the other hand if
+    * the changes are happening right now, just read current value
+    * directly.
+    */
+
+   seqcnt = read_seqcount_begin(&q->ave_seqcnt);
+   integral = do_nr_running_integral(q);
+   if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) {
+       read_seqcount_begin(&q->ave_seqcnt);
+       integral = q->nr_running_integral;
+   }
+
+   return integral;
+}
+#endif
+
 void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
 {
    struct rq *rq = this_rq();
@@ -2795,7 +3209,7 @@
    int dest_cpu;

    raw_spin_lock_irqsave(&p->pi_lock, flags);
-   dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+   dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
    if (dest_cpu == smp_processor_id())
        goto unlock;

@@ -2825,7 +3239,7 @@
  */
 unsigned long long task_sched_runtime(struct task_struct *p)
 {
-   unsigned long flags;
+   struct rq_flags rf;
    struct rq *rq;
    u64 ns;

@@ -2845,7 +3259,7 @@
        return p->se.sum_exec_runtime;
 #endif

-   rq = task_rq_lock(p, &flags);
+   rq = task_rq_lock(p, &rf);
    /*
     * Must be ->curr _and_ ->on_rq.  If dequeued, we would
     * project cycles that may never be accounted to this
@@ -2856,7 +3270,7 @@
        p->sched_class->update_curr(rq);
    }
    ns = p->se.sum_exec_runtime;
-   task_rq_unlock(rq, p, &flags);
+   task_rq_unlock(rq, p, &rf);

    return ns;
 }
@@ -2879,6 +3293,7 @@
    return total += scr->dl;
 }

+unsigned long boosted_cpu_util(int cpu);
 static void sched_freq_tick_pelt(int cpu)
 {
    unsigned long cpu_utilization = boosted_cpu_util(cpu);
@@ -2889,47 +3304,45 @@
    if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr)
        return;

-   if (!use_util_est())
-       cpu_utilization = capacity_max;
-
    /*
     * To make free room for a task that is building up its "real"
     * utilization and to harm its performance the least, request
     * a jump to a higher OPP as soon as the margin of free capacity
     * is impacted (specified by capacity_margin).
+    * Remember CPU utilization in sched_capacity_reqs should be normalised.
     */
+   cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
    set_cfs_cpu_capacity(cpu, true, cpu_utilization);
 }

 #ifdef CONFIG_SCHED_WALT
 static void sched_freq_tick_walt(int cpu)
 {
-   unsigned long cpu_utilization = cpu_util(cpu, UTIL_EST);
+   unsigned long cpu_utilization = cpu_util_freq(cpu);
    unsigned long capacity_curr = capacity_curr_of(cpu);

    if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
        return sched_freq_tick_pelt(cpu);

    /*
-    * Add a margin to the WALT utilization.
+    * Add a margin to the WALT utilization to check if we will need to
+    * increase frequency.
     * NOTE: WALT tracks a single CPU signal for all the scheduling
     * classes, thus this margin is going to be added to the DL class as
     * well, which is something we do not do in sched_freq_tick_pelt case.
-    *
-    * TODO:
-    * Here we're adding margin, but we're also adding margin in cpufreq.
-    * There shouldn't be a double addition.
     */
-   cpu_utilization = add_capacity_margin(cpu_utilization);
-   if (cpu_utilization <= capacity_curr)
+   if (add_capacity_margin(cpu_utilization) <= capacity_curr)
        return;

    /*
     * It is likely that the load is growing so we
     * keep the added margin in our request as an
     * extra boost.
+    * Remember CPU utilization in sched_capacity_reqs should be normalised.
     */
+   cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
    set_cfs_cpu_capacity(cpu, true, cpu_utilization);
+
 }
 #define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu)
 #else
@@ -2938,16 +3351,9 @@

 static void sched_freq_tick(int cpu)
 {
-   unsigned long capacity_orig, capacity_curr;
-
    if (!sched_freq())
        return;

-   capacity_orig = capacity_orig_of(cpu);
-   capacity_curr = capacity_curr_of(cpu);
-   if (capacity_curr == capacity_orig)
-       return;
-
    _sched_freq_tick(cpu);
 }
 #else
@@ -2968,11 +3374,11 @@

    raw_spin_lock(&rq->lock);
    walt_set_window_start(rq);
+   walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+           walt_ktime_clock(), 0);
    update_rq_clock(rq);
    curr->sched_class->task_tick(rq, curr, 0);
    update_cpu_load_active(rq);
-   walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
-           walt_ktime_clock(), 0);
    calc_global_load_tick(rq);
    sched_freq_tick(cpu);
    raw_spin_unlock(&rq->lock);
@@ -3115,25 +3521,23 @@
    if (task_stack_end_corrupted(prev))
        panic("corrupted stack end detected inside scheduler\n");
 #endif
-   /*
-    * Test if we are atomic. Since do_exit() needs to call into
-    * schedule() atomically, we ignore that path. Otherwise whine
-    * if we are scheduling when we should not.
-    */
-   if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
+
+   if (unlikely(in_atomic_preempt_off())) {
        __schedule_bug(prev);
+       preempt_count_set(PREEMPT_DISABLED);
+   }
    rcu_sleep_check();

    profile_hit(SCHED_PROFILING, __builtin_return_address(0));

-   schedstat_inc(this_rq(), sched_count);
+   schedstat_inc(this_rq()->sched_count);
 }

 /*
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev)
+pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
    const struct sched_class *class = &fair_sched_class;
    struct task_struct *p;
@@ -3144,20 +3548,21 @@
     */
    if (likely(prev->sched_class == class &&
           rq->nr_running == rq->cfs.h_nr_running)) {
-       p = fair_sched_class.pick_next_task(rq, prev);
+       p = fair_sched_class.pick_next_task(rq, prev, cookie);
        if (unlikely(p == RETRY_TASK))
            goto again;

        /* assumes fair_sched_class->next == idle_sched_class */
        if (unlikely(!p))
-           p = idle_sched_class.pick_next_task(rq, prev);
+           p = idle_sched_class.pick_next_task(rq, prev, cookie);

-       return p;
+       if (likely(p != RETRY_TASK))
+           return p;
    }

 again:
    for_each_class(class) {
-       p = class->pick_next_task(rq, prev);
+       p = class->pick_next_task(rq, prev, cookie);
        if (p) {
            if (unlikely(p == RETRY_TASK))
                goto again;
@@ -3204,20 +3609,20 @@
  *          - explicit schedule() call
  *          - return from syscall or exception to user-space
  *          - return from interrupt-handler to user-space
+ *
+ * WARNING: must be called with preemption disabled!
  */
-static void __sched __schedule(void)
+static void __sched notrace __schedule(bool preempt)
 {
    struct task_struct *prev, *next;
    unsigned long *switch_count;
+   struct pin_cookie cookie;
    struct rq *rq;
    int cpu;
    u64 wallclock;

-need_resched:
-   preempt_disable();
    cpu = smp_processor_id();
    rq = cpu_rq(cpu);
-   rcu_note_context_switch(cpu);
    prev = rq->curr;

    schedule_debug(prev);
@@ -3225,77 +3630,105 @@
    if (sched_feat(HRTICK))
        hrtick_clear(rq);

+   local_irq_disable();
+   rcu_note_context_switch();
+
    /*
     * Make sure that signal_pending_state()->signal_pending() below
     * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
     * done by the caller to avoid the race with signal_wake_up().
     */
    smp_mb__before_spinlock();
-   raw_spin_lock_irq(&rq->lock);
+   raw_spin_lock(&rq->lock);
+   cookie = lockdep_pin_lock(&rq->lock);
+
+   rq->clock_skip_update <<= 1; /* promote REQ to ACT */

    switch_count = &prev->nivcsw;
-   if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+   if (!preempt && prev->state) {
        if (unlikely(signal_pending_state(prev->state, prev))) {
            prev->state = TASK_RUNNING;
        } else {
            deactivate_task(rq, prev, DEQUEUE_SLEEP);
            prev->on_rq = 0;

-           /*
-            * If a worker went to sleep, notify and ask workqueue
-            * whether it wants to wake up a task to maintain
-            * concurrency.
-            */
-           if (prev->flags & PF_WQ_WORKER) {
-               struct task_struct *to_wakeup;
-
-               to_wakeup = wq_worker_sleeping(prev, cpu);
-               if (to_wakeup)
-                   try_to_wake_up_local(to_wakeup);
-           }
        }
        switch_count = &prev->nvcsw;
    }

-   if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
+   if (task_on_rq_queued(prev))
        update_rq_clock(rq);

-   next = pick_next_task(rq, prev);
+   next = pick_next_task(rq, prev, cookie);
    wallclock = walt_ktime_clock();
    walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
    walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
    clear_tsk_need_resched(prev);
    clear_preempt_need_resched();
-   rq->skip_clock_update = 0;
+   rq->clock_skip_update = 0;

    if (likely(prev != next)) {
+#ifdef CONFIG_SCHED_WALT
+       if (!prev->on_rq)
+           prev->last_sleep_ts = wallclock;
+#endif
        rq->nr_switches++;
        rq->curr = next;
        ++*switch_count;

-       context_switch(rq, prev, next); /* unlocks the rq */
-       /*
-        * The context switch have flipped the stack from under us
-        * and restored the local variables which were saved when
-        * this task called schedule() in the past. prev == current
-        * is still correct, but it can be moved to another cpu/rq.
-        */
-       cpu = smp_processor_id();
-       rq = cpu_rq(cpu);
-   } else
+       //trace_sched_switch(preempt, prev, next);
+       rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
+       cpu = cpu_of(rq);
+   } else {
+       lockdep_unpin_lock(&rq->lock, cookie);
        raw_spin_unlock_irq(&rq->lock);
+   }

    balance_callback(rq);
+}

-   sched_preempt_enable_no_resched();
-   if (need_resched())
-       goto need_resched;
+void __noreturn do_task_dead(void)
+{
+   /*
+    * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+    * when the following two conditions become true.
+    *   - There is race condition of mmap_sem (It is acquired by
+    *     exit_mm()), and
+    *   - SMI occurs before setting TASK_RUNINNG.
+    *     (or hypervisor of virtual machine switches to other guest)
+    *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+    *
+    * To avoid it, we have to wait for releasing tsk->pi_lock which
+    * is held by try_to_wake_up()
+    */
+   smp_mb();
+   raw_spin_unlock_wait(&current->pi_lock);
+
+   /* causes final put_task_struct in finish_task_switch(). */
+   __set_current_state(TASK_DEAD);
+   current->flags |= PF_NOFREEZE;  /* tell freezer to ignore us */
+   __schedule(false);
+   BUG();
+   /* Avoid "noreturn function does return".  */
+   for (;;)
+       cpu_relax();    /* For when BUG is null */
 }

 static inline void sched_submit_work(struct task_struct *tsk)
 {
-   if (!tsk->state || tsk_is_pi_blocked(tsk))
+   if (!tsk->state)
+       return;
+   /*
+    * If a worker went to sleep, notify and ask workqueue whether
+    * it wants to wake up a task to maintain concurrency.
+    */
+   if (tsk->flags & PF_WQ_WORKER)
+       wq_worker_sleeping(tsk);
+
+
+   if (tsk_is_pi_blocked(tsk))
        return;
+
    /*
     * If we are going to sleep and we have plugged IO queued,
     * make sure to submit it to avoid deadlocks.
@@ -3304,12 +3737,23 @@
        blk_schedule_flush_plug(tsk);
 }

+static void sched_update_worker(struct task_struct *tsk)
+{
+   if (tsk->flags & PF_WQ_WORKER)
+       wq_worker_running(tsk);
+}
+
 asmlinkage __visible void __sched schedule(void)
 {
    struct task_struct *tsk = current;

    sched_submit_work(tsk);
-   __schedule();
+   do {
+       preempt_disable();
+       __schedule(false);
+       sched_preempt_enable_no_resched();
+   } while (need_resched());
+   sched_update_worker(tsk);
 }
 EXPORT_SYMBOL(schedule);

@@ -3344,6 +3788,20 @@
    preempt_disable();
 }

+static void preempt_schedule_common(void)
+{
+   do {
+       preempt_disable_notrace();
+       __schedule(true);
+       preempt_enable_no_resched_notrace();
+
+       /*
+        * Check again in case we missed a preemption opportunity
+        * between schedule and now.
+        */
+   } while (need_resched());
+}
+
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
@@ -3359,24 +3817,13 @@
    if (likely(!preemptible()))
        return;

-   do {
-       __preempt_count_add(PREEMPT_ACTIVE);
-       __schedule();
-       __preempt_count_sub(PREEMPT_ACTIVE);
-
-       /*
-        * Check again in case we missed a preemption opportunity
-        * between schedule and now.
-        */
-       barrier();
-   } while (need_resched());
+   preempt_schedule_common();
 }
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);

-#ifdef CONFIG_CONTEXT_TRACKING
 /**
- * preempt_schedule_context - preempt_schedule called by tracing
+ * preempt_schedule_notrace - preempt_schedule called by tracing
  *
  * The tracing infrastructure uses preempt_enable_notrace to prevent
  * recursion and tracing preempt enabling caused by the tracing
@@ -3389,7 +3836,7 @@
  * instead of preempt_schedule() to exit user context if needed before
  * calling the scheduler.
  */
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 {
    enum ctx_state prev_ctx;

@@ -3397,22 +3844,20 @@
        return;

    do {
-       __preempt_count_add(PREEMPT_ACTIVE);
+       preempt_disable_notrace();
        /*
         * Needs preempt disabled in case user_exit() is traced
         * and the tracer calls preempt_enable_notrace() causing
         * an infinite recursion.
         */
        prev_ctx = exception_enter();
-       __schedule();
+       __schedule(true);
        exception_exit(prev_ctx);

-       __preempt_count_sub(PREEMPT_ACTIVE);
-       barrier();
+       preempt_enable_no_resched_notrace();
    } while (need_resched());
 }
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_CONTEXT_TRACKING */
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);

 #endif /* CONFIG_PREEMPT */

@@ -3432,17 +3877,11 @@
    prev_state = exception_enter();

    do {
-       __preempt_count_add(PREEMPT_ACTIVE);
+       preempt_disable();
        local_irq_enable();
-       __schedule();
+       __schedule(true);
        local_irq_disable();
-       __preempt_count_sub(PREEMPT_ACTIVE);
-
-       /*
-        * Check again in case we missed a preemption opportunity
-        * between schedule and now.
-        */
-       barrier();
+       sched_preempt_enable_no_resched();
    } while (need_resched());

    exception_exit(prev_state);
@@ -3451,7 +3890,7 @@
 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
              void *key)
 {
-   return try_to_wake_up(curr->private, mode, wake_flags);
+   return try_to_wake_up(curr->private, mode, wake_flags, 1);
 }
 EXPORT_SYMBOL(default_wake_function);

@@ -3470,13 +3909,15 @@
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-   int oldprio, queued, running, enqueue_flag = 0;
-   struct rq *rq;
+   int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
    const struct sched_class *prev_class;
+   struct rq_flags rf;
+   struct rq *rq;

    BUG_ON(prio > MAX_PRIO);

-   rq = __task_rq_lock(p);
+   rq = __task_rq_lock(p, &rf);
+   update_rq_clock(rq);

    /*
     * Idle task boosting is a nono in general. There is one
@@ -3498,11 +3939,15 @@

    trace_sched_pi_setprio(p, prio);
    oldprio = p->prio;
+
+   if (oldprio == prio)
+       queue_flag &= ~DEQUEUE_MOVE;
+
    prev_class = p->sched_class;
    queued = task_on_rq_queued(p);
    running = task_current(rq, p);
    if (queued)
-       dequeue_task(rq, p, 0);
+       dequeue_task(rq, p, queue_flag);
    if (running)
        put_prev_task(rq, p);

@@ -3520,8 +3965,7 @@
        if (!dl_prio(p->normal_prio) ||
            (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
            p->dl.dl_boosted = 1;
-           p->dl.dl_throttled = 0;
-           enqueue_flag = ENQUEUE_REPLENISH;
+           queue_flag |= ENQUEUE_REPLENISH;
        } else
            p->dl.dl_boosted = 0;
        p->sched_class = &dl_sched_class;
@@ -3529,7 +3973,7 @@
        if (dl_prio(oldprio))
            p->dl.dl_boosted = 0;
        if (oldprio < prio)
-           enqueue_flag = ENQUEUE_HEAD;
+           queue_flag |= ENQUEUE_HEAD;
        p->sched_class = &rt_sched_class;
    } else {
        if (dl_prio(oldprio))
@@ -3541,15 +3985,15 @@

    p->prio = prio;

-   if (running)
-       p->sched_class->set_curr_task(rq);
    if (queued)
-       enqueue_task(rq, p, enqueue_flag);
+       enqueue_task(rq, p, queue_flag);
+   if (running)
+       set_curr_task(rq, p);

    check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
    preempt_disable(); /* avoid rq from going away on us */
-   __task_rq_unlock(rq);
+   __task_rq_unlock(rq, &rf);

    balance_callback(rq);
    preempt_enable();
@@ -3558,8 +4002,9 @@

 void set_user_nice(struct task_struct *p, long nice)
 {
-   int old_prio, delta, queued;
-   unsigned long flags;
+   bool queued, running;
+   int old_prio, delta;
+   struct rq_flags rf;
    struct rq *rq;

    if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
@@ -3568,7 +4013,9 @@
     * We have to be careful, if called from sys_setpriority(),
     * the task might be in the middle of scheduling on another CPU.
     */
-   rq = task_rq_lock(p, &flags);
+   rq = task_rq_lock(p, &rf);
+   update_rq_clock(rq);
+
    /*
     * The RT priorities are set via sched_setscheduler(), but we still
     * allow the 'normal' nice value to be set - but as expected
@@ -3580,8 +4027,11 @@
        goto out_unlock;
    }
    queued = task_on_rq_queued(p);
+   running = task_current(rq, p);
    if (queued)
-       dequeue_task(rq, p, 0);
+       dequeue_task(rq, p, DEQUEUE_SAVE);
+   if (running)
+       put_prev_task(rq, p);

    p->static_prio = NICE_TO_PRIO(nice);
    set_load_weight(p);
@@ -3590,7 +4040,7 @@
    delta = p->prio - old_prio;

    if (queued) {
-       enqueue_task(rq, p, 0);
+       enqueue_task(rq, p, ENQUEUE_RESTORE);
        /*
         * If the task increased its priority or is running and
         * lowered its priority, then reschedule its CPU:
@@ -3598,8 +4048,10 @@
        if (delta < 0 || (delta > 0 && task_running(rq, p)))
            resched_curr(rq);
    }
+   if (running)
+       set_curr_task(rq, p);
 out_unlock:
-   task_rq_unlock(rq, p, &flags);
+   task_rq_unlock(rq, p, &rf);
 }
 EXPORT_SYMBOL(set_user_nice);

@@ -3874,18 +4326,33 @@
    return match;
 }

+static bool dl_param_changed(struct task_struct *p,
+       const struct sched_attr *attr)
+{
+   struct sched_dl_entity *dl_se = &p->dl;
+
+   if (dl_se->dl_runtime != attr->sched_runtime ||
+       dl_se->dl_deadline != attr->sched_deadline ||
+       dl_se->dl_period != attr->sched_period ||
+       dl_se->flags != attr->sched_flags)
+       return true;
+
+   return false;
+}
+
 static int __sched_setscheduler(struct task_struct *p,
                const struct sched_attr *attr,
-               bool user)
+               bool user, bool pi)
 {
    int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
              MAX_RT_PRIO - 1 - attr->sched_priority;
    int retval, oldprio, oldpolicy = -1, queued, running;
    int new_effective_prio, policy = attr->sched_policy;
-   unsigned long flags;
    const struct sched_class *prev_class;
-   struct rq *rq;
+   struct rq_flags rf;
    int reset_on_fork;
+   int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+   struct rq *rq;

    /* may grab non-irq protected spin_locks */
    BUG_ON(in_interrupt());
@@ -3897,10 +4364,7 @@
    } else {
        reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);

-       if (policy != SCHED_DEADLINE &&
-               policy != SCHED_FIFO && policy != SCHED_RR &&
-               policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-               policy != SCHED_IDLE)
+       if (!valid_policy(policy))
            return -EINVAL;
    }

@@ -3956,7 +4420,7 @@
         * Treat SCHED_IDLE as nice 20. Only allow a switch to
         * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
         */
-       if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+       if (idle_policy(p->policy) && !idle_policy(policy)) {
            if (!can_nice(p, task_nice(p)))
                return -EPERM;
        }
@@ -3983,13 +4447,14 @@
     * To be able to change p->policy safely, the appropriate
     * runqueue lock must be held.
     */
-   rq = task_rq_lock(p, &flags);
+   rq = task_rq_lock(p, &rf);
+   update_rq_clock(rq);

    /*
     * Changing the policy of the stop threads its a very bad idea
     */
    if (p == rq->stop) {
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
        return -EINVAL;
    }

@@ -4002,11 +4467,11 @@
            goto change;
        if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
            goto change;
-       if (dl_policy(policy))
+       if (dl_policy(policy) && dl_param_changed(p, attr))
            goto change;

        p->sched_reset_on_fork = reset_on_fork;
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
        return 0;
    }
 change:
@@ -4020,7 +4485,7 @@
        if (rt_bandwidth_enabled() && rt_policy(policy) &&
                task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                !task_group_is_autogroup(task_group(p))) {
-           task_rq_unlock(rq, p, &flags);
+           task_rq_unlock(rq, p, &rf);
            return -EPERM;
        }
 #endif
@@ -4035,7 +4500,7 @@
             */
            if (!cpumask_subset(span, &p->cpus_allowed) ||
                rq->rd->dl_bw.bw == 0) {
-               task_rq_unlock(rq, p, &flags);
+               task_rq_unlock(rq, p, &rf);
                return -EPERM;
            }
        }
@@ -4045,7 +4510,7 @@
    /* recheck policy now with rq lock held */
    if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
        policy = oldpolicy = -1;
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
        goto recheck;
    }

@@ -4055,52 +4520,55 @@
     * is available.
     */
    if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
-       task_rq_unlock(rq, p, &flags);
+       task_rq_unlock(rq, p, &rf);
        return -EBUSY;
    }

    p->sched_reset_on_fork = reset_on_fork;
    oldprio = p->prio;

-   /*
-    * Take priority boosted tasks into account. If the new
-    * effective priority is unchanged, we just store the new
-    * normal parameters and do not touch the scheduler class and
-    * the runqueue. This will be done when the task deboost
-    * itself.
-    */
-   new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-   if (new_effective_prio == oldprio) {
-       __setscheduler_params(p, attr);
-       task_rq_unlock(rq, p, &flags);
-       return 0;
+   if (pi) {
+       /*
+        * Take priority boosted tasks into account. If the new
+        * effective priority is unchanged, we just store the new
+        * normal parameters and do not touch the scheduler class and
+        * the runqueue. This will be done when the task deboost
+        * itself.
+        */
+       new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+       if (new_effective_prio == oldprio)
+           queue_flags &= ~DEQUEUE_MOVE;
    }

    queued = task_on_rq_queued(p);
    running = task_current(rq, p);
    if (queued)
-       dequeue_task(rq, p, 0);
+       dequeue_task(rq, p, queue_flags);
    if (running)
        put_prev_task(rq, p);

    prev_class = p->sched_class;
-   __setscheduler(rq, p, attr, true);
+   __setscheduler(rq, p, attr, pi);

-   if (running)
-       p->sched_class->set_curr_task(rq);
    if (queued) {
        /*
         * We enqueue to tail when the priority of a task is
         * increased (user space view).
         */
-       enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+       if (oldprio < p->prio)
+           queue_flags |= ENQUEUE_HEAD;
+
+       enqueue_task(rq, p, queue_flags);
    }
+   if (running)
+       set_curr_task(rq, p);

    check_class_changed(rq, p, prev_class, oldprio);
    preempt_disable(); /* avoid rq from going away on us */
-   task_rq_unlock(rq, p, &flags);
+   task_rq_unlock(rq, p, &rf);

-   rt_mutex_adjust_pi(p);
+   if (pi)
+       rt_mutex_adjust_pi(p);

    /*
     * Run balance callbacks after we've adjusted the PI chain.
@@ -4127,7 +4595,7 @@
        attr.sched_policy = policy;
    }

-   return __sched_setscheduler(p, &attr, check);
+   return __sched_setscheduler(p, &attr, check, true);
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
@@ -4148,7 +4616,7 @@

 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
 {
-   return __sched_setscheduler(p, attr, true);
+   return __sched_setscheduler(p, attr, true, true);
 }
 EXPORT_SYMBOL_GPL(sched_setattr);

@@ -4170,6 +4638,7 @@
 {
    return _sched_setscheduler(p, policy, param, false);
 }
+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);

 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@ -4683,7 +5152,7 @@
 {
    struct rq *rq = this_rq_lock();

-   schedstat_inc(rq, yld_count);
+   schedstat_inc(rq->yld_count);
    current->sched_class->yield_task(rq);

    /*
@@ -4700,22 +5169,17 @@
    return 0;
 }

-static void __cond_resched(void)
-{
-   __preempt_count_add(PREEMPT_ACTIVE);
-   __schedule();
-   __preempt_count_sub(PREEMPT_ACTIVE);
-}
-
+#ifndef CONFIG_PREEMPT
 int __sched _cond_resched(void)
 {
-   if (should_resched()) {
-       __cond_resched();
+   if (should_resched(0)) {
+       preempt_schedule_common();
        return 1;
    }
    return 0;
 }
 EXPORT_SYMBOL(_cond_resched);
+#endif

 /*
  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -4727,7 +5191,7 @@
  */
 int __cond_resched_lock(spinlock_t *lock)
 {
-   int resched = should_resched();
+   int resched = should_resched(PREEMPT_LOCK_OFFSET);
    int ret = 0;

    lockdep_assert_held(lock);
@@ -4735,7 +5199,7 @@
    if (spin_needbreak(lock) || resched) {
        spin_unlock(lock);
        if (resched)
-           __cond_resched();
+           preempt_schedule_common();
        else
            cpu_relax();
        ret = 1;
@@ -4749,9 +5213,9 @@
 {
    BUG_ON(!in_softirq());

-   if (should_resched()) {
+   if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
        local_bh_enable();
-       __cond_resched();
+       preempt_schedule_common();
        local_bh_disable();
        return 1;
    }
@@ -4841,7 +5305,7 @@

    yielded = curr->sched_class->yield_to_task(rq, p, preempt);
    if (yielded) {
-       schedstat_inc(rq, yld_count);
+       schedstat_inc(rq->yld_count);
        /*
         * Make p's CPU reschedule; pick_next_entity takes care of
         * fairness.
@@ -4866,36 +5330,26 @@
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
  */
-void __sched io_schedule(void)
-{
-   struct rq *rq = raw_rq();
-
-   delayacct_blkio_start();
-   atomic_inc(&rq->nr_iowait);
-   blk_flush_plug(current);
-   current->in_iowait = 1;
-   schedule();
-   current->in_iowait = 0;
-   atomic_dec(&rq->nr_iowait);
-   delayacct_blkio_end();
-}
-EXPORT_SYMBOL(io_schedule);
-
 long __sched io_schedule_timeout(long timeout)
 {
-   struct rq *rq = raw_rq();
+   int old_iowait = current->in_iowait;
+   struct rq *rq;
    long ret;

+   current->in_iowait = 1;
+   blk_schedule_flush_plug(current);
+
    delayacct_blkio_start();
+   rq = raw_rq();
    atomic_inc(&rq->nr_iowait);
-   blk_flush_plug(current);
-   current->in_iowait = 1;
    ret = schedule_timeout(timeout);
-   current->in_iowait = 0;
+   current->in_iowait = old_iowait;
    atomic_dec(&rq->nr_iowait);
    delayacct_blkio_end();
+
    return ret;
 }
+EXPORT_SYMBOL(io_schedule_timeout);

 /**
  * sys_sched_get_priority_max - return maximum RT priority.
@@ -4966,10 +5420,10 @@
 {
    struct task_struct *p;
    unsigned int time_slice;
-   unsigned long flags;
+   struct rq_flags rf;
+   struct timespec t;
    struct rq *rq;
    int retval;
-   struct timespec t;

    if (pid < 0)
        return -EINVAL;
@@ -4984,11 +5438,11 @@
    if (retval)
        goto out_unlock;

-   rq = task_rq_lock(p, &flags);
+   rq = task_rq_lock(p, &rf);
    time_slice = 0;
    if (p->sched_class->get_rr_interval)
        time_slice = p->sched_class->get_rr_interval(rq, p);
-   task_rq_unlock(rq, p, &flags);
+   task_rq_unlock(rq, p, &rf);

    rcu_read_unlock();
    jiffies_to_timespec(time_slice, &t);
@@ -5006,9 +5460,12 @@
 {
    unsigned long free = 0;
    int ppid;
-   unsigned state;
+   unsigned long state = p->state;

-   state = p->state ? __ffs(p->state) + 1 : 0;
+   if (!try_get_task_stack(p))
+       return;
+   if (state)
+       state = __ffs(state) + 1;
    printk(KERN_INFO "%-15.15s %c", p->comm,
        state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
@@ -5025,8 +5482,10 @@
 #ifdef CONFIG_DEBUG_STACK_USAGE
    free = stack_not_used(p);
 #endif
+   ppid = 0;
    rcu_read_lock();
-   ppid = task_pid_nr(rcu_dereference(p->real_parent));
+   if (pid_alive(p))
+       ppid = task_pid_nr(rcu_dereference(p->real_parent));
    rcu_read_unlock();
    printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
        task_pid_nr(p), ppid,
@@ -5034,6 +5493,7 @@

    print_worker_info(KERN_INFO, p);
    show_stack(p, NULL);
+   put_task_stack(p);
 }

 void show_state_filter(unsigned long state_filter)
@@ -5095,7 +5555,6 @@
    raw_spin_lock(&rq->lock);

    __sched_fork(0, idle);
-
    idle->state = TASK_RUNNING;
    idle->se.exec_start = sched_clock();

@@ -5163,26 +5622,26 @@
  */
 void sched_setnuma(struct task_struct *p, int nid)
 {
-   struct rq *rq;
-   unsigned long flags;
    bool queued, running;
+   struct rq_flags rf;
+   struct rq *rq;

-   rq = task_rq_lock(p, &flags);
+   rq = task_rq_lock(p, &rf);
    queued = task_on_rq_queued(p);
    running = task_current(rq, p);

    if (queued)
-       dequeue_task(rq, p, 0);
+       dequeue_task(rq, p, DEQUEUE_SAVE);
    if (running)
        put_prev_task(rq, p);

    p->numa_preferred_nid = nid;

-   if (running)
-       p->sched_class->set_curr_task(rq);
    if (queued)
-       enqueue_task(rq, p, 0);
-   task_rq_unlock(rq, p, &flags);
+       enqueue_task(rq, p, ENQUEUE_RESTORE);
+   if (running)
+       set_curr_task(rq, p);
+   task_rq_unlock(rq, p, &rf);
 }
 #endif /* CONFIG_NUMA_BALANCING */

@@ -5242,10 +5701,11 @@
  * there's no concurrency possible, we hold the required locks anyway
  * because of lock validation efforts.
  */
-static void migrate_tasks(unsigned int dead_cpu)
+static void migrate_tasks(struct rq *dead_rq)
 {
-   struct rq *rq = cpu_rq(dead_cpu);
+   struct rq *rq = dead_rq;
    struct task_struct *next, *stop = rq->stop;
+   struct pin_cookie cookie;
    int dest_cpu;

    /*
@@ -5266,7 +5726,7 @@
     */
    update_rq_clock(rq);

-   for ( ; ; ) {
+   for (;;) {
        /*
         * There's this thread running, bail when that's the only
         * remaining thread.
@@ -5274,17 +5734,48 @@
        if (rq->nr_running == 1)
            break;

-       next = pick_next_task(rq, &fake_task);
+       /*
+        * pick_next_task assumes pinned rq->lock.
+        */
+       cookie = lockdep_pin_lock(&rq->lock);
+       next = pick_next_task(rq, &fake_task, cookie);
        BUG_ON(!next);
        next->sched_class->put_prev_task(rq, next);

-       /* Find suitable destination for @next, with force if needed. */
-       dest_cpu = select_fallback_rq(dead_cpu, next);
+       /*
+        * Rules for changing task_struct::cpus_allowed are holding
+        * both pi_lock and rq->lock, such that holding either
+        * stabilizes the mask.
+        *
+        * Drop rq->lock is not quite as disastrous as it usually is
+        * because !cpu_active at this point, which means load-balance
+        * will not interfere. Also, stop-machine.
+        */
+       lockdep_unpin_lock(&rq->lock, cookie);
        raw_spin_unlock(&rq->lock);
+       raw_spin_lock(&next->pi_lock);
+       raw_spin_lock(&rq->lock);
+
+       /*
+        * Since we're inside stop-machine, _nothing_ should have
+        * changed the task, WARN if weird stuff happened, because in
+        * that case the above rq->lock drop is a fail too.
+        */
+       if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
+           raw_spin_unlock(&next->pi_lock);
+           continue;
+       }

-       __migrate_task(next, dead_cpu, dest_cpu);
+       /* Find suitable destination for @next, with force if needed. */
+       dest_cpu = select_fallback_rq(dead_rq->cpu, next);

-       raw_spin_lock(&rq->lock);
+       rq = __migrate_task(rq, next, dest_cpu);
+       if (rq != dead_rq) {
+           raw_spin_unlock(&rq->lock);
+           rq = dead_rq;
+           raw_spin_lock(&rq->lock);
+       }
+       raw_spin_unlock(&next->pi_lock);
    }

    rq->stop = stop;
@@ -5517,8 +6008,7 @@
 /* may be called multiple times per register */
 static void unregister_sched_domain_sysctl(void)
 {
-   if (sd_sysctl_header)
-       unregister_sysctl_table(sd_sysctl_header);
+   unregister_sysctl_table(sd_sysctl_header);
    sd_sysctl_header = NULL;
    if (sd_ctl_dir[0].child)
        sd_free_ctl_entry(&sd_ctl_dir[0].child);
@@ -5603,7 +6093,7 @@
            BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
            set_rq_offline(rq);
        }
-       migrate_tasks(cpu);
+       migrate_tasks(rq);
        BUG_ON(rq->nr_running != 1); /* the migration thread */
        raw_spin_unlock_irqrestore(&rq->lock, flags);
        break;
@@ -5629,7 +6119,7 @@
    .priority = CPU_PRI_MIGRATION,
 };

-static void __cpuinit set_cpu_rq_start_time(void)
+static void set_cpu_rq_start_time(void)
 {
    int cpu = smp_processor_id();
    struct rq *rq = cpu_rq(cpu);
@@ -5745,9 +6235,6 @@

    if (!(sd->flags & SD_LOAD_BALANCE)) {
        printk("does not load-balance\n");
-       if (sd->parent)
-           printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
-                   " has parent");
        return -1;
    }

@@ -5840,8 +6327,12 @@

 static int sd_degenerate(struct sched_domain *sd)
 {
-   if (cpumask_weight(sched_domain_span(sd)) == 1)
-       return 1;
+   if (cpumask_weight(sched_domain_span(sd)) == 1) {
+       if (sd->groups->sge)
+           sd->flags &= ~SD_LOAD_BALANCE;
+       else
+           return 1;
+   }

    /* Following flags need at least 2 groups */
    if (sd->flags & (SD_LOAD_BALANCE |
@@ -5849,6 +6340,7 @@
             SD_BALANCE_FORK |
             SD_BALANCE_EXEC |
             SD_SHARE_CPUCAPACITY |
+            SD_ASYM_CPUCAPACITY |
             SD_SHARE_PKG_RESOURCES |
             SD_SHARE_POWERDOMAIN |
             SD_SHARE_CAP_STATES)) {
@@ -5880,11 +6372,16 @@
                SD_BALANCE_NEWIDLE |
                SD_BALANCE_FORK |
                SD_BALANCE_EXEC |
+               SD_ASYM_CPUCAPACITY |
                SD_SHARE_CPUCAPACITY |
                SD_SHARE_PKG_RESOURCES |
                SD_PREFER_SIBLING |
                SD_SHARE_POWERDOMAIN |
                SD_SHARE_CAP_STATES);
+       if (parent->groups->sge) {
+           parent->flags &= ~SD_LOAD_BALANCE;
+           return 0;
+       }
        if (nr_node_ids == 1)
            pflags &= ~SD_SERIALIZE;
    }
@@ -5944,6 +6441,19 @@
        call_rcu_sched(&old_rd->rcu, free_rootdomain);
 }

+void sched_get_rd(struct root_domain *rd)
+{
+   atomic_inc(&rd->refcount);
+}
+
+void sched_put_rd(struct root_domain *rd)
+{
+   if (!atomic_dec_and_test(&rd->refcount))
+       return;
+
+   call_rcu_sched(&rd->rcu, free_rootdomain);
+}
+
 static int init_rootdomain(struct root_domain *rd)
 {
    memset(rd, 0, sizeof(*rd));
@@ -5957,6 +6467,12 @@
    if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
        goto free_dlo_mask;

+#ifdef HAVE_RT_PUSH_IPI
+   rd->rto_cpu = -1;
+   raw_spin_lock_init(&rd->rto_lock);
+   init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+#endif
+
    init_dl_bw(&rd->dl_bw);
    if (cpudl_init(&rd->cpudl) != 0)
        goto free_dlo_mask;
@@ -5965,6 +6481,9 @@
        goto free_rto_mask;

    init_max_cpu_capacity(&rd->max_cpu_capacity);
+
+   rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1;
+
    return 0;

 free_rto_mask:
@@ -6027,10 +6546,8 @@
    } while (sg != first);
 }

-static void free_sched_domain(struct rcu_head *rcu)
+static void destroy_sched_domain(struct sched_domain *sd)
 {
-   struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-
    /*
     * If its an overlapping domain it has private groups, iterate and
     * nuke them all.
@@ -6041,18 +6558,26 @@
        kfree(sd->groups->sgc);
        kfree(sd->groups);
    }
+   if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
+       kfree(sd->shared);
    kfree(sd);
 }

-static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+static void destroy_sched_domains_rcu(struct rcu_head *rcu)
 {
-   call_rcu(&sd->rcu, free_sched_domain);
+   struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+   while (sd) {
+       struct sched_domain *parent = sd->parent;
+       destroy_sched_domain(sd);
+       sd = parent;
+   }
 }

-static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+static void destroy_sched_domains(struct sched_domain *sd)
 {
-   for (; sd; sd = sd->parent)
-       destroy_sched_domain(sd, cpu);
+   if (sd)
+       call_rcu(&sd->rcu, destroy_sched_domains_rcu);
 }

 /*
@@ -6067,16 +6592,17 @@
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_busy);
 DEFINE_PER_CPU(struct sched_domain *, sd_asym);
 DEFINE_PER_CPU(struct sched_domain *, sd_ea);
 DEFINE_PER_CPU(struct sched_domain *, sd_scs);

 static void update_top_cache_domain(int cpu)
 {
+   struct sched_domain_shared *sds = NULL;
    struct sched_domain *sd;
-   struct sched_domain *busy_sd = NULL, *ea_sd = NULL;
+   struct sched_domain *ea_sd = NULL;
    int id = cpu;
    int size = 1;

@@ -6084,13 +6610,13 @@
    if (sd) {
        id = cpumask_first(sched_domain_span(sd));
        size = cpumask_weight(sched_domain_span(sd));
-       busy_sd = sd->parent; /* sd_busy */
+       sds = sd->shared;
    }
-   rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);

    rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
    per_cpu(sd_llc_size, cpu) = size;
    per_cpu(sd_llc_id, cpu) = id;
+   rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);

    sd = lowest_flag_domain(cpu, SD_NUMA);
    rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
@@ -6137,7 +6663,7 @@
             */
            if (parent->flags & SD_PREFER_SIBLING)
                tmp->flags |= SD_PREFER_SIBLING;
-           destroy_sched_domain(parent, cpu);
+           destroy_sched_domain(parent);
        } else
            tmp = tmp->parent;
    }
@@ -6145,7 +6671,7 @@
    if (sd && sd_degenerate(sd)) {
        tmp = sd;
        sd = sd->parent;
-       destroy_sched_domain(tmp, cpu);
+       destroy_sched_domain(tmp);
        if (sd)
            sd->child = NULL;
    }
@@ -6155,14 +6681,11 @@
    rq_attach_root(rq, rd);
    tmp = rq->sd;
    rcu_assign_pointer(rq->sd, sd);
-   destroy_sched_domains(tmp, cpu);
+   destroy_sched_domains(tmp);

    update_top_cache_domain(cpu);
 }

-/* cpus with isolated domains */
-static cpumask_var_t cpu_isolated_map;
-
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
@@ -6288,6 +6811,7 @@
         */
        sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
        sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
+       sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;

        /*
         * Make sure the first group of this domain contains the
@@ -6413,7 +6937,6 @@
        return;

    update_group_capacity(sd, cpu);
-   atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
 }

 /*
@@ -6476,28 +6999,6 @@
    sd->groups->sge = fn(cpu);
 }

-#ifdef CONFIG_SCHED_DEBUG
-void set_energy_aware()
-{
-   sched_feat_set("ENERGY_AWARE");
-}
-void clear_energy_aware()
-{
-   sched_feat_set("NO_ENERGY_AWARE");
-}
-#else
-struct static_key __read_mostly __energy_aware = STATIC_KEY_INIT_FALSE;
-
-void set_energy_aware()
-{
-   static_key_slow_inc(&__energy_aware);
-}
-void clear_energy_aware()
-{
-   static_key_slow_dec(&__energy_aware);
-}
-#endif /* CONFIG_SCHED_DEBUG */
-
 /*
  * Initializers for schedule domains
  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
@@ -6583,6 +7084,9 @@
    WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
    *per_cpu_ptr(sdd->sd, cpu) = NULL;

+   if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
+       *per_cpu_ptr(sdd->sds, cpu) = NULL;
+
    if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
        *per_cpu_ptr(sdd->sg, cpu) = NULL;

@@ -6600,13 +7104,20 @@
 /*
  * SD_flags allowed in topology descriptions.
  *
- * SD_SHARE_CPUCAPACITY      - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA                - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN   - describes shared power domain
- * SD_SHARE_CAP_STATES    - describes shared capacity states
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
+ *
+ *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
+ *   SD_SHARE_PKG_RESOURCES - describes shared caches
+ *   SD_NUMA                - describes NUMA topologies
+ *   SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
+ *   SD_SHARE_CAP_STATES    - describes shared capacity states
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
  *
- * Odd one out:
  * SD_ASYM_PACKING        - describes SMT quirks
  */
 #define TOPOLOGY_SD_FLAGS      \
@@ -6614,14 +7125,18 @@
     SD_SHARE_PKG_RESOURCES |   \
     SD_NUMA |          \
     SD_ASYM_PACKING |      \
+    SD_ASYM_CPUCAPACITY |      \
     SD_SHARE_POWERDOMAIN |     \
     SD_SHARE_CAP_STATES)

 static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl,
+   const struct cpumask *cpu_map,
+   struct sched_domain *child, int cpu)
 {
-   struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
-   int sd_weight, sd_flags = 0;
+   struct sd_data *sdd = &tl->data;
+   struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+   int sd_id, sd_weight, sd_flags = 0;

 #ifdef CONFIG_NUMA
    /*
@@ -6670,15 +7185,26 @@
        .smt_gain       = 0,
        .max_newidle_lb_cost    = 0,
        .next_decay_max_lb_cost = jiffies,
+       .child          = child,
 #ifdef CONFIG_SCHED_DEBUG
        .name           = tl->name,
 #endif
    };

+   cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+   sd_id = cpumask_first(sched_domain_span(sd));
+
    /*
     * Convert topological properties into behaviour.
     */

+   if (sd->flags & SD_ASYM_CPUCAPACITY) {
+       struct sched_domain *t = sd;
+
+       for_each_lower_domain(t)
+           t->flags |= SD_BALANCE_WAKE;
+   }
+
    if (sd->flags & SD_SHARE_CPUCAPACITY) {
        sd->flags |= SD_PREFER_SIBLING;
        sd->imbalance_pct = 110;
@@ -6710,7 +7236,17 @@
        sd->idle_idx = 1;
    }

-   sd->private = &tl->data;
+   /*
+    * For all levels sharing cache; connect a sched_domain_shared
+    * instance.
+    */
+   if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+       sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+       atomic_inc(&sd->shared->ref);
+       atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
+   }
+
+   sd->private = sdd;

    return sd;
 }
@@ -6729,7 +7265,8 @@
    { NULL, },
 };

-struct sched_domain_topology_level *sched_domain_topology = default_topology;
+static struct sched_domain_topology_level *sched_domain_topology =
+   default_topology;

 #define for_each_sd_topology(tl)           \
    for (tl = sched_domain_topology; tl->mask; tl++)
@@ -6992,6 +7529,10 @@
        if (!sdd->sd)
            return -ENOMEM;

+       sdd->sds = alloc_percpu(struct sched_domain_shared *);
+       if (!sdd->sds)
+           return -ENOMEM;
+
        sdd->sg = alloc_percpu(struct sched_group *);
        if (!sdd->sg)
            return -ENOMEM;
@@ -7002,6 +7543,7 @@

        for_each_cpu(j, cpu_map) {
            struct sched_domain *sd;
+           struct sched_domain_shared *sds;
            struct sched_group *sg;
            struct sched_group_capacity *sgc;

@@ -7012,6 +7554,13 @@

            *per_cpu_ptr(sdd->sd, j) = sd;

+           sds = kzalloc_node(sizeof(struct sched_domain_shared),
+                   GFP_KERNEL, cpu_to_node(j));
+           if (!sds)
+               return -ENOMEM;
+
+           *per_cpu_ptr(sdd->sds, j) = sds;
+
            sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
                    GFP_KERNEL, cpu_to_node(j));
            if (!sg)
@@ -7051,6 +7600,8 @@
                kfree(*per_cpu_ptr(sdd->sd, j));
            }

+           if (sdd->sds)
+               kfree(*per_cpu_ptr(sdd->sds, j));
            if (sdd->sg)
                kfree(*per_cpu_ptr(sdd->sg, j));
            if (sdd->sgc)
@@ -7058,6 +7609,8 @@
        }
        free_percpu(sdd->sd);
        sdd->sd = NULL;
+       free_percpu(sdd->sds);
+       sdd->sds = NULL;
        free_percpu(sdd->sg);
        sdd->sg = NULL;
        free_percpu(sdd->sgc);
@@ -7069,16 +7622,12 @@
        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
        struct sched_domain *child, int cpu)
 {
-   struct sched_domain *sd = sd_init(tl, cpu);
-   if (!sd)
-       return child;
+   struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);

-   cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
    if (child) {
        sd->level = child->level + 1;
        sched_domain_level_max = max(sched_domain_level_max, sd->level);
        child->parent = sd;
-       sd->child = child;

        if (!cpumask_subset(sched_domain_span(child),
                    sched_domain_span(sd))) {
@@ -7109,7 +7658,6 @@
    enum s_alloc alloc_state;
    struct sched_domain *sd;
    struct s_data d;
-   struct rq *rq = NULL;
    int i, ret = -ENOMEM;

    alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -7127,8 +7675,6 @@
                *per_cpu_ptr(d.sd, i) = sd;
            if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
                sd->flags |= SD_OVERLAP;
-           if (cpumask_equal(cpu_map, sched_domain_span(sd)))
-               break;
        }
    }

@@ -7163,8 +7709,19 @@
    /* Attach the domains */
    rcu_read_lock();
    for_each_cpu(i, cpu_map) {
-       rq = cpu_rq(i);
+       int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
+       int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu);
+
+       if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig >
+           cpu_rq(max_cpu)->cpu_capacity_orig))
+           WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
+
+       if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig <
+           cpu_rq(min_cpu)->cpu_capacity_orig))
+           WRITE_ONCE(d.rd->min_cap_orig_cpu, i);
+
        sd = *per_cpu_ptr(d.sd, i);
+
        cpu_attach_domain(sd, d.rd, i);
    }
    rcu_read_unlock();
@@ -7385,17 +7942,16 @@
         * operation in the resume sequence, just build a single sched
         * domain, ignoring cpusets.
         */
-       num_cpus_frozen--;
-       if (likely(num_cpus_frozen)) {
-           partition_sched_domains(1, NULL, NULL);
+       partition_sched_domains(1, NULL, NULL);
+       if (--num_cpus_frozen)
            break;
-       }

        /*
         * This is the last CPU online operation. So fall through and
         * restore the original sched domains by considering the
         * cpuset configurations.
         */
+       cpuset_force_rebuild();

    case CPU_ONLINE:
    case CPU_DOWN_FAILED:
@@ -7428,7 +7984,6 @@
 {
    cpumask_var_t non_isolated_cpus;

-   walt_init_cpu_efficiency();
    alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
    alloc_cpumask_var(&fallback_doms, GFP_KERNEL);

@@ -7490,6 +8045,7 @@
 #endif

 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
+DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);

 void __init sched_init(void)
 {
@@ -7528,6 +8084,8 @@
        for_each_possible_cpu(i) {
            per_cpu(load_balance_mask, i) = (void *)ptr;
            ptr += cpumask_size();
+       per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
+           cpumask_size(), GFP_KERNEL, cpu_to_node(i));
        }
 #endif /* CONFIG_CPUMASK_OFFSTACK */
    }
@@ -7553,6 +8111,7 @@
    INIT_LIST_HEAD(&root_task_group.children);
    INIT_LIST_HEAD(&root_task_group.siblings);
    autogroup_init(&init_task);
+
 #endif /* CONFIG_CGROUP_SCHED */

    for_each_possible_cpu(i) {
@@ -7564,11 +8123,12 @@
        rq->calc_load_active = 0;
        rq->calc_load_update = jiffies + LOAD_FREQ;
        init_cfs_rq(&rq->cfs);
-       init_rt_rq(&rq->rt, rq);
-       init_dl_rq(&rq->dl, rq);
+       init_rt_rq(&rq->rt);
+       init_dl_rq(&rq->dl);
 #ifdef CONFIG_FAIR_GROUP_SCHED
        root_task_group.shares = ROOT_TASK_GROUP_LOAD;
        INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+       rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
        /*
         * How much cpu bandwidth does root_task_group get?
         *
@@ -7610,6 +8170,7 @@
        rq->active_balance = 0;
        rq->next_balance = jiffies;
        rq->push_cpu = 0;
+       rq->push_task = NULL;
        rq->cpu = i;
        rq->online = 0;
        rq->idle_stamp = 0;
@@ -7695,15 +8256,34 @@

 void __might_sleep(const char *file, int line, int preempt_offset)
 {
+   /*
+    * Blocking primitives will set (and therefore destroy) current->state,
+    * since we will exit with TASK_RUNNING make sure we enter with it,
+    * otherwise we will destroy state.
+    */
+   if (WARN_ONCE(current->state != TASK_RUNNING,
+           "do not call blocking ops when !TASK_RUNNING; "
+           "state=%lx set at [<%p>] %pS\n",
+           current->state,
+           (void *)current->task_state_change,
+           (void *)current->task_state_change))
+       __set_current_state(TASK_RUNNING);
+
+   ___might_sleep(file, line, preempt_offset);
+}
+EXPORT_SYMBOL(__might_sleep);
+
+void ___might_sleep(const char *file, int line, int preempt_offset)
+{
    static unsigned long prev_jiffy;    /* ratelimiting */

    rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
    if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
-        !is_idle_task(current)) || oops_in_progress)
-       return;
-   if (system_state != SYSTEM_RUNNING &&
-       (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
+        !is_idle_task(current)) ||
+       system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
+       oops_in_progress)
        return;
+
    if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
        return;
    prev_jiffy = jiffies;
@@ -7716,6 +8296,9 @@
            in_atomic(), irqs_disabled(),
            current->pid, current->comm);

+   if (task_stack_end_corrupted(current))
+       printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
    debug_show_held_locks(current);
    if (irqs_disabled())
        print_irqtrace_events(current);
@@ -7728,36 +8311,16 @@
 #endif
    dump_stack();
 }
-EXPORT_SYMBOL(__might_sleep);
+EXPORT_SYMBOL(___might_sleep);
 #endif

 #ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
+void normalize_rt_tasks(void)
 {
-   const struct sched_class *prev_class = p->sched_class;
+   struct task_struct *g, *p;
    struct sched_attr attr = {
        .sched_policy = SCHED_NORMAL,
    };
-   int old_prio = p->prio;
-   int queued;
-
-   queued = task_on_rq_queued(p);
-   if (queued)
-       dequeue_task(rq, p, 0);
-   __setscheduler(rq, p, &attr, false);
-   if (queued) {
-       enqueue_task(rq, p, 0);
-       resched_curr(rq);
-   }
-
-   check_class_changed(rq, p, prev_class, old_prio);
-}
-
-void normalize_rt_tasks(void)
-{
-   struct task_struct *g, *p;
-   unsigned long flags;
-   struct rq *rq;

    read_lock(&tasklist_lock);
    for_each_process_thread(g, p) {
@@ -7767,12 +8330,10 @@
        if (p->flags & PF_KTHREAD)
            continue;

-       p->se.exec_start        = 0;
-#ifdef CONFIG_SCHEDSTATS
-       p->se.statistics.wait_start = 0;
-       p->se.statistics.sleep_start    = 0;
-       p->se.statistics.block_start    = 0;
-#endif
+       p->se.exec_start = 0;
+       schedstat_set(p->se.statistics.wait_start,  0);
+       schedstat_set(p->se.statistics.sleep_start, 0);
+       schedstat_set(p->se.statistics.block_start, 0);

        if (!dl_task(p) && !rt_task(p)) {
            /*
@@ -7784,9 +8345,7 @@
            continue;
        }

-       rq = task_rq_lock(p, &flags);
-       normalize_task(rq, p);
-       task_rq_unlock(rq, p, &flags);
+       __sched_setscheduler(p, &attr, false, false);
    }
    read_unlock(&tasklist_lock);
 }
@@ -7920,27 +8479,9 @@
    spin_unlock_irqrestore(&task_group_lock, flags);
 }

-/* change task's runqueue when it moves between groups.
- * The caller of this function should have put the task in its new group
- * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk, int type)
 {
    struct task_group *tg;
-   int queued, running;
-   unsigned long flags;
-   struct rq *rq;
-
-   rq = task_rq_lock(tsk, &flags);
-
-   running = task_current(rq, tsk);
-   queued = task_on_rq_queued(tsk);
-
-   if (queued)
-       dequeue_task(rq, tsk, 0);
-   if (unlikely(running))
-       put_prev_task(rq, tsk);

    /*
     * All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7953,18 +8494,45 @@
    tsk->sched_task_group = tg;

 #ifdef CONFIG_FAIR_GROUP_SCHED
-   if (tsk->sched_class->task_move_group)
-       tsk->sched_class->task_move_group(tsk);
+   if (tsk->sched_class->task_change_group)
+       tsk->sched_class->task_change_group(tsk, type);
    else
 #endif
        set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+   int queued, running;
+   struct rq_flags rf;
+   struct rq *rq;

+   rq = task_rq_lock(tsk, &rf);
+   update_rq_clock(rq);
+
+   running = task_current(rq, tsk);
+   queued = task_on_rq_queued(tsk);
+
+   if (queued)
+       dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
    if (unlikely(running))
-       tsk->sched_class->set_curr_task(rq);
+       put_prev_task(rq, tsk);
+
+   sched_change_group(tsk, TASK_MOVE_GROUP);
+
    if (queued)
-       enqueue_task(rq, tsk, 0);
+       enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
+   if (unlikely(running))
+       set_curr_task(rq, tsk);

-   task_rq_unlock(rq, tsk, &flags);
+   task_rq_unlock(rq, tsk, &rf);
 }
 #endif /* CONFIG_CGROUP_SCHED */

@@ -8077,6 +8645,17 @@
 {
    int i, err = 0;

+   /*
+    * Disallowing the root group RT runtime is BAD, it would disallow the
+    * kernel creating (and or operating) RT threads.
+    */
+   if (tg == &root_task_group && rt_runtime == 0)
+       return -EINVAL;
+
+   /* No period doesn't make any sense. */
+   if (rt_period == 0)
+       return -EINVAL;
+
    mutex_lock(&rt_constraints_mutex);
    read_lock(&tasklist_lock);
    err = __rt_schedulable(tg, rt_period, rt_runtime);
@@ -8126,16 +8705,13 @@
    return rt_runtime_us;
 }

-static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
 {
    u64 rt_runtime, rt_period;

-   rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+   rt_period = rt_period_us * NSEC_PER_USEC;
    rt_runtime = tg->rt_bandwidth.rt_runtime;

-   if (rt_period == 0)
-       return -EINVAL;
-
    return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }

@@ -8382,9 +8958,21 @@
    sched_offline_group(tg);
 }

+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
 static void cpu_cgroup_fork(struct task_struct *task)
 {
-   sched_move_task(task);
+   struct rq_flags rf;
+   struct rq *rq;
+
+   rq = task_rq_lock(task, &rf);
+
+   update_rq_clock(rq);
+   sched_change_group(task, TASK_SET_GROUP);
+
+   task_rq_unlock(rq, task, &rf);
 }

 static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
@@ -8765,3 +9353,44 @@
    pr_info("Task dump for CPU %d:\n", cpu);
    sched_show_task(cpu_curr(cpu));
 }
+
+/*
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+const int sched_prio_to_weight[40] = {
+ /* -20 */     88761,     71755,     56483,     46273,     36291,
+ /* -15 */     29154,     23254,     18705,     14949,     11916,
+ /* -10 */      9548,      7620,      6100,      4904,      3906,
+ /*  -5 */      3121,      2501,      1991,      1586,      1277,
+ /*   0 */      1024,       820,       655,       526,       423,
+ /*   5 */       335,       272,       215,       172,       137,
+ /*  10 */       110,        87,        70,        56,        45,
+ /*  15 */        36,        29,        23,        18,        15,
+};
+
+/*
+ * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+const u32 sched_prio_to_wmult[40] = {
+ /* -20 */     48388,     59856,     76040,     92818,    118348,
+ /* -15 */    147320,    184698,    229616,    287308,    360437,
+ /* -10 */    449829,    563644,    704093,    875809,   1099582,
+ /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
+ /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
+ /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
+ /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
+ /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};
diff -Nur /home/ninez/android/marlin/kernel/sched/cpudeadline.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpudeadline.c
--- /home/ninez/android/marlin/kernel/sched/cpudeadline.c   2018-08-10 01:54:08.563395055 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpudeadline.c   2018-08-11 23:57:17.128607487 -0400
@@ -31,11 +31,6 @@
    return (i << 1) + 2;
 }

-static inline int dl_time_before(u64 a, u64 b)
-{
-   return (s64)(a - b) < 0;
-}
-
 static void cpudl_exchange(struct cpudl *cp, int a, int b)
 {
    int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
@@ -107,7 +102,9 @@
    int best_cpu = -1;
    const struct sched_dl_entity *dl_se = &p->dl;

-   if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
+   if (later_mask &&
+       cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed) &&
+       cpumask_and(later_mask, later_mask, cpu_active_mask)) {
        best_cpu = cpumask_any(later_mask);
        goto out;
    } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
@@ -186,6 +183,26 @@
 }

 /*
+ * cpudl_set_freecpu - Set the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_set_freecpu(struct cpudl *cp, int cpu)
+{
+   cpumask_set_cpu(cpu, cp->free_cpus);
+}
+
+/*
+ * cpudl_clear_freecpu - Clear the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
+{
+   cpumask_clear_cpu(cpu, cp->free_cpus);
+}
+
+/*
  * cpudl_init - initialize the cpudl structure
  * @cp: the cpudl max-heap context
  */
@@ -203,7 +220,7 @@
    if (!cp->elements)
        return -ENOMEM;

-   if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+   if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
        kfree(cp->elements);
        return -ENOMEM;
    }
@@ -211,8 +228,6 @@
    for_each_possible_cpu(i)
        cp->elements[i].idx = IDX_INVALID;

-   cpumask_setall(cp->free_cpus);
-
    return 0;
 }

diff -Nur /home/ninez/android/marlin/kernel/sched/cpudeadline.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpudeadline.h
--- /home/ninez/android/marlin/kernel/sched/cpudeadline.h   2018-08-10 01:54:08.563395055 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpudeadline.h   2018-08-11 23:57:17.128607487 -0400
@@ -2,6 +2,7 @@
 #define _LINUX_CPUDL_H

 #include <linux/sched.h>
+#include <linux/sched/deadline.h>

 #define IDX_INVALID     -1

@@ -24,6 +25,8 @@
           struct cpumask *later_mask);
 void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
 int cpudl_init(struct cpudl *cp);
+void cpudl_set_freecpu(struct cpudl *cp, int cpu);
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
 void cpudl_cleanup(struct cpudl *cp);
 #else
 #define cpudl_set(cp, cpu, dl) do { } while (0)
diff -Nur /home/ninez/android/marlin/kernel/sched/cpufreq.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq.c
--- /home/ninez/android/marlin/kernel/sched/cpufreq.c   1969-12-31 19:00:00.000000000 -0500
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq.c   2018-08-11 23:57:17.128607487 -0400
@@ -0,0 +1,63 @@
+/*
+ * Scheduler code and data structures related to cpufreq.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "sched.h"
+
+DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
+ * @cpu: The CPU to set the pointer for.
+ * @data: New pointer value.
+ * @func: Callback function to set for the CPU.
+ *
+ * Set and publish the update_util_data pointer for the given CPU.
+ *
+ * The update_util_data pointer of @cpu is set to @data and the callback
+ * function pointer in the target struct update_util_data is set to @func.
+ * That function will be called by cpufreq_update_util() from RCU-sched
+ * read-side critical sections, so it must not sleep.  @data will always be
+ * passed to it as the first argument which allows the function to get to the
+ * target update_util_data structure and its container.
+ *
+ * The update_util_data pointer of @cpu must be NULL when this function is
+ * called or it will WARN() and return with no effect.
+ */
+void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
+           void (*func)(struct update_util_data *data, u64 time,
+                    unsigned int flags))
+{
+   if (WARN_ON(!data || !func))
+       return;
+
+   if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
+       return;
+
+   data->func = func;
+   rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
+}
+EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
+
+/**
+ * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer.
+ * @cpu: The CPU to clear the pointer for.
+ *
+ * Clear the update_util_data pointer for the given CPU.
+ *
+ * Callers must use RCU-sched callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * right after this function to avoid use-after-free.
+ */
+void cpufreq_remove_update_util_hook(int cpu)
+{
+   rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
+}
+EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
diff -Nur /home/ninez/android/marlin/kernel/sched/cpufreq_sched.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq_sched.c
--- /home/ninez/android/marlin/kernel/sched/cpufreq_sched.c 2018-08-10 01:54:08.563395055 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq_sched.c 2018-08-11 23:57:17.128607487 -0400
@@ -32,6 +32,12 @@
 static DEFINE_PER_CPU(unsigned long, enabled);
 DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);

+struct gov_tunables {
+   struct gov_attr_set attr_set;
+   unsigned int up_throttle_nsec;
+   unsigned int down_throttle_nsec;
+};
+
 /**
  * gov_data - per-policy data internal to the governor
  * @up_throttle: next throttling period expiry if increasing OPP
@@ -53,8 +59,8 @@
 struct gov_data {
    ktime_t up_throttle;
    ktime_t down_throttle;
-   unsigned int up_throttle_nsec;
-   unsigned int down_throttle_nsec;
+   struct gov_tunables *tunables;
+   struct list_head tunables_hook;
    struct task_struct *task;
    struct irq_work irq_work;
    unsigned int requested_freq;
@@ -71,8 +77,10 @@

    __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);

-   gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec);
-   gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec);
+   gd->up_throttle = ktime_add_ns(ktime_get(),
+                      gd->tunables->up_throttle_nsec);
+   gd->down_throttle = ktime_add_ns(ktime_get(),
+                    gd->tunables->down_throttle_nsec);
    up_write(&policy->rwsem);
 }

@@ -194,7 +202,7 @@
    }

    /* Convert the new maximum capacity request into a cpu frequency */
-   freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
+   freq_new = capacity * policy->cpuinfo.max_freq >> SCHED_CAPACITY_SHIFT;
    if (cpufreq_frequency_table_target(policy, policy->freq_table,
                       freq_new, CPUFREQ_RELATION_L,
                       &index_new))
@@ -227,6 +235,18 @@
    cpufreq_cpu_put(policy);
 }

+#ifdef CONFIG_SCHED_WALT
+static inline unsigned long
+requested_capacity(struct sched_capacity_reqs *scr)
+{
+   if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+       return scr->cfs;
+   return scr->cfs + scr->rt;
+}
+#else
+#define requested_capacity(scr) (scr->cfs + scr->rt)
+#endif
+
 void update_cpu_capacity_request(int cpu, bool request)
 {
    unsigned long new_capacity;
@@ -237,25 +257,10 @@

    scr = &per_cpu(cpu_sched_capacity_reqs, cpu);

-#ifdef CONFIG_SCHED_WALT
-   if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
-       /*
-        * Same WALT signal is set at different places, take the max
-        * reported utilization
-        */
-       new_capacity = max(scr->cfs, scr->rt);
-       new_capacity = max(new_capacity, scr->dl);
-   } else {
-       /*
-        * For PELT, utilization is aggregated
-        */
-       new_capacity = scr->cfs + scr->rt + scr->dl;
-   }
-#else
-   new_capacity = scr->cfs + scr->rt + scr->dl;
-#endif
+   new_capacity = requested_capacity(scr);
    new_capacity = new_capacity * capacity_margin
        / SCHED_CAPACITY_SCALE;
+   new_capacity += scr->dl;

    if (new_capacity == scr->total)
        return;
@@ -277,12 +282,70 @@
    static_key_slow_dec(&__sched_freq);
 }

-static struct attribute_group sched_attr_group_gov_pol;
-static struct attribute_group *get_sysfs_attr(void)
+/* Tunables */
+static struct gov_tunables *global_tunables;
+
+static inline struct gov_tunables *to_tunables(struct gov_attr_set *attr_set)
+{
+   return container_of(attr_set, struct gov_tunables, attr_set);
+}
+
+static ssize_t up_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
+{
+   struct gov_tunables *tunables = to_tunables(attr_set);
+
+   return sprintf(buf, "%u\n", tunables->up_throttle_nsec);
+}
+
+static ssize_t up_throttle_nsec_store(struct gov_attr_set *attr_set,
+                     const char *buf, size_t count)
 {
-   return &sched_attr_group_gov_pol;
+   struct gov_tunables *tunables = to_tunables(attr_set);
+   int ret;
+   long unsigned int val;
+
+   ret = kstrtoul(buf, 0, &val);
+   if (ret < 0)
+       return ret;
+   tunables->up_throttle_nsec = val;
+   return count;
 }

+static ssize_t down_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
+{
+   struct gov_tunables *tunables = to_tunables(attr_set);
+
+   return sprintf(buf, "%u\n", tunables->down_throttle_nsec);
+}
+
+static ssize_t down_throttle_nsec_store(struct gov_attr_set *attr_set,
+                   const char *buf, size_t count)
+{
+   struct gov_tunables *tunables = to_tunables(attr_set);
+   int ret;
+   long unsigned int val;
+
+   ret = kstrtoul(buf, 0, &val);
+   if (ret < 0)
+       return ret;
+   tunables->down_throttle_nsec = val;
+   return count;
+}
+
+static struct governor_attr up_throttle_nsec = __ATTR_RW(up_throttle_nsec);
+static struct governor_attr down_throttle_nsec = __ATTR_RW(down_throttle_nsec);
+
+static struct attribute *schedfreq_attributes[] = {
+   &up_throttle_nsec.attr,
+   &down_throttle_nsec.attr,
+   NULL
+};
+
+static struct kobj_type tunables_ktype = {
+   .default_attrs = schedfreq_attributes,
+   .sysfs_ops = &governor_sysfs_ops,
+};
+
 static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
 {
    struct gov_data *gd;
@@ -297,20 +360,40 @@
    if (!gd)
        return -ENOMEM;

-   gd->up_throttle_nsec = policy->cpuinfo.transition_latency ?
-               policy->cpuinfo.transition_latency :
-               THROTTLE_UP_NSEC;
-   gd->down_throttle_nsec = THROTTLE_DOWN_NSEC;
-   pr_debug("%s: throttle threshold = %u [ns]\n",
-         __func__, gd->up_throttle_nsec);
-
    policy->governor_data = gd;

-   rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr());
-   if (rc) {
-       pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc);
-       goto err;
-   }
+   if (!global_tunables) {
+       gd->tunables = kzalloc(sizeof(*gd->tunables), GFP_KERNEL);
+       if (!gd->tunables)
+           goto free_gd;
+
+       gd->tunables->up_throttle_nsec =
+           policy->cpuinfo.transition_latency ?
+           policy->cpuinfo.transition_latency :
+           THROTTLE_UP_NSEC;
+       gd->tunables->down_throttle_nsec =
+           THROTTLE_DOWN_NSEC;
+
+       rc = kobject_init_and_add(&gd->tunables->attr_set.kobj,
+                     &tunables_ktype,
+                     get_governor_parent_kobj(policy),
+                     "%s", cpufreq_gov_sched.name);
+       if (rc)
+           goto free_tunables;
+
+       gov_attr_set_init(&gd->tunables->attr_set,
+                 &gd->tunables_hook);
+
+       pr_debug("%s: throttle_threshold = %u [ns]\n",
+            __func__, gd->tunables->up_throttle_nsec);
+
+       if (!have_governor_per_policy())
+           global_tunables = gd->tunables;
+   } else {
+       gd->tunables = global_tunables;
+       gov_attr_set_get(&global_tunables->attr_set,
+                &gd->tunables_hook);
+   }

    if (cpufreq_driver_is_slow()) {
        cpufreq_driver_slow = true;
@@ -320,7 +403,7 @@
        if (IS_ERR_OR_NULL(gd->task)) {
            pr_err("%s: failed to create kschedfreq thread\n",
                   __func__);
-           goto err;
+           goto free_tunables;
        }
        get_task_struct(gd->task);
        kthread_bind_mask(gd->task, policy->related_cpus);
@@ -332,7 +415,9 @@

    return 0;

-err:
+free_tunables:
+   kfree(gd->tunables);
+free_gd:
    policy->governor_data = NULL;
    kfree(gd);
    return -ENOMEM;
@@ -340,6 +425,7 @@

 static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
 {
+   unsigned int count;
    struct gov_data *gd = policy->governor_data;

    if (!gd)
@@ -351,7 +437,12 @@
        put_task_struct(gd->task);
    }

-   sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr());
+   count = gov_attr_set_put(&gd->tunables->attr_set, &gd->tunables_hook);
+   if (!count) {
+       if (!have_governor_per_policy())
+           global_tunables = NULL;
+       kfree(gd->tunables);
+   }

    policy->governor_data = NULL;

@@ -413,88 +504,6 @@
    return 0;
 }

-/* Tunables */
-static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf)
-{
-   return sprintf(buf, "%u\n", gd->up_throttle_nsec);
-}
-
-static ssize_t store_up_throttle_nsec(struct gov_data *gd,
-       const char *buf, size_t count)
-{
-   int ret;
-   long unsigned int val;
-
-   ret = kstrtoul(buf, 0, &val);
-   if (ret < 0)
-       return ret;
-   gd->up_throttle_nsec = val;
-   return count;
-}
-
-static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf)
-{
-   return sprintf(buf, "%u\n", gd->down_throttle_nsec);
-}
-
-static ssize_t store_down_throttle_nsec(struct gov_data *gd,
-       const char *buf, size_t count)
-{
-   int ret;
-   long unsigned int val;
-
-   ret = kstrtoul(buf, 0, &val);
-   if (ret < 0)
-       return ret;
-   gd->down_throttle_nsec = val;
-   return count;
-}
-
-/*
- * Create show/store routines
- * - sys: One governor instance for complete SYSTEM
- * - pol: One governor instance per struct cpufreq_policy
- */
-#define show_gov_pol_sys(file_name)                    \
-static ssize_t show_##file_name##_gov_pol              \
-(struct cpufreq_policy *policy, char *buf)             \
-{                                  \
-   return show_##file_name(policy->governor_data, buf);        \
-}
-
-#define store_gov_pol_sys(file_name)                   \
-static ssize_t store_##file_name##_gov_pol             \
-(struct cpufreq_policy *policy, const char *buf, size_t count)     \
-{                                  \
-   return store_##file_name(policy->governor_data, buf, count);    \
-}
-
-#define gov_pol_attr_rw(_name)                     \
-   static struct freq_attr _name##_gov_pol =               \
-   __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
-
-#define show_store_gov_pol_sys(file_name)              \
-   show_gov_pol_sys(file_name);                        \
-   store_gov_pol_sys(file_name)
-#define tunable_handlers(file_name) \
-   show_gov_pol_sys(file_name); \
-   store_gov_pol_sys(file_name); \
-   gov_pol_attr_rw(file_name)
-
-tunable_handlers(down_throttle_nsec);
-tunable_handlers(up_throttle_nsec);
-
-/* Per policy governor instance */
-static struct attribute *sched_attributes_gov_pol[] = {
-   &up_throttle_nsec_gov_pol.attr,
-   &down_throttle_nsec_gov_pol.attr,
-   NULL,
-};
-
-static struct attribute_group sched_attr_group_gov_pol = {
-   .attrs = sched_attributes_gov_pol,
-   .name = "sched",
-};

 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
 static
diff -Nur /home/ninez/android/marlin/kernel/sched/cpufreq_schedutil.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq_schedutil.c
--- /home/ninez/android/marlin/kernel/sched/cpufreq_schedutil.c 1969-12-31 19:00:00.000000000 -0500
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq_schedutil.c 2018-08-21 13:56:47.913412345 -0400
@@ -0,0 +1,874 @@
+/*
+ * CPUFreq governor based on scheduler-provided CPU utilization data.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpufreq.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <trace/events/power.h>
+
+#include "sched.h"
+#include "tune.h"
+
+unsigned long boosted_cpu_util(int cpu);
+
+/* Stub out fast switch routines present on mainline to reduce the backport
+ * overhead. */
+#define cpufreq_driver_fast_switch(x, y) 0
+#define cpufreq_enable_fast_switch(x)
+#define cpufreq_disable_fast_switch(x)
+#define LATENCY_MULTIPLIER         (1000)
+#define SUGOV_KTHREAD_PRIORITY 80
+
+struct sugov_tunables {
+   struct gov_attr_set attr_set;
+   unsigned int up_rate_limit_us;
+   unsigned int down_rate_limit_us;
+   bool iowait_boost_enable;
+};
+
+struct sugov_policy {
+   struct cpufreq_policy *policy;
+
+   struct sugov_tunables *tunables;
+   struct list_head tunables_hook;
+
+   raw_spinlock_t update_lock;  /* For shared policies */
+   u64 last_freq_update_time;
+   s64 min_rate_limit_ns;
+   s64 up_rate_delay_ns;
+   s64 down_rate_delay_ns;
+   unsigned int next_freq;
+   unsigned int cached_raw_freq;
+
+   /* The next fields are only needed if fast switch cannot be used. */
+   struct irq_work irq_work;
+   struct kthread_work work;
+   struct mutex work_lock;
+   struct kthread_worker worker;
+   struct task_struct *thread;
+   bool work_in_progress;
+
+   bool need_freq_update;
+};
+
+struct sugov_cpu {
+   struct update_util_data update_util;
+   struct sugov_policy *sg_policy;
+
+   bool iowait_boost_pending;
+   unsigned int iowait_boost;
+   unsigned int iowait_boost_max;
+   u64 last_update;
+
+   /* The fields below are only needed when sharing a policy. */
+   unsigned long util;
+   unsigned long max;
+   unsigned int flags;
+
+   /* The field below is for single-CPU policies only. */
+#ifdef CONFIG_NO_HZ_COMMON
+   unsigned long saved_idle_calls;
+#endif
+};
+
+static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
+
+/************************ Governor internals ***********************/
+
+static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
+{
+   s64 delta_ns;
+
+   if (unlikely(sg_policy->need_freq_update))
+       return true;
+
+   delta_ns = time - sg_policy->last_freq_update_time;
+
+   /* No need to recalculate next freq for min_rate_limit_us at least */
+   return delta_ns >= sg_policy->min_rate_limit_ns;
+}
+
+static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time,
+                    unsigned int next_freq)
+{
+   s64 delta_ns;
+
+   delta_ns = time - sg_policy->last_freq_update_time;
+
+   if (next_freq > sg_policy->next_freq &&
+       delta_ns < sg_policy->up_rate_delay_ns)
+           return true;
+
+   if (next_freq < sg_policy->next_freq &&
+       delta_ns < sg_policy->down_rate_delay_ns)
+           return true;
+
+   return false;
+}
+
+static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
+               unsigned int next_freq)
+{
+   struct cpufreq_policy *policy = sg_policy->policy;
+
+   if (sugov_up_down_rate_limit(sg_policy, time, next_freq)) {
+       /* Reset cached freq as next_freq isn't changed */
+       sg_policy->cached_raw_freq = 0;
+       return;
+   }
+
+   if (sg_policy->next_freq == next_freq)
+       return;
+
+   sg_policy->next_freq = next_freq;
+   sg_policy->last_freq_update_time = time;
+
+   if (policy->fast_switch_enabled) {
+       next_freq = cpufreq_driver_fast_switch(policy, next_freq);
+       if (next_freq == CPUFREQ_ENTRY_INVALID)
+           return;
+
+       policy->cur = next_freq;
+       trace_cpu_frequency(next_freq, smp_processor_id());
+   } else if (!sg_policy->work_in_progress) {
+       sg_policy->work_in_progress = true;
+       irq_work_queue(&sg_policy->irq_work);
+   }
+}
+
+/**
+ * get_next_freq - Compute a new frequency for a given cpufreq policy.
+ * @sg_policy: schedutil policy object to compute the new frequency for.
+ * @util: Current CPU utilization.
+ * @max: CPU capacity.
+ *
+ * If the utilization is frequency-invariant, choose the new frequency to be
+ * proportional to it, that is
+ *
+ * next_freq = C * max_freq * util / max
+ *
+ * Otherwise, approximate the would-be frequency-invariant utilization by
+ * util_raw * (curr_freq / max_freq) which leads to
+ *
+ * next_freq = C * curr_freq * util_raw / max
+ *
+ * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ *
+ * The lowest driver-supported frequency which is equal or greater than the raw
+ * next_freq (as calculated above) is returned, subject to policy min/max and
+ * cpufreq driver limitations.
+ */
+static unsigned int get_next_freq(struct sugov_policy *sg_policy,
+                 unsigned long util, unsigned long max)
+{
+   struct cpufreq_policy *policy = sg_policy->policy;
+   unsigned int freq = arch_scale_freq_invariant() ?
+               policy->cpuinfo.max_freq : policy->cur;
+
+   freq = (freq + (freq >> 2)) * util / max;
+
+   if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
+       return sg_policy->next_freq;
+
+   sg_policy->need_freq_update = false;
+   sg_policy->cached_raw_freq = freq;
+   return cpufreq_driver_resolve_freq(policy, freq);
+}
+
+static inline bool use_pelt(void)
+{
+#ifdef CONFIG_SCHED_WALT
+   return (!sysctl_sched_use_walt_cpu_util || walt_disabled);
+#else
+   return true;
+#endif
+}
+
+static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time)
+{
+   int cpu = smp_processor_id();
+   struct rq *rq = cpu_rq(cpu);
+   unsigned long max_cap, rt;
+   s64 delta;
+
+   max_cap = arch_scale_cpu_capacity(NULL, cpu);
+
+   sched_avg_update(rq);
+   delta = time - rq->age_stamp;
+   if (unlikely(delta < 0))
+       delta = 0;
+   rt = div64_u64(rq->rt_avg, sched_avg_period() + delta);
+   rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;
+
+   *util = boosted_cpu_util(cpu);
+   if (use_pelt())
+       *util = *util + rt;
+
+   *util = min(*util, max_cap);
+   *max = max_cap;
+}
+
+static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+                  unsigned int flags)
+{
+   struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+
+   if (!sg_policy->tunables->iowait_boost_enable)
+       return;
+
+   /* Clear iowait_boost if the CPU apprears to have been idle. */
+   if (sg_cpu->iowait_boost) {
+       s64 delta_ns = time - sg_cpu->last_update;
+
+       if (delta_ns > TICK_NSEC) {
+           sg_cpu->iowait_boost = 0;
+           sg_cpu->iowait_boost_pending = false;
+       }
+   }
+
+   if (flags & SCHED_CPUFREQ_IOWAIT) {
+       if (sg_cpu->iowait_boost_pending)
+           return;
+
+       sg_cpu->iowait_boost_pending = true;
+
+       if (sg_cpu->iowait_boost) {
+           sg_cpu->iowait_boost <<= 1;
+           if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
+               sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+       } else {
+           sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+       }
+   }
+}
+
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
+                  unsigned long *max)
+{
+   unsigned int boost_util, boost_max;
+
+   if (!sg_cpu->iowait_boost)
+       return;
+
+   if (sg_cpu->iowait_boost_pending) {
+       sg_cpu->iowait_boost_pending = false;
+   } else {
+       sg_cpu->iowait_boost >>= 1;
+       if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+           sg_cpu->iowait_boost = 0;
+           return;
+       }
+   }
+
+   boost_util = sg_cpu->iowait_boost;
+   boost_max = sg_cpu->iowait_boost_max;
+
+   if (*util * boost_max < *max * boost_util) {
+       *util = boost_util;
+       *max = boost_max;
+   }
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+{
+   unsigned long idle_calls = tick_nohz_get_idle_calls();
+   bool ret = idle_calls == sg_cpu->saved_idle_calls;
+
+   sg_cpu->saved_idle_calls = idle_calls;
+   return ret;
+}
+#else
+static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+#endif /* CONFIG_NO_HZ_COMMON */
+
+static void sugov_update_single(struct update_util_data *hook, u64 time,
+               unsigned int flags)
+{
+   struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+   struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+   struct cpufreq_policy *policy = sg_policy->policy;
+   unsigned long util, max;
+   unsigned int next_f;
+   bool busy;
+
+   sugov_set_iowait_boost(sg_cpu, time, flags);
+   sg_cpu->last_update = time;
+
+   /*
+    * For slow-switch systems, single policy requests can't run at the
+    * moment if update is in progress, unless we acquire update_lock.
+    */
+   if (sg_policy->work_in_progress)
+       return;
+
+   if (!sugov_should_update_freq(sg_policy, time))
+       return;
+
+   busy = sugov_cpu_is_busy(sg_cpu);
+
+   if (flags & SCHED_CPUFREQ_DL) {
+       next_f = policy->cpuinfo.max_freq;
+   } else {
+       sugov_get_util(&util, &max, time);
+       sugov_iowait_boost(sg_cpu, &util, &max);
+       next_f = get_next_freq(sg_policy, util, max);
+       /*
+        * Do not reduce the frequency if the CPU has not been idle
+        * recently, as the reduction is likely to be premature then.
+        */
+       if (busy && next_f < sg_policy->next_freq) {
+           next_f = sg_policy->next_freq;
+
+           /* Reset cached freq as next_freq has changed */
+           sg_policy->cached_raw_freq = 0;
+       }
+   }
+   sugov_update_commit(sg_policy, time, next_f);
+}
+
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
+{
+   struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+   struct cpufreq_policy *policy = sg_policy->policy;
+   unsigned long util = 0, max = 1;
+   unsigned int j;
+
+   for_each_cpu(j, policy->cpus) {
+       struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
+       unsigned long j_util, j_max;
+       s64 delta_ns;
+
+       /*
+        * If the CPU utilization was last updated before the previous
+        * frequency update and the time elapsed between the last update
+        * of the CPU utilization and the last frequency update is long
+        * enough, don't take the CPU into account as it probably is
+        * idle now (and clear iowait_boost for it).
+        */
+       delta_ns = time - j_sg_cpu->last_update;
+       if (delta_ns > TICK_NSEC) {
+           j_sg_cpu->iowait_boost = 0;
+           j_sg_cpu->iowait_boost_pending = false;
+           continue;
+       }
+       if (j_sg_cpu->flags & SCHED_CPUFREQ_DL)
+           return policy->cpuinfo.max_freq;
+
+       j_util = j_sg_cpu->util;
+       j_max = j_sg_cpu->max;
+       if (j_util * max > j_max * util) {
+           util = j_util;
+           max = j_max;
+       }
+
+       sugov_iowait_boost(j_sg_cpu, &util, &max);
+   }
+
+   return get_next_freq(sg_policy, util, max);
+}
+
+static void sugov_update_shared(struct update_util_data *hook, u64 time,
+               unsigned int flags)
+{
+   struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+   struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+   unsigned long util, max;
+   unsigned int next_f;
+
+   sugov_get_util(&util, &max, time);
+
+   raw_spin_lock(&sg_policy->update_lock);
+
+   sg_cpu->util = util;
+   sg_cpu->max = max;
+   sg_cpu->flags = flags;
+
+   sugov_set_iowait_boost(sg_cpu, time, flags);
+   sg_cpu->last_update = time;
+
+   if (sugov_should_update_freq(sg_policy, time)) {
+       if (flags & SCHED_CPUFREQ_DL)
+           next_f = sg_policy->policy->cpuinfo.max_freq;
+       else
+           next_f = sugov_next_freq_shared(sg_cpu, time);
+
+       sugov_update_commit(sg_policy, time, next_f);
+   }
+
+   raw_spin_unlock(&sg_policy->update_lock);
+}
+
+static void sugov_work(struct kthread_work *work)
+{
+   struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+   unsigned int freq;
+   unsigned long flags;
+
+   /*
+    * Hold sg_policy->update_lock shortly to handle the case where:
+    * incase sg_policy->next_freq is read here, and then updated by
+    * sugov_update_shared just before work_in_progress is set to false
+    * here, we may miss queueing the new update.
+    *
+    * Note: If a work was queued after the update_lock is released,
+    * sugov_work will just be called again by kthread_work code; and the
+    * request will be proceed before the sugov thread sleeps.
+    */
+   raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
+   freq = sg_policy->next_freq;
+   sg_policy->work_in_progress = false;
+   raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
+
+   mutex_lock(&sg_policy->work_lock);
+   __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
+   mutex_unlock(&sg_policy->work_lock);
+}
+
+static void sugov_irq_work(struct irq_work *irq_work)
+{
+   struct sugov_policy *sg_policy;
+
+   sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
+
+   /*
+    * For RT and deadline tasks, the schedutil governor shoots the
+    * frequency to maximum. Special care must be taken to ensure that this
+    * kthread doesn't result in the same behavior.
+    *
+    * This is (mostly) guaranteed by the work_in_progress flag. The flag is
+    * updated only at the end of the sugov_work() function and before that
+    * the schedutil governor rejects all other frequency scaling requests.
+    *
+    * There is a very rare case though, where the RT thread yields right
+    * after the work_in_progress flag is cleared. The effects of that are
+    * neglected for now.
+    */
+   kthread_queue_work(&sg_policy->worker, &sg_policy->work);
+}
+
+/************************** sysfs interface ************************/
+
+static struct sugov_tunables *global_tunables;
+static DEFINE_MUTEX(global_tunables_lock);
+
+static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
+{
+   return container_of(attr_set, struct sugov_tunables, attr_set);
+}
+
+static DEFINE_MUTEX(min_rate_lock);
+
+static void update_min_rate_limit_us(struct sugov_policy *sg_policy)
+{
+   mutex_lock(&min_rate_lock);
+   sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,
+                      sg_policy->down_rate_delay_ns);
+   mutex_unlock(&min_rate_lock);
+}
+
+static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+   struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+   return sprintf(buf, "%u\n", tunables->up_rate_limit_us);
+}
+
+static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+   struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+   return sprintf(buf, "%u\n", tunables->down_rate_limit_us);
+}
+
+static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set,
+                     const char *buf, size_t count)
+{
+   struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+   struct sugov_policy *sg_policy;
+   unsigned int rate_limit_us;
+
+   if (kstrtouint(buf, 10, &rate_limit_us))
+       return -EINVAL;
+
+   tunables->up_rate_limit_us = rate_limit_us;
+
+   list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+       sg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+       update_min_rate_limit_us(sg_policy);
+   }
+
+   return count;
+}
+
+static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set,
+                   const char *buf, size_t count)
+{
+   struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+   struct sugov_policy *sg_policy;
+   unsigned int rate_limit_us;
+
+   if (kstrtouint(buf, 10, &rate_limit_us))
+       return -EINVAL;
+
+   tunables->down_rate_limit_us = rate_limit_us;
+
+   list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+       sg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+       update_min_rate_limit_us(sg_policy);
+   }
+
+   return count;
+}
+
+static ssize_t iowait_boost_enable_show(struct gov_attr_set *attr_set,
+                   char *buf)
+{
+   struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+   return sprintf(buf, "%u\n", tunables->iowait_boost_enable);
+}
+
+static ssize_t iowait_boost_enable_store(struct gov_attr_set *attr_set,
+                    const char *buf, size_t count)
+{
+   struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+   bool enable;
+
+   if (kstrtobool(buf, &enable))
+       return -EINVAL;
+
+   tunables->iowait_boost_enable = enable;
+
+   return count;
+}
+
+static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us);
+static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us);
+static struct governor_attr iowait_boost_enable = __ATTR_RW(iowait_boost_enable);
+
+static struct attribute *sugov_attributes[] = {
+   &up_rate_limit_us.attr,
+   &down_rate_limit_us.attr,
+   &iowait_boost_enable.attr,
+   NULL
+};
+
+static struct kobj_type sugov_tunables_ktype = {
+   .default_attrs = sugov_attributes,
+   .sysfs_ops = &governor_sysfs_ops,
+};
+
+/********************** cpufreq governor interface *********************/
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+static
+#endif
+struct cpufreq_governor cpufreq_gov_schedutil;
+
+static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
+{
+   struct sugov_policy *sg_policy;
+
+   sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
+   if (!sg_policy)
+       return NULL;
+
+   sg_policy->policy = policy;
+   raw_spin_lock_init(&sg_policy->update_lock);
+   return sg_policy;
+}
+
+static void sugov_policy_free(struct sugov_policy *sg_policy)
+{
+   kfree(sg_policy);
+}
+
+static int sugov_kthread_create(struct sugov_policy *sg_policy)
+{
+   struct task_struct *thread;
+   struct sched_param param = { .sched_priority = 80 };
+   struct cpufreq_policy *policy = sg_policy->policy;
+   int ret;
+
+   /* kthread only required for slow path */
+   if (policy->fast_switch_enabled)
+       return 0;
+
+   kthread_init_work(&sg_policy->work, sugov_work);
+   kthread_init_worker(&sg_policy->worker);
+   thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
+               "sugov:%d",
+               cpumask_first(policy->related_cpus));
+   if (IS_ERR(thread)) {
+       pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+       return PTR_ERR(thread);
+   }
+
+   ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+   if (ret) {
+       kthread_stop(thread);
+       pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+       return ret;
+   }
+
+   sg_policy->thread = thread;
+   kthread_bind_mask(thread, policy->related_cpus);
+   init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+   mutex_init(&sg_policy->work_lock);
+
+   wake_up_process(thread);
+
+   return 0;
+}
+
+static void sugov_kthread_stop(struct sugov_policy *sg_policy)
+{
+   /* kthread only required for slow path */
+   if (sg_policy->policy->fast_switch_enabled)
+       return;
+
+   kthread_flush_worker(&sg_policy->worker);
+   kthread_stop(sg_policy->thread);
+   mutex_destroy(&sg_policy->work_lock);
+}
+
+static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
+{
+   struct sugov_tunables *tunables;
+
+   tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
+   if (tunables) {
+       gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
+       if (!have_governor_per_policy())
+           global_tunables = tunables;
+   }
+   return tunables;
+}
+
+static void sugov_tunables_free(struct sugov_tunables *tunables)
+{
+   if (!have_governor_per_policy())
+       global_tunables = NULL;
+
+   kfree(tunables);
+}
+
+static int sugov_init(struct cpufreq_policy *policy)
+{
+   struct sugov_policy *sg_policy;
+   struct sugov_tunables *tunables;
+   int ret = 0;
+
+   /* State should be equivalent to EXIT */
+   if (policy->governor_data)
+       return -EBUSY;
+
+   cpufreq_enable_fast_switch(policy);
+
+   sg_policy = sugov_policy_alloc(policy);
+   if (!sg_policy) {
+       ret = -ENOMEM;
+       goto disable_fast_switch;
+   }
+
+   ret = sugov_kthread_create(sg_policy);
+   if (ret)
+       goto free_sg_policy;
+
+   mutex_lock(&global_tunables_lock);
+
+   if (global_tunables) {
+       if (WARN_ON(have_governor_per_policy())) {
+           ret = -EINVAL;
+           goto stop_kthread;
+       }
+       policy->governor_data = sg_policy;
+       sg_policy->tunables = global_tunables;
+
+       gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
+       goto out;
+   }
+
+   tunables = sugov_tunables_alloc(sg_policy);
+   if (!tunables) {
+       ret = -ENOMEM;
+       goto stop_kthread;
+   }
+
+   if (policy->up_transition_delay_us && policy->down_transition_delay_us) {
+       tunables->up_rate_limit_us = policy->up_transition_delay_us;
+       tunables->down_rate_limit_us = policy->down_transition_delay_us;
+   } else {
+       unsigned int lat;
+
+                tunables->up_rate_limit_us = LATENCY_MULTIPLIER;
+                tunables->down_rate_limit_us = LATENCY_MULTIPLIER;
+       lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+       if (lat) {
+                        tunables->up_rate_limit_us *= lat;
+                        tunables->down_rate_limit_us *= lat;
+                }
+   }
+
+   tunables->iowait_boost_enable = policy->iowait_boost_enable;
+
+   policy->governor_data = sg_policy;
+   sg_policy->tunables = tunables;
+
+   ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
+                  get_governor_parent_kobj(policy), "%s",
+                  cpufreq_gov_schedutil.name);
+   if (ret)
+       goto fail;
+
+out:
+   mutex_unlock(&global_tunables_lock);
+   return 0;
+
+fail:
+   policy->governor_data = NULL;
+   sugov_tunables_free(tunables);
+
+stop_kthread:
+   sugov_kthread_stop(sg_policy);
+
+free_sg_policy:
+   mutex_unlock(&global_tunables_lock);
+
+   sugov_policy_free(sg_policy);
+
+disable_fast_switch:
+   cpufreq_disable_fast_switch(policy);
+
+   pr_err("initialization failed (error %d)\n", ret);
+   return ret;
+}
+
+static int sugov_exit(struct cpufreq_policy *policy)
+{
+   struct sugov_policy *sg_policy = policy->governor_data;
+   struct sugov_tunables *tunables = sg_policy->tunables;
+   unsigned int count;
+
+   mutex_lock(&global_tunables_lock);
+
+   count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
+   policy->governor_data = NULL;
+   if (!count)
+       sugov_tunables_free(tunables);
+
+   mutex_unlock(&global_tunables_lock);
+
+   sugov_kthread_stop(sg_policy);
+   sugov_policy_free(sg_policy);
+
+   cpufreq_disable_fast_switch(policy);
+   return 0;
+}
+
+static int sugov_start(struct cpufreq_policy *policy)
+{
+   struct sugov_policy *sg_policy = policy->governor_data;
+   unsigned int cpu;
+
+   sg_policy->up_rate_delay_ns =
+       sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;
+   sg_policy->down_rate_delay_ns =
+       sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
+   update_min_rate_limit_us(sg_policy);
+   sg_policy->last_freq_update_time = 0;
+   sg_policy->next_freq = 0;
+   sg_policy->work_in_progress = false;
+   sg_policy->need_freq_update = false;
+   sg_policy->cached_raw_freq = 0;
+
+   for_each_cpu(cpu, policy->cpus) {
+       struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+
+       memset(sg_cpu, 0, sizeof(*sg_cpu));
+       sg_cpu->sg_policy = sg_policy;
+       sg_cpu->flags = SCHED_CPUFREQ_DL;
+       sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+       cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+                        policy_is_shared(policy) ?
+                           sugov_update_shared :
+                           sugov_update_single);
+   }
+   return 0;
+}
+
+static int sugov_stop(struct cpufreq_policy *policy)
+{
+   struct sugov_policy *sg_policy = policy->governor_data;
+   unsigned int cpu;
+
+   for_each_cpu(cpu, policy->cpus)
+       cpufreq_remove_update_util_hook(cpu);
+
+   synchronize_sched();
+
+   if (!policy->fast_switch_enabled) {
+       irq_work_sync(&sg_policy->irq_work);
+       kthread_cancel_work_sync(&sg_policy->work);
+   }
+   return 0;
+}
+
+static int sugov_limits(struct cpufreq_policy *policy)
+{
+   struct sugov_policy *sg_policy = policy->governor_data;
+
+   if (!policy->fast_switch_enabled) {
+       mutex_lock(&sg_policy->work_lock);
+       cpufreq_policy_apply_limits(policy);
+       mutex_unlock(&sg_policy->work_lock);
+   }
+
+   sg_policy->need_freq_update = true;
+
+   return 0;
+}
+
+static int cpufreq_schedutil_cb(struct cpufreq_policy *policy,
+               unsigned int event)
+{
+   switch(event) {
+   case CPUFREQ_GOV_POLICY_INIT:
+       return sugov_init(policy);
+   case CPUFREQ_GOV_POLICY_EXIT:
+       return sugov_exit(policy);
+   case CPUFREQ_GOV_START:
+       return sugov_start(policy);
+   case CPUFREQ_GOV_STOP:
+       return sugov_stop(policy);
+   case CPUFREQ_GOV_LIMITS:
+       return sugov_limits(policy);
+   default:
+       BUG();
+   }
+}
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+static
+#endif
+struct cpufreq_governor cpufreq_gov_schedutil = {
+   .name = "schedutil",
+   .governor = cpufreq_schedutil_cb,
+   .owner = THIS_MODULE,
+};
+
+static int __init sugov_register(void)
+{
+   return cpufreq_register_governor(&cpufreq_gov_schedutil);
+}
+fs_initcall(sugov_register);
diff -Nur /home/ninez/android/marlin/kernel/sched/cputime.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cputime.c
--- /home/ninez/android/marlin/kernel/sched/cputime.c   2018-08-10 01:54:08.563395055 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cputime.c   2018-08-11 23:57:17.128607487 -0400
@@ -306,6 +306,26 @@
    return false;
 }

+#ifdef CONFIG_64BIT
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+   return t->se.sum_exec_runtime;
+}
+#else
+static u64 read_sum_exec_runtime(struct task_struct *t)
+{
+   u64 ns;
+   struct rq_flags rf;
+   struct rq *rq;
+
+   rq = task_rq_lock(t, &rf);
+   ns = t->se.sum_exec_runtime;
+   task_rq_unlock(rq, t, &rf);
+
+   return ns;
+}
+#endif
+
 /*
  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
  * tasks (sum on group iteration) belonging to @tsk's group.
@@ -318,6 +338,17 @@
    unsigned int seq, nextseq;
    unsigned long flags;

+   /*
+    * Update current task runtime to account pending time since last
+    * scheduler action or thread_group_cputime() call. This thread group
+    * might have other running tasks on different CPUs, but updating
+    * their runtime can affect syscall performance, so we skip account
+    * those pending times and rely only on values updated on tick or
+    * other scheduler action.
+    */
+   if (same_thread_group(current, tsk))
+       (void) task_sched_runtime(current);
+
    rcu_read_lock();
    /* Attempt a lockless read on the first round. */
    nextseq = 0;
@@ -332,7 +363,7 @@
            task_cputime(t, &utime, &stime);
            times->utime += utime;
            times->stime += stime;
-           times->sum_exec_runtime += task_sched_runtime(t);
+           times->sum_exec_runtime += read_sum_exec_runtime(t);
        }
        /* If lockless access failed, take the lock. */
        nextseq = 1;
@@ -582,48 +613,43 @@
 }

 /*
- * Atomically advance counter to the new value. Interrupts, vcpu
- * scheduling, and scaling inaccuracies can cause cputime_advance
- * to be occasionally called with a new value smaller than counter.
- * Let's enforce atomicity.
+ * Adjust tick based cputime random precision against scheduler runtime
+ * accounting.
  *
- * Normally a caller will only go through this loop once, or not
- * at all in case a previous caller updated counter the same jiffy.
- */
-static void cputime_advance(cputime_t *counter, cputime_t new)
-{
-   cputime_t old;
-
-   while (new > (old = READ_ONCE(*counter)))
-       cmpxchg_cputime(counter, old, new);
-}
-
-/*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
+ * Tick based cputime accounting depend on random scheduling timeslices of a
+ * task to be interrupted or not by the timer.  Depending on these
+ * circumstances, the number of these interrupts may be over or
+ * under-optimistic, matching the real user and system cputime with a variable
+ * precision.
+ *
+ * Fix this by scaling these tick based values against the total runtime
+ * accounted by the CFS scheduler.
+ *
+ * This code provides the following guarantees:
+ *
+ *   stime + utime == rtime
+ *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
+ *
+ * Assuming that rtime_i+1 >= rtime_i.
  */
 static void cputime_adjust(struct task_cputime *curr,
-              struct cputime *prev,
+              struct prev_cputime *prev,
               cputime_t *ut, cputime_t *st)
 {
    cputime_t rtime, stime, utime;
+   unsigned long flags;

-   /*
-    * Tick based cputime accounting depend on random scheduling
-    * timeslices of a task to be interrupted or not by the timer.
-    * Depending on these circumstances, the number of these interrupts
-    * may be over or under-optimistic, matching the real user and system
-    * cputime with a variable precision.
-    *
-    * Fix this by scaling these tick based values against the total
-    * runtime accounted by the CFS scheduler.
-    */
+   /* Serialize concurrent callers such that we can honour our guarantees */
+   raw_spin_lock_irqsave(&prev->lock, flags);
    rtime = nsecs_to_cputime(curr->sum_exec_runtime);

    /*
-    * Update userspace visible utime/stime values only if actual execution
-    * time is bigger than already exported. Note that can happen, that we
-    * provided bigger values due to scaling inaccuracy on big numbers.
+    * This is possible under two circumstances:
+    *  - rtime isn't monotonic after all (a bug);
+    *  - we got reordered by the lock.
+    *
+    * In both cases this acts as a filter such that the rest of the code
+    * can assume it is monotonic regardless of anything else.
     */
    if (prev->stime + prev->utime >= rtime)
        goto out;
@@ -633,22 +659,46 @@

    if (utime == 0) {
        stime = rtime;
-   } else if (stime == 0) {
-       utime = rtime;
-   } else {
-       cputime_t total = stime + utime;
+       goto update;
+   }

-       stime = scale_stime((__force u64)stime,
-                   (__force u64)rtime, (__force u64)total);
-       utime = rtime - stime;
+   if (stime == 0) {
+       utime = rtime;
+       goto update;
    }

-   cputime_advance(&prev->stime, stime);
-   cputime_advance(&prev->utime, utime);
+   stime = scale_stime((__force u64)stime, (__force u64)rtime,
+               (__force u64)(stime + utime));
+
+   /*
+    * Make sure stime doesn't go backwards; this preserves monotonicity
+    * for utime because rtime is monotonic.
+    *
+    *  utime_i+1 = rtime_i+1 - stime_i
+    *            = rtime_i+1 - (rtime_i - utime_i)
+    *            = (rtime_i+1 - rtime_i) + utime_i
+    *            >= utime_i
+    */
+   if (stime < prev->stime)
+       stime = prev->stime;
+   utime = rtime - stime;
+
+   /*
+    * Make sure utime doesn't go backwards; this still preserves
+    * monotonicity for stime, analogous argument to above.
+    */
+   if (utime < prev->utime) {
+       utime = prev->utime;
+       stime = rtime - utime;
+   }

+update:
+   prev->stime = stime;
+   prev->utime = utime;
 out:
    *ut = prev->utime;
    *st = prev->stime;
+   raw_spin_unlock_irqrestore(&prev->lock, flags);
 }

 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
diff -Nur /home/ninez/android/marlin/kernel/sched/deadline.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/deadline.c
--- /home/ninez/android/marlin/kernel/sched/deadline.c  2018-08-10 01:54:08.563395055 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/deadline.c  2018-08-26 16:43:11.647206295 -0400
@@ -18,6 +18,8 @@

 #include <linux/slab.h>

+#include "walt.h"
+
 struct dl_bandwidth def_dl_bandwidth;

 static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
@@ -87,7 +89,7 @@
    dl_b->total_bw = 0;
 }

-void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
+void init_dl_rq(struct dl_rq *dl_rq)
 {
    dl_rq->rb_root = RB_ROOT;

@@ -152,7 +154,7 @@
 {
    struct task_struct *p = dl_task_of(dl_se);

-   if (p->nr_cpus_allowed > 1)
+   if (tsk_nr_cpus_allowed(p) > 1)
        dl_rq->dl_nr_migratory++;

    update_dl_migration(dl_rq);
@@ -162,7 +164,7 @@
 {
    struct task_struct *p = dl_task_of(dl_se);

-   if (p->nr_cpus_allowed > 1)
+   if (tsk_nr_cpus_allowed(p) > 1)
        dl_rq->dl_nr_migratory--;

    update_dl_migration(dl_rq);
@@ -231,17 +233,23 @@
    return dl_task(prev);
 }

-static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
+static DEFINE_PER_CPU(struct callback_head, dl_push_head);
+static DEFINE_PER_CPU(struct callback_head, dl_pull_head);

 static void push_dl_tasks(struct rq *);
+static void pull_dl_task(struct rq *);

 static inline void queue_push_tasks(struct rq *rq)
 {
    if (!has_pushable_dl_tasks(rq))
        return;

-   queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu),
-       push_dl_tasks);
+   queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+   queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
 }

 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
@@ -322,14 +330,17 @@
    return false;
 }

-static inline int pull_dl_task(struct rq *rq)
+static inline void pull_dl_task(struct rq *rq)
 {
-   return 0;
 }

 static inline void queue_push_tasks(struct rq *rq)
 {
 }
+
+static inline void queue_pull_task(struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */

 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -450,13 +461,13 @@
  *
  * This function returns true if:
  *
- *   runtime / (deadline - t) > dl_runtime / dl_deadline ,
+ *   runtime / (deadline - t) > dl_runtime / dl_period ,
  *
  * IOW we can't recycle current parameters.
  *
- * Notice that the bandwidth check is done against the deadline. For
+ * Notice that the bandwidth check is done against the period. For
  * task with deadline equal to period this is the same of using
- * dl_period instead of dl_deadline in the equation above.
+ * dl_deadline instead of dl_period in the equation above.
  */
 static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
                   struct sched_dl_entity *pi_se, u64 t)
@@ -481,7 +492,7 @@
     * of anything below microseconds resolution is actually fiction
     * (but still we want to give the user that illusion >;).
     */
-   left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+   left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
    right = ((dl_se->deadline - t) >> DL_SCALE) *
        (pi_se->dl_runtime >> DL_SCALE);

@@ -596,16 +607,10 @@
                             struct sched_dl_entity,
                             dl_timer);
    struct task_struct *p = dl_task_of(dl_se);
+   struct rq_flags rf;
    struct rq *rq;
-again:
-   rq = task_rq(p);
-   raw_spin_lock(&rq->lock);

-   if (rq != task_rq(p)) {
-       /* Task was moved, retrying. */
-       raw_spin_unlock(&rq->lock);
-       goto again;
-   }
+   rq = task_rq_lock(p, &rf);

    /*
     * The task might have changed its scheduling policy to something
@@ -686,12 +691,19 @@
     * Queueing this task back might have overloaded rq, check if we need
     * to kick someone away.
     */
-   if (has_pushable_dl_tasks(rq))
+   if (has_pushable_dl_tasks(rq)) {
+       /*
+        * Nothing relies on rq->lock after this, so its safe to drop
+        * rq->lock.
+        */
+       lockdep_unpin_lock(&rq->lock, rf.cookie);
        push_dl_task(rq);
+       lockdep_repin_lock(&rq->lock, rf.cookie);
+   }
 #endif

 unlock:
-   raw_spin_unlock(&rq->lock);
+   task_rq_unlock(rq, p, &rf);

    /*
     * This can free the task_struct, including this hrtimer, do not touch
@@ -711,7 +723,7 @@
 }

 static
-int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
+int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
 {
    return (dl_se->runtime <= 0);
 }
@@ -743,6 +755,9 @@
    if (unlikely((s64)delta_exec <= 0))
        return;

+   /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+   cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
+
    schedstat_set(curr->se.statistics.exec_max,
              max(curr->se.statistics.exec_max, delta_exec));

@@ -753,7 +768,7 @@
    cpuacct_charge(curr, delta_exec);

    dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
-   if (dl_runtime_exceeded(rq, dl_se)) {
+   if (dl_runtime_exceeded(dl_se)) {
        dl_se->dl_throttled = 1;
        __dequeue_task_dl(rq, curr, 0);
        if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
@@ -869,6 +884,7 @@
    WARN_ON(!dl_prio(prio));
    dl_rq->dl_nr_running++;
    add_nr_running(rq_of_dl_rq(dl_rq), 1);
+   walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));

    inc_dl_deadline(dl_rq, deadline);
    inc_dl_migration(dl_se, dl_rq);
@@ -883,6 +899,7 @@
    WARN_ON(!dl_rq->dl_nr_running);
    dl_rq->dl_nr_running--;
    sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+   walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));

    dec_dl_deadline(dl_rq, dl_se->deadline);
    dec_dl_migration(dl_se, dl_rq);
@@ -969,7 +986,7 @@

    /*
     * Use the scheduling parameters of the top pi-waiter
-    * task if we have one and its (relative) deadline is
+    * task if we have one and its (absolute) deadline is
     * smaller than our one... OTW we keep our runtime and
     * deadline.
     */
@@ -998,7 +1015,7 @@

    enqueue_dl_entity(&p->dl, pi_se, flags);

-   if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+   if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
        enqueue_pushable_dl_task(rq, p);
 }

@@ -1038,7 +1055,14 @@
        rq->curr->dl.dl_yielded = 1;
        p->dl.runtime = 0;
    }
+   update_rq_clock(rq);
    update_curr_dl(rq);
+   /*
+    * Tell update_rq_clock() that we've just updated,
+    * so we don't do microscopic update in schedule()
+    * and double the fastpath cost.
+    */
+   rq_clock_skip_update(rq, true);
 }

 #ifdef CONFIG_SMP
@@ -1046,12 +1070,13 @@
 static int find_later_rq(struct task_struct *task);

 static int
-select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags,
+         int sibling_count_hint)
 {
    struct task_struct *curr;
    struct rq *rq;

-   if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+   if (sd_flag != SD_BALANCE_WAKE)
        goto out;

    rq = cpu_rq(cpu);
@@ -1069,12 +1094,15 @@
     * try to make it stay here, it might be important.
     */
    if (unlikely(dl_task(curr)) &&
-       (curr->nr_cpus_allowed < 2 ||
+       (tsk_nr_cpus_allowed(curr) < 2 ||
         !dl_entity_preempt(&p->dl, &curr->dl)) &&
-       (p->nr_cpus_allowed > 1)) {
+       (tsk_nr_cpus_allowed(p) > 1)) {
        int target = find_later_rq(p);

-       if (target != -1)
+       if (target != -1 &&
+               (dl_time_before(p->dl.deadline,
+                   cpu_rq(target)->dl.earliest_dl.curr) ||
+               (cpu_rq(target)->dl.dl_nr_running == 0)))
            cpu = target;
    }
    rcu_read_unlock();
@@ -1089,7 +1117,7 @@
     * Current can't be migrated, useless to reschedule,
     * let's hope p can move out.
     */
-   if (rq->curr->nr_cpus_allowed == 1 ||
+   if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
        cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
        return;

@@ -1097,15 +1125,13 @@
     * p is migratable, so let's not schedule it and
     * see if it is pushed or pulled somewhere else.
     */
-   if (p->nr_cpus_allowed != 1 &&
+   if (tsk_nr_cpus_allowed(p) != 1 &&
        cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
        return;

    resched_curr(rq);
 }

-static int pull_dl_task(struct rq *this_rq);
-
 #endif /* CONFIG_SMP */

 /*
@@ -1136,6 +1162,10 @@
 {
    hrtick_start(rq, p->dl.runtime);
 }
+#else /* !CONFIG_SCHED_HRTICK */
+static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+{
+}
 #endif

 static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1149,7 +1179,8 @@
    return rb_entry(left, struct sched_dl_entity, rb_node);
 }

-struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
+struct task_struct *
+pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
    struct sched_dl_entity *dl_se;
    struct task_struct *p;
@@ -1158,7 +1189,15 @@
    dl_rq = &rq->dl;

    if (need_pull_dl_task(rq, prev)) {
+       /*
+        * This is OK, because current is on_cpu, which avoids it being
+        * picked for load-balance and preemption/IRQs are still
+        * disabled avoiding further scheduler activity on it and we're
+        * being very careful to re-start the picking loop.
+        */
+       lockdep_unpin_lock(&rq->lock, cookie);
        pull_dl_task(rq);
+       lockdep_repin_lock(&rq->lock, cookie);
        /*
         * pull_rt_task() can drop (and re-acquire) rq->lock; this
         * means a stop task can slip in, in which case we need to
@@ -1189,10 +1228,8 @@
    /* Running task will never be pushed. */
        dequeue_pushable_dl_task(rq, p);

-#ifdef CONFIG_SCHED_HRTICK
    if (hrtick_enabled(rq))
        start_hrtick_dl(rq, p);
-#endif

    queue_push_tasks(rq);

@@ -1203,7 +1240,7 @@
 {
    update_curr_dl(rq);

-   if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
+   if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1)
        enqueue_pushable_dl_task(rq, p);
 }

@@ -1211,10 +1248,14 @@
 {
    update_curr_dl(rq);

-#ifdef CONFIG_SCHED_HRTICK
-   if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
+   /*
+    * Even when we have runtime, update_curr_dl() might have resulted in us
+    * not being the leftmost task anymore. In that case NEED_RESCHED will
+    * be set and schedule() will start a new hrtick for the next task.
+    */
+   if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
+       is_leftmost(p, &rq->dl))
        start_hrtick_dl(rq, p);
-#endif
 }

 static void task_fork_dl(struct task_struct *p)
@@ -1287,6 +1328,32 @@
    return NULL;
 }

+/*
+ * Return the earliest pushable rq's task, which is suitable to be executed
+ * on the CPU, NULL otherwise:
+ */
+static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
+{
+   struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
+   struct task_struct *p = NULL;
+
+   if (!has_pushable_dl_tasks(rq))
+       return NULL;
+
+next_node:
+   if (next_node) {
+       p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
+
+       if (pick_dl_task(rq, p, cpu))
+           return p;
+
+       next_node = rb_next(next_node);
+       goto next_node;
+   }
+
+   return NULL;
+}
+
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);

 static int find_later_rq(struct task_struct *task)
@@ -1300,16 +1367,13 @@
    if (unlikely(!later_mask))
        return -1;

-   if (task->nr_cpus_allowed == 1)
+   if (tsk_nr_cpus_allowed(task) == 1)
        return -1;

    /*
     * We have to consider system topology and task affinity
     * first, then we can look for a suitable cpu.
     */
-   cpumask_copy(later_mask, task_rq(task)->rd->span);
-   cpumask_and(later_mask, later_mask, cpu_active_mask);
-   cpumask_and(later_mask, later_mask, &task->cpus_allowed);
    best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
            task, later_mask);
    if (best_cpu == -1)
@@ -1393,6 +1457,18 @@

        later_rq = cpu_rq(cpu);

+       if (later_rq->dl.dl_nr_running &&
+           !dl_time_before(task->dl.deadline,
+                   later_rq->dl.earliest_dl.curr)) {
+           /*
+            * Target rq has tasks of equal or earlier deadline,
+            * retrying does not release any lock and is unlikely
+            * to yield a different result.
+            */
+           later_rq = NULL;
+           break;
+       }
+
        /* Retry if something changed. */
        if (double_lock_balance(rq, later_rq)) {
            if (unlikely(task_rq(task) != rq ||
@@ -1436,7 +1512,7 @@

    BUG_ON(rq->cpu != task_cpu(p));
    BUG_ON(task_current(rq, p));
-   BUG_ON(p->nr_cpus_allowed <= 1);
+   BUG_ON(tsk_nr_cpus_allowed(p) <= 1);

    BUG_ON(!task_on_rq_queued(p));
    BUG_ON(!dl_task(p));
@@ -1453,6 +1529,7 @@
 {
    struct task_struct *next_task;
    struct rq *later_rq;
+   int ret = 0;

    if (!rq->dl.overloaded)
        return 0;
@@ -1474,7 +1551,7 @@
     */
    if (dl_task(rq->curr) &&
        dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
-       rq->curr->nr_cpus_allowed > 1) {
+       tsk_nr_cpus_allowed(rq->curr) > 1) {
        resched_curr(rq);
        return 0;
    }
@@ -1498,7 +1575,6 @@
             * The task is still there. We don't try
             * again, some other cpu will pull it when ready.
             */
-           dequeue_pushable_dl_task(rq, next_task);
            goto out;
        }

@@ -1513,9 +1589,12 @@

    deactivate_task(rq, next_task, 0);
    clear_average_bw(&next_task->dl, &rq->dl);
+   next_task->on_rq = TASK_ON_RQ_MIGRATING;
    set_task_cpu(next_task, later_rq->cpu);
+   next_task->on_rq = TASK_ON_RQ_QUEUED;
    add_average_bw(&next_task->dl, &later_rq->dl);
    activate_task(later_rq, next_task, 0);
+   ret = 1;

    resched_curr(later_rq);

@@ -1524,25 +1603,26 @@
 out:
    put_task_struct(next_task);

-   return 1;
+   return ret;
 }

 static void push_dl_tasks(struct rq *rq)
 {
-   /* Terminates as it moves a -deadline task */
+   /* push_dl_task() will return true if it moved a -deadline task */
    while (push_dl_task(rq))
        ;
 }

-static int pull_dl_task(struct rq *this_rq)
+static void pull_dl_task(struct rq *this_rq)
 {
-   int this_cpu = this_rq->cpu, ret = 0, cpu;
+   int this_cpu = this_rq->cpu, cpu;
    struct task_struct *p;
+   bool resched = false;
    struct rq *src_rq;
    u64 dmin = LONG_MAX;

    if (likely(!dl_overloaded(this_rq)))
-       return 0;
+       return;

    /*
     * Match the barrier from dl_set_overloaded; this guarantees that if we
@@ -1575,7 +1655,7 @@
        if (src_rq->dl.dl_nr_running <= 1)
            goto skip;

-       p = pick_next_earliest_dl_task(src_rq, this_cpu);
+       p = pick_earliest_pushable_dl_task(src_rq, this_cpu);

        /*
         * We found a task to be pulled if:
@@ -1597,11 +1677,13 @@
                       src_rq->curr->dl.deadline))
                goto skip;

-           ret = 1;
+           resched = true;

            deactivate_task(src_rq, p, 0);
            clear_average_bw(&p->dl, &src_rq->dl);
+           p->on_rq = TASK_ON_RQ_MIGRATING;
            set_task_cpu(p, this_cpu);
+           p->on_rq = TASK_ON_RQ_QUEUED;
            add_average_bw(&p->dl, &this_rq->dl);
            activate_task(this_rq, p, 0);
            dmin = p->dl.deadline;
@@ -1612,7 +1694,8 @@
        double_unlock_balance(this_rq, src_rq);
    }

-   return ret;
+   if (resched)
+       resched_curr(this_rq);
 }

 /*
@@ -1623,11 +1706,10 @@
 {
    if (!task_running(rq, p) &&
        !test_tsk_need_resched(rq->curr) &&
-       has_pushable_dl_tasks(rq) &&
-       p->nr_cpus_allowed > 1 &&
+       tsk_nr_cpus_allowed(p) > 1 &&
        dl_task(rq->curr) &&
-       (rq->curr->nr_cpus_allowed < 2 ||
-        dl_entity_preempt(&rq->curr->dl, &p->dl))) {
+       (tsk_nr_cpus_allowed(rq->curr) < 2 ||
+        !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
        push_dl_tasks(rq);
    }
 }
@@ -1635,44 +1717,34 @@
 static void set_cpus_allowed_dl(struct task_struct *p,
                const struct cpumask *new_mask)
 {
+   struct root_domain *src_rd;
    struct rq *rq;
-   int weight;

    BUG_ON(!dl_task(p));

-   /*
-    * Update only if the task is actually running (i.e.,
-    * it is on the rq AND it is not throttled).
-    */
-   if (!on_dl_rq(&p->dl))
-       return;
-
-   weight = cpumask_weight(new_mask);
-
-   /*
-    * Only update if the process changes its state from whether it
-    * can migrate or not.
-    */
-   if ((p->nr_cpus_allowed > 1) == (weight > 1))
-       return;
-
    rq = task_rq(p);
-
+   src_rd = rq->rd;
    /*
-    * The process used to be able to migrate OR it can now migrate
+    * Migrating a SCHED_DEADLINE task between exclusive
+    * cpusets (different root_domains) entails a bandwidth
+    * update. We already made space for us in the destination
+    * domain (see cpuset_can_attach()).
     */
-   if (weight <= 1) {
-       if (!task_current(rq, p))
-           dequeue_pushable_dl_task(rq, p);
-       BUG_ON(!rq->dl.dl_nr_migratory);
-       rq->dl.dl_nr_migratory--;
-   } else {
-       if (!task_current(rq, p))
-           enqueue_pushable_dl_task(rq, p);
-       rq->dl.dl_nr_migratory++;
+   if (!cpumask_intersects(src_rd->span, new_mask)) {
+       struct dl_bw *src_dl_b;
+
+       src_dl_b = dl_bw_of(cpu_of(rq));
+       /*
+        * We now free resources of the root_domain we are migrating
+        * off. In the worst case, sched_setattr() may temporary fail
+        * until we complete the update.
+        */
+       raw_spin_lock(&src_dl_b->lock);
+       __dl_clear(src_dl_b, p->dl.dl_bw);
+       raw_spin_unlock(&src_dl_b->lock);
    }

-   update_dl_migration(&rq->dl);
+   set_cpus_allowed_common(p, new_mask);
 }

 /* Assumes rq->lock is held */
@@ -1681,6 +1753,7 @@
    if (rq->dl.overloaded)
        dl_set_overload(rq);

+   cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
    if (rq->dl.dl_nr_running > 0)
        cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
 }
@@ -1692,9 +1765,10 @@
        dl_clear_overload(rq);

    cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+   cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
 }

-void init_sched_dl_class(void)
+void __init init_sched_dl_class(void)
 {
    unsigned int i;

@@ -1726,8 +1800,7 @@
    if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
        return;

-   if (pull_dl_task(rq))
-       resched_curr(rq);
+   queue_pull_task(rq);
 }

 /*
@@ -1736,28 +1809,15 @@
  */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
-   int check_resched = 1;
-
-   /*
-    * If p is throttled, don't consider the possibility
-    * of preempting rq->curr, the check will be done right
-    * after its runtime will get replenished.
-    */
-   if (unlikely(p->dl.dl_throttled))
-       return;
-
    if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-       if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
-           /* Only reschedule if pushing failed */
-           check_resched = 0;
-#endif /* CONFIG_SMP */
-       if (check_resched) {
-           if (dl_task(rq->curr))
-               check_preempt_curr_dl(rq, p, 0);
-           else
-               resched_curr(rq);
-       }
+       if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
+           queue_push_tasks(rq);
+#endif
+       if (dl_task(rq->curr))
+           check_preempt_curr_dl(rq, p, 0);
+       else
+           resched_curr(rq);
    }
 }

@@ -1777,15 +1837,14 @@
         * or lowering its prio, so...
         */
        if (!rq->dl.overloaded)
-           pull_dl_task(rq);
+           queue_pull_task(rq);

        /*
         * If we now have a earlier deadline task than p,
         * then reschedule, provided p is still on this
         * runqueue.
         */
-       if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
-           rq->curr == p)
+       if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
            resched_curr(rq);
 #else
        /*
diff -Nur /home/ninez/android/marlin/kernel/sched/debug.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/debug.c
--- /home/ninez/android/marlin/kernel/sched/debug.c 2018-08-10 01:54:08.563395055 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/debug.c 2018-08-26 16:43:11.647206295 -0400
@@ -65,8 +65,12 @@

 #define P(F) \
    SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
+#define P_SCHEDSTAT(F) \
+   SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)schedstat_val(F))
 #define PN(F) \
    SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN_SCHEDSTAT(F) \
+   SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))

    if (!se)
        return;
@@ -74,25 +78,27 @@
    PN(se->exec_start);
    PN(se->vruntime);
    PN(se->sum_exec_runtime);
-#ifdef CONFIG_SCHEDSTATS
-   PN(se->statistics.wait_start);
-   PN(se->statistics.sleep_start);
-   PN(se->statistics.block_start);
-   PN(se->statistics.sleep_max);
-   PN(se->statistics.block_max);
-   PN(se->statistics.exec_max);
-   PN(se->statistics.slice_max);
-   PN(se->statistics.wait_max);
-   PN(se->statistics.wait_sum);
-   P(se->statistics.wait_count);
-#endif
+   if (schedstat_enabled()) {
+       PN_SCHEDSTAT(se->statistics.wait_start);
+       PN_SCHEDSTAT(se->statistics.sleep_start);
+       PN_SCHEDSTAT(se->statistics.block_start);
+       PN_SCHEDSTAT(se->statistics.sleep_max);
+       PN_SCHEDSTAT(se->statistics.block_max);
+       PN_SCHEDSTAT(se->statistics.exec_max);
+       PN_SCHEDSTAT(se->statistics.slice_max);
+       PN_SCHEDSTAT(se->statistics.wait_max);
+       PN_SCHEDSTAT(se->statistics.wait_sum);
+       P_SCHEDSTAT(se->statistics.wait_count);
+   }
    P(se->load.weight);
 #ifdef CONFIG_SMP
    P(se->avg.load_avg);
    P(se->avg.util_avg);
-   P(se->avg.util_est);
 #endif
+
+#undef PN_SCHEDSTAT
 #undef PN
+#undef P_SCHEDSTAT
 #undef P
 }
 #endif
@@ -123,13 +129,17 @@
        (long long)(p->nvcsw + p->nivcsw),
        p->prio);
 #ifdef CONFIG_SCHEDSTATS
+   if (schedstat_enabled()) {
+       SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+           SPLIT_NS(p->se.statistics.wait_sum),
+           SPLIT_NS(p->se.sum_exec_runtime),
+           SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+   }
+#else
    SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-       SPLIT_NS(p->se.vruntime),
+       0LL, 0L,
        SPLIT_NS(p->se.sum_exec_runtime),
-       SPLIT_NS(p->se.statistics.sum_sleep_runtime));
-#else
-   SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
-       0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
+       0LL, 0L);
 #endif
 #ifdef CONFIG_NUMA_BALANCING
    SEQ_printf(m, " %d", task_node(p));
@@ -148,7 +158,7 @@
    SEQ_printf(m,
    "\nrunnable tasks:\n"
    "            task   PID         tree-key  switches  prio"
-   "     exec-runtime         sum-exec        sum-sleep\n"
+   "     wait-time             sum-exec        sum-sleep\n"
    "------------------------------------------------------"
    "----------------------------------------------------\n");

@@ -210,8 +220,6 @@
            cfs_rq->runnable_load_avg);
    SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
            cfs_rq->avg.util_avg);
-   SEQ_printf(m, "  .%-30s: %lu\n", "util_est",
-           cfs_rq->avg.util_est);
    SEQ_printf(m, "  .%-30s: %ld\n", "removed_load_avg",
            atomic_long_read(&cfs_rq->removed_load_avg));
    SEQ_printf(m, "  .%-30s: %ld\n", "removed_util_avg",
@@ -297,6 +305,7 @@
    PN(next_balance);
    SEQ_printf(m, "  .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
    PN(clock);
+   PN(clock_task);
    P(cpu_load[0]);
    P(cpu_load[1]);
    P(cpu_load[2]);
@@ -305,25 +314,23 @@
 #undef P
 #undef PN

-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
-#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
-
-   P(yld_count);
-
-   P(sched_count);
-   P(sched_goidle);
 #ifdef CONFIG_SMP
+#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
    P64(avg_idle);
    P64(max_idle_balance_cost);
+#undef P64
 #endif

-   P(ttwu_count);
-   P(ttwu_local);
-
+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, schedstat_val(rq->n));
+   if (schedstat_enabled()) {
+       P(yld_count);
+       P(sched_count);
+       P(sched_goidle);
+       P(ttwu_count);
+       P(ttwu_local);
+   }
 #undef P
-#undef P64
-#endif
+
    spin_lock_irqsave(&sched_debug_lock, flags);
    print_cfs_stats(m, cpu);
    print_rt_stats(m, cpu);
@@ -556,10 +563,14 @@
    SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
 #define P(F) \
    SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define P_SCHEDSTAT(F) \
+   SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
 #define __PN(F) \
    SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
 #define PN(F) \
    SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#define PN_SCHEDSTAT(F) \
+   SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))

    PN(se.exec_start);
    PN(se.vruntime);
@@ -567,38 +578,66 @@

    nr_switches = p->nvcsw + p->nivcsw;

-#ifdef CONFIG_SCHEDSTATS
-   PN(se.statistics.wait_start);
-   PN(se.statistics.sleep_start);
-   PN(se.statistics.block_start);
-   PN(se.statistics.sleep_max);
-   PN(se.statistics.block_max);
-   PN(se.statistics.exec_max);
-   PN(se.statistics.slice_max);
-   PN(se.statistics.wait_max);
-   PN(se.statistics.wait_sum);
-   P(se.statistics.wait_count);
-   PN(se.statistics.iowait_sum);
-   P(se.statistics.iowait_count);
+
    P(se.nr_migrations);
-   P(se.statistics.nr_migrations_cold);
-   P(se.statistics.nr_failed_migrations_affine);
-   P(se.statistics.nr_failed_migrations_running);
-   P(se.statistics.nr_failed_migrations_hot);
-   P(se.statistics.nr_forced_migrations);
-   P(se.statistics.nr_wakeups);
-   P(se.statistics.nr_wakeups_sync);
-   P(se.statistics.nr_wakeups_migrate);
-   P(se.statistics.nr_wakeups_local);
-   P(se.statistics.nr_wakeups_remote);
-   P(se.statistics.nr_wakeups_affine);
-   P(se.statistics.nr_wakeups_affine_attempts);
-   P(se.statistics.nr_wakeups_passive);
-   P(se.statistics.nr_wakeups_idle);

-   {
+   if (schedstat_enabled()) {
        u64 avg_atom, avg_per_cpu;

+       PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
+       PN_SCHEDSTAT(se.statistics.wait_start);
+       PN_SCHEDSTAT(se.statistics.sleep_start);
+       PN_SCHEDSTAT(se.statistics.block_start);
+       PN_SCHEDSTAT(se.statistics.sleep_max);
+       PN_SCHEDSTAT(se.statistics.block_max);
+       PN_SCHEDSTAT(se.statistics.exec_max);
+       PN_SCHEDSTAT(se.statistics.slice_max);
+       PN_SCHEDSTAT(se.statistics.wait_max);
+       PN_SCHEDSTAT(se.statistics.wait_sum);
+       P_SCHEDSTAT(se.statistics.wait_count);
+       PN_SCHEDSTAT(se.statistics.iowait_sum);
+       P_SCHEDSTAT(se.statistics.iowait_count);
+       P_SCHEDSTAT(se.statistics.nr_migrations_cold);
+       P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
+       P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
+       P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
+       P_SCHEDSTAT(se.statistics.nr_forced_migrations);
+       P_SCHEDSTAT(se.statistics.nr_wakeups);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_local);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
+       /* eas */
+       /* select_idle_sibling() */
+       P_SCHEDSTAT(se.statistics.nr_wakeups_sis_attempts);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_sis_idle);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_sis_cache_affine);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_sis_suff_cap);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_sis_idle_cpu);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_sis_count);
+       /* select_energy_cpu_brute() */
+       P_SCHEDSTAT(se.statistics.nr_wakeups_secb_attempts);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_secb_sync);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_secb_idle_bt);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_secb_insuff_cap);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_secb_no_nrg_sav);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_secb_nrg_sav);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_secb_count);
+       /* find_best_target() */
+       P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_attempts);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_no_cpu);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_no_sd);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_pref_idle);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_count);
+       /* cas */
+       /* select_task_rq_fair() */
+       P_SCHEDSTAT(se.statistics.nr_wakeups_cas_attempts);
+       P_SCHEDSTAT(se.statistics.nr_wakeups_cas_count);
+
        avg_atom = p->se.sum_exec_runtime;
        if (nr_switches)
            avg_atom = div64_ul(avg_atom, nr_switches);
@@ -616,7 +655,7 @@
        __PN(avg_atom);
        __PN(avg_per_cpu);
    }
-#endif
+
    __P(nr_switches);
    SEQ_printf(m, "%-45s:%21Ld\n",
           "nr_voluntary_switches", (long long)p->nvcsw);
@@ -629,13 +668,14 @@
    P(se.avg.util_sum);
    P(se.avg.load_avg);
    P(se.avg.util_avg);
-   P(se.avg.util_est);
    P(se.avg.last_update_time);
 #endif
    P(policy);
    P(prio);
+#undef PN_SCHEDSTAT
 #undef PN
 #undef __PN
+#undef P_SCHEDSTAT
 #undef P
 #undef __P

diff -Nur /home/ninez/android/marlin/kernel/sched/energy.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/energy.c
--- /home/ninez/android/marlin/kernel/sched/energy.c    2018-08-10 01:54:08.563395055 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/energy.c    2018-08-11 23:57:17.128607487 -0400
@@ -27,8 +27,6 @@
 #include <linux/sched_energy.h>
 #include <linux/stddef.h>

-#include "sched.h"
-
 struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];

 static void free_resources(void)
@@ -61,14 +59,12 @@
    for_each_possible_cpu(cpu) {
        cn = of_get_cpu_node(cpu, NULL);
        if (!cn) {
-           if (sched_feat(ENERGY_AWARE))
-               pr_warn("CPU device node missing for CPU %d\n", cpu);
+           pr_warn("CPU device node missing for CPU %d\n", cpu);
            return;
        }

        if (!of_find_property(cn, "sched-energy-costs", NULL)) {
-           if (sched_feat(ENERGY_AWARE))
-               pr_warn("CPU device node has no sched-energy-costs\n");
+           pr_warn("CPU device node has no sched-energy-costs\n");
            return;
        }

@@ -79,8 +75,7 @@

            prop = of_find_property(cp, "busy-cost-data", NULL);
            if (!prop || !prop->value) {
-               if (sched_feat(ENERGY_AWARE))
-                   pr_warn("No busy-cost data, skipping sched_energy init\n");
+               pr_warn("No busy-cost data, skipping sched_energy init\n");
                goto out;
            }

@@ -102,8 +97,7 @@

            prop = of_find_property(cp, "idle-cost-data", NULL);
            if (!prop || !prop->value) {
-               if (sched_feat(ENERGY_AWARE))
-                   pr_warn("No idle-cost data, skipping sched_energy init\n");
+               pr_warn("No idle-cost data, skipping sched_energy init\n");
                goto out;
            }

@@ -123,7 +117,6 @@
    }

    pr_info("Sched-energy-costs installed from DT\n");
-   set_energy_aware();
    return;

 out:
diff -Nur /home/ninez/android/marlin/kernel/sched/fair.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/fair.c
--- /home/ninez/android/marlin/kernel/sched/fair.c  2018-08-10 01:54:08.566728454 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/fair.c  2018-08-26 16:43:11.647206295 -0400
@@ -20,8 +20,8 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
  */

-#include <linux/latencytop.h>
 #include <linux/sched.h>
+#include <linux/latencytop.h>
 #include <linux/cpumask.h>
 #include <linux/cpuidle.h>
 #include <linux/slab.h>
@@ -53,14 +53,18 @@
 unsigned int sysctl_sched_latency = 6000000ULL;
 unsigned int normalized_sysctl_sched_latency = 6000000ULL;

-unsigned int sysctl_sched_is_big_little = 0;
 unsigned int sysctl_sched_sync_hint_enable = 1;
 unsigned int sysctl_sched_initial_task_util = 0;
 unsigned int sysctl_sched_cstate_aware = 1;

 #ifdef CONFIG_SCHED_WALT
+#ifdef CONFIG_SCHED_WALT_DEFAULT
 unsigned int sysctl_sched_use_walt_cpu_util = 1;
 unsigned int sysctl_sched_use_walt_task_util = 1;
+#else
+unsigned int sysctl_sched_use_walt_cpu_util = 0;
+unsigned int sysctl_sched_use_walt_task_util = 0;
+#endif
 __read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
     (10 * NSEC_PER_MSEC);
 #endif
@@ -128,6 +132,12 @@
 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #endif

+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * margin < capacity * 1024
+ */
+unsigned int capacity_margin = 1280; /* ~20% */
+
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
    lw->weight += inc;
@@ -155,9 +165,9 @@
  *
  * This idea comes from the SD scheduler of Con Kolivas:
  */
-static int get_update_sysctl_factor(void)
+static unsigned int get_update_sysctl_factor(void)
 {
-   unsigned int cpus = min_t(int, num_online_cpus(), 8);
+   unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
    unsigned int factor;

    switch (sysctl_sched_tunable_scaling) {
@@ -270,9 +280,7 @@

 static inline struct task_struct *task_of(struct sched_entity *se)
 {
-#ifdef CONFIG_SCHED_DEBUG
-   WARN_ON_ONCE(!entity_is_task(se));
-#endif
+   SCHED_WARN_ON(!entity_is_task(se));
    return container_of(se, struct task_struct, se);
 }

@@ -300,19 +308,59 @@
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
    if (!cfs_rq->on_list) {
+       struct rq *rq = rq_of(cfs_rq);
+       int cpu = cpu_of(rq);
        /*
         * Ensure we either appear before our parent (if already
         * enqueued) or force our parent to appear after us when it is
-        * enqueued.  The fact that we always enqueue bottom-up
-        * reduces this to two cases.
+        * enqueued. The fact that we always enqueue bottom-up
+        * reduces this to two cases and a special case for the root
+        * cfs_rq. Furthermore, it also means that we will always reset
+        * tmp_alone_branch either when the branch is connected
+        * to a tree or when we reach the beg of the tree
         */
        if (cfs_rq->tg->parent &&
-           cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
-           list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
-               &rq_of(cfs_rq)->leaf_cfs_rq_list);
-       } else {
+           cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+           /*
+            * If parent is already on the list, we add the child
+            * just before. Thanks to circular linked property of
+            * the list, this means to put the child at the tail
+            * of the list that starts by parent.
+            */
+           list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+               &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+           /*
+            * The branch is now connected to its tree so we can
+            * reset tmp_alone_branch to the beginning of the
+            * list.
+            */
+           rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+       } else if (!cfs_rq->tg->parent) {
+           /*
+            * cfs rq without parent should be put
+            * at the tail of the list.
+            */
            list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
-               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+               &rq->leaf_cfs_rq_list);
+           /*
+            * We have reach the beg of a tree so we can reset
+            * tmp_alone_branch to the beginning of the list.
+            */
+           rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+       } else {
+           /*
+            * The parent has not already been added so we want to
+            * make sure that it will be put after us.
+            * tmp_alone_branch points to the beg of the branch
+            * where we will add parent.
+            */
+           list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+               rq->tmp_alone_branch);
+           /*
+            * update tmp_alone_branch to points to the new beg
+            * of the branch
+            */
+           rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
        }

        cfs_rq->on_list = 1;
@@ -470,17 +518,23 @@

 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 {
+   struct sched_entity *curr = cfs_rq->curr;
+
    u64 vruntime = cfs_rq->min_vruntime;

-   if (cfs_rq->curr)
-       vruntime = cfs_rq->curr->vruntime;
+   if (curr) {
+       if (curr->on_rq)
+           vruntime = curr->vruntime;
+       else
+           curr = NULL;
+   }

    if (cfs_rq->rb_leftmost) {
        struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
                           struct sched_entity,
                           run_node);

-       if (!cfs_rq->curr)
+       if (!curr)
            vruntime = se->vruntime;
        else
            vruntime = min_vruntime(vruntime, se->vruntime);
@@ -585,7 +639,7 @@
        loff_t *ppos)
 {
    int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-   int factor = get_update_sysctl_factor();
+   unsigned int factor = get_update_sysctl_factor();

    if (ret || !write)
        return ret;
@@ -670,16 +724,17 @@
 }

 #ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 static unsigned long task_h_load(struct task_struct *p);

 /*
  * We choose a half-life close to 1 scheduling period.
- * Note: The tables below are dependent on this value.
+ * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
+ * dependent on this value.
  */
-#define LOAD_AVG_PERIOD 16
-#define LOAD_AVG_MAX 24117 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 172 /* number of full periods to produce LOAD_AVG_MAX */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */

 /* Give new sched_entity start runnable values to heavy its load in infant time */
 void init_entity_runnable_average(struct sched_entity *se)
@@ -693,23 +748,117 @@
     * will definitely be update (after enqueue).
     */
    sa->period_contrib = 1023;
-   sa->load_avg = scale_load_down(se->load.weight);
+   /*
+    * Tasks are intialized with full load to be seen as heavy tasks until
+    * they get a chance to stabilize to their real load level.
+    * Group entities are intialized with zero load to reflect the fact that
+    * nothing has been attached to the task group yet.
+    */
+   if (entity_is_task(se))
+       sa->load_avg = scale_load_down(se->load.weight);
    sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
-   sa->util_avg =  sched_freq() ?
-       sysctl_sched_initial_task_util :
-       scale_load_down(SCHED_LOAD_SCALE);
-   sa->util_est = sa->util_avg;
-   sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+   /*
+    * In previous Android versions, we used to have:
+    *  sa->util_avg =  sched_freq() ?
+    *      sysctl_sched_initial_task_util :
+    *      scale_load_down(SCHED_LOAD_SCALE);
+    *  sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+    * However, that functionality has been moved to enqueue.
+    * It is unclear if we should restore this in enqueue.
+    */
+   /*
+    * At this point, util_avg won't be used in select_task_rq_fair anyway
+    */
+   sa->util_avg = 0;
+   sa->util_sum = 0;
    /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }

-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
+static void attach_entity_cfs_rq(struct sched_entity *se);
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
+/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task.
+ *
+ * For example, a simplest series from the beginning would be like:
+ *
+ *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+   struct cfs_rq *cfs_rq = cfs_rq_of(se);
+   struct sched_avg *sa = &se->avg;
+   long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+
+   if (cap > 0) {
+       if (cfs_rq->avg.util_avg != 0) {
+           sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
+           sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+           if (sa->util_avg > cap)
+               sa->util_avg = cap;
+       } else {
+           sa->util_avg = cap;
+       }
+       /*
+        * If we wish to restore tuning via setting initial util,
+        * this is where we should do it.
+        */
+       sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+   }
+
+   if (entity_is_task(se)) {
+       struct task_struct *p = task_of(se);
+       if (p->sched_class != &fair_sched_class) {
+           /*
+            * For !fair tasks do:
+            *
+           update_cfs_rq_load_avg(now, cfs_rq, false);
+           attach_entity_load_avg(cfs_rq, se);
+           switched_from_fair(rq, p);
+            *
+            * such that the next switched_to_fair() has the
+            * expected state.
+            */
+           se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
+           return;
+       }
+   }
+
+   attach_entity_cfs_rq(se);
+}
+
 #else
 void init_entity_runnable_average(struct sched_entity *se)
 {
 }
-#endif
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+}
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */

 /*
  * Update the current task's runtime statistics.
@@ -733,7 +882,7 @@
              max(delta_exec, curr->statistics.exec_max));

    curr->sum_exec_runtime += delta_exec;
-   schedstat_add(cfs_rq, exec_clock, delta_exec);
+   schedstat_add(cfs_rq->exec_clock, delta_exec);

    curr->vruntime += calc_delta_fair(delta_exec, curr);
    update_min_vruntime(cfs_rq);
@@ -757,48 +906,165 @@
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-   schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
+   u64 wait_start, prev_wait_start;
+
+   if (!schedstat_enabled())
+       return;
+
+   wait_start = rq_clock(rq_of(cfs_rq));
+   prev_wait_start = schedstat_val(se->statistics.wait_start);
+
+   if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+       likely(wait_start > prev_wait_start))
+       wait_start -= prev_wait_start;
+
+   schedstat_set(se->statistics.wait_start, wait_start);
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+   struct task_struct *p;
+   u64 delta;
+
+   if (!schedstat_enabled())
+       return;
+
+   delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
+
+   if (entity_is_task(se)) {
+       p = task_of(se);
+       if (task_on_rq_migrating(p)) {
+           /*
+            * Preserve migrating task's wait time so wait_start
+            * time stamp can be adjusted to accumulate wait time
+            * prior to migration.
+            */
+           schedstat_set(se->statistics.wait_start, delta);
+           return;
+       }
+       trace_sched_stat_wait(p, delta);
+   }
+
+   schedstat_set(se->statistics.wait_max,
+             max(schedstat_val(se->statistics.wait_max), delta));
+   schedstat_inc(se->statistics.wait_count);
+   schedstat_add(se->statistics.wait_sum, delta);
+   schedstat_set(se->statistics.wait_start, 0);
+}
+
+static inline void
+update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+   struct task_struct *tsk = NULL;
+   u64 sleep_start, block_start;
+
+   if (!schedstat_enabled())
+       return;
+
+   sleep_start = schedstat_val(se->statistics.sleep_start);
+   block_start = schedstat_val(se->statistics.block_start);
+
+   if (entity_is_task(se))
+       tsk = task_of(se);
+
+   if (sleep_start) {
+       u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
+
+       if ((s64)delta < 0)
+           delta = 0;
+
+       if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
+           schedstat_set(se->statistics.sleep_max, delta);
+
+       schedstat_set(se->statistics.sleep_start, 0);
+       schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+       if (tsk) {
+           account_scheduler_latency(tsk, delta >> 10, 1);
+           trace_sched_stat_sleep(tsk, delta);
+       }
+   }
+   if (block_start) {
+       u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
+
+       if ((s64)delta < 0)
+           delta = 0;
+
+       if (unlikely(delta > schedstat_val(se->statistics.block_max)))
+           schedstat_set(se->statistics.block_max, delta);
+
+       schedstat_set(se->statistics.block_start, 0);
+       schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+       if (tsk) {
+           if (tsk->in_iowait) {
+               schedstat_add(se->statistics.iowait_sum, delta);
+               schedstat_inc(se->statistics.iowait_count);
+               trace_sched_stat_iowait(tsk, delta);
+           }
+
+           trace_sched_stat_blocked(tsk, delta);
+
+           /*
+            * Blocking time is in units of nanosecs, so shift by
+            * 20 to get a milliseconds-range estimation of the
+            * amount of time that the task spent sleeping:
+            */
+           if (unlikely(prof_on == SLEEP_PROFILING)) {
+               profile_hits(SLEEP_PROFILING,
+                       (void *)get_wchan(tsk),
+                       delta >> 20);
+           }
+           account_scheduler_latency(tsk, delta >> 10, 0);
+       }
+   }
 }

 /*
  * Task is being enqueued - update stats:
  */
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+   if (!schedstat_enabled())
+       return;
+
    /*
     * Are we enqueueing a waiting task? (for current tasks
     * a dequeue/enqueue event is a NOP)
     */
    if (se != cfs_rq->curr)
        update_stats_wait_start(cfs_rq, se);
-}

-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-   schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
-           rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
-   schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
-   schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
-           rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
-   if (entity_is_task(se)) {
-       trace_sched_stat_wait(task_of(se),
-           rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-   }
-#endif
-   schedstat_set(se->statistics.wait_start, 0);
+   if (flags & ENQUEUE_WAKEUP)
+       update_stats_enqueue_sleeper(cfs_rq, se);
 }

 static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+
+   if (!schedstat_enabled())
+       return;
+
    /*
     * Mark the end of the wait period if dequeueing a
     * waiting task:
     */
    if (se != cfs_rq->curr)
        update_stats_wait_end(cfs_rq, se);
+
+   if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
+       struct task_struct *tsk = task_of(se);
+
+       if (tsk->state & TASK_INTERRUPTIBLE)
+           schedstat_set(se->statistics.sleep_start,
+                     rq_clock(rq_of(cfs_rq)));
+       if (tsk->state & TASK_UNINTERRUPTIBLE)
+           schedstat_set(se->statistics.block_start,
+                     rq_clock(rq_of(cfs_rq)));
+   }
 }

 /*
@@ -1309,8 +1575,16 @@
     * One idle CPU per node is evaluated for a task numa move.
     * Call select_idle_sibling to maybe find a better one.
     */
-   if (!cur)
-       env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+   if (!cur) {
+       /*
+        * select_idle_siblings() uses an per-cpu cpumask that
+        * can be used from IRQ context.
+        */
+       local_irq_disable();
+       env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+                          env->dst_cpu);
+       local_irq_enable();
+   }

 assign:
    task_numa_assign(env, cur, imp);
@@ -1612,6 +1886,11 @@
    u64 runtime, period;
    spinlock_t *group_lock = NULL;

+   /*
+    * The p->mm->numa_scan_seq field gets updated without
+    * exclusive access. Use READ_ONCE() here to ensure
+    * that the field is read in a single access:
+    */
    seq = READ_ONCE(p->mm->numa_scan_seq);
    if (p->numa_scan_seq == seq)
        return;
@@ -1857,7 +2136,7 @@
    int local = !!(flags & TNF_FAULT_LOCAL);
    int priv;

-   if (!numabalancing_enabled)
+   if (!static_branch_likely(&sched_numa_balancing))
        return;

    /* for example, ksmd faulting in a user's mm */
@@ -1929,6 +2208,14 @@

 static void reset_ptenuma_scan(struct task_struct *p)
 {
+   /*
+    * We only did a read acquisition of the mmap sem, so
+    * p->mm->numa_scan_seq is written to without exclusive access
+    * and the update is not guaranteed to be atomic. That's not
+    * much of an issue though, since this is just used for
+    * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
+    * expensive, to avoid any form of compiler optimizations:
+    */
    WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
    p->mm->numa_scan_offset = 0;
 }
@@ -1945,9 +2232,9 @@
    struct vm_area_struct *vma;
    unsigned long start, end;
    unsigned long nr_pte_updates = 0;
-   long pages;
+   long pages, virtpages;

-   WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+   SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));

    work->next = work; /* protect against double add */
    /*
@@ -1991,9 +2278,11 @@
    start = mm->numa_scan_offset;
    pages = sysctl_numa_balancing_scan_size;
    pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+   virtpages = pages * 8;     /* Scan up to this much virtual space */
    if (!pages)
        return;

+
    down_read(&mm->mmap_sem);
    vma = find_vma(mm, start);
    if (!vma) {
@@ -2003,7 +2292,7 @@
    }
    for (; vma; vma = vma->vm_next) {
        if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
-           is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
+           is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
            continue;
        }

@@ -2028,18 +2317,22 @@
            start = max(start, vma->vm_start);
            end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
            end = min(end, vma->vm_end);
-           nr_pte_updates += change_prot_numa(vma, start, end);
+           nr_pte_updates = change_prot_numa(vma, start, end);

            /*
-            * Scan sysctl_numa_balancing_scan_size but ensure that
-            * at least one PTE is updated so that unused virtual
-            * address space is quickly skipped.
+            * Try to scan sysctl_numa_balancing_size worth of
+            * hpages that have at least one present PTE that
+            * is not already pte-numa. If the VMA contains
+            * areas that are unused or already full of prot_numa
+            * PTEs, scan up to virtpages, to skip through those
+            * areas faster.
             */
            if (nr_pte_updates)
                pages -= (end - start) >> PAGE_SHIFT;
+           virtpages -= (end - start) >> PAGE_SHIFT;

            start = end;
-           if (pages <= 0)
+           if (pages <= 0 || virtpages <= 0)
                goto out;

            cond_resched();
@@ -2140,28 +2433,22 @@

 #ifdef CONFIG_FAIR_GROUP_SCHED
 # ifdef CONFIG_SMP
-static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
-   long tg_weight;
+   long tg_weight, load, shares;

    /*
-    * Use this CPU's real-time load instead of the last load contribution
-    * as the updating of the contribution is delayed, and we will use the
-    * the real-time load to calc the share. See update_tg_load_avg().
+    * This really should be: cfs_rq->avg.load_avg, but instead we use
+    * cfs_rq->load.weight, which is its upper bound. This helps ramp up
+    * the shares for small weight interactive tasks.
     */
-   tg_weight = atomic_long_read(&tg->load_avg);
-   tg_weight -= cfs_rq->tg_load_avg_contrib;
-   tg_weight += cfs_rq->load.weight;
+   load = scale_load_down(cfs_rq->load.weight);

-   return tg_weight;
-}
-
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
-   long tg_weight, load, shares;
+   tg_weight = atomic_long_read(&tg->load_avg);

-   tg_weight = calc_tg_weight(tg, cfs_rq);
-   load = cfs_rq->load.weight;
+   /* Ensure tg_weight >= load */
+   tg_weight -= cfs_rq->tg_load_avg_contrib;
+   tg_weight += load;

    shares = (tg->shares * load);
    if (tg_weight)
@@ -2198,16 +2485,20 @@

 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);

-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
 {
+   struct cfs_rq *cfs_rq = group_cfs_rq(se);
    struct task_group *tg;
-   struct sched_entity *se;
    long shares;

-   tg = cfs_rq->tg;
-   se = tg->se[cpu_of(rq_of(cfs_rq))];
-   if (!se || throttled_hierarchy(cfs_rq))
+   if (!cfs_rq)
+       return;
+
+   if (throttled_hierarchy(cfs_rq))
        return;
+
+   tg = cfs_rq->tg;
+
 #ifndef CONFIG_SMP
    if (likely(se->load.weight == tg->shares))
        return;
@@ -2216,8 +2507,9 @@

    reweight_entity(cfs_rq_of(se), se, shares);
 }
+
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -2225,8 +2517,12 @@
 #ifdef CONFIG_SMP
 /* Precomputed fixed inverse multiplies for multiplication by y^n */
 static const u32 runnable_avg_yN_inv[] = {
-   0xffff, 0xf524, 0xeabf, 0xe0cb, 0xd744, 0xce23, 0xc566, 0xbd07,
-   0xb504, 0xad57, 0xa5fe, 0x9ef4, 0x9837, 0x91c3, 0x8b95, 0x85aa,
+   0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+   0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+   0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+   0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+   0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+   0x85aac367, 0x82cd8698,
 };

 /*
@@ -2234,8 +2530,19 @@
  * over-estimates when re-combining.
  */
 static const u32 runnable_avg_yN_sum[] = {
-       0,  980, 1919, 2818, 3679, 4503, 5292, 6048, 6772, 7465, 8129,
-    8764, 9373, 9956,10514,11048,11560,
+       0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
+    9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
+   17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
+};
+
+/*
+ * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
+ * lower integers. See Documentation/scheduler/sched-avg.txt how these
+ * were generated:
+ */
+static const u32 __accumulated_sum_N32[] = {
+       0, 23371, 35056, 40899, 43820, 45281,
+   46011, 46376, 46559, 46650, 46696, 46719,
 };

 /*
@@ -2266,8 +2573,7 @@
        local_n %= LOAD_AVG_PERIOD;
    }

-   val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n],
-           LOAD_AVG_PERIOD);
+   val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
    return val;
 }

@@ -2287,22 +2593,13 @@
    else if (unlikely(n >= LOAD_AVG_MAX_N))
        return LOAD_AVG_MAX;

-   /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
-   do {
-       contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
-       contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
-
-       n -= LOAD_AVG_PERIOD;
-   } while (n > LOAD_AVG_PERIOD);
-
+   /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
+   contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
+   n %= LOAD_AVG_PERIOD;
    contrib = decay_load(contrib, n);
    return contrib + runnable_avg_yN_sum[n];
 }

-#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
-#error "load tracking assumes 2^10 as unit"
-#endif
-
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)

 /*
@@ -2439,10 +2736,42 @@
    return decayed;
 }

-#ifdef CONFIG_FAIR_GROUP_SCHED
 /*
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * and effective_load (which is not done because it is too costly).
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do {                           \
+   typeof(_ptr) ptr = (_ptr);                              \
+   typeof(_val) val = (_val);                              \
+   typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+                               \
+   res = var + val;                                        \
+                               \
+   if (val < 0 && res > var)                               \
+       res = 0;                                        \
+                               \
+   WRITE_ONCE(*ptr, res);                                  \
+} while (0)
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
  */
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 {
@@ -2506,29 +2835,249 @@
        se->avg.last_update_time = n_last_update_time;
    }
 }
+
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+   struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+   long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+   /* Nothing to update */
+   if (!delta)
+       return;
+
+   /* Set new sched_entity's utilization */
+   se->avg.util_avg = gcfs_rq->avg.util_avg;
+   se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+   /* Update parent cfs_rq utilization */
+   add_positive(&cfs_rq->avg.util_avg, delta);
+   cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+   struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+   long delta, load = gcfs_rq->avg.load_avg;
+
+   /*
+    * If the load of group cfs_rq is null, the load of the
+    * sched_entity will also be null so we can skip the formula
+    */
+   if (load) {
+       long tg_load;
+
+       /* Get tg's load and ensure tg_load > 0 */
+       tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+
+       /* Ensure tg_load >= load and updated with current load*/
+       tg_load -= gcfs_rq->tg_load_avg_contrib;
+       tg_load += load;
+
+       /*
+        * We need to compute a correction term in the case that the
+        * task group is consuming more CPU than a task of equal
+        * weight. A task with a weight equals to tg->shares will have
+        * a load less or equal to scale_load_down(tg->shares).
+        * Similarly, the sched_entities that represent the task group
+        * at parent level, can't have a load higher than
+        * scale_load_down(tg->shares). And the Sum of sched_entities'
+        * load must be <= scale_load_down(tg->shares).
+        */
+       if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+           /* scale gcfs_rq's load into tg's shares*/
+           load *= scale_load_down(gcfs_rq->tg->shares);
+           load /= tg_load;
+       }
+   }
+
+   delta = load - se->avg.load_avg;
+
+   /* Nothing to update */
+   if (!delta)
+       return;
+
+   /* Set new sched_entity's load */
+   se->avg.load_avg = load;
+   se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+
+   /* Update parent cfs_rq load */
+   add_positive(&cfs_rq->avg.load_avg, delta);
+   cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+
+   /*
+    * If the sched_entity is already enqueued, we also have to update the
+    * runnable load avg.
+    */
+   if (se->on_rq) {
+       /* Update parent cfs_rq runnable_load_avg */
+       add_positive(&cfs_rq->runnable_load_avg, delta);
+       cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+   }
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+{
+   cfs_rq->propagate_avg = 1;
+}
+
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+{
+   struct cfs_rq *cfs_rq = group_cfs_rq(se);
+
+   if (!cfs_rq->propagate_avg)
+       return 0;
+
+   cfs_rq->propagate_avg = 0;
+   return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+   struct cfs_rq *cfs_rq;
+
+   if (entity_is_task(se))
+       return 0;
+
+   if (!test_and_clear_tg_cfs_propagate(se))
+       return 0;
+
+   cfs_rq = cfs_rq_of(se);
+
+   set_tg_cfs_propagate(cfs_rq);
+
+   update_tg_cfs_util(cfs_rq, se);
+   update_tg_cfs_load(cfs_rq, se);
+
+   return 1;
+}
+
+/*
+ * Check if we need to update the load and the utilization of a blocked
+ * group_entity:
+ */
+static inline bool skip_blocked_update(struct sched_entity *se)
+{
+   struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+
+   /*
+    * If sched_entity still have not zero load or utilization, we have to
+    * decay it:
+    */
+   if (se->avg.load_avg || se->avg.util_avg)
+       return false;
+
+   /*
+    * If there is a pending propagation, we have to update the load and
+    * the utilization of the sched_entity:
+    */
+   if (gcfs_rq->propagate_avg)
+       return false;
+
+   /*
+    * Otherwise, the load and the utilization of the sched_entity is
+    * already zero and there is no pending propagation, so it will be a
+    * waste of time to try to decay it:
+    */
+   return true;
+}
+
 #else /* CONFIG_FAIR_GROUP_SCHED */
+
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+   return 0;
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+
 #endif /* CONFIG_FAIR_GROUP_SCHED */

+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+        if (&this_rq()->cfs == cfs_rq) {
+                /*
+                 * There are a few boundary cases this might miss but it should
+                 * get called often enough that that should (hopefully) not be
+                 * a real problem -- added to that it only calls on the local
+                 * CPU, so if we enqueue remotely we'll miss an update, but
+                 * the next tick/schedule should update.
+                 *
+                 * It will not get called when we go idle, because the idle
+                 * thread is a different class (!fair), nor will the utilization
+                 * number include things like RT tasks.
+                 *
+                 * As is, the util number is not freq-invariant (we'd have to
+                 * implement arch_scale_freq_capacity() for that).
+                 *
+                 * See cpu_util().
+                 */
+                cpufreq_update_util(rq_of(cfs_rq), 0);
+        }
+}
+
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);

-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do {              \
+   typeof(_ptr) ptr = (_ptr);              \
+   typeof(*ptr) val = (_val);              \
+   typeof(*ptr) res, var = READ_ONCE(*ptr);        \
+   res = var - val;                    \
+   if (res > var)                      \
+       res = 0;                    \
+   WRITE_ONCE(*ptr, res);                  \
+} while (0)
+
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
+ */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 {
    struct sched_avg *sa = &cfs_rq->avg;
-   int decayed, removed = 0;
+   int decayed, removed = 0, removed_util = 0;

    if (atomic_long_read(&cfs_rq->removed_load_avg)) {
-       long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
-       sa->load_avg = max_t(long, sa->load_avg - r, 0);
-       sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+       s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+       sub_positive(&sa->load_avg, r);
+       sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
        removed = 1;
+       set_tg_cfs_propagate(cfs_rq);
    }

    if (atomic_long_read(&cfs_rq->removed_util_avg)) {
        long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
-       sa->util_avg = max_t(long, sa->util_avg - r, 0);
-       sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+       sub_positive(&sa->util_avg, r);
+       sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
+       removed_util = 1;
+       set_tg_cfs_propagate(cfs_rq);
    }

    decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2539,81 +3088,94 @@
    cfs_rq->load_last_update_time_copy = sa->last_update_time;
 #endif

+   /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
+   if (cfs_rq == &rq_of(cfs_rq)->cfs)
+       trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+
+   if (update_freq && (decayed || removed_util))
+       cfs_rq_util_change(cfs_rq);
+
    return decayed || removed;
 }

-static inline unsigned long task_util_est(struct task_struct *p)
-{
-   return p->se.avg.util_est;
-}
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG  0x1
+#define SKIP_AGE_LOAD  0x2
+#define SKIP_CPUFREQ   0x4

 /* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_load_avg(struct sched_entity *se, int flags)
 {
    struct cfs_rq *cfs_rq = cfs_rq_of(se);
    u64 now = cfs_rq_clock_task(cfs_rq);
    int cpu = cpu_of(rq_of(cfs_rq));
+   int decayed;
+   void *ptr = NULL;

    /*
     * Track task load average for carrying it to new CPU after migrated, and
     * track group sched_entity load average for task_h_load calc in migration
     */
-   __update_load_avg(now, cpu, &se->avg,
+   if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+       __update_load_avg(now, cpu, &se->avg,
              se->on_rq * scale_load_down(se->load.weight),
              cfs_rq->curr == se, NULL);
+   }

-   if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
-       update_tg_load_avg(cfs_rq, 0);
+   decayed  = update_cfs_rq_load_avg(now, cfs_rq, !(flags & SKIP_CPUFREQ));
+   decayed |= propagate_entity_load_avg(se);

-   if (entity_is_task(se))
-       trace_sched_load_avg_task(task_of(se), &se->avg);
-   trace_sched_load_avg_cpu(cpu, cfs_rq);
+   if (decayed && (flags & UPDATE_TG))
+       update_tg_load_avg(cfs_rq, 0);

-   /* Update task estimated utilization */
-   if (se->avg.util_est < se->avg.util_avg) {
-       cfs_rq->avg.util_est += (se->avg.util_avg - se->avg.util_est);
-       se->avg.util_est = se->avg.util_avg;
+   if (entity_is_task(se)) {
+#ifdef CONFIG_SCHED_WALT
+       ptr = (void *)&(task_of(se)->ravg);
+#endif
+       trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
    }
-
 }

+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-   if (!sched_feat(ATTACH_AGE_LOAD))
-       goto skip_aging;
-
-   /*
-    * If we got migrated (either between CPUs or between cgroups) we'll
-    * have aged the average right before clearing @last_update_time.
-    */
-   if (se->avg.last_update_time) {
-       __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-                 &se->avg, 0, 0, NULL);
-
-       /*
-        * XXX: we could have just aged the entire load away if we've been
-        * absent from the fair class for too long.
-        */
-   }
-
-skip_aging:
    se->avg.last_update_time = cfs_rq->avg.last_update_time;
    cfs_rq->avg.load_avg += se->avg.load_avg;
    cfs_rq->avg.load_sum += se->avg.load_sum;
    cfs_rq->avg.util_avg += se->avg.util_avg;
    cfs_rq->avg.util_sum += se->avg.util_sum;
+   set_tg_cfs_propagate(cfs_rq);
+
+   cfs_rq_util_change(cfs_rq);
 }

+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-   __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-             &se->avg, se->on_rq * scale_load_down(se->load.weight),
-             cfs_rq->curr == se, NULL);

-   cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
-   cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
-   cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
-   cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+   sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+   sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+   sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
+   sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+   set_tg_cfs_propagate(cfs_rq);
+
+   cfs_rq_util_change(cfs_rq);
 }

 /* Add the load generated by se into cfs_rq's load average */
@@ -2621,34 +3183,20 @@
 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
    struct sched_avg *sa = &se->avg;
-   u64 now = cfs_rq_clock_task(cfs_rq);
-   int migrated, decayed;
-
-   migrated = !sa->last_update_time;
-   if (!migrated) {
-       __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
-           se->on_rq * scale_load_down(se->load.weight),
-           cfs_rq->curr == se, NULL);
-   }
-
-   decayed = update_cfs_rq_load_avg(now, cfs_rq);

    cfs_rq->runnable_load_avg += sa->load_avg;
    cfs_rq->runnable_load_sum += sa->load_sum;

-   if (migrated)
+   if (!sa->last_update_time) {
        attach_entity_load_avg(cfs_rq, se);
-
-   if (decayed || migrated)
        update_tg_load_avg(cfs_rq, 0);
+   }
 }

 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
 static inline void
 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-   update_load_avg(se, 1);
-
    cfs_rq->runnable_load_avg =
        max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
    cfs_rq->runnable_load_sum =
@@ -2677,46 +3225,36 @@
 #endif

 /*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_entity_load_avg(struct sched_entity *se)
+{
+   struct cfs_rq *cfs_rq = cfs_rq_of(se);
+   u64 last_update_time;
+
+   last_update_time = cfs_rq_last_update_time(cfs_rq);
+   __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+}
+
+/*
  * Task first catches up with cfs_rq, and then subtract
  * itself from the cfs_rq (task must be off the queue now).
  */
 void remove_entity_load_avg(struct sched_entity *se)
 {
    struct cfs_rq *cfs_rq = cfs_rq_of(se);
-   u64 last_update_time;

    /*
     * Newly created task or never used group entity should not be removed
     * from its (source) cfs_rq
     */
-   if (se->avg.last_update_time == 0)
-       return;
-
-   last_update_time = cfs_rq_last_update_time(cfs_rq);

-   __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+   sync_entity_load_avg(se);
    atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
    atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
 }

-/*
- * Update the rq's load with the elapsed running time before entering
- * idle. if the last scheduled task is not a CFS task, idle_enter will
- * be the only way to update the runnable statistic.
- */
-void idle_enter_fair(struct rq *this_rq)
-{
-}
-
-/*
- * Update the rq's load with the elapsed idle time before a task is
- * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
- * be the only way to update the runnable statistic.
- */
-void idle_exit_fair(struct rq *this_rq)
-{
-}
-
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
 {
    return cfs_rq->runnable_load_avg;
@@ -2731,7 +3269,17 @@

 #else /* CONFIG_SMP */

-static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+{
+   return 0;
+}
+
+#define UPDATE_TG  0x0
+#define SKIP_AGE_LOAD  0x0
+#define SKIP_CPUFREQ   0x0
+
+static inline void update_load_avg(struct sched_entity *se, int not_used1){}
 static inline void
 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void
@@ -2750,69 +3298,6 @@

 #endif /* CONFIG_SMP */

-static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHEDSTATS
-   struct task_struct *tsk = NULL;
-
-   if (entity_is_task(se))
-       tsk = task_of(se);
-
-   if (se->statistics.sleep_start) {
-       u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
-
-       if ((s64)delta < 0)
-           delta = 0;
-
-       if (unlikely(delta > se->statistics.sleep_max))
-           se->statistics.sleep_max = delta;
-
-       se->statistics.sleep_start = 0;
-       se->statistics.sum_sleep_runtime += delta;
-
-       if (tsk) {
-           account_scheduler_latency(tsk, delta >> 10, 1);
-           trace_sched_stat_sleep(tsk, delta);
-       }
-   }
-   if (se->statistics.block_start) {
-       u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
-
-       if ((s64)delta < 0)
-           delta = 0;
-
-       if (unlikely(delta > se->statistics.block_max))
-           se->statistics.block_max = delta;
-
-       se->statistics.block_start = 0;
-       se->statistics.sum_sleep_runtime += delta;
-
-       if (tsk) {
-           if (tsk->in_iowait) {
-               se->statistics.iowait_sum += delta;
-               se->statistics.iowait_count++;
-               trace_sched_stat_iowait(tsk, delta);
-           }
-
-           trace_sched_stat_blocked(tsk, delta);
-           trace_sched_blocked_reason(tsk);
-
-           /*
-            * Blocking time is in units of nanosecs, so shift by
-            * 20 to get a milliseconds-range estimation of the
-            * amount of time that the task spent sleeping:
-            */
-           if (unlikely(prof_on == SLEEP_PROFILING)) {
-               profile_hits(SLEEP_PROFILING,
-                       (void *)get_wchan(tsk),
-                       delta >> 20);
-           }
-           account_scheduler_latency(tsk, delta >> 10, 0);
-       }
-   }
-#endif
-}
-
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -2822,7 +3307,7 @@
        d = -d;

    if (d > 3*sysctl_sched_latency)
-       schedstat_inc(cfs_rq, nr_spread_over);
+       schedstat_inc(cfs_rq->nr_spread_over);
 #endif
 }

@@ -2860,6 +3345,26 @@

 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);

+static inline void check_schedstat_required(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+   if (schedstat_enabled())
+       return;
+
+   /* Force schedstat enabled if a dependent tracepoint is active */
+   if (trace_sched_stat_wait_enabled()    ||
+           trace_sched_stat_sleep_enabled()   ||
+           trace_sched_stat_iowait_enabled()  ||
+           trace_sched_stat_blocked_enabled() ||
+           trace_sched_stat_runtime_enabled())  {
+       pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+                "stat_blocked and stat_runtime require the "
+                "kernel parameter schedstats=enabled or "
+                "kernel.sched_schedstats=1\n");
+   }
+#endif
+}
+
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -2874,16 +3379,16 @@
     * Update run-time statistics of the 'current'.
     */
    update_curr(cfs_rq);
+   update_load_avg(se, UPDATE_TG);
    enqueue_entity_load_avg(cfs_rq, se);
+   update_cfs_shares(se);
    account_entity_enqueue(cfs_rq, se);
-   update_cfs_shares(cfs_rq);

-   if (flags & ENQUEUE_WAKEUP) {
+   if (flags & ENQUEUE_WAKEUP)
        place_entity(cfs_rq, se, 0);
-       enqueue_sleeper(cfs_rq, se);
-   }

-   update_stats_enqueue(cfs_rq, se);
+   check_schedstat_required();
+   update_stats_enqueue(cfs_rq, se, flags);
    check_spread(cfs_rq, se);
    if (se != cfs_rq->curr)
        __enqueue_entity(cfs_rq, se);
@@ -2945,25 +3450,30 @@
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+   int update_flags;
+
    /*
     * Update run-time statistics of the 'current'.
     */
    update_curr(cfs_rq);
-   dequeue_entity_load_avg(cfs_rq, se);

-   update_stats_dequeue(cfs_rq, se);
-   if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
-       if (entity_is_task(se)) {
-           struct task_struct *tsk = task_of(se);
+   /*
+    * When dequeuing a sched_entity, we must:
+    *   - Update loads to have both entity and cfs_rq synced with now.
+    *   - Substract its load from the cfs_rq->runnable_avg.
+    *   - Substract its previous weight from cfs_rq->load.weight.
+    *   - For group entity, update its weight to reflect the new share
+    *     of its group cfs_rq.
+    */
+   update_flags = UPDATE_TG;

-           if (tsk->state & TASK_INTERRUPTIBLE)
-               se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
-           if (tsk->state & TASK_UNINTERRUPTIBLE)
-               se->statistics.block_start = rq_clock(rq_of(cfs_rq));
-       }
-#endif
-   }
+   if (flags & DEQUEUE_IDLE)
+       update_flags |= SKIP_CPUFREQ;
+
+   update_load_avg(se, update_flags);
+   dequeue_entity_load_avg(cfs_rq, se);
+
+   update_stats_dequeue(cfs_rq, se, flags);

    clear_buddies(cfs_rq, se);

@@ -2983,8 +3493,16 @@
    /* return excess runtime on last dequeue */
    return_cfs_rq_runtime(cfs_rq);

-   update_min_vruntime(cfs_rq);
-   update_cfs_shares(cfs_rq);
+   update_cfs_shares(se);
+
+   /*
+    * Now advance min_vruntime if @se was the entity holding it back,
+    * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
+    * put back on, and if we advance min_vruntime, we'll be placed back
+    * further than we started -- ie. we'll be penalized.
+    */
+   if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+       update_min_vruntime(cfs_rq);
 }

 /*
@@ -3039,22 +3557,23 @@
         */
        update_stats_wait_end(cfs_rq, se);
        __dequeue_entity(cfs_rq, se);
-       update_load_avg(se, 1);
+       update_load_avg(se, UPDATE_TG);
    }

    update_stats_curr_start(cfs_rq, se);
    cfs_rq->curr = se;
-#ifdef CONFIG_SCHEDSTATS
+
    /*
     * Track our maximum slice length, if the CPU's load is at
     * least twice that of our own weight (i.e. dont track it
     * when there are only lesser-weight tasks around):
     */
-   if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
-       se->statistics.slice_max = max(se->statistics.slice_max,
-           se->sum_exec_runtime - se->prev_sum_exec_runtime);
+   if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+       schedstat_set(se->statistics.slice_max,
+           max((u64)schedstat_val(se->statistics.slice_max),
+               se->sum_exec_runtime - se->prev_sum_exec_runtime));
    }
-#endif
+
    se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }

@@ -3134,6 +3653,7 @@
    check_cfs_rq_runtime(cfs_rq);

    check_spread(cfs_rq, prev);
+
    if (prev->on_rq) {
        update_stats_wait_start(cfs_rq, prev);
        /* Put 'current' back into the tree. */
@@ -3155,8 +3675,8 @@
    /*
     * Ensure that runnable average is periodically updated.
     */
-   update_load_avg(curr, 1);
-   update_cfs_shares(cfs_rq);
+   update_load_avg(curr, UPDATE_TG);
+   update_cfs_shares(curr);

 #ifdef CONFIG_SCHED_HRTICK
    /*
@@ -3255,7 +3775,7 @@
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 {
    if (unlikely(cfs_rq->throttle_count))
-       return cfs_rq->throttled_clock_task;
+       return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;

    return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
 }
@@ -3393,13 +3913,11 @@
    struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];

    cfs_rq->throttle_count--;
-#ifdef CONFIG_SMP
    if (!cfs_rq->throttle_count) {
        /* adjust cfs_rq_clock_task() */
        cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
                         cfs_rq->throttled_clock_task;
    }
-#endif

    return 0;
 }
@@ -3766,6 +4284,23 @@
        throttle_cfs_rq(cfs_rq);
 }

+static void sync_throttle(struct task_group *tg, int cpu)
+{
+   struct cfs_rq *pcfs_rq, *cfs_rq;
+
+   if (!cfs_bandwidth_used())
+       return;
+
+   if (!tg->parent)
+       return;
+
+   cfs_rq = tg->cfs_rq[cpu];
+   pcfs_rq = tg->parent->cfs_rq[cpu];
+
+   cfs_rq->throttle_count = pcfs_rq->throttle_count;
+   pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+}
+
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
@@ -3851,6 +4386,10 @@

 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
+   /* init_cfs_bandwidth() was not called */
+   if (!cfs_b->throttled_cfs_rq.next)
+       return;
+
    hrtimer_cancel(&cfs_b->period_timer);
    hrtimer_cancel(&cfs_b->slack_timer);
 }
@@ -3901,6 +4440,7 @@
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static inline void sync_throttle(struct task_group *tg, int cpu) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}

 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -3945,9 +4485,9 @@
    struct sched_entity *se = &p->se;
    struct cfs_rq *cfs_rq = cfs_rq_of(se);

-   WARN_ON(task_rq(p) != rq);
+   SCHED_WARN_ON(task_rq(p) != rq);

-   if (cfs_rq->nr_running > 1) {
+   if (rq->cfs.h_nr_running > 1) {
        u64 slice = sched_slice(cfs_rq, se);
        u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
        s64 delta = slice - ran;
@@ -3988,21 +4528,25 @@
 #endif

 #ifdef CONFIG_SMP
+static bool __cpu_overutilized(int cpu, int delta);
 static bool cpu_overutilized(int cpu);
+unsigned long boosted_cpu_util(int cpu);
+#else
+#define boosted_cpu_util(cpu) cpu_util_freq(cpu)
 #endif

-#ifdef CONFIG_CPU_FREQ_GOV_SCHED
-static void update_capacity_of(int cpu, bool request)
+#ifdef CONFIG_SMP
+static void update_capacity_of(int cpu)
 {
    unsigned long req_cap;

    if (!sched_freq())
        return;

-   /* Convert scale-invariant capacity to cpu. */
+   /* Normalize scale-invariant capacity to cpu. */
    req_cap = boosted_cpu_util(cpu);
    req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
-   set_cfs_cpu_capacity(cpu, request, req_cap);
+   set_cfs_cpu_capacity(cpu, true, req_cap);
 }
 #endif

@@ -4019,8 +4563,35 @@
 #ifdef CONFIG_SMP
    int task_new = flags & ENQUEUE_WAKEUP_NEW;
    int task_wakeup = flags & ENQUEUE_WAKEUP;
+
+   /*
+    * Update SchedTune accounting.
+    *
+    * We do it before updating the CPU capacity to ensure the
+    * boost value of the current task is accounted for in the
+    * selection of the OPP.
+    *
+    * We do it also in the case where we enqueue a throttled task;
+    * we could argue that a throttled task should not boost a CPU,
+    * however:
+    * a) properly implementing CPU boosting considering throttled
+    *    tasks will increase a lot the complexity of the solution
+    * b) it's not easy to quantify the benefits introduced by
+    *    such a more complex solution.
+    * Thus, for the time being we go for the simple solution and boost
+    * also for throttled RQs.
+    */
+   schedtune_enqueue_task(p, cpu_of(rq));
 #endif

+   /*
+    * If in_iowait is set, the code below may not trigger any cpufreq
+    * utilization updates, so do it here explicitly with the IOWAIT flag
+    * passed.
+    */
+   if (p->in_iowait)
+       cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+
    for_each_sched_entity(se) {
        if (se->on_rq)
            break;
@@ -4032,7 +4603,7 @@
         *
         * note: in the case of encountering a throttled cfs_rq we will
         * post the final h_nr_running increment below.
-       */
+        */
        if (cfs_rq_throttled(cfs_rq))
            break;
        cfs_rq->h_nr_running++;
@@ -4049,34 +4620,14 @@
        if (cfs_rq_throttled(cfs_rq))
            break;

-       update_load_avg(se, 1);
-       update_cfs_shares(cfs_rq);
+       update_load_avg(se, UPDATE_TG);
+       update_cfs_shares(se);
    }

    if (!se)
        add_nr_running(rq, 1);

 #ifdef CONFIG_SMP
-
-   /*
-    * Update SchedTune accouting.
-    *
-    * We do it before updating the CPU capacity to ensure the
-    * boost value of the current task is accounted for in the
-    * selection of the OPP.
-    *
-    * We do it also in the case where we enqueue a trottled task;
-    * we could argue that a throttled task should not boost a CPU,
-    * however:
-    * a) properly implementing CPU boosting considering throttled
-    *    tasks will increase a lot the complexity of the solution
-    * b) it's not easy to quantify the benefits introduced by
-    *    such a more complex solution.
-    * Thus, for the time being we go for the simple solution and boost
-    * also for throttled RQs.
-    */
-   schedtune_enqueue_task(p, cpu_of(rq));
-
    if (!se) {
        walt_inc_cumulative_runnable_avg(rq, p);
        if (!task_new && !rq->rd->overutilized &&
@@ -4093,17 +4644,10 @@
         * request after load balancing is done.
         */
        if (task_new || task_wakeup)
-           update_capacity_of(cpu_of(rq), true);
+           update_capacity_of(cpu_of(rq));
    }

-   /* Get the top level CFS RQ for the task CPU */
-   cfs_rq = &(task_rq(p)->cfs);
-
-   /* Update RQ estimated utilization */
-   cfs_rq->avg.util_est += task_util_est(p);
-
 #endif /* CONFIG_SMP */
-
    hrtick_update(rq);
 }

@@ -4120,6 +4664,20 @@
    struct sched_entity *se = &p->se;
    int task_sleep = flags & DEQUEUE_SLEEP;

+   if (task_sleep && rq->nr_running == 1)
+       flags |= DEQUEUE_IDLE;
+
+#ifdef CONFIG_SMP
+   /*
+    * Update SchedTune accounting
+    *
+    * We do it before updating the CPU capacity to ensure the
+    * boost value of the current task is accounted for in the
+    * selection of the OPP.
+    */
+   schedtune_dequeue_task(p, cpu_of(rq));
+#endif
+
    for_each_sched_entity(se) {
        cfs_rq = cfs_rq_of(se);
        dequeue_entity(cfs_rq, se, flags);
@@ -4137,21 +4695,22 @@

        /* Don't dequeue parent if it has other entities besides us */
        if (cfs_rq->load.weight) {
+           /* Avoid re-evaluating load for this entity: */
+           se = parent_entity(se);
            /*
             * Bias pick_next to pick a task from this cfs_rq, as
             * p is sleeping when it is within its sched_slice.
             */
-           if (task_sleep && parent_entity(se))
-               set_next_buddy(parent_entity(se));
-
-           /* avoid re-evaluating load for this entity */
-           se = parent_entity(se);
+           if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+               set_next_buddy(se);
            break;
        }
        flags |= DEQUEUE_SLEEP;
    }

    for_each_sched_entity(se) {
+       int update_flags;
+
        cfs_rq = cfs_rq_of(se);
        cfs_rq->h_nr_running--;
        walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
@@ -4159,24 +4718,19 @@
        if (cfs_rq_throttled(cfs_rq))
            break;

-       update_load_avg(se, 1);
-       update_cfs_shares(cfs_rq);
+       update_flags = UPDATE_TG;
+
+       if (flags & DEQUEUE_IDLE)
+           update_flags |= SKIP_CPUFREQ;
+
+       update_load_avg(se, update_flags);
+       update_cfs_shares(se);
    }

    if (!se)
        sub_nr_running(rq, 1);

 #ifdef CONFIG_SMP
-
-   /*
-    * Update SchedTune accouting
-    *
-    * We do it before updating the CPU capacity to ensure the
-    * boost value of the current task is accounted for in the
-    * selection of the OPP.
-    */
-   schedtune_dequeue_task(p, cpu_of(rq));
-
    if (!se) {
        walt_dec_cumulative_runnable_avg(rq, p);

@@ -4190,26 +4744,12 @@
         */
        if (task_sleep) {
            if (rq->cfs.nr_running)
-               update_capacity_of(cpu_of(rq), true);
+               update_capacity_of(cpu_of(rq));
            else if (sched_freq())
-               update_capacity_of(cpu_of(rq), false);
+               set_cfs_cpu_capacity(cpu_of(rq), false, 0); /* no normalization required for 0 */
        }
    }

-   /* Get the top level CFS RQ for the task CPU */
-   cfs_rq = &(task_rq(p)->cfs);
-
-   /* Update RQ estimated utilization */
-   if (cfs_rq->avg.util_est >= task_util_est(p))
-       cfs_rq->avg.util_est -= task_util_est(p);
-   else
-       cfs_rq->avg.util_est = 0;
-
-
-   /* Update estimated utilization */
-   if (task_sleep)
-       p->se.avg.util_est = p->se.avg.util_avg;
-
 #endif /* CONFIG_SMP */

    hrtick_update(rq);
@@ -4545,25 +5085,30 @@
        return wl;

    for_each_sched_entity(se) {
-       long w, W;
+       struct cfs_rq *cfs_rq = se->my_q;
+       long W, w = cfs_rq_load_avg(cfs_rq);

-       tg = se->my_q->tg;
+       tg = cfs_rq->tg;

        /*
         * W = @wg + \Sum rw_j
         */
-       W = wg + calc_tg_weight(tg, se->my_q);
+       W = wg + atomic_long_read(&tg->load_avg);
+
+       /* Ensure \Sum rw_j >= rw_i */
+       W -= cfs_rq->tg_load_avg_contrib;
+       W += w;

        /*
         * w = rw_i + @wl
         */
-       w = cfs_rq_load_avg(se->my_q) + wl;
+       w += wl;

        /*
         * wl = S * s'_i; see (2)
         */
        if (W > 0 && w < W)
-           wl = (w * tg->shares) / W;
+           wl = (w * (long)tg->shares) / W;
        else
            wl = tg->shares;

@@ -4612,16 +5157,95 @@
           >> SCHED_CAPACITY_SHIFT;
 }

+/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and min freq scaling.
+ */
+unsigned long capacity_min_of(int cpu)
+{
+   if (!sched_feat(MIN_CAPACITY_CAPPING))
+       return 0;
+   return arch_scale_cpu_capacity(NULL, cpu) *
+          arch_scale_min_freq_capacity(NULL, cpu)
+          >> SCHED_CAPACITY_SHIFT;
+}
+
+
 static inline bool energy_aware(void)
 {
    return sched_feat(ENERGY_AWARE);
 }

 /*
+ * CPU candidates.
+ *
+ * These are labels to reference CPU candidates for an energy_diff.
+ * Currently we support only two possible candidates: the task's previous CPU
+ * and another candiate CPU.
+ * More advanced/aggressive EAS selection policies can consider more
+ * candidates.
+ */
+#define EAS_CPU_PRV    0
+#define EAS_CPU_NXT    1
+#define EAS_CPU_BKP    2
+#define EAS_CPU_CNT    3
+
+/*
+ * energy_diff - supports the computation of the estimated energy impact in
+ * moving a "task"'s "util_delta" between different CPU candidates.
+ */
+struct energy_env {
+   /* Utilization to move */
+   struct task_struct  *p;
+   int         util_delta;
+
+   /* Mask of CPUs candidates to evaluate */
+   cpumask_t       cpus_mask;
+
+   /* CPU candidates to evaluate */
+   struct {
+
+       /* CPU ID, must be in cpus_mask */
+       int cpu_id;
+
+       /*
+        * Index (into sched_group_energy::cap_states) of the OPP the
+        * CPU needs to run at if the task is placed on it.
+        * This includes the both active and blocked load, due to
+        * other tasks on this CPU,  as well as the task's own
+        * utilization.
+        */
+       int cap_idx;
+       int cap;
+
+       /* Estimated system energy */
+       unsigned int energy;
+
+       /* Estimated energy variation wrt EAS_CPU_PRV */
+       int nrg_delta;
+
+   } cpu[EAS_CPU_CNT];
+
+   /*
+    * Index (into energy_env::cpu) of the morst energy efficient CPU for
+    * the specified energy_env::task
+    */
+   int         next_idx;
+
+   /* Support data */
+   struct sched_group  *sg_top;
+   struct sched_group  *sg_cap;
+   struct sched_group  *sg;
+};
+
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
+/*
  * __cpu_norm_util() returns the cpu util relative to a specific capacity,
- * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
- * energy calculations. Using the scale-invariant util returned by
- * cpu_util() and approximating scale-invariant util by:
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
+ * energy calculations.
+ *
+ * Since util is a scale-invariant utilization defined as:
  *
  *   util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
  *
@@ -4631,34 +5255,41 @@
  *
  *   norm_util = running_time/time ~ util/capacity
  */
-static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta)
+static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
 {
-   int util = __cpu_util(cpu, delta, UTIL_EST);
-
    if (util >= capacity)
        return SCHED_CAPACITY_SCALE;

    return (util << SCHED_CAPACITY_SHIFT)/capacity;
 }

-static int calc_util_delta(struct energy_env *eenv, int cpu)
-{
-   if (cpu == eenv->src_cpu)
-       return -eenv->util_delta;
-   if (cpu == eenv->dst_cpu)
-       return eenv->util_delta;
-   return 0;
-}
-
-static
-unsigned long group_max_util(struct energy_env *eenv)
+static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx)
 {
-   int i, delta;
    unsigned long max_util = 0;
+   unsigned long util;
+   int cpu;
+
+   for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
+       util = cpu_util_wake(cpu, eenv->p);
+
+       /*
+        * If we are looking at the target CPU specified by the eenv,
+        * then we should add the (estimated) utilization of the task
+        * assuming we will wake it up on that CPU.
+        */
+       if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
+           util += eenv->util_delta;
+
+       max_util = max(max_util, util);
+
+       /*
+        * Take into account any minimum frequency imposed
+        * elsewhere which limits the energy states available
+        * If the MIN_CAPACITY_CAPPING feature is not enabled
+        * capacity_min_of will return 0 (not capped).
+        */
+       max_util = max(max_util, capacity_min_of(cpu));

-   for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) {
-       delta = calc_util_delta(eenv, i);
-       max_util = max(max_util, __cpu_util(i, delta, UTIL_EST));
    }

    return max_util;
@@ -4666,93 +5297,67 @@

 /*
  * group_norm_util() returns the approximated group util relative to it's
- * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in
- * energy calculations. Since task executions may or may not overlap in time in
- * the group the true normalized util is between max(cpu_norm_util(i)) and
- * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
- * latter is used as the estimate as it leads to a more pessimistic energy
+ * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
+ * in energy calculations.
+ *
+ * Since task executions may or may not overlap in time in the group the true
+ * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
+ * when iterating over all CPUs in the group.
+ * The latter estimate is used as it leads to a more pessimistic energy
  * estimate (more busy).
  */
 static unsigned
-long group_norm_util(struct energy_env *eenv)
+long group_norm_util(struct energy_env *eenv, int cpu_idx)
 {
-   int i, delta;
-   unsigned long util_sum = 0;
-   struct sched_group *sg = eenv->sg;
-   unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+   unsigned long capacity = eenv->cpu[cpu_idx].cap;
+   unsigned long util, util_sum = 0;
+   int cpu;

-   for_each_cpu(i, sched_group_cpus(sg)) {
-       delta = calc_util_delta(eenv, i);
-       util_sum += __cpu_norm_util(i, capacity, delta);
-   }
+   for_each_cpu(cpu, sched_group_cpus(eenv->sg)) {
+       util = cpu_util_wake(cpu, eenv->p);

-   if (util_sum > SCHED_CAPACITY_SCALE)
-       return SCHED_CAPACITY_SCALE;
-   return util_sum;
-}
+       /*
+        * If we are looking at the target CPU specified by the eenv,
+        * then we should add the (estimated) utilization of the task
+        * assuming we will wake it up on that CPU.
+        */
+       if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
+           util += eenv->util_delta;

-#ifdef CONFIG_SCHED_TUNE
-static inline int
-find_min_capacity(struct energy_env *eenv)
-{
-   const struct sched_group_energy const *sge = eenv->sg->sge;
-   unsigned long min_capacity, cur_capacity;
-   int min_cap_idx, cap_idx;
-   unsigned long min_util;
-
-   /* Non boosted tasks do not affect the minimum capacity */
-   if (!schedtune_task_boost(eenv->task))
-       return eenv->cap_idx;
-
-   /* Find minimum capacity to satify the task boost value */
-   min_util = boosted_task_util(eenv->task);
-   for (min_cap_idx = 0; min_cap_idx < (sge->nr_cap_states-1); min_cap_idx++) {
-       if (sge->cap_states[min_cap_idx].cap >= min_util)
-           break;
+       util_sum += __cpu_norm_util(util, capacity);
    }
-   min_capacity = sge->cap_states[min_cap_idx].cap;
-
-   /* The current capacity is the one computed by the caller */
-   cur_capacity = sge->cap_states[eenv->cap_idx].cap;
-
-   /*
-    * Compute the minumum CPU capacity required to support task boosting
-    * within this SG.
-    */
-   cur_capacity = max(min_capacity, cur_capacity);
-   cap_idx = max(eenv->cap_idx, min_cap_idx);

-   return cap_idx;
+   return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
 }
-#else
-#define find_min_capacity(eenv) eenv->cap_idx
-#endif /* CONFIG_SCHED_TUNE */

-static int find_new_capacity(struct energy_env *eenv)
+static int find_new_capacity(struct energy_env *eenv, int cpu_idx)
 {
-   const struct sched_group_energy const *sge = eenv->sg->sge;
+   const struct sched_group_energy *sge = eenv->sg->sge;
    int idx, max_idx = sge->nr_cap_states - 1;
-   unsigned long util = group_max_util(eenv);
+   unsigned long util = group_max_util(eenv, cpu_idx);

    /* default is max_cap if we don't find a match */
-   eenv->cap_idx = max_idx;
+   eenv->cpu[cpu_idx].cap_idx = max_idx;
+   eenv->cpu[cpu_idx].cap = sge->cap_states[max_idx].cap;

    for (idx = 0; idx < sge->nr_cap_states; idx++) {
        if (sge->cap_states[idx].cap >= util) {
-           /* Keep track of SG's capacity index */
-           eenv->cap_idx = idx;
+           /* Keep track of SG's capacity */
+           eenv->cpu[cpu_idx].cap_idx = idx;
+           eenv->cpu[cpu_idx].cap = sge->cap_states[idx].cap;
            break;
        }
    }
-   /* Update SG's capacity based on boost value of the current task */
-   eenv->cap_idx = find_min_capacity(eenv);

-   return eenv->cap_idx;
+   return eenv->cpu[cpu_idx].cap_idx;
 }

-static int group_idle_state(struct sched_group *sg)
+static int group_idle_state(struct energy_env *eenv, int cpu_idx)
 {
+   struct sched_group *sg = eenv->sg;
    int i, state = INT_MAX;
+   int src_in_grp, dst_in_grp;
+   long grp_util = 0;

    /* Find the shallowest idle state in the sched group. */
    for_each_cpu(i, sched_group_cpus(sg))
@@ -4761,114 +5366,161 @@
    /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
    state++;

+   src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id,
+                     sched_group_cpus(sg));
+   dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id,
+                     sched_group_cpus(sg));
+   if (src_in_grp == dst_in_grp) {
+       /* both CPUs under consideration are in the same group or not in
+        * either group, migration should leave idle state the same.
+        */
+       goto end;
+   }
+
+   /*
+    * Try to estimate if a deeper idle state is
+    * achievable when we move the task.
+    */
+   for_each_cpu(i, sched_group_cpus(sg)) {
+       grp_util += cpu_util_wake(i, eenv->p);
+       if (unlikely(i == eenv->cpu[cpu_idx].cpu_id))
+           grp_util += eenv->util_delta;
+   }
+
+   if (grp_util <=
+       ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
+       /* after moving, this group is at most partly
+        * occupied, so it should have some idle time.
+        */
+       int max_idle_state_idx = sg->sge->nr_idle_states - 2;
+       int new_state = grp_util * max_idle_state_idx;
+       if (grp_util <= 0)
+           /* group will have no util, use lowest state */
+           new_state = max_idle_state_idx + 1;
+       else {
+           /* for partially idle, linearly map util to idle
+            * states, excluding the lowest one. This does not
+            * correspond to the state we expect to enter in
+            * reality, but an indication of what might happen.
+            */
+           new_state = min(max_idle_state_idx, (int)
+                   (new_state / sg->sgc->max_capacity));
+           new_state = max_idle_state_idx - new_state;
+       }
+       state = new_state;
+   } else {
+       /* After moving, the group will be fully occupied
+        * so assume it will not be idle at all.
+        */
+       state = 0;
+   }
+end:
    return state;
 }

 /*
- * Compute energy for the eenv's SG (i.e. eenv->sg).
+ * sched_group_energy(): Computes the absolute energy consumption of cpus
+ * belonging to the sched_group including shared resources shared only by
+ * members of the group. Iterates over all cpus in the hierarchy below the
+ * sched_group starting from the bottom working it's way up before going to
+ * the next cpu until all cpus are covered at all levels. The current
+ * implementation is likely to gather the same util statistics multiple times.
+ * This can probably be done in a faster but more complex way.
+ * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg).
+ *
+ * This works in iterations to compute the SG's energy for each CPU
+ * candidate defined by the energy_env's cpu array.
  *
- * This works in two iterations:
- * first iteration, before moving the utilization, i.e.
- *   util_delta == 0
- * second iteration, after moving the utilization, i.e.
- *   util_delta != 0
+ * NOTE: in the following computations for busy_energy and idle_energy we do
+ * not shift by SCHED_CAPACITY_SHIFT in order to reduce rounding errors.
+ * The required scaling will be performed just one time, by the calling
+ * functions, once we accumulated the contributons for all the SGs.
  */
-static void before_after_energy(struct energy_env *eenv)
+static void calc_sg_energy(struct energy_env *eenv)
 {
-
-   int sg_busy_energy, sg_idle_energy;
    struct sched_group *sg = eenv->sg;
-   unsigned long util_delta;
-   unsigned long group_util;
+   int busy_energy, idle_energy;
+   unsigned int busy_power;
+   unsigned int idle_power;
+   unsigned long sg_util;
    int cap_idx, idle_idx;
    int total_energy = 0;
-   unsigned int cap;
-   bool after;
-
-   util_delta = eenv->util_delta;
-   eenv->util_delta = 0;
-   after = false;
-
-compute_after:
+   int cpu_idx;

-   idle_idx = group_idle_state(sg);
+   for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {

-   cap_idx = find_new_capacity(eenv);
-   group_util = group_norm_util(eenv);
-   cap = sg->sge->cap_states[cap_idx].cap;

-   sg_busy_energy   = group_util * sg->sge->cap_states[cap_idx].power;
-   sg_busy_energy >>= SCHED_CAPACITY_SHIFT;
+       if (eenv->cpu[cpu_idx].cpu_id == -1)
+           continue;
+       /* Compute ACTIVE energy */
+       cap_idx = find_new_capacity(eenv, cpu_idx);
+       busy_power = sg->sge->cap_states[cap_idx].power;
+       /*
+        * in order to calculate cpu_norm_util, we need to know which
+        * capacity level the group will be at, so calculate that first
+        */
+       sg_util = group_norm_util(eenv, cpu_idx);

-   sg_idle_energy   = SCHED_CAPACITY_SCALE - group_util;
-   sg_idle_energy  *= sg->sge->idle_states[idle_idx].power;
-   sg_idle_energy >>= SCHED_CAPACITY_SHIFT;
+       busy_energy   = sg_util * busy_power;

-   total_energy = sg_busy_energy + sg_idle_energy;
+       /* Compute IDLE energy */
+       idle_idx = group_idle_state(eenv, cpu_idx);
+       idle_power = sg->sge->idle_states[idle_idx].power;

-   /* Account for "after" metrics */
-   if (after) {
-       if (sg->group_weight == 1 &&
-           cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
-           eenv->after.utilization = group_util;
-           eenv->after.capacity = cap;
-       }
-       eenv->after.energy += total_energy;
-       return;
-   }
+       idle_energy   = SCHED_CAPACITY_SCALE - sg_util;
+       idle_energy  *= idle_power;

-   /* Account for "before" metrics */
-   if (sg->group_weight == 1 &&
-       cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
-       eenv->after.utilization = group_util;
-       eenv->before.capacity = cap;
+       total_energy = busy_energy + idle_energy;
+       eenv->cpu[cpu_idx].energy += total_energy;
    }
-   eenv->before.energy += total_energy;
-
-   /* Setup eenv for the "after" case */
-   eenv->util_delta = util_delta;
-   after = true;
-
-   goto compute_after;
-
 }

 /*
- * sched_group_energy(): Computes the absolute energy consumption of cpus
- * belonging to the sched_group including shared resources shared only by
- * members of the group. Iterates over all cpus in the hierarchy below the
- * sched_group starting from the bottom working it's way up before going to
- * the next cpu until all cpus are covered at all levels. The current
- * implementation is likely to gather the same util statistics multiple times.
- * This can probably be done in a faster but more complex way.
- * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ * compute_energy() computes the absolute variation in energy consumption by
+ * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT.
+ *
+ * NOTE: compute_energy() may fail when racing with sched_domain updates, in
+ *       which case we abort by returning -EINVAL.
  */
-static int sched_group_energy(struct energy_env *eenv)
+static int compute_energy(struct energy_env *eenv)
 {
-   struct sched_domain *sd;
    struct cpumask visit_cpus;
-   struct sched_group *sg;
-   int cpu;
+   int cpu_count;

    WARN_ON(!eenv->sg_top->sge);

    cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
+   /* If a cpu is hotplugged in while we are in this function,
+    * it does not appear in the existing visit_cpus mask
+    * which came from the sched_group pointer of the
+    * sched_domain pointed at by sd_ea for either the prev
+    * or next cpu and was dereferenced in __energy_diff.
+    * Since we will dereference sd_scs later as we iterate
+    * through the CPUs we expect to visit, new CPUs can
+    * be present which are not in the visit_cpus mask.
+    * Guard this with cpu_count.
+    */
+   cpu_count = cpumask_weight(&visit_cpus);

    while (!cpumask_empty(&visit_cpus)) {
        struct sched_group *sg_shared_cap = NULL;
-
-       cpu = cpumask_first(&visit_cpus);
+       int cpu = cpumask_first(&visit_cpus);
+       struct sched_domain *sd;

        /*
         * Is the group utilization affected by cpus outside this
         * sched_group?
+        * This sd may have groups with cpus which were not present
+        * when we took visit_cpus.
         */
        sd = rcu_dereference(per_cpu(sd_scs, cpu));
+
        if (sd && sd->parent)
            sg_shared_cap = sd->parent->groups;

        for_each_domain(cpu, sd) {
-           sg = sd->groups;
+           struct sched_group *sg = sd->groups;

            /* Has this sched_domain already been visited? */
            if (sd->child && group_first_cpu(sg) != cpu)
@@ -4878,18 +5530,52 @@
                eenv->sg_cap = sg;
                if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
                    eenv->sg_cap = sg_shared_cap;
+               else
+                   eenv->sg_cap = sg;

+               /*
+                * Compute the energy for all the candidate
+                * CPUs in the current visited SG.
+                */
                eenv->sg = sg;
-               before_after_energy(eenv);
-
-               if (!sd->child)
+               calc_sg_energy(eenv);
+
+               /* remove CPUs we have just visited */
+               if (!sd->child) {
+                   /*
+                    * cpu_count here is the number of
+                    * cpus we expect to visit in this
+                    * calculation. If we race against
+                    * hotplug, we can have extra cpus
+                    * added to the groups we are
+                    * iterating which do not appear in
+                    * the visit_cpus mask. In that case
+                    * we are not able to calculate energy
+                    * without restarting so we will bail
+                    * out and use prev_cpu this time.
+                    */
+                   if (!cpu_count)
+                       return -EINVAL;
                    cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
+                   cpu_count--;
+               }

                if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
                    goto next_cpu;

            } while (sg = sg->next, sg != sd->groups);
        }
+
+       /*
+        * If we raced with hotplug and got an sd NULL-pointer;
+        * returning a wrong energy estimation is better than
+        * entering an infinite loop.
+        * Specifically: If a cpu is unplugged after we took
+        * the visit_cpus mask, it no longer has an sd_scs
+        * pointer, so when we dereference it, we get NULL.
+        */
+       if (cpumask_test_cpu(cpu, &visit_cpus))
+           return -EINVAL;
 next_cpu:
        cpumask_clear_cpu(cpu, &visit_cpus);
        continue;
@@ -4903,168 +5589,103 @@
    return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
 }

-static inline int normalize_energy(int energy_diff);
-
-#define eenv_before(__X) eenv->before.__X
-#define eenv_after(__X)  eenv->after.__X
-#define eenv_delta(__X)  eenv->after.__X - eenv->before.__X
-
-static inline void
-__update_perf_energy_deltas(struct energy_env *eenv)
-{
-   unsigned long task_util = eenv->util_delta;
-
-   /*
-    * SpeedUp Index
-    *
-    *   SPI := cpu_capacity - task_util
-    *
-    * which estimate how sooner a task will complete when running
-    * on an higher OPP wrt the minimum required.
-    */
-   eenv_before(speedup_idx) = eenv_before(capacity) - task_util;
-   eenv_after(speedup_idx)  = eenv_after(capacity) - task_util;
-
-   /*
-    * Delay Index
-    *
-    *   DLI := 1024 * (cpu_util - task_util) / cpu_util
-    *
-    * which represents the "fraction" of CPU bandwidth consumed by other
-    * tasks in the worst case, i.e. assuming all other tasks runs before.
-    *
-    * NOTE: in the above formula we assume that "cpu_util" includes
-    *       already the task utilization.
-    */
-   eenv_before(delay_idx)  =  SCHED_CAPACITY_SCALE;
-   eenv_before(delay_idx) *= (eenv_before(utilization) - task_util);
-   eenv_before(delay_idx) /=  eenv_before(utilization);
-   eenv_after(delay_idx)   =  SCHED_CAPACITY_SCALE;
-   eenv_after(delay_idx)  *= (eenv_after(utilization) - task_util);
-   eenv_after(delay_idx)  /=  eenv_after(utilization);
-
-   /* Performance Variation */
-   eenv->prf_delta = eenv_delta(speedup_idx) - eenv_delta(delay_idx);
-
-   /* Energy Variation */
-   eenv->nrg_delta = normalize_energy(eenv_delta(energy));
-
-}
-
 /*
- * energy_diff(): Estimate the energy impact of changing the utilization
- * distribution. eenv specifies the change: utilisation amount, source, and
- * destination cpu. Source or destination cpu may be -1 in which case the
- * utilization is removed from or added to the system (e.g. task wake-up). If
- * both are specified, the utilization is migrated.
+ * select_energy_cpu_idx(): estimate the energy impact of changing the
+ * utilization distribution.
+ *
+ * The eenv parameter specifies the changes: utilisation amount and a pair of
+ * possible CPU candidates (the previous CPU and a different target CPU).
+ *
+ * This function returns the index of a CPU candidate specified by the
+ * energy_env which corresponds to the first CPU saving energy.
+ * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy
+ * efficient than running on prev_cpu. This is also the value returned in case
+ * of abort due to error conditions during the computations.
+ * A value greater than zero means that the first energy-efficient CPU is the
+ * one represented by eenv->cpu[eenv->next_idx].cpu_id.
  */
-static inline int __energy_diff(struct energy_env *eenv)
+static inline int select_energy_cpu_idx(struct energy_env *eenv)
 {
    struct sched_domain *sd;
    struct sched_group *sg;
    int sd_cpu = -1;
+   int cpu_idx;
+   int margin;

-   if (eenv->src_cpu == eenv->dst_cpu)
-       return 0;
-
-   sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
+   sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id;
    sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
+
    if (!sd)
-       return 0; /* Error */
+       return EAS_CPU_PRV;
+
+   cpumask_clear(&eenv->cpus_mask);
+   for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+       int cpu = eenv->cpu[cpu_idx].cpu_id;
+
+       if (cpu < 0)
+           continue;
+       cpumask_set_cpu(cpu, &eenv->cpus_mask);
+   }

    sg = sd->groups;
+
    do {
-       if (!cpu_in_sg(sg, eenv->src_cpu) &&
-           !cpu_in_sg(sg, eenv->dst_cpu))
+       /* Skip SGs which do not contains a candidate CPU */
+       if (!cpumask_intersects(&eenv->cpus_mask, sched_group_cpus(sg)))
            continue;

        eenv->sg_top = sg;
-       if (sched_group_energy(eenv))
-           return 0; /* Invalid result abort */
+       /* energy is unscaled to reduce rounding errors */
+       if (compute_energy(eenv) == -EINVAL)
+           return EAS_CPU_PRV;

    } while (sg = sg->next, sg != sd->groups);

-   __update_perf_energy_deltas(eenv);
-
-   trace_sched_energy_diff(eenv);
-   trace_sched_energy_perf_deltas(eenv);
-
-   return eenv->nrg_delta;
-}
+   /* Scale energy before comparisons */
+   for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx)
+       eenv->cpu[cpu_idx].energy >>= SCHED_CAPACITY_SHIFT;

-#ifdef CONFIG_SCHED_TUNE
-
-struct target_nrg schedtune_target_nrg;
-
-/*
- * System energy normalization
- * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
- * corresponding to the specified energy variation.
- */
-static inline int
-normalize_energy(int energy_diff)
-{
-   u32 normalized_nrg;
-#ifdef CONFIG_SCHED_DEBUG
-   int max_delta;
-
-   /* Check for boundaries */
-   max_delta  = schedtune_target_nrg.max_power;
-   max_delta -= schedtune_target_nrg.min_power;
-   WARN_ON(abs(energy_diff) >= max_delta);
-#endif
-
-   /* Do scaling using positive numbers to increase the range */
-   normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
-
-   /* Scale by energy magnitude */
-   normalized_nrg <<= SCHED_CAPACITY_SHIFT;
-
-   /* Normalize on max energy for target platform */
-   normalized_nrg = reciprocal_divide(
-           normalized_nrg, schedtune_target_nrg.rdiv);
-
-   return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
-}
-
-static inline bool filter_energy(void)
-{
-   return sched_feat(ENERGY_FILTER);
-}
-
-static inline int
-energy_diff(struct energy_env *eenv)
-{
-   int boost;
+   /*
+    * Compute the dead-zone margin used to prevent too many task
+    * migrations with negligible energy savings.
+    * An energy saving is considered meaningful if it reduces the energy
+    * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56%
+    */
+   margin = eenv->cpu[EAS_CPU_PRV].energy >> 6;

-   /* Conpute "absolute" energy diff */
-   __energy_diff(eenv);
-   if (!filter_energy())
-       return eenv->nrg_delta;
+   /*
+    * By default the EAS_CPU_PRV CPU is considered the most energy
+    * efficient, with a 0 energy variation.
+    */
+   eenv->next_idx = EAS_CPU_PRV;

-   /* Return energy diff when boost margin is 0 */
-   boost = schedtune_task_boost(eenv->task);
-   if (boost == 0)
-       return eenv->nrg_delta;
+   /*
+    * Compare the other CPU candidates to find a CPU which can be
+    * more energy efficient then EAS_CPU_PRV
+    */
+   for (cpu_idx = EAS_CPU_NXT; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+       /* Skip not valid scheduled candidates */
+       if (eenv->cpu[cpu_idx].cpu_id < 0)
+           continue;
+       /* Compute energy delta wrt EAS_CPU_PRV */
+       eenv->cpu[cpu_idx].nrg_delta =
+           eenv->cpu[cpu_idx].energy -
+           eenv->cpu[EAS_CPU_PRV].energy;
+       /* filter energy variations within the dead-zone margin */
+       if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin)
+           eenv->cpu[cpu_idx].nrg_delta = 0;
+       /* update the schedule candidate with min(nrg_delta) */
+       if (eenv->cpu[cpu_idx].nrg_delta <
+           eenv->cpu[eenv->next_idx].nrg_delta) {
+           eenv->next_idx = cpu_idx;
+           if (sched_feat(FBT_STRICT_ORDER))
+               break;
+       }
+   }

-   eenv->payoff = schedtune_accept_deltas(
-           eenv->nrg_delta,
-           eenv->prf_delta,
-           eenv->task);
-
-   /*
-    * When SchedTune is enabled, the energy_diff() function will return
-    * the computed energy payoff value. Since the energy_diff() return
-    * value is expected to be negative by its callers, this evaluation
-    * function return a negative value each time the evaluation return a
-    * positive payoff, which is the condition for the acceptance of
-    * a scheduling decision
-    */
-   return -eenv->payoff;
+   return eenv->next_idx;
+
 }
-#else /* CONFIG_SCHED_TUNE */
-#define energy_diff(eenv) __energy_diff(eenv)
-#endif

 /*
  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
@@ -5078,31 +5699,34 @@
  * being client/server, worker/dispatcher, interrupt source or whatever is
  * irrelevant, spread criteria is apparent partner count exceeds socket size.
  */
-static int wake_wide(struct task_struct *p)
+static int wake_wide(struct task_struct *p, int sibling_count_hint)
 {
    unsigned int master = current->wakee_flips;
    unsigned int slave = p->wakee_flips;
-   int factor = this_cpu_read(sd_llc_size);
+   int llc_size = this_cpu_read(sd_llc_size);
+
+   if (sibling_count_hint >= llc_size)
+       return 1;

    if (master < slave)
        swap(master, slave);
-   if (slave < factor || master < slave * factor)
+   if (slave < llc_size || master < slave * llc_size)
        return 0;
    return 1;
 }

-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+              int prev_cpu, int sync)
 {
    s64 this_load, load;
    s64 this_eff_load, prev_eff_load;
-   int idx, this_cpu, prev_cpu;
+   int idx, this_cpu;
    struct task_group *tg;
    unsigned long weight;
    int balanced;

    idx   = sd->wake_idx;
    this_cpu  = smp_processor_id();
-   prev_cpu  = task_cpu(p);
    load      = source_load(prev_cpu, idx);
    this_load = target_load(this_cpu, idx);

@@ -5146,18 +5770,29 @@

    balanced = this_eff_load <= prev_eff_load;

-   schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
+   schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);

    if (!balanced)
        return 0;

-   schedstat_inc(sd, ttwu_move_affine);
-   schedstat_inc(p, se.statistics.nr_wakeups_affine);
+   schedstat_inc(sd->ttwu_move_affine);
+   schedstat_inc(p->se.statistics.nr_wakeups_affine);

    return 1;
 }

-unsigned int capacity_margin = 1280; /* ~20% margin */
+static inline unsigned long task_util(struct task_struct *p)
+{
+#ifdef CONFIG_SCHED_WALT
+   if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+       unsigned long demand = p->ravg.demand;
+       return (demand << 10) / walt_ravg_window;
+   }
+#endif
+   return p->se.avg.util_avg;
+}
+
+static inline unsigned long boosted_task_util(struct task_struct *p);

 static inline bool __task_fits(struct task_struct *p, int cpu, int util)
 {
@@ -5182,29 +5817,131 @@
    return __task_fits(p, cpu, 0);
 }

-static inline bool task_fits_spare(struct task_struct *p, int cpu)
+static bool __cpu_overutilized(int cpu, int delta)
 {
-   return __task_fits(p, cpu, cpu_util(cpu, UTIL_EST));
+   return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin);
 }

 static bool cpu_overutilized(int cpu)
 {
-   return (capacity_of(cpu) * 1024) < (cpu_util(cpu, UTIL_AVG) * capacity_margin);
+   return __cpu_overutilized(cpu, 0);
+}
+
+#ifdef CONFIG_SCHED_TUNE
+
+struct reciprocal_value schedtune_spc_rdiv;
+
+static long
+schedtune_margin(unsigned long signal, long boost)
+{
+   long long margin = 0;
+
+   /*
+    * Signal proportional compensation (SPC)
+    *
+    * The Boost (B) value is used to compute a Margin (M) which is
+    * proportional to the complement of the original Signal (S):
+    *   M = B * (SCHED_CAPACITY_SCALE - S)
+    * The obtained M could be used by the caller to "boost" S.
+    */
+   if (boost >= 0) {
+       margin  = SCHED_CAPACITY_SCALE - signal;
+       margin *= boost;
+   } else {
+       margin = -signal * boost;
+   }
+
+   margin  = reciprocal_divide(margin, schedtune_spc_rdiv);
+   if (boost < 0)
+       margin *= -1;
+
+   return margin;
+}
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+   int boost = schedtune_cpu_boost(cpu);
+
+   if (boost == 0)
+       return 0;
+
+   return schedtune_margin(util, boost);
+}
+
+static inline long
+schedtune_task_margin(struct task_struct *p)
+{
+   int boost = schedtune_task_boost(p);
+   unsigned long util;
+   long margin;
+
+   if (boost == 0)
+       return 0;
+
+   util = task_util(p);
+   margin = schedtune_margin(util, boost);
+
+   return margin;
+}
+
+#else /* CONFIG_SCHED_TUNE */
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+   return 0;
+}
+
+static inline int
+schedtune_task_margin(struct task_struct *p)
+{
+   return 0;
+}
+
+#endif /* CONFIG_SCHED_TUNE */
+
+unsigned long
+boosted_cpu_util(int cpu)
+{
+   unsigned long util = cpu_util_freq(cpu);
+   long margin = schedtune_cpu_margin(util, cpu);
+
+   trace_sched_boost_cpu(cpu, util, margin);
+
+   return util + margin;
+}
+
+static inline unsigned long
+boosted_task_util(struct task_struct *p)
+{
+   unsigned long util = task_util(p);
+   long margin = schedtune_task_margin(p);
+
+   trace_sched_boost_task(p, util, margin);
+
+   return util + margin;
+}
+
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+   return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
 }

 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
  */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
          int this_cpu, int sd_flag)
 {
    struct sched_group *idlest = NULL, *group = sd->groups;
-   struct sched_group *fit_group = NULL, *spare_group = NULL;
-   unsigned long min_load = ULONG_MAX, this_load = 0;
-   unsigned long fit_capacity = ULONG_MAX;
-   unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE;
+   struct sched_group *most_spare_sg = NULL;
+   unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
+   unsigned long most_spare = 0, this_spare = 0;
    int load_idx = sd->forkexec_idx;
    int imbalance = 100 + (sd->imbalance_pct-100)/2;

@@ -5212,7 +5949,7 @@
        load_idx = sd->wake_idx;

    do {
-       unsigned long load, avg_load, spare_capacity;
+       unsigned long load, avg_load, spare_cap, max_spare_cap;
        int local_group;
        int i;

@@ -5224,8 +5961,12 @@
        local_group = cpumask_test_cpu(this_cpu,
                           sched_group_cpus(group));

-       /* Tally up the load of all CPUs in the group */
+       /*
+        * Tally up the load of all CPUs in the group and find
+        * the group containing the CPU with most spare capacity.
+        */
        avg_load = 0;
+       max_spare_cap = 0;

        for_each_cpu(i, sched_group_cpus(group)) {
            /* Bias balancing toward cpus of our domain */
@@ -5236,24 +5977,10 @@

            avg_load += load;

-           /*
-            * Look for most energy-efficient group that can fit
-            * that can fit the task.
-            */
-           if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
-               fit_capacity = capacity_of(i);
-               fit_group = group;
-           }
+           spare_cap = capacity_spare_wake(i, p);

-           /*
-            * Look for group which has most spare capacity on a
-            * single cpu.
-            */
-           spare_capacity = capacity_of(i) - cpu_util(i, UTIL_EST);
-           if (spare_capacity > max_spare_capacity) {
-               max_spare_capacity = spare_capacity;
-               spare_group = group;
-           }
+           if (spare_cap > max_spare_cap)
+               max_spare_cap = spare_cap;
        }

        /* Adjust by relative CPU capacity of the group */
@@ -5261,28 +5988,51 @@

        if (local_group) {
            this_load = avg_load;
-       } else if (avg_load < min_load) {
-           min_load = avg_load;
-           idlest = group;
+           this_spare = max_spare_cap;
+       } else {
+           if (avg_load < min_load) {
+               min_load = avg_load;
+               idlest = group;
+           }
+
+           if (most_spare < max_spare_cap) {
+               most_spare = max_spare_cap;
+               most_spare_sg = group;
+           }
        }
    } while (group = group->next, group != sd->groups);

-   if (fit_group)
-       return fit_group;
+   /*
+    * The cross-over point between using spare capacity or least load
+    * is too conservative for high utilization tasks on partially
+    * utilized systems if we require spare_capacity > task_util(p)
+    * so we allow for some task stuffing by using
+    * spare_capacity > task_util(p)/2.
+    *
+    * Spare capacity can't be used for fork because the utilization has
+    * not been set yet, we must first select a rq to compute the initial
+    * utilization.
+    */
+   if (sd_flag & SD_BALANCE_FORK)
+       goto skip_spare;

-   if (spare_group)
-       return spare_group;
+   if (this_spare > task_util(p) / 2 &&
+       imbalance*this_spare > 100*most_spare)
+       return NULL;
+   else if (most_spare > task_util(p) / 2)
+       return most_spare_sg;

+skip_spare:
    if (!idlest || 100*this_load < imbalance*min_load)
        return NULL;
    return idlest;
 }

 /*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
  */
 static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
    unsigned long load, min_load = ULONG_MAX;
    unsigned int min_exit_latency = UINT_MAX;
@@ -5291,9 +6041,13 @@
    int shallowest_idle_cpu = -1;
    int i;

+   /* Check if we have any choice: */
+   if (group->group_weight == 1)
+       return cpumask_first(sched_group_cpus(group));
+
    /* Traverse only the allowed CPUs */
    for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
-       if (task_fits_spare(p, i)) {
+       if (idle_cpu(i)) {
            struct rq *rq = cpu_rq(i);
            struct cpuidle_state *idle = idle_get_state(rq);
            if (idle && idle->exit_latency < min_exit_latency) {
@@ -5305,8 +6059,7 @@
                min_exit_latency = idle->exit_latency;
                latest_idle_timestamp = rq->idle_stamp;
                shallowest_idle_cpu = i;
-           } else if (idle_cpu(i) &&
-                  (!idle || idle->exit_latency == min_exit_latency) &&
+           } else if ((!idle || idle->exit_latency == min_exit_latency) &&
                   rq->idle_stamp > latest_idle_timestamp) {
                /*
                 * If equal or no active idle state, then
@@ -5315,15 +6068,8 @@
                 */
                latest_idle_timestamp = rq->idle_stamp;
                shallowest_idle_cpu = i;
-           } else if (shallowest_idle_cpu == -1) {
-               /*
-                * If we haven't found an idle CPU yet
-                * pick a non-idle one that can fit the task as
-                * fallback.
-                */
-               shallowest_idle_cpu = i;
            }
-       } else {
+       } else if (shallowest_idle_cpu == -1) {
            load = weighted_cpuload(i);
            if (load < min_load || (load == min_load && i == this_cpu)) {
                min_load = load;
@@ -5333,29 +6079,99 @@
    }

    return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+ }
+
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+                 int cpu, int prev_cpu, int sd_flag)
+{
+   int new_cpu = cpu;
+   int wu = sd_flag & SD_BALANCE_WAKE;
+   int cas_cpu = -1;
+
+   if (wu) {
+       schedstat_inc(p->se.statistics.nr_wakeups_cas_attempts);
+       schedstat_inc(this_rq()->eas_stats.cas_attempts);
+   }
+
+   if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+       return prev_cpu;
+
+   while (sd) {
+       struct sched_group *group;
+       struct sched_domain *tmp;
+       int weight;
+
+       if (wu)
+           schedstat_inc(sd->eas_stats.cas_attempts);
+
+       if (!(sd->flags & sd_flag)) {
+           sd = sd->child;
+           continue;
+       }
+
+       group = find_idlest_group(sd, p, cpu, sd_flag);
+       if (!group) {
+           sd = sd->child;
+           continue;
+       }
+
+       new_cpu = find_idlest_group_cpu(group, p, cpu);
+       if (new_cpu == cpu) {
+           /* Now try balancing at a lower domain level of cpu */
+           sd = sd->child;
+           continue;
+       }
+
+       /* Now try balancing at a lower domain level of new_cpu */
+       cpu = cas_cpu = new_cpu;
+       weight = sd->span_weight;
+       sd = NULL;
+       for_each_domain(cpu, tmp) {
+           if (weight <= tmp->span_weight)
+               break;
+           if (tmp->flags & sd_flag)
+               sd = tmp;
+       }
+       /* while loop will break here if sd == NULL */
+   }
+
+   if (wu && (cas_cpu >= 0)) {
+       schedstat_inc(p->se.statistics.nr_wakeups_cas_count);
+       schedstat_inc(this_rq()->eas_stats.cas_count);
+   }
+
+   return new_cpu;
 }

 /*
  * Try and locate an idle CPU in the sched_domain.
  */
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
 {
    struct sched_domain *sd;
    struct sched_group *sg;
-   int i = task_cpu(p);
-   int best_idle = -1;
-   int best_idle_cstate = -1;
-   int best_idle_capacity = INT_MAX;
+   int best_idle_cpu = -1;
+   int best_idle_cstate = INT_MAX;
+   unsigned long best_idle_capacity = ULONG_MAX;
+
+   schedstat_inc(p->se.statistics.nr_wakeups_sis_attempts);
+   schedstat_inc(this_rq()->eas_stats.sis_attempts);

    if (!sysctl_sched_cstate_aware) {
-       if (idle_cpu(target))
+       if (idle_cpu(target)) {
+           schedstat_inc(p->se.statistics.nr_wakeups_sis_idle);
+           schedstat_inc(this_rq()->eas_stats.sis_idle);
            return target;
+       }

        /*
         * If the prevous cpu is cache affine and idle, don't be stupid.
         */
-       if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
-           return i;
+       if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {
+           schedstat_inc(p->se.statistics.nr_wakeups_sis_cache_affine);
+           schedstat_inc(this_rq()->eas_stats.sis_cache_affine);
+           return prev;
+       }
    }

    /*
@@ -5365,24 +6181,30 @@
    for_each_lower_domain(sd) {
        sg = sd->groups;
        do {
+           int i;
            if (!cpumask_intersects(sched_group_cpus(sg),
                        tsk_cpus_allowed(p)))
                goto next;

            if (sysctl_sched_cstate_aware) {
                for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
-                   struct rq *rq = cpu_rq(i);
-                   int idle_idx = idle_get_state_idx(rq);
+                   int idle_idx = idle_get_state_idx(cpu_rq(i));
                    unsigned long new_usage = boosted_task_util(p);
                    unsigned long capacity_orig = capacity_orig_of(i);
+
                    if (new_usage > capacity_orig || !idle_cpu(i))
                        goto next;

-                   if (i == target && new_usage <= capacity_curr_of(target))
+                   if (i == target && new_usage <= capacity_curr_of(target)) {
+                       schedstat_inc(p->se.statistics.nr_wakeups_sis_suff_cap);
+                       schedstat_inc(this_rq()->eas_stats.sis_suff_cap);
+                       schedstat_inc(sd->eas_stats.sis_suff_cap);
                        return target;
+                   }

-                   if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
-                       best_idle = i;
+                   if (idle_idx < best_idle_cstate &&
+                       capacity_orig <= best_idle_capacity) {
+                       best_idle_cpu = i;
                        best_idle_cstate = idle_idx;
                        best_idle_capacity = capacity_orig;
                    }
@@ -5395,6 +6217,9 @@

                target = cpumask_first_and(sched_group_cpus(sg),
                    tsk_cpus_allowed(p));
+               schedstat_inc(p->se.statistics.nr_wakeups_sis_idle_cpu);
+               schedstat_inc(this_rq()->eas_stats.sis_idle_cpu);
+               schedstat_inc(sd->eas_stats.sis_idle_cpu);
                goto done;
            }
 next:
@@ -5402,171 +6227,112 @@
        } while (sg != sd->groups);
    }

-   if (best_idle > 0)
-       target = best_idle;
+   if (best_idle_cpu >= 0)
+       target = best_idle_cpu;

 done:
+   schedstat_inc(p->se.statistics.nr_wakeups_sis_count);
+   schedstat_inc(this_rq()->eas_stats.sis_count);
+
    return target;
 }

-static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
+/*
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed.  check_for_migration() looks for a better CPU of
+ * rq->curr. For that case we should return cpu util with contributions from
+ * currently running task p removed.
+ */
+static int cpu_util_wake(int cpu, struct task_struct *p)
 {
-   int iter_cpu;
-   int target_cpu = -1;
-   int target_util = 0;
-   int backup_capacity = 0;
-   int best_idle_cpu = -1;
-   int best_idle_cstate = INT_MAX;
-   int backup_cpu = -1;
-   unsigned long min_util;
-   unsigned long new_util;
-
-   min_util = boosted_task_util(p);
-   for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
-       int cur_capacity;
-       struct rq *rq;
-       int idle_idx;
-
-       /*
-        * Iterate from higher cpus for boosted tasks.
-        */
-       int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
-
-       if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
-           continue;
-
-       /*
-        * p's blocked utilization is still accounted for on prev_cpu
-        * so prev_cpu will receive a negative bias due to the double
-        * accounting. However, the blocked utilization may be zero.
-        */
-       new_util = cpu_util(i, UTIL_EST) + task_util(p, UTIL_EST);
-
-       /*
-        * Ensure minimum capacity to grant the required boost.
-        * The target CPU can be already at a capacity level higher
-        * than the one required to boost the task.
-        */
-       new_util = max(min_util, new_util);
-       if (new_util > capacity_orig_of(i))
-           continue;
+   unsigned long util, capacity;

 #ifdef CONFIG_SCHED_WALT
-       if (walt_cpu_high_irqload(i))
-           continue;
+   /*
+    * WALT does not decay idle tasks in the same manner
+    * as PELT, so it makes little sense to subtract task
+    * utilization from cpu utilization. Instead just use
+    * cpu_util for this case.
+    */
+   if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
+       p->state == TASK_WAKING)
+       return cpu_util(cpu);
 #endif
+   /* Task has no contribution or is new */
+   if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+       return cpu_util(cpu);

-       /*
-        * Unconditionally favoring tasks that prefer idle cpus to
-        * improve latency.
-        */
-       if (idle_cpu(i) && prefer_idle) {
-           if (best_idle_cpu < 0)
-               best_idle_cpu = i;
-           continue;
-       }
-
-       cur_capacity = capacity_curr_of(i);
-       rq = cpu_rq(i);
-       idle_idx = idle_get_state_idx(rq);
+   capacity = capacity_orig_of(cpu);
+   util = max_t(long, cpu_util(cpu) - task_util(p), 0);

-       if (new_util < cur_capacity) {
-           if (cpu_rq(i)->nr_running) {
-               if (!prefer_idle) {
-                   /* Find a target cpu with highest
-                    * utilization.
-                    */
-                   if (target_util == 0 ||
-                       target_util < new_util) {
-                       target_cpu = i;
-                       target_util = new_util;
-                   }
-               } else {
-                   /* Find a target cpu with lowest
-                    * utilization.
-                    */
-                   if (target_util == 0 ||
-                       target_util > new_util) {
-                       target_cpu = i;
-                       target_util = new_util;
-                   }
-               }
-           } else if (!prefer_idle) {
-               if (best_idle_cpu < 0 ||
-                   (sysctl_sched_cstate_aware &&
-                       best_idle_cstate > idle_idx)) {
-                   best_idle_cstate = idle_idx;
-                   best_idle_cpu = i;
-               }
-           }
-       } else if (backup_capacity == 0 ||
-               backup_capacity > cur_capacity) {
-           // Find a backup cpu with least capacity.
-           backup_capacity = cur_capacity;
-           backup_cpu = i;
-       }
-   }
+   return (util >= capacity) ? capacity : util;
+}

-   if (prefer_idle && best_idle_cpu >= 0)
-       target_cpu = best_idle_cpu;
-   else if (target_cpu < 0)
-       target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
+static int start_cpu(bool boosted)
+{
+   struct root_domain *rd = cpu_rq(smp_processor_id())->rd;

-   return target_cpu;
+   return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
 }

-static int energy_aware_wake_cpu(struct task_struct *p, int target)
+static inline int find_best_target(struct task_struct *p, int *backup_cpu,
+                  bool boosted, bool prefer_idle)
 {
+   unsigned long min_util = boosted_task_util(p);
+   unsigned long target_capacity = ULONG_MAX;
+   unsigned long min_wake_util = ULONG_MAX;
+   unsigned long target_max_spare_cap = 0;
+   unsigned long best_active_util = ULONG_MAX;
+   int best_idle_cstate = INT_MAX;
    struct sched_domain *sd;
-   struct sched_group *sg, *sg_target;
-   int target_max_cap = INT_MAX;
-   int target_cpu = task_cpu(p);
-   unsigned long min_util;
-   unsigned long new_util;
-   int i;
+   struct sched_group *sg;
+   int best_active_cpu = -1;
+   int best_idle_cpu = -1;
+   int target_cpu = -1;
+   int cpu, i;

-   sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
+   *backup_cpu = -1;

-   if (!sd)
-       return target;
+   schedstat_inc(p->se.statistics.nr_wakeups_fbt_attempts);
+   schedstat_inc(this_rq()->eas_stats.fbt_attempts);

-   sg = sd->groups;
-   sg_target = sg;
+   /* Find start CPU based on boost value */
+   cpu = start_cpu(boosted);
+   if (cpu < 0) {
+       schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_cpu);
+       schedstat_inc(this_rq()->eas_stats.fbt_no_cpu);
+       return -1;
+   }

-   if (sysctl_sched_is_big_little) {
+   /* Find SD for the start CPU */
+   sd = rcu_dereference(per_cpu(sd_ea, cpu));
+   if (!sd) {
+       schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_sd);
+       schedstat_inc(this_rq()->eas_stats.fbt_no_sd);
+       return -1;
+   }

-       /*
-        * Find group with sufficient capacity. We only get here if no cpu is
-        * overutilized. We may end up overutilizing a cpu by adding the task,
-        * but that should not be any worse than select_idle_sibling().
-        * load_balance() should sort it out later as we get above the tipping
-        * point.
-        */
-       do {
-           /* Assuming all cpus are the same in group */
-           int max_cap_cpu = group_first_cpu(sg);
+   /* Scan CPUs in all SDs */
+   sg = sd->groups;
+   do {
+       for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+           unsigned long capacity_curr = capacity_curr_of(i);
+           unsigned long capacity_orig = capacity_orig_of(i);
+           unsigned long wake_util, new_util;

-           /*
-            * Assume smaller max capacity means more energy-efficient.
-            * Ideally we should query the energy model for the right
-            * answer but it easily ends up in an exhaustive search.
-            */
-           if (capacity_of(max_cap_cpu) < target_max_cap &&
-               task_fits_max(p, max_cap_cpu)) {
-               sg_target = sg;
-               target_max_cap = capacity_of(max_cap_cpu);
-           }
-       } while (sg = sg->next, sg != sd->groups);
+           if (!cpu_online(i))
+               continue;
+
+           if (walt_cpu_high_irqload(i))
+               continue;

-       /* Find cpu with sufficient capacity */
-       min_util = boosted_task_util(p);
-       for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
            /*
             * p's blocked utilization is still accounted for on prev_cpu
             * so prev_cpu will receive a negative bias due to the double
             * accounting. However, the blocked utilization may be zero.
             */
-           new_util = cpu_util(i, UTIL_EST) + task_util(p, UTIL_EST);
+           wake_util = cpu_util_wake(i, p);
+           new_util = wake_util + task_util(p);

            /*
             * Ensure minimum capacity to grant the required boost.
@@ -5574,49 +6340,349 @@
             * than the one required to boost the task.
             */
            new_util = max(min_util, new_util);
-           if (new_util > capacity_orig_of(i))
+           if (new_util > capacity_orig)
+               continue;
+
+           /*
+            * Case A) Latency sensitive tasks
+            *
+            * Unconditionally favoring tasks that prefer idle CPU to
+            * improve latency.
+            *
+            * Looking for:
+            * - an idle CPU, whatever its idle_state is, since
+            *   the first CPUs we explore are more likely to be
+            *   reserved for latency sensitive tasks.
+            * - a non idle CPU where the task fits in its current
+            *   capacity and has the maximum spare capacity.
+            * - a non idle CPU with lower contention from other
+            *   tasks and running at the lowest possible OPP.
+            *
+            * The last two goals tries to favor a non idle CPU
+            * where the task can run as if it is "almost alone".
+            * A maximum spare capacity CPU is favoured since
+            * the task already fits into that CPU's capacity
+            * without waiting for an OPP chance.
+            *
+            * The following code path is the only one in the CPUs
+            * exploration loop which is always used by
+            * prefer_idle tasks. It exits the loop with wither a
+            * best_active_cpu or a target_cpu which should
+            * represent an optimal choice for latency sensitive
+            * tasks.
+            */
+           if (prefer_idle) {
+
+               /*
+                * Case A.1: IDLE CPU
+                * Return the first IDLE CPU we find.
+                */
+               if (idle_cpu(i)) {
+                   schedstat_inc(p->se.statistics.nr_wakeups_fbt_pref_idle);
+                   schedstat_inc(this_rq()->eas_stats.fbt_pref_idle);
+
+                   trace_sched_find_best_target(p,
+                           prefer_idle, min_util,
+                           cpu, best_idle_cpu,
+                           best_active_cpu, i);
+
+                   return i;
+               }
+
+               /*
+                * Case A.2: Target ACTIVE CPU
+                * Favor CPUs with max spare capacity.
+                */
+               if ((capacity_curr > new_util) &&
+                   (capacity_orig - new_util > target_max_spare_cap)) {
+                   target_max_spare_cap = capacity_orig - new_util;
+                   target_cpu = i;
+                   continue;
+               }
+               if (target_cpu != -1)
+                   continue;
+
+
+               /*
+                * Case A.3: Backup ACTIVE CPU
+                * Favor CPUs with:
+                * - lower utilization due to other tasks
+                * - lower utilization with the task in
+                */
+               if (wake_util > min_wake_util)
+                   continue;
+               if (new_util > best_active_util)
+                   continue;
+               min_wake_util = wake_util;
+               best_active_util = new_util;
+               best_active_cpu = i;
                continue;
+           }

-           if (new_util < capacity_curr_of(i)) {
-               target_cpu = i;
-               if (cpu_rq(i)->nr_running)
-                   break;
+           /*
+            * Enforce EAS mode
+            *
+            * For non latency sensitive tasks, skip CPUs that
+            * will be overutilized by moving the task there.
+            *
+            * The goal here is to remain in EAS mode as long as
+            * possible at least for !prefer_idle tasks.
+            */
+           if ((new_util * capacity_margin) >
+               (capacity_orig * SCHED_CAPACITY_SCALE))
+               continue;
+
+           /*
+            * Favor CPUs with smaller capacity for Non latency
+            * sensitive tasks.
+            */
+           if (capacity_orig > target_capacity)
+               continue;
+
+           /*
+            * Case B) Non latency sensitive tasks on IDLE CPUs.
+            *
+            * Find an optimal backup IDLE CPU for non latency
+            * sensitive tasks.
+            *
+            * Looking for:
+            * - minimizing the capacity_orig,
+            *   i.e. preferring LITTLE CPUs
+            * - favoring shallowest idle states
+            *   i.e. avoid to wakeup deep-idle CPUs
+            *
+            * The following code path is used by non latency
+            * sensitive tasks if IDLE CPUs are available. If at
+            * least one of such CPUs are available it sets the
+            * best_idle_cpu to the most suitable idle CPU to be
+            * selected.
+            *
+            * If idle CPUs are available, favour these CPUs to
+            * improve performances by spreading tasks.
+            * Indeed, the energy_diff() computed by the caller
+            * will take care to ensure the minimization of energy
+            * consumptions without affecting performance.
+            */
+           if (idle_cpu(i)) {
+               int idle_idx = idle_get_state_idx(cpu_rq(i));
+
+               /*
+                * Skip CPUs in deeper idle state, but only
+                * if they are also less energy efficient.
+                * IOW, prefer a deep IDLE LITTLE CPU vs a
+                * shallow idle big CPU.
+                */
+               if (sysctl_sched_cstate_aware &&
+                   best_idle_cstate <= idle_idx)
+                   continue;
+
+               /* Keep track of best idle CPU */
+               target_capacity = capacity_orig;
+               best_idle_cstate = idle_idx;
+               best_idle_cpu = i;
+               continue;
            }

-           /* cpu has capacity at higher OPP, keep it as fallback */
-           if (target_cpu == task_cpu(p))
-               target_cpu = i;
+           /*
+            * Case C) Non latency sensitive tasks on ACTIVE CPUs.
+            *
+            * Pack tasks in the most energy efficient capacities.
+            *
+            * This task packing strategy prefers more energy
+            * efficient CPUs (i.e. pack on smaller maximum
+            * capacity CPUs) while also trying to spread tasks to
+            * run them all at the lower OPP.
+            *
+            * This assumes for example that it's more energy
+            * efficient to run two tasks on two CPUs at a lower
+            * OPP than packing both on a single CPU but running
+            * that CPU at an higher OPP.
+            *
+            * Thus, this case keep track of the CPU with the
+            * smallest maximum capacity and highest spare maximum
+            * capacity.
+            */
+
+           /* Favor CPUs with maximum spare capacity */
+           if ((capacity_orig - new_util) < target_max_spare_cap)
+               continue;
+
+           target_max_spare_cap = capacity_orig - new_util;
+           target_capacity = capacity_orig;
+           target_cpu = i;
        }
-   } else {
-       /*
-        * Find a cpu with sufficient capacity
-        */
-       bool boosted = schedtune_task_boost(p) > 0;
-       bool prefer_idle = schedtune_prefer_idle(p) > 0;
-       int tmp_target = find_best_target(p, boosted, prefer_idle);
-       if (tmp_target >= 0) {
-           target_cpu = tmp_target;
-           if ((boosted || prefer_idle) && idle_cpu(target_cpu))
-               return target_cpu;
+
+   } while (sg = sg->next, sg != sd->groups);
+
+   /*
+    * For non latency sensitive tasks, cases B and C in the previous loop,
+    * we pick the best IDLE CPU only if we was not able to find a target
+    * ACTIVE CPU.
+    *
+    * Policies priorities:
+    *
+    * - prefer_idle tasks:
+    *
+    *   a) IDLE CPU available, we return immediately
+    *   b) ACTIVE CPU where task fits and has the bigger maximum spare
+    *      capacity (i.e. target_cpu)
+    *   c) ACTIVE CPU with less contention due to other tasks
+    *      (i.e. best_active_cpu)
+    *
+    * - NON prefer_idle tasks:
+    *
+    *   a) ACTIVE CPU: target_cpu
+    *   b) IDLE CPU: best_idle_cpu
+    */
+   if (target_cpu == -1)
+       target_cpu = prefer_idle
+           ? best_active_cpu
+           : best_idle_cpu;
+   else
+       *backup_cpu = prefer_idle
+       ? best_active_cpu
+       : best_idle_cpu;
+
+   trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
+                    best_idle_cpu, best_active_cpu,
+                    target_cpu);
+
+   schedstat_inc(p->se.statistics.nr_wakeups_fbt_count);
+   schedstat_inc(this_rq()->eas_stats.fbt_count);
+
+   return target_cpu;
+}
+
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+   long min_cap, max_cap;
+   min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+   max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
+   /* Minimum capacity is close to max, no need to abort wake_affine */
+   if (max_cap - min_cap < max_cap >> 3)
+       return 0;
+
+   /* Bring task utilization in sync with prev_cpu */
+   sync_entity_load_avg(&p->se);
+
+   return min_cap * 1024 < task_util(p) * capacity_margin;
+}
+
+static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
+{
+   bool boosted, prefer_idle;
+   struct sched_domain *sd;
+   int target_cpu;
+   int backup_cpu;
+   int next_cpu;
+
+   schedstat_inc(p->se.statistics.nr_wakeups_secb_attempts);
+   schedstat_inc(this_rq()->eas_stats.secb_attempts);
+
+   if (sysctl_sched_sync_hint_enable && sync) {
+       int cpu = smp_processor_id();
+
+       if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+           schedstat_inc(p->se.statistics.nr_wakeups_secb_sync);
+           schedstat_inc(this_rq()->eas_stats.secb_sync);
+           return cpu;
        }
    }

-   if (target_cpu != task_cpu(p)) {
+   rcu_read_lock();
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+   boosted = schedtune_task_boost(p) > 0;
+   prefer_idle = schedtune_prefer_idle(p) > 0;
+#else
+   boosted = get_sysctl_sched_cfs_boost() > 0;
+   prefer_idle = 0;
+#endif
+
+
+
+   sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
+   if (!sd) {
+       target_cpu = prev_cpu;
+       goto unlock;
+   }
+
+   sync_entity_load_avg(&p->se);
+
+   /* Find a cpu with sufficient capacity */
+   next_cpu = find_best_target(p, &backup_cpu, boosted, prefer_idle);
+   if (next_cpu == -1) {
+       target_cpu = prev_cpu;
+       goto unlock;
+   }
+
+   /* Unconditionally prefer IDLE CPUs for boosted/prefer_idle tasks */
+   if ((boosted || prefer_idle) && idle_cpu(next_cpu)) {
+       schedstat_inc(p->se.statistics.nr_wakeups_secb_idle_bt);
+       schedstat_inc(this_rq()->eas_stats.secb_idle_bt);
+       target_cpu = next_cpu;
+       goto unlock;
+   }
+
+   target_cpu = prev_cpu;
+   if (next_cpu != prev_cpu) {
+       int delta = 0;
        struct energy_env eenv = {
-           .util_delta = task_util(p, UTIL_EST),
-           .src_cpu    = task_cpu(p),
-           .dst_cpu    = target_cpu,
-           .task       = p,
+           .p              = p,
+           .util_delta     = task_util(p),
+           /* Task's previous CPU candidate */
+           .cpu[EAS_CPU_PRV] = {
+               .cpu_id = prev_cpu,
+           },
+           /* Main alternative CPU candidate */
+           .cpu[EAS_CPU_NXT] = {
+               .cpu_id = next_cpu,
+           },
+           /* Backup alternative CPU candidate */
+           .cpu[EAS_CPU_BKP] = {
+               .cpu_id = backup_cpu,
+           },
        };

+
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
+           p->state == TASK_WAKING)
+           delta = task_util(p);
+#endif
        /* Not enough spare capacity on previous cpu */
-       if (cpu_overutilized(task_cpu(p)))
-           return target_cpu;
+       if (__cpu_overutilized(prev_cpu, delta)) {
+           schedstat_inc(p->se.statistics.nr_wakeups_secb_insuff_cap);
+           schedstat_inc(this_rq()->eas_stats.secb_insuff_cap);
+           target_cpu = next_cpu;
+           goto unlock;
+       }

-       if (energy_diff(&eenv) >= 0)
-           return task_cpu(p);
+       /* Check if EAS_CPU_NXT is a more energy efficient CPU */
+       if (select_energy_cpu_idx(&eenv) != EAS_CPU_PRV) {
+           schedstat_inc(p->se.statistics.nr_wakeups_secb_nrg_sav);
+           schedstat_inc(this_rq()->eas_stats.secb_nrg_sav);
+           target_cpu = eenv.cpu[eenv.next_idx].cpu_id;
+           goto unlock;
+       }
+
+       schedstat_inc(p->se.statistics.nr_wakeups_secb_no_nrg_sav);
+       schedstat_inc(this_rq()->eas_stats.secb_no_nrg_sav);
+       target_cpu = prev_cpu;
+       goto unlock;
    }

+   schedstat_inc(p->se.statistics.nr_wakeups_secb_count);
+   schedstat_inc(this_rq()->eas_stats.secb_count);
+
+unlock:
+   rcu_read_unlock();
    return target_cpu;
 }

@@ -5633,7 +6699,8 @@
  * preempt must be disabled.
  */
 static int
-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
+           int sibling_count_hint)
 {
    struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
    int cpu = smp_processor_id();
@@ -5641,13 +6708,15 @@
    int want_affine = 0;
    int sync = wake_flags & WF_SYNC;

-   if (p->nr_cpus_allowed == 1)
-       return prev_cpu;
+   if (sd_flag & SD_BALANCE_WAKE) {
+       record_wakee(p);
+       want_affine = !wake_wide(p, sibling_count_hint) &&
+                 !wake_cap(p, cpu, prev_cpu) &&
+                 cpumask_test_cpu(cpu, &p->cpus_allowed);
+   }

-   if (sd_flag & SD_BALANCE_WAKE)
-       want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
-                 cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
-                 energy_aware();
+   if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
+       return select_energy_cpu_brute(p, prev_cpu, sync);

    rcu_read_lock();
    for_each_domain(cpu, tmp) {
@@ -5672,65 +6741,25 @@

    if (affine_sd) {
        sd = NULL; /* Prefer wake_affine over balance flags */
-       if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+       if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
            new_cpu = cpu;
    }

-   if (!sd) {
-       int sync_used = 0;
-       bool about_to_idle = (cpu_rq(cpu)->nr_running < 2);
-
-       if (sysctl_sched_sync_hint_enable && sync
-               && about_to_idle) {
-           cpumask_t search_cpus;
-           cpumask_and(&search_cpus, tsk_cpus_allowed(p),
-                   cpu_online_mask);
-           if (cpumask_test_cpu(cpu, &search_cpus)) {
-               sync_used = 1;
-               new_cpu = cpu;
-           }
-       }
-
-       if (!sync_used) {
-           if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
-               new_cpu = energy_aware_wake_cpu(p, prev_cpu);
-           else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
-               new_cpu = select_idle_sibling(p, new_cpu);
-       }
-
-   } else while (sd) {
-       struct sched_group *group;
-       int weight;
-
-       if (!(sd->flags & sd_flag)) {
-           sd = sd->child;
-           continue;
-       }
-
-       group = find_idlest_group(sd, p, cpu, sd_flag);
-       if (!group) {
-           sd = sd->child;
-           continue;
-       }
+   if (sd && !(sd_flag & SD_BALANCE_FORK)) {
+       /*
+        * We're going to need the task's util for capacity_spare_wake
+        * in find_idlest_group. Sync it up to prev_cpu's
+        * last_update_time.
+        */
+       sync_entity_load_avg(&p->se);
+   }

-       new_cpu = find_idlest_cpu(group, p, cpu);
-       if (new_cpu == -1 || new_cpu == cpu) {
-           /* Now try balancing at a lower domain level of cpu */
-           sd = sd->child;
-           continue;
-       }
+   if (!sd) {
+       if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+           new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);

-       /* Now try balancing at a lower domain level of new_cpu */
-       cpu = new_cpu;
-       weight = sd->span_weight;
-       sd = NULL;
-       for_each_domain(cpu, tmp) {
-           if (weight <= tmp->span_weight)
-               break;
-           if (tmp->flags & sd_flag)
-               sd = tmp;
-       }
-       /* while loop will break here if sd == NULL */
+   } else {
+       new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
    }
    rcu_read_unlock();

@@ -5742,7 +6771,7 @@
  * cfs_rq_of(p) references at time of call are still valid and identify the
  * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
  */
-static void migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+static void migrate_task_rq_fair(struct task_struct *p)
 {
    /*
     * We are supposed to update the task to "current" time, then its up to date
@@ -5929,7 +6958,7 @@
 }

 static struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
    struct cfs_rq *cfs_rq = &rq->cfs;
    struct sched_entity *se;
@@ -6041,8 +7070,15 @@

 idle:
    rq->misfit_task = 0;
-
+   /*
+    * This is OK, because current is on_cpu, which avoids it being picked
+    * for load-balance and preemption/IRQs are still disabled avoiding
+    * further scheduler activity on it and we're being very careful to
+    * re-start the picking loop.
+    */
+   lockdep_unpin_lock(&rq->lock, cookie);
    new_tasks = idle_balance(rq);
+   lockdep_repin_lock(&rq->lock, cookie);
    /*
     * Because idle_balance() releases (and re-acquires) rq->lock, it is
     * possible for any higher priority task to appear. In that case we
@@ -6101,7 +7137,7 @@
         * so we don't do microscopic update in schedule()
         * and double the fastpath cost.
         */
-        rq->skip_clock_update = 1;
+       rq_clock_skip_update(rq, true);
    }

    set_skip_buddy(se);
@@ -6320,90 +7356,57 @@
 }

 #ifdef CONFIG_NUMA_BALANCING
-/* Returns true if the destination node has incurred more faults */
-static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+/*
+ * Returns 1, if task migration degrades locality
+ * Returns 0, if task migration improves locality i.e migration preferred.
+ * Returns -1, if task migration is not affected by locality.
+ */
+static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 {
    struct numa_group *numa_group = rcu_dereference(p->numa_group);
+   unsigned long src_faults, dst_faults;
    int src_nid, dst_nid;

-   if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
-       !(env->sd->flags & SD_NUMA)) {
-       return false;
-   }
+   if (!static_branch_likely(&sched_numa_balancing))
+       return -1;
+
+   if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+       return -1;

    src_nid = cpu_to_node(env->src_cpu);
    dst_nid = cpu_to_node(env->dst_cpu);

    if (src_nid == dst_nid)
-       return false;
-
-   if (numa_group) {
-       /* Task is already in the group's interleave set. */
-       if (node_isset(src_nid, numa_group->active_nodes))
-           return false;
-
-       /* Task is moving into the group's interleave set. */
-       if (node_isset(dst_nid, numa_group->active_nodes))
-           return true;
+       return -1;

-       return group_faults(p, dst_nid) > group_faults(p, src_nid);
+   /* Migrating away from the preferred node is always bad. */
+   if (src_nid == p->numa_preferred_nid) {
+       if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
+           return 1;
+       else
+           return -1;
    }

    /* Encourage migration to the preferred node. */
    if (dst_nid == p->numa_preferred_nid)
-       return true;
-
-   return task_faults(p, dst_nid) > task_faults(p, src_nid);
-}
-
-
-static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
-{
-   struct numa_group *numa_group = rcu_dereference(p->numa_group);
-   int src_nid, dst_nid;
-
-   if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
-       return false;
-
-   if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
-       return false;
-
-   src_nid = cpu_to_node(env->src_cpu);
-   dst_nid = cpu_to_node(env->dst_cpu);
-
-   if (src_nid == dst_nid)
-       return false;
+       return 0;

    if (numa_group) {
-       /* Task is moving within/into the group's interleave set. */
-       if (node_isset(dst_nid, numa_group->active_nodes))
-           return false;
-
-       /* Task is moving out of the group's interleave set. */
-       if (node_isset(src_nid, numa_group->active_nodes))
-           return true;
-
-       return group_faults(p, dst_nid) < group_faults(p, src_nid);
+       src_faults = group_faults(p, src_nid);
+       dst_faults = group_faults(p, dst_nid);
+   } else {
+       src_faults = task_faults(p, src_nid);
+       dst_faults = task_faults(p, dst_nid);
    }

-   /* Migrating away from the preferred node is always bad. */
-   if (src_nid == p->numa_preferred_nid)
-       return true;
-
-   return task_faults(p, dst_nid) < task_faults(p, src_nid);
+   return dst_faults < src_faults;
 }

 #else
-static inline bool migrate_improves_locality(struct task_struct *p,
+static inline int migrate_degrades_locality(struct task_struct *p,
                         struct lb_env *env)
 {
-   return false;
-}
-
-static inline bool migrate_degrades_locality(struct task_struct *p,
-                        struct lb_env *env)
-{
-   return false;
+   return -1;
 }
 #endif

@@ -6413,7 +7416,7 @@
 static
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
-   int tsk_cache_hot = 0;
+   int tsk_cache_hot;

    lockdep_assert_held(&env->src_rq->lock);

@@ -6430,7 +7433,7 @@
    if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
        int cpu;

-       schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+       schedstat_inc(p->se.statistics.nr_failed_migrations_affine);

        env->flags |= LBF_SOME_PINNED;

@@ -6461,7 +7464,7 @@
    env->flags &= ~LBF_ALL_PINNED;

    if (task_running(env->src_rq, p)) {
-       schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+       schedstat_inc(p->se.statistics.nr_failed_migrations_running);
        return 0;
    }

@@ -6471,20 +7474,20 @@
     * 2) task is cache cold, or
     * 3) too many balance attempts have failed.
     */
-   tsk_cache_hot = task_hot(p, env);
-   if (!tsk_cache_hot)
-       tsk_cache_hot = migrate_degrades_locality(p, env);
+   tsk_cache_hot = migrate_degrades_locality(p, env);
+   if (tsk_cache_hot == -1)
+       tsk_cache_hot = task_hot(p, env);

-   if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
+   if (tsk_cache_hot <= 0 ||
        env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-       if (tsk_cache_hot) {
-           schedstat_inc(env->sd, lb_hot_gained[env->idle]);
-           schedstat_inc(p, se.statistics.nr_forced_migrations);
+       if (tsk_cache_hot == 1) {
+           schedstat_inc(env->sd->lb_hot_gained[env->idle]);
+           schedstat_inc(p->se.statistics.nr_forced_migrations);
        }
        return 1;
    }

-   schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+   schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
    return 0;
 }

@@ -6495,8 +7498,8 @@
 {
    lockdep_assert_held(&env->src_rq->lock);

-   deactivate_task(env->src_rq, p, 0);
    p->on_rq = TASK_ON_RQ_MIGRATING;
+   deactivate_task(env->src_rq, p, 0);
    double_lock_balance(env->src_rq, env->dst_rq);
    set_task_cpu(p, env->dst_cpu);
    double_unlock_balance(env->src_rq, env->dst_rq);
@@ -6526,7 +7529,7 @@
         * so we can safely collect stats here rather than
         * inside detach_tasks().
         */
-       schedstat_inc(env->sd, lb_gained[env->idle]);
+       schedstat_inc(env->sd->lb_gained[env->idle]);
        return p;
    }
    return NULL;
@@ -6618,7 +7621,7 @@
     * so we can safely collect detach_one_task() stats here rather
     * than inside detach_one_task().
     */
-   schedstat_add(env->sd, lb_gained[env->idle], detached);
+   schedstat_add(env->sd->lb_gained[env->idle], detached);

    return detached;
 }
@@ -6631,8 +7634,8 @@
    lockdep_assert_held(&rq->lock);

    BUG_ON(task_rq(p) != rq);
-   p->on_rq = TASK_ON_RQ_QUEUED;
    activate_task(rq, p, 0);
+   p->on_rq = TASK_ON_RQ_QUEUED;
    check_preempt_curr(rq, p, 0);
 }

@@ -6647,7 +7650,7 @@
    /*
     * We want to potentially raise target_cpu's OPP.
     */
-   update_capacity_of(cpu_of(rq), true);
+   update_capacity_of(cpu_of(rq));
    raw_spin_unlock(&rq->lock);
 }

@@ -6672,7 +7675,7 @@
    /*
     * We want to potentially raise env.dst_cpu's OPP.
     */
-   update_capacity_of(env->dst_cpu, true);
+   update_capacity_of(env->dst_cpu);

    raw_spin_unlock(&env->dst_rq->lock);
 }
@@ -6692,12 +7695,20 @@
     * list_add_leaf_cfs_rq() for details.
     */
    for_each_leaf_cfs_rq(rq, cfs_rq) {
+       struct sched_entity *se;
+
        /* throttled entities do not contribute to load */
        if (throttled_hierarchy(cfs_rq))
            continue;

-       if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+       if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
+                      true))
            update_tg_load_avg(cfs_rq, 0);
+
+       /* Propagate pending load changes to the parent, if any: */
+       se = cfs_rq->tg->se[cpu];
+       if (se && !skip_blocked_update(se))
+           update_load_avg(se, 0);
    }
    raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
@@ -6757,7 +7768,7 @@

    raw_spin_lock_irqsave(&rq->lock, flags);
    update_rq_clock(rq);
-   update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+   update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
    raw_spin_unlock_irqrestore(&rq->lock, flags);
 }

@@ -6908,6 +7919,9 @@

    cpu_rq(cpu)->cpu_capacity_orig = capacity;

+   capacity *= arch_scale_max_freq_capacity(sd, cpu);
+   capacity >>= SCHED_CAPACITY_SHIFT;
+
    mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;

    raw_spin_lock_irqsave(&mcc->lock, flags);
@@ -6937,13 +7951,14 @@
    cpu_rq(cpu)->cpu_capacity = capacity;
    sdg->sgc->capacity = capacity;
    sdg->sgc->max_capacity = capacity;
+   sdg->sgc->min_capacity = capacity;
 }

 void update_group_capacity(struct sched_domain *sd, int cpu)
 {
    struct sched_domain *child = sd->child;
    struct sched_group *group, *sdg = sd->groups;
-   unsigned long capacity, max_capacity;
+   unsigned long capacity, max_capacity, min_capacity;
    unsigned long interval;

    interval = msecs_to_jiffies(sd->balance_interval);
@@ -6957,6 +7972,7 @@

    capacity = 0;
    max_capacity = 0;
+   min_capacity = ULONG_MAX;

    if (child->flags & SD_OVERLAP) {
        /*
@@ -6987,6 +8003,7 @@
            }

            max_capacity = max(capacity, max_capacity);
+           min_capacity = min(capacity, min_capacity);
        }
    } else  {
        /*
@@ -7000,12 +8017,14 @@

            capacity += sgc->capacity;
            max_capacity = max(sgc->max_capacity, max_capacity);
+           min_capacity = min(sgc->min_capacity, min_capacity);
            group = group->next;
        } while (group != child->groups);
    }

    sdg->sgc->capacity = capacity;
    sdg->sgc->max_capacity = max_capacity;
+   sdg->sgc->min_capacity = min_capacity;
 }

 /*
@@ -7112,9 +8131,9 @@
                            ref->sgc->max_capacity;
 }

-static enum group_type group_classify(struct lb_env *env,
-       struct sched_group *group,
-       struct sg_lb_stats *sgs)
+static inline enum
+group_type group_classify(struct sched_group *group,
+             struct sg_lb_stats *sgs)
 {
    if (sgs->group_no_capacity)
        return group_overloaded;
@@ -7128,6 +8147,38 @@
    return group_other;
 }

+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * idle load balancing data
+ *  - used by the nohz balance, but we want it available here
+ *    so that we can see which CPUs have no tick.
+ */
+static struct {
+   cpumask_var_t idle_cpus_mask;
+   atomic_t nr_cpus;
+   unsigned long next_balance;     /* in jiffy units */
+} nohz ____cacheline_aligned;
+
+static inline void update_cpu_stats_if_tickless(struct rq *rq)
+{
+   /* only called from update_sg_lb_stats when irqs are disabled */
+   if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
+       /* rate limit updates to once-per-jiffie at most */
+       if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
+           return;
+
+       raw_spin_lock(&rq->lock);
+       update_rq_clock(rq);
+       update_idle_cpu_load(rq);
+       update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
+       raw_spin_unlock(&rq->lock);
+   }
+}
+
+#else
+static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
+#endif
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -7151,6 +8202,12 @@
    for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
        struct rq *rq = cpu_rq(i);

+       /* if we are entering idle and there are CPUs with
+        * their tick stopped, do an update for them
+        */
+       if (env->idle == CPU_NEWLY_IDLE)
+           update_cpu_stats_if_tickless(rq);
+
        /* Bias balancing toward cpus of our domain */
        if (local_group)
            load = target_load(i, load_idx);
@@ -7158,7 +8215,7 @@
            load = source_load(i, load_idx);

        sgs->group_load += load;
-       sgs->group_util += cpu_util(i, UTIL_AVG);
+       sgs->group_util += cpu_util(i);
        sgs->sum_nr_running += rq->cfs.h_nr_running;

        nr_running = rq->nr_running;
@@ -7193,7 +8250,7 @@
    sgs->group_weight = group->group_weight;

    sgs->group_no_capacity = group_is_overloaded(env, sgs);
-   sgs->group_type = group_classify(env, group, sgs);
+   sgs->group_type = group_classify(group, sgs);
 }

 /**
@@ -7233,18 +8290,27 @@
    if (sgs->avg_load <= busiest->avg_load)
        return false;

+   if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+       goto asym_packing;
+
    /*
-    * Candiate sg has no more than one task per cpu and has higher
-    * per-cpu capacity. No reason to pull tasks to less capable cpus.
+    * Candidate sg has no more than one task per CPU and
+    * has higher per-CPU capacity. Migrating tasks to less
+    * capable CPUs may harm throughput. Maximize throughput,
+    * power/energy consequences are not considered.
     */
    if (sgs->sum_nr_running <= sgs->group_weight &&
        group_smaller_cpu_capacity(sds->local, sg))
        return false;

+asym_packing:
    /* This is the busiest node in its class. */
    if (!(env->sd->flags & SD_ASYM_PACKING))
        return true;

+   /* No ASYM_PACKING if target cpu is already busy */
+   if (env->idle == CPU_NOT_IDLE)
+       return true;
    /*
     * ASYM_PACKING needs to move all the work to the lowest
     * numbered CPUs in the group, therefore mark all groups
@@ -7254,7 +8320,8 @@
        if (!sds->busiest)
            return true;

-       if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+       /* Prefer to move from highest possible cpu's work */
+       if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
            return true;
    }

@@ -7291,6 +8358,9 @@
 }
 #endif /* CONFIG_NUMA_BALANCING */

+#define lb_sd_parent(sd) \
+   (sd->parent && sd->parent->groups != sd->parent->groups->next)
+
 /**
  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
  * @env: The load balancing environment.
@@ -7343,7 +8413,7 @@
            group_has_capacity(env, &sds->local_stat) &&
            (sgs->sum_nr_running > 1)) {
            sgs->group_no_capacity = 1;
-           sgs->group_type = group_overloaded;
+           sgs->group_type = group_classify(sg, sgs);
        }

        /*
@@ -7373,7 +8443,7 @@

    env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;

-   if (!env->sd->parent) {
+   if (!lb_sd_parent(env->sd)) {
        /* update overload indicator if we are at root domain */
        if (env->dst_rq->rd->overload != overload)
            env->dst_rq->rd->overload = overload;
@@ -7422,6 +8492,9 @@
    if (!(env->sd->flags & SD_ASYM_PACKING))
        return 0;

+   if (env->idle == CPU_NOT_IDLE)
+       return 0;
+
    if (!sds->busiest)
        return 0;

@@ -7639,8 +8712,7 @@
    busiest = &sds.busiest_stat;

    /* ASYM feature bypasses nice load balance check */
-   if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
-       check_asym_packing(env, &sds))
+   if (check_asym_packing(env, &sds))
        return sds.busiest;

    /* There is no busy sibling group to pull tasks from */
@@ -7658,8 +8730,11 @@
    if (busiest->group_type == group_imbalanced)
        goto force_balance;

-   /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-   if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+   /*
+    * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
+    * capacities from resulting in underutilization due to avg_load.
+    */
+   if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
        busiest->group_no_capacity)
        goto force_balance;

@@ -7827,6 +8902,7 @@
    }

    if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+       ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
                env->src_rq->cfs.h_nr_running == 1 &&
                cpu_overutilized(env->src_cpu) &&
                !cpu_overutilized(env->dst_cpu)) {
@@ -7881,7 +8957,7 @@
            int *continue_balancing)
 {
    int ld_moved, cur_ld_moved, active_balance = 0;
-   struct sched_domain *sd_parent = sd->parent;
+   struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
    struct sched_group *group;
    struct rq *busiest;
    unsigned long flags;
@@ -7908,7 +8984,7 @@

    cpumask_copy(cpus, cpu_active_mask);

-   schedstat_inc(sd, lb_count[idle]);
+   schedstat_inc(sd->lb_count[idle]);

 redo:
    if (!should_we_balance(&env)) {
@@ -7918,19 +8994,19 @@

    group = find_busiest_group(&env);
    if (!group) {
-       schedstat_inc(sd, lb_nobusyg[idle]);
+       schedstat_inc(sd->lb_nobusyg[idle]);
        goto out_balanced;
    }

    busiest = find_busiest_queue(&env, group);
    if (!busiest) {
-       schedstat_inc(sd, lb_nobusyq[idle]);
+       schedstat_inc(sd->lb_nobusyq[idle]);
        goto out_balanced;
    }

    BUG_ON(busiest == env.dst_rq);

-   schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+   schedstat_add(sd->lb_imbalance[idle], env.imbalance);

    env.src_cpu = busiest->cpu;
    env.src_rq = busiest;
@@ -7948,6 +9024,7 @@

 more_balance:
        raw_spin_lock_irqsave(&busiest->lock, flags);
+       update_rq_clock(busiest);

        /*
         * cur_ld_moved - load moved in current iteration
@@ -7958,7 +9035,7 @@
         * We want to potentially lower env.src_cpu's OPP.
         */
        if (cur_ld_moved)
-           update_capacity_of(env.src_cpu, true);
+           update_capacity_of(env.src_cpu);

        /*
         * We've detached some tasks from busiest_rq. Every
@@ -8032,7 +9109,24 @@
        /* All tasks on this runqueue were pinned by CPU affinity */
        if (unlikely(env.flags & LBF_ALL_PINNED)) {
            cpumask_clear_cpu(cpu_of(busiest), cpus);
-           if (!cpumask_empty(cpus)) {
+           /*
+            * dst_cpu is not a valid busiest cpu in the following
+            * check since load cannot be pulled from dst_cpu to be
+            * put on dst_cpu.
+            */
+           cpumask_clear_cpu(env.dst_cpu, cpus);
+           /*
+            * Go back to "redo" iff the load-balance cpumask
+            * contains other potential busiest cpus for the
+            * current sched domain.
+            */
+           if (cpumask_intersects(cpus, sched_domain_span(env.sd))) {
+               /*
+                * Now that the check has passed, reenable
+                * dst_cpu so that load can be calculated on
+                * it in the redo path.
+                */
+               cpumask_set_cpu(env.dst_cpu, cpus);
                env.loop = 0;
                env.loop_break = sched_nr_migrate_break;
                goto redo;
@@ -8042,7 +9136,7 @@
    }

    if (!ld_moved) {
-       schedstat_inc(sd, lb_failed[idle]);
+       schedstat_inc(sd->lb_failed[idle]);
        /*
         * Increment the failure counter only on periodic balance.
         * We do not want newidle balance, which can be very
@@ -8086,10 +9180,7 @@
                    &busiest->active_balance_work);
            }

-           /*
-            * We've kicked active balancing, reset the failure
-            * counter.
-            */
+           /* We've kicked active balancing, force task migration. */
            sd->nr_balance_failed = sd->cache_nice_tries+1;
        }
    } else
@@ -8129,7 +9220,7 @@
     * we can't migrate them. Let the imbalance flag set so parent level
     * can try to migrate them.
     */
-   schedstat_inc(sd, lb_balanced[idle]);
+   schedstat_inc(sd->lb_balanced[idle]);

    sd->nr_balance_failed = 0;

@@ -8185,8 +9276,6 @@
    u64 curr_cost = 0;
    long removed_util = 0;

-   idle_enter_fair(this_rq);
-
    /*
     * We must set idle_stamp _before_ calling idle_balance(), such that we
     * measure the duration of idle_balance() as idle time.
@@ -8283,14 +9372,13 @@
        pulled_task = -1;

    if (pulled_task) {
-       idle_exit_fair(this_rq);
        this_rq->idle_stamp = 0;
    } else if (removed_util) {
        /*
         * No task pulled and someone has been migrated away.
         * Good case to trigger an OPP update.
         */
-       update_capacity_of(this_cpu, true);
+       update_capacity_of(this_cpu);
    }

    return pulled_task;
@@ -8308,8 +9396,18 @@
    int busiest_cpu = cpu_of(busiest_rq);
    int target_cpu = busiest_rq->push_cpu;
    struct rq *target_rq = cpu_rq(target_cpu);
-   struct sched_domain *sd;
+   struct sched_domain *sd = NULL;
    struct task_struct *p = NULL;
+   struct task_struct *push_task = NULL;
+   int push_task_detached = 0;
+   struct lb_env env = {
+       .sd     = sd,
+       .dst_cpu    = target_cpu,
+       .dst_rq     = target_rq,
+       .src_cpu    = busiest_rq->cpu,
+       .src_rq     = busiest_rq,
+       .idle       = CPU_IDLE,
+   };

    raw_spin_lock_irq(&busiest_rq->lock);

@@ -8329,6 +9427,17 @@
     */
    BUG_ON(busiest_rq == target_rq);

+   push_task = busiest_rq->push_task;
+   if (push_task) {
+       if (task_on_rq_queued(push_task) &&
+           task_cpu(push_task) == busiest_cpu &&
+                   cpu_online(target_cpu)) {
+           detach_task(push_task, &env);
+           push_task_detached = 1;
+       }
+       goto out_unlock;
+   }
+
    /* Search for an sd spanning us and the target CPU. */
    rcu_read_lock();
    for_each_domain(target_cpu, sd) {
@@ -8338,33 +9447,36 @@
    }

    if (likely(sd)) {
-       struct lb_env env = {
-           .sd     = sd,
-           .dst_cpu    = target_cpu,
-           .dst_rq     = target_rq,
-           .src_cpu    = busiest_rq->cpu,
-           .src_rq     = busiest_rq,
-           .idle       = CPU_IDLE,
-       };
-
-       schedstat_inc(sd, alb_count);
+       env.sd = sd;
+       schedstat_inc(sd->alb_count);
+       update_rq_clock(busiest_rq);

        p = detach_one_task(&env);
        if (p) {
-           schedstat_inc(sd, alb_pushed);
+           schedstat_inc(sd->alb_pushed);
            /*
             * We want to potentially lower env.src_cpu's OPP.
             */
-           update_capacity_of(env.src_cpu, true);
+           update_capacity_of(env.src_cpu);
        }
        else
-           schedstat_inc(sd, alb_failed);
+           schedstat_inc(sd->alb_failed);
    }
    rcu_read_unlock();
 out_unlock:
    busiest_rq->active_balance = 0;
+
+   if (push_task)
+       busiest_rq->push_task = NULL;
+
    raw_spin_unlock(&busiest_rq->lock);

+   if (push_task) {
+       if (push_task_detached)
+           attach_one_task(target_rq, push_task);
+       put_task_struct(push_task);
+   }
+
    if (p)
        attach_one_task(target_rq, p);

@@ -8385,12 +9497,6 @@
  *   needed, they will kick the idle load balancer, which then does idle
  *   load balancing for all the idle CPUs.
  */
-static struct {
-   cpumask_var_t idle_cpus_mask;
-   atomic_t nr_cpus;
-   unsigned long next_balance;     /* in jiffy units */
-} nohz ____cacheline_aligned;
-
 static inline int find_new_ilb(void)
 {
    int ilb = cpumask_first(nohz.idle_cpus_mask);
@@ -8449,13 +9555,13 @@
    int cpu = smp_processor_id();

    rcu_read_lock();
-   sd = rcu_dereference(per_cpu(sd_busy, cpu));
+   sd = rcu_dereference(per_cpu(sd_llc, cpu));

    if (!sd || !sd->nohz_idle)
        goto unlock;
    sd->nohz_idle = 0;

-   atomic_inc(&sd->groups->sgc->nr_busy_cpus);
+   atomic_inc(&sd->shared->nr_busy_cpus);
 unlock:
    rcu_read_unlock();
 }
@@ -8466,13 +9572,13 @@
    int cpu = smp_processor_id();

    rcu_read_lock();
-   sd = rcu_dereference(per_cpu(sd_busy, cpu));
+   sd = rcu_dereference(per_cpu(sd_llc, cpu));

    if (!sd || sd->nohz_idle)
        goto unlock;
    sd->nohz_idle = 1;

-   atomic_dec(&sd->groups->sgc->nr_busy_cpus);
+   atomic_dec(&sd->shared->nr_busy_cpus);
 unlock:
    rcu_read_unlock();
 }
@@ -8711,8 +9817,8 @@
 static inline bool nohz_kick_needed(struct rq *rq)
 {
    unsigned long now = jiffies;
+   struct sched_domain_shared *sds;
    struct sched_domain *sd;
-   struct sched_group_capacity *sgc;
    int nr_busy, cpu = rq->cpu;
    bool kick = false;

@@ -8740,12 +9846,18 @@
        (!energy_aware() || cpu_overutilized(cpu)))
        return true;

-   rcu_read_lock();
-   sd = rcu_dereference(per_cpu(sd_busy, cpu));
-   if (sd && !energy_aware()) {
-       sgc = sd->groups->sgc;
-       nr_busy = atomic_read(&sgc->nr_busy_cpus);
+   /* Do idle load balance if there have misfit task */
+   if (energy_aware())
+       return rq->misfit_task;

+   rcu_read_lock();
+   sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+   if (sds) {
+       /*
+        * XXX: write a coherent comment on why we do this.
+        * See also: http://lkml.kernel.org/r/[email protected]
+        */
+       nr_busy = atomic_read(&sds->nr_busy_cpus);
        if (nr_busy > 1) {
            kick = true;
            goto unlock;
@@ -8831,6 +9943,47 @@
    unthrottle_offline_cfs_rqs(rq);
 }

+static inline int
+kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+   int rc = 0;
+
+   /* Invoke active balance to force migrate currently running task */
+   raw_spin_lock(&rq->lock);
+   if (!rq->active_balance) {
+       rq->active_balance = 1;
+       rq->push_cpu = new_cpu;
+       get_task_struct(p);
+       rq->push_task = p;
+       rc = 1;
+   }
+   raw_spin_unlock(&rq->lock);
+
+   return rc;
+}
+
+void check_for_migration(struct rq *rq, struct task_struct *p)
+{
+   int new_cpu;
+   int active_balance;
+   int cpu = task_cpu(p);
+
+   if (energy_aware() && rq->misfit_task) {
+       if (rq->curr->state != TASK_RUNNING ||
+           rq->curr->nr_cpus_allowed == 1)
+           return;
+
+       new_cpu = select_energy_cpu_brute(p, cpu, 0);
+       if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) {
+           active_balance = kick_active_balance(rq, p, new_cpu);
+           if (active_balance)
+               stop_one_cpu_nowait(cpu,
+                       active_load_balance_cpu_stop,
+                       rq, &rq->active_balance_work);
+       }
+   }
+}
+
 #endif /* CONFIG_SMP */

 /*
@@ -8846,7 +9999,7 @@
        entity_tick(cfs_rq, se, queued);
    }

-   if (numabalancing_enabled)
+   if (static_branch_unlikely(&sched_numa_balancing))
        task_tick_numa(rq, curr);

 #ifdef CONFIG_SMP
@@ -8869,31 +10022,17 @@
 {
    struct cfs_rq *cfs_rq;
    struct sched_entity *se = &p->se, *curr;
-   int this_cpu = smp_processor_id();
    struct rq *rq = this_rq();
-   unsigned long flags;
-
-   raw_spin_lock_irqsave(&rq->lock, flags);

+   raw_spin_lock(&rq->lock);
    update_rq_clock(rq);

    cfs_rq = task_cfs_rq(current);
    curr = cfs_rq->curr;
-
-   /*
-    * Not only the cpu but also the task_group of the parent might have
-    * been changed after parent->se.parent,cfs_rq were copied to
-    * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
-    * of child point to valid ones.
-    */
-   rcu_read_lock();
-   __set_task_cpu(p, this_cpu);
-   rcu_read_unlock();
-
-   update_curr(cfs_rq);
-
-   if (curr)
+   if (curr) {
+       update_curr(cfs_rq);
        se->vruntime = curr->vruntime;
+   }
    place_entity(cfs_rq, se, 1);

    if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -8906,8 +10045,7 @@
    }

    se->vruntime -= cfs_rq->min_vruntime;
-
-   raw_spin_unlock_irqrestore(&rq->lock, flags);
+   raw_spin_unlock(&rq->lock);
 }

 /*
@@ -8959,6 +10097,61 @@
    return false;
 }

+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Propagate the changes of the sched_entity across the tg tree to make it
+ * visible to the root
+ */
+static void propagate_entity_cfs_rq(struct sched_entity *se)
+{
+   struct cfs_rq *cfs_rq;
+
+   /* Start to propagate at parent */
+   se = se->parent;
+
+   for_each_sched_entity(se) {
+       cfs_rq = cfs_rq_of(se);
+
+       if (cfs_rq_throttled(cfs_rq))
+           break;
+
+       update_load_avg(se, UPDATE_TG);
+   }
+}
+#else
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
+#endif
+
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+   struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+   /* Catch up with the cfs_rq and remove our load when we leave */
+   update_load_avg(se, 0);
+   detach_entity_load_avg(cfs_rq, se);
+   update_tg_load_avg(cfs_rq, false);
+   propagate_entity_cfs_rq(se);
+}
+
+static void attach_entity_cfs_rq(struct sched_entity *se)
+{
+   struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+   /*
+    * Since the real-depth could have been changed (only FAIR
+    * class maintain depth value), reset depth properly.
+    */
+   se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+
+   /* Synchronize entity with its cfs_rq */
+   update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+   attach_entity_load_avg(cfs_rq, se);
+   update_tg_load_avg(cfs_rq, false);
+   propagate_entity_cfs_rq(se);
+}
+
 static void detach_task_cfs_rq(struct task_struct *p)
 {
    struct sched_entity *se = &p->se;
@@ -8973,8 +10166,7 @@
        se->vruntime -= cfs_rq->min_vruntime;
    }

-   /* Catch up with the cfs_rq and remove our load when we leave */
-   detach_entity_load_avg(cfs_rq, se);
+   detach_entity_cfs_rq(se);
 }

 static void attach_task_cfs_rq(struct task_struct *p)
@@ -8982,16 +10174,7 @@
    struct sched_entity *se = &p->se;
    struct cfs_rq *cfs_rq = cfs_rq_of(se);

-#ifdef CONFIG_FAIR_GROUP_SCHED
-   /*
-    * Since the real-depth could have been changed (only FAIR
-    * class maintain depth value), reset depth properly.
-    */
-   se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
-
-   /* Synchronize task with its cfs_rq */
-   attach_entity_load_avg(cfs_rq, se);
+   attach_entity_cfs_rq(se);

    if (!vruntime_normalized(p))
        se->vruntime += cfs_rq->min_vruntime;
@@ -9045,12 +10228,23 @@
    cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
 #ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+   cfs_rq->propagate_avg = 0;
+#endif
    atomic_long_set(&cfs_rq->removed_load_avg, 0);
    atomic_long_set(&cfs_rq->removed_util_avg, 0);
 #endif
 }

 #ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_set_group_fair(struct task_struct *p)
+{
+   struct sched_entity *se = &p->se;
+
+   set_task_rq(p, task_cpu(p));
+   se->depth = se->parent ? se->parent->depth + 1 : 0;
+}
+
 static void task_move_group_fair(struct task_struct *p)
 {
    detach_task_cfs_rq(p);
@@ -9063,6 +10257,19 @@
    attach_task_cfs_rq(p);
 }

+static void task_change_group_fair(struct task_struct *p, int type)
+{
+   switch (type) {
+   case TASK_SET_GROUP:
+       task_set_group_fair(p);
+       break;
+
+   case TASK_MOVE_GROUP:
+       task_move_group_fair(p);
+       break;
+   }
+}
+
 void free_fair_sched_group(struct task_group *tg)
 {
    int i;
@@ -9085,8 +10292,9 @@

 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
-   struct cfs_rq *cfs_rq;
    struct sched_entity *se;
+   struct cfs_rq *cfs_rq;
+   struct rq *rq;
    int i;

    tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -9101,6 +10309,8 @@
    init_cfs_bandwidth(tg_cfs_bandwidth(tg));

    for_each_possible_cpu(i) {
+       rq = cpu_rq(i);
+
        cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                      GFP_KERNEL, cpu_to_node(i));
        if (!cfs_rq)
@@ -9114,6 +10324,11 @@
        init_cfs_rq(cfs_rq);
        init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
        init_entity_runnable_average(se);
+
+       raw_spin_lock_irq(&rq->lock);
+       post_init_entity_util_avg(se);
+       sync_throttle(tg, i);
+       raw_spin_unlock_irq(&rq->lock);
    }

    return 1;
@@ -9202,8 +10417,10 @@

        /* Possible calls to update_curr() need rq clock */
        update_rq_clock(rq);
-       for_each_sched_entity(se)
-           update_cfs_shares(group_cfs_rq(se));
+       for_each_sched_entity(se) {
+           update_load_avg(se, UPDATE_TG);
+           update_cfs_shares(se);
+       }
        raw_spin_unlock_irqrestore(&rq->lock, flags);
    }

@@ -9264,6 +10481,7 @@

    .task_waking        = task_waking_fair,
    .task_dead      = task_dead_fair,
+   .set_cpus_allowed   = set_cpus_allowed_common,
 #endif

    .set_curr_task          = set_curr_task_fair,
@@ -9279,7 +10497,7 @@
    .update_curr        = update_curr_fair,

 #ifdef CONFIG_FAIR_GROUP_SCHED
-   .task_move_group    = task_move_group_fair,
+   .task_change_group  = task_change_group_fair,
 #endif
 };

diff -Nur /home/ninez/android/marlin/kernel/sched/features.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/features.h
--- /home/ninez/android/marlin/kernel/sched/features.h  2018-08-10 01:54:08.566728454 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/features.h  2018-08-15 17:51:31.901600413 -0400
@@ -49,7 +49,7 @@
  * Queue remote wakeups on the target CPU and process them
  * using the scheduler IPI. Reduces rq->lock contention/bounces.
  */
-SCHED_FEAT(TTWU_QUEUE, true)
+SCHED_FEAT(TTWU_QUEUE, false)

 #ifdef HAVE_RT_PUSH_IPI
 /*
@@ -66,48 +66,39 @@

 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
+SCHED_FEAT(RT_RUNTIME_GREED, false)
 SCHED_FEAT(LB_MIN, false)

 SCHED_FEAT(ATTACH_AGE_LOAD, true)

 /*
- * Apply the automatic NUMA scheduling policy. Enabled automatically
- * at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=
- */
-#ifdef CONFIG_NUMA_BALANCING
-SCHED_FEAT(NUMA,   false)
-
-/*
- * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
- * higher number of hinting faults are recorded during active load
- * balancing.
- */
-SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
-
-/*
- * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
- * lower number of hinting faults have been recorded. As this has
- * the potential to prevent a task ever migrating to a new node
- * due to CPU overload it is disabled by default.
+ * Energy aware scheduling. Use platform energy model to guide scheduling
+ * decisions optimizing for energy efficiency.
  */
-SCHED_FEAT(NUMA_RESIST_LOWER, false)
-#endif
+SCHED_FEAT(ENERGY_AWARE, true)

 /*
- * Energy aware scheduling. Use platform energy model to guide scheduling
- * decisions optimizing for energy efficiency.
+ * Minimum capacity capping. Keep track of minimum capacity factor when
+ * minimum frequency available to a policy is modified.
+ * If enabled, this can be used to inform the scheduler about capacity
+ * restrictions.
  */
-SCHED_FEAT(ENERGY_AWARE, false)
+SCHED_FEAT(MIN_CAPACITY_CAPPING, false)

 /*
- * UtilEstimation. Use estimated CPU utiliation.
+ * Enforce the priority of candidates selected by find_best_target()
+ * ON: If the target CPU saves any energy, use that.
+ * OFF: Use whichever of target or backup saves most.
  */
-SCHED_FEAT(UTIL_EST, false)
+SCHED_FEAT(FBT_STRICT_ORDER, false)

 /*
- * SchedTune. Use Performance/Energy filtering function to evaluate the
- * trade off between energy consumption and performance impact when comparing
- * previous and next candidate CPUs.
+ * Apply schedtune boost hold to tasks of all sched classes.
+ * If enabled, schedtune will hold the boost applied to a CPU
+ * for 50ms regardless of task activation - if the task is
+ * still running 50ms later, the boost hold expires and schedtune
+ * boost will expire immediately the task stops.
+ * If disabled, this behaviour will only apply to tasks of the
+ * RT class.
  */
-SCHED_FEAT(ENERGY_FILTER, true)
+SCHED_FEAT(SCHEDTUNE_BOOST_HOLD_ALL, false)
diff -Nur /home/ninez/android/marlin/kernel/sched/idle.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/idle.c
--- /home/ninez/android/marlin/kernel/sched/idle.c  2018-08-10 01:54:08.566728454 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/idle.c  2018-08-11 23:57:17.131940887 -0400
@@ -58,7 +58,8 @@
    rcu_idle_enter();
    trace_cpu_idle_rcuidle(0, smp_processor_id());
    local_irq_enable();
-   while (!tif_need_resched())
+   while (!tif_need_resched() &&
+       (cpu_idle_force_poll || tick_check_broadcast_expired()))
        cpu_relax();
    trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
    rcu_idle_exit();
@@ -208,6 +209,8 @@
    goto exit_idle;
 }

+DEFINE_PER_CPU(bool, cpu_dead_idle);
+
 /*
  * Generic idle loop implementation
  *
@@ -233,8 +236,13 @@
            check_pgt_cache();
            rmb();

-           if (cpu_is_offline(smp_processor_id()))
+           if (cpu_is_offline(smp_processor_id())) {
+               rcu_cpu_notify(NULL, CPU_DYING_IDLE,
+                          (void *)(long)smp_processor_id());
+               smp_mb(); /* all activity before dead. */
+               this_cpu_write(cpu_dead_idle, true);
                arch_cpu_idle_dead();
+           }

            local_irq_disable();
            arch_cpu_idle_enter();
diff -Nur /home/ninez/android/marlin/kernel/sched/idle_task.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/idle_task.c
--- /home/ninez/android/marlin/kernel/sched/idle_task.c 2018-08-10 01:54:08.566728454 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/idle_task.c 2018-08-26 16:43:11.650539699 -0400
@@ -9,7 +9,8 @@

 #ifdef CONFIG_SMP
 static int
-select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags,
+           int sibling_count_hint)
 {
    return task_cpu(p); /* IDLE tasks as never migrated */
 }
@@ -24,11 +25,16 @@
 }

 static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev)
+pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
+   if (sched_feat(RT_RUNTIME_GREED))
+       if (try_to_unthrottle_rt_rq(&rq->rt))
+           return RETRY_TASK;
+
    put_prev_task(rq, prev);

-   schedstat_inc(rq, sched_goidle);
+   update_idle_core(rq);
+   schedstat_inc(rq->sched_goidle);
    return rq->idle;
 }

@@ -47,7 +53,6 @@

 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
-   idle_exit_fair(rq);
    rq_last_tick_reset(rq);
 }

@@ -96,6 +101,7 @@

 #ifdef CONFIG_SMP
    .select_task_rq     = select_task_rq_idle,
+   .set_cpus_allowed   = set_cpus_allowed_common,
 #endif

    .set_curr_task          = set_curr_task_idle,
diff -Nur /home/ninez/android/marlin/kernel/sched/loadavg.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/loadavg.c
--- /home/ninez/android/marlin/kernel/sched/loadavg.c   2018-08-10 01:54:08.566728454 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/loadavg.c   2018-08-11 23:57:17.131940887 -0400
@@ -168,7 +168,7 @@
     * If the folding window started, make sure we start writing in the
     * next idle-delta.
     */
-   if (!time_before(jiffies, calc_load_update))
+   if (!time_before(jiffies, READ_ONCE(calc_load_update)))
        idx++;

    return idx & 1;
@@ -201,8 +201,9 @@
    struct rq *this_rq = this_rq();

    /*
-    * If we're still before the sample window, we're done.
+    * If we're still before the pending sample window, we're done.
     */
+   this_rq->calc_load_update = READ_ONCE(calc_load_update);
    if (time_before(jiffies, this_rq->calc_load_update))
        return;

@@ -211,7 +212,6 @@
     * accounted through the nohz accounting, so skip the entire deal and
     * sync up for the next window.
     */
-   this_rq->calc_load_update = calc_load_update;
    if (time_before(jiffies, this_rq->calc_load_update + 10))
        this_rq->calc_load_update += LOAD_FREQ;
 }
@@ -307,13 +307,15 @@
  */
 static void calc_global_nohz(void)
 {
+   unsigned long sample_window;
    long delta, active, n;

-   if (!time_before(jiffies, calc_load_update + 10)) {
+   sample_window = READ_ONCE(calc_load_update);
+   if (!time_before(jiffies, sample_window + 10)) {
        /*
         * Catch-up, fold however many we are behind still
         */
-       delta = jiffies - calc_load_update - 10;
+       delta = jiffies - sample_window - 10;
        n = 1 + (delta / LOAD_FREQ);

        active = atomic_long_read(&calc_load_tasks);
@@ -323,7 +325,7 @@
        avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
        avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);

-       calc_load_update += n * LOAD_FREQ;
+       WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ);
    }

    /*
@@ -351,9 +353,11 @@
  */
 void calc_global_load(unsigned long ticks)
 {
+   unsigned long sample_window;
    long active, delta;

-   if (time_before(jiffies, calc_load_update + 10))
+   sample_window = READ_ONCE(calc_load_update);
+   if (time_before(jiffies, sample_window + 10))
        return;

    /*
@@ -370,7 +374,7 @@
    avenrun[1] = calc_load(avenrun[1], EXP_5, active);
    avenrun[2] = calc_load(avenrun[2], EXP_15, active);

-   calc_load_update += LOAD_FREQ;
+   WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);

    /*
     * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
diff -Nur /home/ninez/android/marlin/kernel/sched/Makefile /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/Makefile
--- /home/ninez/android/marlin/kernel/sched/Makefile    2018-08-10 01:54:08.563395055 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/Makefile    2018-08-21 23:16:53.820436609 -0400
@@ -2,15 +2,6 @@
 CFLAGS_REMOVE_clock.o = -pg
 endif

-# KASAN instrumentation is temporarily disabled for energy.o due to the repeated
-# reports that caused the kernel to not boot as seen in b/31800756. Should a fix
-# be provided, this line can be removed again. But given that KCOV is also disabled
-# for this module, it might be worth thinking about whether or not we should also
-# just turn off KASAN instrumentation entirely here.
-KASAN_SANITIZE_core.o := n
-KASAN_SANITIZE_energy.o := n
-KASAN_SANITIZE_fair.o := n
-
 # These files are disabled because they produce non-interesting flaky coverage
 # that is not a function of syscall inputs. E.g. involuntary context switches.
 KCOV_INSTRUMENT := n
@@ -26,7 +17,7 @@

 obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o idle.o
+obj-y += wait.o swait.o swork.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
 obj-$(CONFIG_SCHED_WALT) += walt.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
@@ -34,4 +25,7 @@
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_SCHED_TUNE) += tune.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
+obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
+obj-y += boost.o
diff -Nur /home/ninez/android/marlin/kernel/sched/rt.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/rt.c
--- /home/ninez/android/marlin/kernel/sched/rt.c    2018-08-10 01:54:08.566728454 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/rt.c    2018-08-26 16:43:11.650539699 -0400
@@ -8,10 +8,9 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/irq_work.h>
-#include <linux/hrtimer.h>

 #include "walt.h"
-#include "tune.h"
+#include "tune.h"

 int sched_rr_timeslice = RR_TIMESLICE;

@@ -69,11 +68,7 @@
    raw_spin_unlock(&rt_b->rt_runtime_lock);
 }

-#ifdef CONFIG_SMP
-static void push_irq_work_func(struct irq_work *work);
-#endif
-
-void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+void init_rt_rq(struct rt_rq *rt_rq)
 {
    struct rt_prio_array *array;
    int i;
@@ -92,13 +87,6 @@
    rt_rq->rt_nr_migratory = 0;
    rt_rq->overloaded = 0;
    plist_head_init(&rt_rq->pushable_tasks);
-
-#ifdef HAVE_RT_PUSH_IPI
-   rt_rq->push_flags = 0;
-   rt_rq->push_cpu = nr_cpu_ids;
-   raw_spin_lock_init(&rt_rq->push_lock);
-   init_irq_work(&rt_rq->push_work, push_irq_work_func);
-#endif
 #endif /* CONFIG_SMP */
    /* We start is dequeued state, because no RT tasks are queued */
    rt_rq->rt_queued = 0;
@@ -214,7 +202,7 @@
        if (!rt_se)
            goto err_free_rq;

-       init_rt_rq(rt_rq, cpu_rq(i));
+       init_rt_rq(rt_rq);
        rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
        init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
    }
@@ -331,7 +319,7 @@
    rt_rq = &rq_of_rt_rq(rt_rq)->rt;

    rt_rq->rt_nr_total++;
-   if (p->nr_cpus_allowed > 1)
+   if (tsk_nr_cpus_allowed(p) > 1)
        rt_rq->rt_nr_migratory++;

    update_rt_migration(rt_rq);
@@ -348,7 +336,7 @@
    rt_rq = &rq_of_rt_rq(rt_rq)->rt;

    rt_rq->rt_nr_total--;
-   if (p->nr_cpus_allowed > 1)
+   if (tsk_nr_cpus_allowed(p) > 1)
        rt_rq->rt_nr_migratory--;

    update_rt_migration(rt_rq);
@@ -370,14 +358,12 @@
    if (!has_pushable_tasks(rq))
        return;

-   queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu),
-       push_rt_tasks);
+   queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
 }

 static inline void queue_pull_task(struct rq *rq)
 {
-   queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu),
-       pull_rt_task);
+   queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 }

 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -443,7 +429,7 @@

 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
-   return !list_empty(&rt_se->run_list);
+   return rt_se->on_rq;
 }

 #ifdef CONFIG_RT_GROUP_SCHED
@@ -489,8 +475,8 @@
    return rt_se->my_q;
 }

-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);

 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
@@ -506,7 +492,7 @@
        if (!rt_se)
            enqueue_top_rt_rq(rt_rq);
        else if (!on_rt_rq(rt_se))
-           enqueue_rt_entity(rt_se, false);
+           enqueue_rt_entity(rt_se, 0);

        if (rt_rq->highest_prio.curr < curr->prio)
            resched_curr(rq);
@@ -523,7 +509,7 @@
    if (!rt_se)
        dequeue_top_rt_rq(rt_rq);
    else if (on_rt_rq(rt_se))
-       dequeue_rt_entity(rt_se);
+       dequeue_rt_entity(rt_se, 0);
 }

 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -630,6 +616,22 @@

 #endif /* CONFIG_RT_GROUP_SCHED */

+static inline void unthrottle_rt_rq(struct rt_rq *rt_rq)
+{
+   rt_rq->rt_time = 0;
+   rt_rq->rt_throttled = 0;
+   sched_rt_rq_enqueue(rt_rq);
+}
+
+int try_to_unthrottle_rt_rq(struct rt_rq *rt_rq)
+{
+   if (rt_rq_throttled(rt_rq)) {
+       unthrottle_rt_rq(rt_rq);
+       return 1;
+   }
+   return 0;
+}
+
 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
 {
    struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -642,11 +644,11 @@
 /*
  * We ran out of runtime, see if we can borrow some from our neighbours.
  */
-static int do_balance_runtime(struct rt_rq *rt_rq)
+static void do_balance_runtime(struct rt_rq *rt_rq)
 {
    struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
    struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
-   int i, weight, more = 0;
+   int i, weight;
    u64 rt_period;

    weight = cpumask_weight(rd->span);
@@ -680,7 +682,6 @@
                diff = rt_period - rt_rq->rt_runtime;
            iter->rt_runtime -= diff;
            rt_rq->rt_runtime += diff;
-           more = 1;
            if (rt_rq->rt_runtime == rt_period) {
                raw_spin_unlock(&iter->rt_runtime_lock);
                break;
@@ -690,8 +691,6 @@
        raw_spin_unlock(&iter->rt_runtime_lock);
    }
    raw_spin_unlock(&rt_b->rt_runtime_lock);
-
-   return more;
 }

 /*
@@ -803,26 +802,19 @@
    }
 }

-static int balance_runtime(struct rt_rq *rt_rq)
+static void balance_runtime(struct rt_rq *rt_rq)
 {
-   int more = 0;
-
    if (!sched_feat(RT_RUNTIME_SHARE))
-       return more;
+       return;

    if (rt_rq->rt_time > rt_rq->rt_runtime) {
        raw_spin_unlock(&rt_rq->rt_runtime_lock);
-       more = do_balance_runtime(rt_rq);
+       do_balance_runtime(rt_rq);
        raw_spin_lock(&rt_rq->rt_runtime_lock);
    }
-
-   return more;
 }
 #else /* !CONFIG_SMP */
-static inline int balance_runtime(struct rt_rq *rt_rq)
-{
-   return 0;
-}
+static inline void balance_runtime(struct rt_rq *rt_rq) {}
 #endif /* CONFIG_SMP */

 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
@@ -848,6 +840,17 @@
        int enqueue = 0;
        struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
        struct rq *rq = rq_of_rt_rq(rt_rq);
+       int skip;
+
+       /*
+        * When span == cpu_online_mask, taking each rq->lock
+        * can be time-consuming. Try to avoid it when possible.
+        */
+       raw_spin_lock(&rt_rq->rt_runtime_lock);
+       skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
+       raw_spin_unlock(&rt_rq->rt_runtime_lock);
+       if (skip)
+           continue;

        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
@@ -865,11 +868,14 @@
                enqueue = 1;

                /*
-                * Force a clock update if the CPU was idle,
-                * lest wakeup -> unthrottle time accumulate.
+                * When we're idle and a woken (rt) task is
+                * throttled check_preempt_curr() will set
+                * skip_update and the time between the wakeup
+                * and this unthrottle will get accounted as
+                * 'runtime'.
                 */
                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
-                   rq->skip_clock_update = -1;
+                   rq_clock_skip_update(rq, false);
            }
            if (rt_rq->rt_time || rt_rq->rt_nr_running)
                idle = 0;
@@ -973,8 +979,22 @@
         * but accrue some time due to boosting.
         */
        if (likely(rt_b->rt_runtime)) {
+
            static bool once = false;

+           if (sched_feat(RT_RUNTIME_GREED)) {
+               struct rq *rq = rq_of_rt_rq(rt_rq);
+               /*
+                * If there is no other tasks able to run
+                * on this rq, lets be greed and reset our
+                * rt_time.
+                */
+               if (rq->nr_running == rt_rq->rt_nr_running) {
+                   rt_rq->rt_time = 0;
+                   return 0;
+               }
+           }
+
            rt_rq->rt_throttled = 1;

            if (!once) {
@@ -999,73 +1019,6 @@
    return 0;
 }

-/* TODO: Make configurable */
-#define RT_SCHEDTUNE_INTERVAL 50000000ULL
-
-static void sched_rt_update_capacity_req(struct rq *rq, bool tick);
-
-static enum hrtimer_restart rt_schedtune_timer(struct hrtimer *timer)
-{
-   struct sched_rt_entity *rt_se = container_of(timer,
-                            struct sched_rt_entity,
-                            schedtune_timer);
-   struct task_struct *p = rt_task_of(rt_se);
-   struct rq *rq = task_rq(p);
-
-   raw_spin_lock(&rq->lock);
-
-   /*
-    * Nothing to do if:
-    * - task has switched runqueues
-    * - task isn't RT anymore
-    */
-   if (rq != task_rq(p) || (p->sched_class != &rt_sched_class))
-       goto out;
-
-   /*
-    * If task got enqueued back during callback time, it means we raced
-    * with the enqueue on another cpu, that's Ok, just do nothing as
-    * enqueue path would have tried to cancel us and we shouldn't run
-    * Also check the schedtune_enqueued flag as class-switch on a
-    * sleeping task may have already canceled the timer and done dq
-    */
-   if (p->on_rq || rt_se->schedtune_enqueued == false)
-       goto out;
-
-   /*
-    * RT task is no longer active, cancel boost
-    */
-   rt_se->schedtune_enqueued = false;
-   schedtune_dequeue_task(p, cpu_of(rq));
-   sched_rt_update_capacity_req(rq, false);
-out:
-   raw_spin_unlock(&rq->lock);
-
-   /*
-    * This can free the task_struct if no more references.
-    */
-   put_task_struct(p);
-
-   return HRTIMER_NORESTART;
-}
-
-void init_rt_schedtune_timer(struct sched_rt_entity *rt_se)
-{
-   struct hrtimer *timer = &rt_se->schedtune_timer;
-
-   hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-   timer->function = rt_schedtune_timer;
-   rt_se->schedtune_enqueued = false;
-}
-
-static void start_schedtune_timer(struct sched_rt_entity *rt_se)
-{
-   struct hrtimer *timer = &rt_se->schedtune_timer;
-
-   hrtimer_start(timer, ns_to_ktime(RT_SCHEDTUNE_INTERVAL),
-             HRTIMER_MODE_REL_PINNED);
-}
-
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -1083,6 +1036,9 @@
    if (unlikely((s64)delta_exec <= 0))
        return;

+   /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+   cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+
    schedstat_set(curr->se.statistics.exec_max,
              max(curr->se.statistics.exec_max, delta_exec));

@@ -1276,12 +1232,27 @@
 }

 static inline
+unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
+{
+   struct rt_rq *group_rq = group_rt_rq(rt_se);
+   struct task_struct *tsk;
+
+   if (group_rq)
+       return group_rq->rr_nr_running;
+
+   tsk = rt_task_of(rt_se);
+
+   return (tsk->policy == SCHED_RR) ? 1 : 0;
+}
+
+static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
    int prio = rt_se_prio(rt_se);

    WARN_ON(!rt_prio(prio));
    rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
+   rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);

    inc_rt_prio(rt_rq, prio);
    inc_rt_migration(rt_se, rt_rq);
@@ -1294,13 +1265,37 @@
    WARN_ON(!rt_prio(rt_se_prio(rt_se)));
    WARN_ON(!rt_rq->rt_nr_running);
    rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
+   rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);

    dec_rt_prio(rt_rq, rt_se_prio(rt_se));
    dec_rt_migration(rt_se, rt_rq);
    dec_rt_group(rt_se, rt_rq);
 }

-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+/*
+ * Change rt_se->run_list location unless SAVE && !MOVE
+ *
+ * assumes ENQUEUE/DEQUEUE flags match
+ */
+static inline bool move_entity(unsigned int flags)
+{
+   if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+       return false;
+
+   return true;
+}
+
+static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+{
+   list_del_init(&rt_se->run_list);
+
+   if (list_empty(array->queue + rt_se_prio(rt_se)))
+       __clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+   rt_se->on_list = 0;
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
    struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
    struct rt_prio_array *array = &rt_rq->active;
@@ -1313,26 +1308,37 @@
     * get throttled and the current group doesn't have any other
     * active members.
     */
-   if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+   if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
+       if (rt_se->on_list)
+           __delist_rt_entity(rt_se, array);
        return;
+   }

-   if (head)
-       list_add(&rt_se->run_list, queue);
-   else
-       list_add_tail(&rt_se->run_list, queue);
-   __set_bit(rt_se_prio(rt_se), array->bitmap);
+   if (move_entity(flags)) {
+       WARN_ON_ONCE(rt_se->on_list);
+       if (flags & ENQUEUE_HEAD)
+           list_add(&rt_se->run_list, queue);
+       else
+           list_add_tail(&rt_se->run_list, queue);
+
+       __set_bit(rt_se_prio(rt_se), array->bitmap);
+       rt_se->on_list = 1;
+   }
+   rt_se->on_rq = 1;

    inc_rt_tasks(rt_se, rt_rq);
 }

-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
    struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
    struct rt_prio_array *array = &rt_rq->active;

-   list_del_init(&rt_se->run_list);
-   if (list_empty(array->queue + rt_se_prio(rt_se)))
-       __clear_bit(rt_se_prio(rt_se), array->bitmap);
+   if (move_entity(flags)) {
+       WARN_ON_ONCE(!rt_se->on_list);
+       __delist_rt_entity(rt_se, array);
+   }
+   rt_se->on_rq = 0;

    dec_rt_tasks(rt_se, rt_rq);
 }
@@ -1341,7 +1347,7 @@
  * Because the prio of an upper entry depends on the lower
  * entries, we must remove entries top - down.
  */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
 {
    struct sched_rt_entity *back = NULL;

@@ -1354,35 +1360,64 @@

    for (rt_se = back; rt_se; rt_se = rt_se->back) {
        if (on_rt_rq(rt_se))
-           __dequeue_rt_entity(rt_se);
+           __dequeue_rt_entity(rt_se, flags);
    }
 }

-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
    struct rq *rq = rq_of_rt_se(rt_se);

-   dequeue_rt_stack(rt_se);
+   dequeue_rt_stack(rt_se, flags);
    for_each_sched_rt_entity(rt_se)
-       __enqueue_rt_entity(rt_se, head);
+       __enqueue_rt_entity(rt_se, flags);
    enqueue_top_rt_rq(&rq->rt);
 }

-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
    struct rq *rq = rq_of_rt_se(rt_se);

-   dequeue_rt_stack(rt_se);
+   dequeue_rt_stack(rt_se, flags);

    for_each_sched_rt_entity(rt_se) {
        struct rt_rq *rt_rq = group_rt_rq(rt_se);

        if (rt_rq && rt_rq->rt_nr_running)
-           __enqueue_rt_entity(rt_se, false);
+           __enqueue_rt_entity(rt_se, flags);
    }
    enqueue_top_rt_rq(&rq->rt);
 }

+static void sched_rt_update_capacity_req(struct rq *rq)
+{
+   u64 total, used, age_stamp, avg;
+   s64 delta;
+
+   if (!sched_freq())
+       return;
+
+   sched_avg_update(rq);
+   /*
+    * Since we're reading these variables without serialization make sure
+    * we read them once before doing sanity checks on them.
+    */
+   age_stamp = READ_ONCE(rq->age_stamp);
+   avg = READ_ONCE(rq->rt_avg);
+   delta = rq_clock(rq) - age_stamp;
+
+   if (unlikely(delta < 0))
+       delta = 0;
+
+   total = sched_avg_period() + delta;
+
+   used = div_u64(avg, total);
+   if (unlikely(used > SCHED_CAPACITY_SCALE))
+       used = SCHED_CAPACITY_SCALE;
+
+   set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used));
+}
+
 /*
  * Adding/removing a task to/from a priority array:
  */
@@ -1391,65 +1426,37 @@
 {
    struct sched_rt_entity *rt_se = &p->rt;

+#ifdef CONFIG_SMP
+   schedtune_enqueue_task(p, cpu_of(rq));
+#endif
+
    if (flags & ENQUEUE_WAKEUP)
        rt_se->timeout = 0;

-   enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+   enqueue_rt_entity(rt_se, flags);
    walt_inc_cumulative_runnable_avg(rq, p);

-   if (!task_current(rq, p) && p->nr_cpus_allowed > 1) {
+   if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
        enqueue_pushable_task(rq, p);
-   }
-   if (!schedtune_task_boost(p))
-       return;

-   /*
-    * If schedtune timer is active, that means a boost was already
-    * done, just cancel the timer so that deboost doesn't happen.
-    * Otherwise, increase the boost. If an enqueued timer was
-    * cancelled, put the task reference.
-    */
-   if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
-       put_task_struct(p);
-
-   /*
-    * schedtune_enqueued can be true in the following situation:
-    * enqueue_task_rt grabs rq lock before timer fires
-    *    or before its callback acquires rq lock
-    * schedtune_enqueued can be false if timer callback is running
-    * and timer just released rq lock, or if the timer finished
-    * running and canceling the boost
-    */
-   if (rt_se->schedtune_enqueued == true)
-       return;
-
-   rt_se->schedtune_enqueued = true;
-   schedtune_enqueue_task(p, cpu_of(rq));
-   sched_rt_update_capacity_req(rq, false);
+   sched_rt_update_capacity_req(rq);
 }

 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
    struct sched_rt_entity *rt_se = &p->rt;

+#ifdef CONFIG_SMP
+   schedtune_dequeue_task(p, cpu_of(rq));
+#endif
+
    update_curr_rt(rq);
-   dequeue_rt_entity(rt_se);
+   dequeue_rt_entity(rt_se, flags);
    walt_dec_cumulative_runnable_avg(rq, p);

    dequeue_pushable_task(rq, p);

-   if (rt_se->schedtune_enqueued == false)
-       return;
-
-   if (flags == DEQUEUE_SLEEP) {
-       get_task_struct(p);
-       start_schedtune_timer(rt_se);
-       return;
-   }
-
-   rt_se->schedtune_enqueued = false;
-   schedtune_dequeue_task(p, cpu_of(rq));
-   sched_rt_update_capacity_req(rq, false);
+   sched_rt_update_capacity_req(rq);
 }

 /*
@@ -1499,20 +1506,6 @@
    return !!((pc & SOFTIRQ_MASK)>= SOFTIRQ_DISABLE_OFFSET);
 }

-static bool is_top_app_cpu(int cpu)
-{
-   bool boosted = (schedtune_cpu_boost(cpu) > 0);
-
-   return boosted;
-}
-
-static bool is_top_app(struct task_struct *cur)
-{
-   bool boosted = (schedtune_task_boost(cur) > 0);
-
-   return boosted;
-}
-
 /*
  * Return whether the task on the given cpu is currently non-preemptible
  * while handling a potentially long softint, or if the task is likely
@@ -1527,14 +1520,8 @@
    struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu);
    int task_pc = 0;

-   if (task) {
-       if (is_top_app(task))
-           return true;
+   if (task)
        task_pc = task_preempt_count(task);
-   }
-
-   if (is_top_app_cpu(cpu))
-       return true;

    if (softirq_masked(task_pc))
        return true;
@@ -1544,37 +1531,12 @@
         task_pc & SOFTIRQ_MASK));
 }

-static void schedtune_dequeue_rt(struct rq *rq, struct task_struct *p)
-{
-   struct sched_rt_entity *rt_se = &p->rt;
-
-   BUG_ON(!raw_spin_is_locked(&rq->lock));
-
-   if (rt_se->schedtune_enqueued == false)
-       return;
-
-   /*
-    * Incase of class change cancel any active timers. Otherwise, increase
-    * the boost. If an enqueued timer was cancelled, put the task ref.
-    */
-   if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
-       put_task_struct(p);
-
-   /* schedtune_enqueued is true, deboost it */
-   rt_se->schedtune_enqueued = false;
-   schedtune_dequeue_task(p, task_cpu(p));
-   sched_rt_update_capacity_req(rq, false);
-}
-
 static int
-select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
+         int sibling_count_hint)
 {
    struct task_struct *curr;
    struct rq *rq;
-   bool may_not_preempt;
-
-   if (p->nr_cpus_allowed == 1)
-       goto out;

    /* For anything but wake ups, just return the task_cpu */
    if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
@@ -1586,12 +1548,7 @@
    curr = READ_ONCE(rq->curr); /* unlocked access */

    /*
-    * If the current task on @p's runqueue is a softirq task,
-    * it may run without preemption for a time that is
-    * ill-suited for a waiting RT task. Therefore, try to
-    * wake this RT task on another runqueue.
-    *
-    * Also, if the current task on @p's runqueue is an RT task, then
+    * If the current task on @p's runqueue is an RT task, then
     * try to see if we can wake this RT task up on another
     * runqueue. Otherwise simply start this RT task
     * on its current runqueue.
@@ -1612,54 +1569,43 @@
     * This test is optimistic, if we get it wrong the load-balancer
     * will have to sort it out.
     */
-   may_not_preempt = task_may_not_preempt(curr, cpu);
-   if (curr && (may_not_preempt ||
-            (unlikely(rt_task(curr)) &&
-             (curr->nr_cpus_allowed < 2 ||
-              curr->prio <= p->prio)))) {
+   if (curr && unlikely(rt_task(curr)) &&
+       (tsk_nr_cpus_allowed(curr) < 2 ||
+        curr->prio <= p->prio)) {
        int target = find_lowest_rq(p);
+
        /*
-        * If cpu is non-preemptible, prefer remote cpu
-        * even if it's running a higher-prio task.
-        * Otherwise: Possible race. Don't bother moving it if the
-        * destination CPU is not running a lower priority task.
+        * Don't bother moving it if the destination CPU is
+        * not running a lower priority task.
         */
        if (target != -1 &&
-           (may_not_preempt ||
-            p->prio < cpu_rq(target)->rt.highest_prio.curr))
+           p->prio < cpu_rq(target)->rt.highest_prio.curr)
            cpu = target;
    }
    rcu_read_unlock();

 out:
-   /*
-    * If previous CPU was different, make sure to cancel any active
-    * schedtune timers and deboost.
-    */
-   if (task_cpu(p) != cpu) {
-       unsigned long fl;
-       struct rq *prq = task_rq(p);
-
-       raw_spin_lock_irqsave(&prq->lock, fl);
-       schedtune_dequeue_rt(prq, p);
-       raw_spin_unlock_irqrestore(&prq->lock, fl);
-   }
-
    return cpu;
 }

 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
-   if (rq->curr->nr_cpus_allowed == 1)
+   /*
+    * Current can't be migrated, useless to reschedule,
+    * let's hope p can move out.
+    */
+   if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
+       !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
        return;

-   if (p->nr_cpus_allowed != 1
+   /*
+    * p is migratable, so let's not schedule it and
+    * see if it is pushed or pulled somewhere else.
+    */
+   if (tsk_nr_cpus_allowed(p) != 1
        && cpupri_find(&rq->rd->cpupri, p, NULL))
        return;

-   if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
-       return;
-
    /*
     * There appears to be other cpus that can accept
     * current and none to run 'p', so lets reschedule
@@ -1699,61 +1645,6 @@
 #endif
 }

-#ifdef CONFIG_SMP
-
-static void sched_rt_update_capacity_req(struct rq *rq, bool tick)
-{
-   u64 total, used, age_stamp, avg;
-   s64 delta;
-   int cpu = cpu_of(rq);
-
-   if (!sched_freq())
-       return;
-
-#ifdef CONFIG_SCHED_WALT
-   if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
-       unsigned long cpu_utilization = boosted_cpu_util(cpu);
-       unsigned long capacity_curr = capacity_curr_of(cpu);
-       int req = 1;
-
-       /*
-        * During a tick, we don't throttle frequency down, just update
-        * the rt utilization.
-        */
-       if (tick && cpu_utilization <= capacity_curr)
-           req = 0;
-
-       set_rt_cpu_capacity(cpu, req, cpu_utilization);
-
-       return;
-   }
-#endif
-   sched_avg_update(rq);
-   /*
-    * Since we're reading these variables without serialization make sure
-    * we read them once before doing sanity checks on them.
-    */
-   age_stamp = READ_ONCE(rq->age_stamp);
-   avg = READ_ONCE(rq->rt_avg);
-   delta = rq_clock(rq) - age_stamp;
-
-   if (unlikely(delta < 0))
-       delta = 0;
-
-   total = sched_avg_period() + delta;
-
-   used = div_u64(avg, total);
-   if (unlikely(used > SCHED_CAPACITY_SCALE))
-       used = SCHED_CAPACITY_SCALE;
-
-   set_rt_cpu_capacity(cpu, 1, (unsigned long)(used));
-}
-#else
-static inline void sched_rt_update_capacity_req(struct rq *rq, bool tick)
-{ }
-
-#endif
-
 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
                           struct rt_rq *rt_rq)
 {
@@ -1790,13 +1681,21 @@
 }

 static struct task_struct *
-pick_next_task_rt(struct rq *rq, struct task_struct *prev)
+pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
    struct task_struct *p;
    struct rt_rq *rt_rq = &rq->rt;

    if (need_pull_rt_task(rq, prev)) {
+       /*
+        * This is OK, because current is on_cpu, which avoids it being
+        * picked for load-balance and preemption/IRQs are still
+        * disabled avoiding further scheduler activity on it and we're
+        * being very careful to re-start the picking loop.
+        */
+       lockdep_unpin_lock(&rq->lock, cookie);
        pull_rt_task(rq);
+       lockdep_repin_lock(&rq->lock, cookie);
        /*
         * pull_rt_task() can drop (and re-acquire) rq->lock; this
         * means a dl or stop task can slip in, in which case we need
@@ -1822,7 +1721,7 @@
         * This value will be the used as an estimation of the next
         * activity.
         */
-       sched_rt_update_capacity_req(rq, false);
+       sched_rt_update_capacity_req(rq);
        return NULL;
    }

@@ -1846,7 +1745,7 @@
     * The previous task needs to be made eligible for pushing
     * if it is still active
     */
-   if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
+   if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
        enqueue_pushable_task(rq, p);
 }

@@ -1896,7 +1795,7 @@
    if (unlikely(!lowest_mask))
        return -1;

-   if (task->nr_cpus_allowed == 1)
+   if (tsk_nr_cpus_allowed(task) == 1)
        return -1; /* No other targets possible */

    if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1956,9 +1855,7 @@
    cpu = cpumask_any(lowest_mask);
    if (cpu < nr_cpu_ids)
        return cpu;
-
-   cpu = -1;
-   return cpu;
+   return -1;
 }

 /* Will lock the rq it finds */
@@ -1986,6 +1883,16 @@
            break;
        }

+       if (lowest_rq->rt.highest_prio.curr <= task->prio) {
+           /*
+            * Target rq has tasks of equal or higher priority,
+            * retrying does not release any lock and is unlikely
+            * to yield a different result.
+            */
+           lowest_rq = NULL;
+           break;
+       }
+
        /* if the prio of this runqueue changed, try again */
        if (double_lock_balance(rq, lowest_rq)) {
            /*
@@ -1998,6 +1905,7 @@
                     !cpumask_test_cpu(lowest_rq->cpu,
                               tsk_cpus_allowed(task)) ||
                     task_running(rq, task) ||
+                    !rt_task(task) ||
                     !task_on_rq_queued(task))) {

                double_unlock_balance(rq, lowest_rq);
@@ -2030,7 +1938,7 @@

    BUG_ON(rq->cpu != task_cpu(p));
    BUG_ON(task_current(rq, p));
-   BUG_ON(p->nr_cpus_allowed <= 1);
+   BUG_ON(tsk_nr_cpus_allowed(p) <= 1);

    BUG_ON(!task_on_rq_queued(p));
    BUG_ON(!rt_task(p));
@@ -2111,7 +2019,9 @@
    }

    deactivate_task(rq, next_task, 0);
+   next_task->on_rq = TASK_ON_RQ_MIGRATING;
    set_task_cpu(next_task, lowest_rq->cpu);
+   next_task->on_rq = TASK_ON_RQ_QUEUED;
    activate_task(lowest_rq, next_task, 0);
    ret = 1;

@@ -2133,160 +2043,172 @@
 }

 #ifdef HAVE_RT_PUSH_IPI
+
 /*
- * The search for the next cpu always starts at rq->cpu and ends
- * when we reach rq->cpu again. It will never return rq->cpu.
- * This returns the next cpu to check, or nr_cpu_ids if the loop
- * is complete.
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * All CPUs with overloaded RT tasks need to be notified as there is currently
+ * no way to know which of these CPUs have the highest priority task waiting
+ * to run. Instead of trying to take a spinlock on each of these CPUs,
+ * which has shown to cause large latency when done on machines with many
+ * CPUs, sending an IPI to the CPUs to have them push off the overloaded
+ * RT tasks waiting to run.
+ *
+ * Just sending an IPI to each of the CPUs is also an issue, as on large
+ * count CPU machines, this can cause an IPI storm on a CPU, especially
+ * if its the only CPU with multiple RT tasks queued, and a large number
+ * of CPUs scheduling a lower priority task at the same time.
+ *
+ * Each root domain has its own irq work function that can iterate over
+ * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
+ * tassk must be checked if there's one or many CPUs that are lowering
+ * their priority, there's a single irq work iterator that will try to
+ * push off RT tasks that are waiting to run.
+ *
+ * When a CPU schedules a lower priority task, it will kick off the
+ * irq work iterator that will jump to each CPU with overloaded RT tasks.
+ * As it only takes the first CPU that schedules a lower priority task
+ * to start the process, the rto_start variable is incremented and if
+ * the atomic result is one, then that CPU will try to take the rto_lock.
+ * This prevents high contention on the lock as the process handles all
+ * CPUs scheduling lower priority tasks.
+ *
+ * All CPUs that are scheduling a lower priority task will increment the
+ * rt_loop_next variable. This will make sure that the irq work iterator
+ * checks all RT overloaded CPUs whenever a CPU schedules a new lower
+ * priority task, even if the iterator is in the middle of a scan. Incrementing
+ * the rt_loop_next will cause the iterator to perform another scan.
  *
- * rq->rt.push_cpu holds the last cpu returned by this function,
- * or if this is the first instance, it must hold rq->cpu.
  */
-static int rto_next_cpu(struct rq *rq)
+static int rto_next_cpu(struct root_domain *rd)
 {
-   int prev_cpu = rq->rt.push_cpu;
+   int next;
    int cpu;

-   cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
-
    /*
-    * If the previous cpu is less than the rq's CPU, then it already
-    * passed the end of the mask, and has started from the beginning.
-    * We end if the next CPU is greater or equal to rq's CPU.
+    * When starting the IPI RT pushing, the rto_cpu is set to -1,
+    * rt_next_cpu() will simply return the first CPU found in
+    * the rto_mask.
+    *
+    * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
+    * will return the next CPU found in the rto_mask.
+    *
+    * If there are no more CPUs left in the rto_mask, then a check is made
+    * against rto_loop and rto_loop_next. rto_loop is only updated with
+    * the rto_lock held, but any CPU may increment the rto_loop_next
+    * without any locking.
     */
-   if (prev_cpu < rq->cpu) {
-       if (cpu >= rq->cpu)
-           return nr_cpu_ids;
+   for (;;) {

-   } else if (cpu >= nr_cpu_ids) {
-       /*
-        * We passed the end of the mask, start at the beginning.
-        * If the result is greater or equal to the rq's CPU, then
-        * the loop is finished.
-        */
-       cpu = cpumask_first(rq->rd->rto_mask);
-       if (cpu >= rq->cpu)
-           return nr_cpu_ids;
-   }
-   rq->rt.push_cpu = cpu;
+       /* When rto_cpu is -1 this acts like cpumask_first() */
+       cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);

-   /* Return cpu to let the caller know if the loop is finished or not */
-   return cpu;
-}
+       rd->rto_cpu = cpu;

-static int find_next_push_cpu(struct rq *rq)
-{
-   struct rq *next_rq;
-   int cpu;
+       if (cpu < nr_cpu_ids)
+           return cpu;

-   while (1) {
-       cpu = rto_next_cpu(rq);
-       if (cpu >= nr_cpu_ids)
-           break;
-       next_rq = cpu_rq(cpu);
+       rd->rto_cpu = -1;
+
+       /*
+        * ACQUIRE ensures we see the @rto_mask changes
+        * made prior to the @next value observed.
+        *
+        * Matches WMB in rt_set_overload().
+        */
+       next = atomic_read_acquire(&rd->rto_loop_next);

-       /* Make sure the next rq can push to this rq */
-       if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+       if (rd->rto_loop == next)
            break;
+
+       rd->rto_loop = next;
    }

-   return cpu;
+   return -1;
 }

-#define RT_PUSH_IPI_EXECUTING      1
-#define RT_PUSH_IPI_RESTART        2
+static inline bool rto_start_trylock(atomic_t *v)
+{
+   return !atomic_cmpxchg_acquire(v, 0, 1);
+}

-static void tell_cpu_to_push(struct rq *rq)
+static inline void rto_start_unlock(atomic_t *v)
 {
-   int cpu;
+   atomic_set_release(v, 0);
+}

-   if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
-       raw_spin_lock(&rq->rt.push_lock);
-       /* Make sure it's still executing */
-       if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
-           /*
-            * Tell the IPI to restart the loop as things have
-            * changed since it started.
-            */
-           rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
-           raw_spin_unlock(&rq->rt.push_lock);
-           return;
-       }
-       raw_spin_unlock(&rq->rt.push_lock);
-   }
+static void tell_cpu_to_push(struct rq *rq)
+{
+   int cpu = -1;

-   /* When here, there's no IPI going around */
+   /* Keep the loop going if the IPI is currently active */
+   atomic_inc(&rq->rd->rto_loop_next);

-   rq->rt.push_cpu = rq->cpu;
-   cpu = find_next_push_cpu(rq);
-   if (cpu >= nr_cpu_ids)
+   /* Only one CPU can initiate a loop at a time */
+   if (!rto_start_trylock(&rq->rd->rto_loop_start))
        return;

-   rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+   raw_spin_lock(&rq->rd->rto_lock);

-   irq_work_queue_on(&rq->rt.push_work, cpu);
+   /*
+    * The rto_cpu is updated under the lock, if it has a valid cpu
+    * then the IPI is still running and will continue due to the
+    * update to loop_next, and nothing needs to be done here.
+    * Otherwise it is finishing up and an ipi needs to be sent.
+    */
+   if (rq->rd->rto_cpu < 0)
+       cpu = rto_next_cpu(rq->rd);
+
+   raw_spin_unlock(&rq->rd->rto_lock);
+
+   rto_start_unlock(&rq->rd->rto_loop_start);
+
+   if (cpu >= 0) {
+       /* Make sure the rd does not get freed while pushing */
+       sched_get_rd(rq->rd);
+       irq_work_queue_on(&rq->rd->rto_push_work, cpu);
+   }
 }

 /* Called from hardirq context */
-static void try_to_push_tasks(void *arg)
+void rto_push_irq_work_func(struct irq_work *work)
 {
-   struct rt_rq *rt_rq = arg;
-   struct rq *rq, *src_rq;
-   int this_cpu;
+   struct root_domain *rd =
+       container_of(work, struct root_domain, rto_push_work);
+   struct rq *rq;
    int cpu;

-   this_cpu = rt_rq->push_cpu;
-
-   /* Paranoid check */
-   BUG_ON(this_cpu != smp_processor_id());
+   rq = this_rq();

-   rq = cpu_rq(this_cpu);
-   src_rq = rq_of_rt_rq(rt_rq);
-
-again:
+   /*
+    * We do not need to grab the lock to check for has_pushable_tasks.
+    * When it gets updated, a check is made if a push is possible.
+    */
    if (has_pushable_tasks(rq)) {
        raw_spin_lock(&rq->lock);
-       push_rt_task(rq);
+       push_rt_tasks(rq);
        raw_spin_unlock(&rq->lock);
    }

-   /* Pass the IPI to the next rt overloaded queue */
-   raw_spin_lock(&rt_rq->push_lock);
-   /*
-    * If the source queue changed since the IPI went out,
-    * we need to restart the search from that CPU again.
-    */
-   if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
-       rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
-       rt_rq->push_cpu = src_rq->cpu;
-   }
+   raw_spin_lock(&rd->rto_lock);

-   cpu = find_next_push_cpu(src_rq);
+   /* Pass the IPI to the next rt overloaded queue */
+   cpu = rto_next_cpu(rd);

-   if (cpu >= nr_cpu_ids)
-       rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
-   raw_spin_unlock(&rt_rq->push_lock);
+   raw_spin_unlock(&rd->rto_lock);

-   if (cpu >= nr_cpu_ids)
+   if (cpu < 0) {
+       sched_put_rd(rd);
        return;
-
-   /*
-    * It is possible that a restart caused this CPU to be
-    * chosen again. Don't bother with an IPI, just see if we
-    * have more to push.
-    */
-   if (unlikely(cpu == rq->cpu))
-       goto again;
+   }

    /* Try the next RT overloaded CPU */
-   irq_work_queue_on(&rt_rq->push_work, cpu);
-}
-
-static void push_irq_work_func(struct irq_work *work)
-{
-   struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
-
-   try_to_push_tasks(rt_rq);
+   irq_work_queue_on(&rd->rto_push_work, cpu);
 }
 #endif /* HAVE_RT_PUSH_IPI */

@@ -2296,8 +2218,9 @@
    bool resched = false;
    struct task_struct *p;
    struct rq *src_rq;
+   int rt_overload_count = rt_overloaded(this_rq);

-   if (likely(!rt_overloaded(this_rq)))
+   if (likely(!rt_overload_count))
        return;

    /*
@@ -2306,6 +2229,11 @@
     */
    smp_rmb();

+   /* If we are the only overloaded CPU do nothing */
+   if (rt_overload_count == 1 &&
+       cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
+       return;
+
 #ifdef HAVE_RT_PUSH_IPI
    if (sched_feat(RT_PUSH_IPI)) {
        tell_cpu_to_push(this_rq);
@@ -2365,7 +2293,9 @@
            resched = true;

            deactivate_task(src_rq, p, 0);
+           p->on_rq = TASK_ON_RQ_MIGRATING;
            set_task_cpu(p, this_cpu);
+           p->on_rq = TASK_ON_RQ_QUEUED;
            activate_task(this_rq, p, 0);
            /*
             * We continue with the search, just in
@@ -2390,53 +2320,13 @@
 {
    if (!task_running(rq, p) &&
        !test_tsk_need_resched(rq->curr) &&
-       has_pushable_tasks(rq) &&
-       p->nr_cpus_allowed > 1 &&
+       tsk_nr_cpus_allowed(p) > 1 &&
        (dl_task(rq->curr) || rt_task(rq->curr)) &&
-       (rq->curr->nr_cpus_allowed < 2 ||
+       (tsk_nr_cpus_allowed(rq->curr) < 2 ||
         rq->curr->prio <= p->prio))
        push_rt_tasks(rq);
 }

-static void set_cpus_allowed_rt(struct task_struct *p,
-               const struct cpumask *new_mask)
-{
-   struct rq *rq;
-   int weight;
-
-   BUG_ON(!rt_task(p));
-
-   if (!task_on_rq_queued(p))
-       return;
-
-   weight = cpumask_weight(new_mask);
-
-   /*
-    * Only update if the process changes its state from whether it
-    * can migrate or not.
-    */
-   if ((p->nr_cpus_allowed > 1) == (weight > 1))
-       return;
-
-   rq = task_rq(p);
-
-   /*
-    * The process used to be able to migrate OR it can now migrate
-    */
-   if (weight <= 1) {
-       if (!task_current(rq, p))
-           dequeue_pushable_task(rq, p);
-       BUG_ON(!rq->rt.rt_nr_migratory);
-       rq->rt.rt_nr_migratory--;
-   } else {
-       if (!task_current(rq, p))
-           enqueue_pushable_task(rq, p);
-       rq->rt.rt_nr_migratory++;
-   }
-
-   update_rt_migration(&rq->rt);
-}
-
 /* Assumes rq->lock is held */
 static void rq_online_rt(struct rq *rq)
 {
@@ -2466,13 +2356,6 @@
 static void switched_from_rt(struct rq *rq, struct task_struct *p)
 {
    /*
-    * On class switch from rt, always cancel active schedtune timers,
-    * this handles the cases where we switch class for a task that is
-    * already rt-dequeued but has a running timer.
-    */
-   schedtune_dequeue_rt(rq, p);
-
-   /*
     * If there are other RT tasks then we will reschedule
     * and the scheduling of the other RT tasks will handle
     * the balancing. But if we are the last RT task
@@ -2512,7 +2395,7 @@
     */
    if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-       if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+       if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
            queue_push_tasks(rq);
 #endif /* CONFIG_SMP */
        if (p->prio < rq->curr->prio)
@@ -2590,7 +2473,7 @@
    update_curr_rt(rq);

    if (rq->rt.rt_nr_running)
-       sched_rt_update_capacity_req(rq, true);
+       sched_rt_update_capacity_req(rq);

    watchdog(rq, p);

@@ -2654,7 +2537,7 @@
 #ifdef CONFIG_SMP
    .select_task_rq     = select_task_rq_rt,

-   .set_cpus_allowed       = set_cpus_allowed_rt,
+   .set_cpus_allowed       = set_cpus_allowed_common,
    .rq_online              = rq_online_rt,
    .rq_offline             = rq_offline_rt,
    .task_woken     = task_woken_rt,
diff -Nur /home/ninez/android/marlin/kernel/sched/sched.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/sched.h
--- /home/ninez/android/marlin/kernel/sched/sched.h 2018-08-10 01:54:08.566728454 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/sched.h 2018-08-26 16:43:11.650539699 -0400
@@ -1,3 +1,4 @@
+
 #include <linux/sched.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
@@ -13,6 +14,12 @@
 #include "cpudeadline.h"
 #include "cpuacct.h"

+#ifdef CONFIG_SCHED_DEBUG
+#define SCHED_WARN_ON(x)   WARN_ONCE(x, #x)
+#else
+#define SCHED_WARN_ON(x)   ((void)(x))
+#endif
+
 struct rq;
 struct cpuidle_state;

@@ -34,6 +41,12 @@
 static inline void update_cpu_load_active(struct rq *this_rq) { }
 #endif

+#ifdef CONFIG_SCHED_SMT
+extern void update_idle_core(struct rq *rq);
+#else
+static inline void update_idle_core(struct rq *rq) { }
+#endif
+
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
  */
@@ -47,23 +60,30 @@
  * and does not change the user-interface for setting shares/weights.
  *
  * We increase resolution only if we have enough bits to allow this increased
- * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
- * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
- * increased costs.
- */
-#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */
-# define SCHED_LOAD_RESOLUTION 10
-# define scale_load(w)     ((w) << SCHED_LOAD_RESOLUTION)
-# define scale_load_down(w)    ((w) >> SCHED_LOAD_RESOLUTION)
+ * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are
+ * pretty high and the returns do not justify the increased costs.
+ *
+ * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to
+ * increase coverage and consistency always enable it on 64bit platforms.
+ */
+#ifdef CONFIG_64BIT
+# define SCHED_LOAD_SHIFT  (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
+# define scale_load(w)     ((w) << SCHED_FIXEDPOINT_SHIFT)
+# define scale_load_down(w)    ((w) >> SCHED_FIXEDPOINT_SHIFT)
 #else
-# define SCHED_LOAD_RESOLUTION 0
+# define SCHED_LOAD_SHIFT  (SCHED_FIXEDPOINT_SHIFT)
 # define scale_load(w)     (w)
 # define scale_load_down(w)    (w)
 #endif

-#define SCHED_LOAD_SHIFT   (10 + SCHED_LOAD_RESOLUTION)
 #define SCHED_LOAD_SCALE   (1L << SCHED_LOAD_SHIFT)

+/*
+ * NICE_0's weight (visible to users) and its load (invisible to users) have
+ * independent ranges, but they should be well calibrated. We use scale_load()
+ * and scale_load_down(w) to convert between them, and the following must be true:
+ * scale_load(sched_prio_to_weight[20]) == NICE_0_LOAD
+ */
 #define NICE_0_LOAD        SCHED_LOAD_SCALE
 #define NICE_0_SHIFT       SCHED_LOAD_SHIFT

@@ -83,6 +103,10 @@
  */
 #define RUNTIME_INF    ((u64)~0ULL)

+static inline int idle_policy(int policy)
+{
+   return policy == SCHED_IDLE;
+}
 static inline int fair_policy(int policy)
 {
    return policy == SCHED_NORMAL || policy == SCHED_BATCH;
@@ -97,6 +121,11 @@
 {
    return policy == SCHED_DEADLINE;
 }
+static inline bool valid_policy(int policy)
+{
+   return idle_policy(policy) || fair_policy(policy) ||
+       rt_policy(policy) || dl_policy(policy);
+}

 static inline int task_has_rt_policy(struct task_struct *p)
 {
@@ -108,11 +137,6 @@
    return dl_policy(p->policy);
 }

-static inline bool dl_time_before(u64 a, u64 b)
-{
-   return (s64)(a - b) < 0;
-}
-
 /*
  * Tells if entity @a should preempt entity @b.
  */
@@ -183,6 +207,25 @@
    u64 bw, total_bw;
 };

+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+   dl_b->total_bw -= tsk_bw;
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+   dl_b->total_bw += tsk_bw;
+}
+
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+   return dl_b->bw != -1 &&
+          dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+}
+
 extern struct mutex sched_domains_mutex;

 #ifdef CONFIG_CGROUP_SCHED
@@ -365,6 +408,7 @@
    unsigned long runnable_load_avg;
 #ifdef CONFIG_FAIR_GROUP_SCHED
    unsigned long tg_load_avg_contrib;
+   unsigned long propagate_avg;
 #endif
    atomic_long_t removed_load_avg, removed_util_avg;
 #ifndef CONFIG_64BIT
@@ -422,7 +466,7 @@
 }

 /* RT IPI pull logic requires IRQ_WORK */
-#ifdef CONFIG_IRQ_WORK
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
 # define HAVE_RT_PUSH_IPI
 #endif

@@ -430,6 +474,7 @@
 struct rt_rq {
    struct rt_prio_array active;
    unsigned int rt_nr_running;
+   unsigned int rr_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
    struct {
        int curr; /* highest queued rt task prio */
@@ -443,12 +488,6 @@
    unsigned long rt_nr_total;
    int overloaded;
    struct plist_head pushable_tasks;
-#ifdef HAVE_RT_PUSH_IPI
-   int push_flags;
-   int push_cpu;
-   struct irq_work push_work;
-   raw_spinlock_t push_lock;
-#endif
 #endif /* CONFIG_SMP */
    int rt_queued;

@@ -466,6 +505,8 @@
 #endif
 };

+int try_to_unthrottle_rt_rq(struct rt_rq *rt_rq);
+
 /* Deadline class' related fields in a runqueue */
 struct dl_rq {
    /* runqueue is an rbtree, ordered by deadline */
@@ -541,6 +582,19 @@
    struct dl_bw dl_bw;
    struct cpudl cpudl;

+#ifdef HAVE_RT_PUSH_IPI
+   /*
+    * For IPI pull requests, loop across the rto_mask.
+    */
+   struct irq_work rto_push_work;
+   raw_spinlock_t rto_lock;
+   /* These are only updated and read within rto_lock */
+   int rto_loop;
+   int rto_cpu;
+   /* These atomics are updated outside of a lock */
+   atomic_t rto_loop_next;
+   atomic_t rto_loop_start;
+#endif
    /*
     * The "RT overload" flag: it gets set if a CPU has more than
     * one runnable RT task.
@@ -550,10 +604,18 @@

    /* Maximum cpu capacity in the system. */
    struct max_cpu_capacity max_cpu_capacity;
+
+   /* First cpu with maximum and minimum original capacity */
+   int max_cap_orig_cpu, min_cap_orig_cpu;
 };

 extern struct root_domain def_root_domain;
+extern void sched_get_rd(struct root_domain *rd);
+extern void sched_put_rd(struct root_domain *rd);

+#ifdef HAVE_RT_PUSH_IPI
+extern void rto_push_irq_work_func(struct irq_work *work);
+#endif
 #endif /* CONFIG_SMP */

 /*
@@ -587,7 +649,13 @@
 #ifdef CONFIG_NO_HZ_FULL
    unsigned long last_sched_tick;
 #endif
-   int skip_clock_update;
+
+#ifdef CONFIG_CPU_QUIET
+   /* time-based average load */
+   u64 nr_last_stamp;
+   u64 nr_running_integral;
+   seqcount_t ave_seqcnt;
+#endif

    /* capture load from *all* tasks on this cpu: */
    struct load_weight load;
@@ -601,6 +669,7 @@
 #ifdef CONFIG_FAIR_GROUP_SCHED
    /* list of leaf cfs_rq on this cpu: */
    struct list_head leaf_cfs_rq_list;
+   struct list_head *tmp_alone_branch;
 #endif /* CONFIG_FAIR_GROUP_SCHED */

    /*
@@ -615,6 +684,7 @@
    unsigned long next_balance;
    struct mm_struct *prev_mm;

+   unsigned int clock_skip_update;
    u64 clock;
    u64 clock_task;

@@ -633,6 +703,7 @@
    /* For active balancing */
    int active_balance;
    int push_cpu;
+   struct task_struct *push_task;
    struct cpu_stop_work active_balance_work;
    /* cpu of this runqueue: */
    int cpu;
@@ -651,24 +722,14 @@
 #endif

 #ifdef CONFIG_SCHED_WALT
-   /*
-    * max_freq = user or thermal defined maximum
-    * max_possible_freq = maximum supported by hardware
-    */
-   unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
-   struct cpumask freq_domain_cpumask;
-
    u64 cumulative_runnable_avg;
-   int efficiency; /* Differentiate cpus with different IPC capability */
-   int load_scale_factor;
-   int capacity;
-   int max_possible_capacity;
    u64 window_start;
    u64 curr_runnable_sum;
    u64 prev_runnable_sum;
    u64 cur_irqload;
    u64 avg_irqload;
    u64 irqload_ts;
+   u64 cum_window_demand;
 #endif /* CONFIG_SCHED_WALT */


@@ -710,6 +771,8 @@
    /* try_to_wake_up() stats */
    unsigned int ttwu_count;
    unsigned int ttwu_local;
+
+   struct eas_stats eas_stats;
 #endif

 #ifdef CONFIG_SMP
@@ -742,7 +805,7 @@

 static inline u64 __rq_clock_broken(struct rq *rq)
 {
-   return ACCESS_ONCE(rq->clock);
+   return READ_ONCE(rq->clock);
 }

 static inline u64 rq_clock(struct rq *rq)
@@ -757,6 +820,18 @@
    return rq->clock_task;
 }

+#define RQCF_REQ_SKIP  0x01
+#define RQCF_ACT_SKIP  0x02
+
+static inline void rq_clock_skip_update(struct rq *rq, bool skip)
+{
+   lockdep_assert_held(&rq->lock);
+   if (skip)
+       rq->clock_skip_update |= RQCF_REQ_SKIP;
+   else
+       rq->clock_skip_update &= ~RQCF_REQ_SKIP;
+}
+
 #ifdef CONFIG_NUMA_BALANCING
 extern void sched_setnuma(struct task_struct *p, int node);
 extern int migrate_task_to(struct task_struct *p, int cpu);
@@ -836,8 +911,8 @@
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_busy);
 DECLARE_PER_CPU(struct sched_domain *, sd_asym);
 DECLARE_PER_CPU(struct sched_domain *, sd_ea);
 DECLARE_PER_CPU(struct sched_domain *, sd_scs);
@@ -850,12 +925,9 @@
     */
    unsigned long capacity;
    unsigned long max_capacity; /* Max per-cpu capacity in group */
+   unsigned long min_capacity; /* Min per-CPU capacity in group */
    unsigned long next_update;
    int imbalance; /* XXX unrelated to capacity but shared group state */
-   /*
-    * Number of busy cpus in this group.
-    */
-   atomic_t nr_busy_cpus;

    unsigned long cpumask[0]; /* iteration mask */
 };
@@ -866,7 +938,7 @@

    unsigned int group_weight;
    struct sched_group_capacity *sgc;
-   const struct sched_group_energy const *sge;
+   const struct sched_group_energy *sge;

    /*
     * The CPUs this group covers.
@@ -878,9 +950,6 @@
    unsigned long cpumask[0];
 };

-void set_energy_aware(void);
-void clear_energy_aware(void);
-
 static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
 {
    return to_cpumask(sg->cpumask);
@@ -961,7 +1030,6 @@
 {
    return NULL;
 }
-
 #endif /* CONFIG_CGROUP_SCHED */

 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
@@ -1022,17 +1090,8 @@
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */

-#ifdef CONFIG_NUMA_BALANCING
-#define sched_feat_numa(x) sched_feat(x)
-#ifdef CONFIG_SCHED_DEBUG
-#define numabalancing_enabled sched_feat_numa(NUMA)
-#else
-extern bool numabalancing_enabled;
-#endif /* CONFIG_SCHED_DEBUG */
-#else
-#define sched_feat_numa(x) (0)
-#define numabalancing_enabled (0)
-#endif /* CONFIG_NUMA_BALANCING */
+extern struct static_key_false sched_numa_balancing;
+extern struct static_key_false sched_schedstats;

 static inline u64 global_rt_period(void)
 {
@@ -1074,9 +1133,6 @@
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next) do { } while (0)
 #endif
-#ifndef finish_arch_switch
-# define finish_arch_switch(prev)  do { } while (0)
-#endif
 #ifndef finish_arch_post_lock_switch
 # define finish_arch_post_lock_switch()    do { } while (0)
 #endif
@@ -1101,7 +1157,7 @@
     * We must ensure this doesn't happen until the switch is completely
     * finished.
     *
-    * Pairs with the control dependency and rmb in try_to_wake_up().
+    * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
     */
    smp_store_release(&prev->on_cpu, 0);
 #endif
@@ -1139,59 +1195,45 @@
 #define WEIGHT_IDLEPRIO                3
 #define WMULT_IDLEPRIO         1431655765

-/*
- * Nice levels are multiplicative, with a gentle 10% change for every
- * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
- * nice 1, it will get ~10% less CPU time than another CPU-bound task
- * that remained on nice 0.
- *
- * The "10% effect" is relative and cumulative: from _any_ nice level,
- * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
- * If a task goes up by ~10% and another task goes down by ~10% then
- * the relative distance between them is ~25%.)
- */
-static const int prio_to_weight[40] = {
- /* -20 */     88761,     71755,     56483,     46273,     36291,
- /* -15 */     29154,     23254,     18705,     14949,     11916,
- /* -10 */      9548,      7620,      6100,      4904,      3906,
- /*  -5 */      3121,      2501,      1991,      1586,      1277,
- /*   0 */      1024,       820,       655,       526,       423,
- /*   5 */       335,       272,       215,       172,       137,
- /*  10 */       110,        87,        70,        56,        45,
- /*  15 */        36,        29,        23,        18,        15,
-};
+extern const int sched_prio_to_weight[40];
+extern const u32 sched_prio_to_wmult[40];

 /*
- * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
+ * {de,en}queue flags:
  *
- * In cases where the weight does not change often, we can use the
- * precalculated inverse to speed up arithmetics by turning divisions
- * into multiplications:
- */
-static const u32 prio_to_wmult[40] = {
- /* -20 */     48388,     59856,     76040,     92818,    118348,
- /* -15 */    147320,    184698,    229616,    287308,    360437,
- /* -10 */    449829,    563644,    704093,    875809,   1099582,
- /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
- /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
- /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
- /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
- /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
-};
+ * DEQUEUE_SLEEP  - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ *                are in a known state which allows modification. Such pairs
+ *                should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ *        in the runqueue.
+ *
+ * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_WAKING    - sched_class::task_waking was called
+ *
+ */

-#define ENQUEUE_WAKEUP     1
-#define ENQUEUE_HEAD       2
+#define DEQUEUE_SLEEP      0x01
+#define DEQUEUE_SAVE       0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE       0x04 /* matches ENQUEUE_MOVE */
+#define DEQUEUE_IDLE       0x80 /* The last dequeue before IDLE */
+
+#define ENQUEUE_WAKEUP     0x01
+#define ENQUEUE_RESTORE        0x02
+#define ENQUEUE_MOVE       0x04
+
+#define ENQUEUE_HEAD       0x08
+#define ENQUEUE_REPLENISH  0x10
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING     4   /* sched_class::task_waking was called */
+#define ENQUEUE_WAKING     0x20
 #else
-#define ENQUEUE_WAKING     0
+#define ENQUEUE_WAKING     0x00
 #endif
-#define ENQUEUE_REPLENISH  0x08
-#define ENQUEUE_RESTORE    0x10
-#define ENQUEUE_WAKEUP_NEW 0x20
-
-#define DEQUEUE_SLEEP      1
+#define ENQUEUE_WAKEUP_NEW 0x40

 #define RETRY_TASK     ((void *)-1UL)

@@ -1214,12 +1256,14 @@
     * tasks.
     */
    struct task_struct * (*pick_next_task) (struct rq *rq,
-                       struct task_struct *prev);
+                       struct task_struct *prev,
+                       struct pin_cookie cookie);
    void (*put_prev_task) (struct rq *rq, struct task_struct *p);

 #ifdef CONFIG_SMP
-   int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
-   void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
+   int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags,
+                  int subling_count_hint);
+   void (*migrate_task_rq)(struct task_struct *p);

    void (*task_waking) (struct task_struct *task);
    void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1251,8 +1295,11 @@

    void (*update_curr) (struct rq *rq);

+#define TASK_SET_GROUP  0
+#define TASK_MOVE_GROUP    1
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
-   void (*task_move_group) (struct task_struct *p);
+   void (*task_change_group)(struct task_struct *p, int type);
 #endif
 };

@@ -1261,6 +1308,11 @@
    prev->sched_class->put_prev_task(rq, prev);
 }

+static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
+{
+   curr->sched_class->set_curr_task(rq);
+}
+
 #define sched_class_highest (&stop_sched_class)
 #define for_each_class(class) \
    for (class = sched_class_highest; class; class = class->next)
@@ -1279,13 +1331,7 @@

 extern void trigger_load_balance(struct rq *rq);

-extern void idle_enter_fair(struct rq *this_rq);
-extern void idle_exit_fair(struct rq *this_rq);
-
-#else
-
-static inline void idle_enter_fair(struct rq *rq) { }
-static inline void idle_exit_fair(struct rq *rq) { }
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);

 #endif

@@ -1298,7 +1344,7 @@

 static inline struct cpuidle_state *idle_get_state(struct rq *rq)
 {
-   WARN_ON(!rcu_read_lock_held());
+   SCHED_WARN_ON(!rcu_read_lock_held());
    return rq->idle_state;
 }

@@ -1340,7 +1386,6 @@
 extern void init_sched_dl_class(void);
 extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
-extern void init_sched_dl_class(void);

 extern void resched_curr(struct rq *rq);
 extern void resched_cpu(int cpu);
@@ -1350,14 +1395,14 @@

 extern struct dl_bandwidth def_dl_bandwidth;
 extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
-extern void init_rt_schedtune_timer(struct sched_rt_entity *rt_se);
 extern void init_dl_task_timer(struct sched_dl_entity *dl_se);

 unsigned long to_ratio(u64 period, u64 runtime);

 extern void init_entity_runnable_average(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct sched_entity *se);

-static inline void add_nr_running(struct rq *rq, unsigned count)
+static inline void __add_nr_running(struct rq *rq, unsigned count)
 {
    unsigned prev_nr = rq->nr_running;

@@ -1385,11 +1430,48 @@
    }
 }

-static inline void sub_nr_running(struct rq *rq, unsigned count)
+static inline void __sub_nr_running(struct rq *rq, unsigned count)
 {
    rq->nr_running -= count;
 }

+#ifdef CONFIG_CPU_QUIET
+#define NR_AVE_SCALE(x)        ((x) << FSHIFT)
+static inline u64 do_nr_running_integral(struct rq *rq)
+{
+   s64 nr, deltax;
+   u64 nr_running_integral = rq->nr_running_integral;
+
+   deltax = rq->clock_task - rq->nr_last_stamp;
+   nr = NR_AVE_SCALE(rq->nr_running);
+
+   nr_running_integral += nr * deltax;
+
+   return nr_running_integral;
+}
+
+static inline void add_nr_running(struct rq *rq, unsigned count)
+{
+   write_seqcount_begin(&rq->ave_seqcnt);
+   rq->nr_running_integral = do_nr_running_integral(rq);
+   rq->nr_last_stamp = rq->clock_task;
+   __add_nr_running(rq, count);
+   write_seqcount_end(&rq->ave_seqcnt);
+}
+
+static inline void sub_nr_running(struct rq *rq, unsigned count)
+{
+   write_seqcount_begin(&rq->ave_seqcnt);
+   rq->nr_running_integral = do_nr_running_integral(rq);
+   rq->nr_last_stamp = rq->clock_task;
+   __sub_nr_running(rq, count);
+   write_seqcount_end(&rq->ave_seqcnt);
+}
+#else
+#define add_nr_running __add_nr_running
+#define sub_nr_running __sub_nr_running
+#endif
+
 static inline void rq_last_tick_reset(struct rq *rq)
 {
 #ifdef CONFIG_NO_HZ_FULL
@@ -1451,6 +1533,26 @@
 }
 #endif

+#ifndef arch_scale_max_freq_capacity
+static __always_inline
+unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
+{
+   return SCHED_CAPACITY_SCALE;
+}
+#endif
+
+#ifndef arch_scale_min_freq_capacity
+static __always_inline
+unsigned long arch_scale_min_freq_capacity(struct sched_domain *sd, int cpu)
+{
+   /*
+    * Multiplied with any capacity value, this scale factor will return
+    * 0, which represents an un-capped state
+    */
+   return 0;
+}
+#endif
+
 #ifndef arch_scale_cpu_capacity
 static __always_inline
 unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -1473,33 +1575,9 @@
    return cpu_rq(cpu)->cpu_capacity_orig;
 }

-/* Force usage of PELT signal, i.e. util_avg */
-#define UTIL_AVG true
-/* Use estimated utilization when possible, i.e. UTIL_EST feature enabled */
-#define UTIL_EST false
-static inline bool use_util_est(void)
-{
-   return sched_feat(UTIL_EST);
-}
-
 extern unsigned int sysctl_sched_use_walt_cpu_util;
 extern unsigned int walt_ravg_window;
-extern unsigned int walt_disabled;
-
-static inline unsigned long task_util(struct task_struct *p, bool use_pelt)
-{
-
-#ifdef CONFIG_SCHED_WALT
-   if (!walt_disabled && sysctl_sched_use_walt_task_util) {
-       unsigned long demand = p->ravg.demand;
-       return (demand << 10) / walt_ravg_window;
-   }
-#endif
-   if (use_util_est() && !use_pelt)
-       return p->se.avg.util_est;
-   return p->se.avg.util_avg;
-}
-
+extern bool walt_disabled;

 /*
  * cpu_util returns the amount of capacity of a CPU that is used by CFS
@@ -1527,18 +1605,15 @@
  * capacity_orig) as it useful for predicting the capacity required after task
  * migrations (scheduler-driven DVFS).
  */
-static inline unsigned long __cpu_util(int cpu, int delta, bool use_pelt)
+static inline unsigned long __cpu_util(int cpu, int delta)
 {
    unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
    unsigned long capacity = capacity_orig_of(cpu);

-   if (use_util_est() && !use_pelt)
-       util = max(util, cpu_rq(cpu)->cfs.avg.util_est);
-
 #ifdef CONFIG_SCHED_WALT
    if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
-       util = (cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT) /
-           walt_ravg_window;
+       util = div64_u64(cpu_rq(cpu)->cumulative_runnable_avg,
+                walt_ravg_window >> SCHED_LOAD_SHIFT);
 #endif
    delta += util;
    if (delta < 0)
@@ -1547,9 +1622,22 @@
    return (delta >= capacity) ? capacity : delta;
 }

-static inline unsigned long cpu_util(int cpu, bool use_pelt)
+static inline unsigned long cpu_util(int cpu)
 {
-   return __cpu_util(cpu, 0, use_pelt);
+   return __cpu_util(cpu, 0);
+}
+
+static inline unsigned long cpu_util_freq(int cpu)
+{
+   unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+   unsigned long capacity = capacity_orig_of(cpu);
+
+#ifdef CONFIG_SCHED_WALT
+   if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+       util = div64_u64(cpu_rq(cpu)->prev_runnable_sum,
+                walt_ravg_window >> SCHED_LOAD_SHIFT);
+#endif
+   return (util >= capacity) ? capacity : util;
 }

 #endif
@@ -1564,6 +1652,10 @@
    return static_key_false(&__sched_freq);
 }

+/*
+ * sched_capacity_reqs expects capacity requests to be normalised.
+ * All capacities should sum to the range of 0-1024.
+ */
 DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
 void update_cpu_capacity_request(int cpu, bool request);

@@ -1572,32 +1664,45 @@
 {
    struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);

-   if (scr->cfs == capacity)
-       return;
-   scr->cfs = capacity;
-   update_cpu_capacity_request(cpu, request);
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+       int rtdl = scr->rt + scr->dl;
+       /*
+        * WALT tracks the utilization of a CPU considering the load
+        * generated by all the scheduling classes.
+        * Since the following call to:
+        *    update_cpu_capacity
+        * is already adding the RT and DL utilizations let's remove
+        * these contributions from the WALT signal.
+        */
+       if (capacity > rtdl)
+           capacity -= rtdl;
+       else
+           capacity = 0;
+   }
+#endif
+   if (scr->cfs != capacity) {
+       scr->cfs = capacity;
+       update_cpu_capacity_request(cpu, request);
+   }
 }

 static inline void set_rt_cpu_capacity(int cpu, bool request,
                       unsigned long capacity)
 {
-   struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
-
-   if (scr->rt == capacity)
-       return;
-   scr->rt = capacity;
-   update_cpu_capacity_request(cpu, request);
+   if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) {
+       per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity;
+       update_cpu_capacity_request(cpu, request);
+   }
 }

 static inline void set_dl_cpu_capacity(int cpu, bool request,
                       unsigned long capacity)
 {
-   struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
-
-   if (scr->dl == capacity)
-       return;
-   scr->dl = capacity;
-   update_cpu_capacity_request(cpu, request);
+   if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) {
+       per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity;
+       update_cpu_capacity_request(cpu, request);
+   }
 }
 #else
 static inline bool sched_freq(void) { return false; }
@@ -1621,8 +1726,33 @@
 static inline void sched_avg_update(struct rq *rq) { }
 #endif

-extern struct rq *lock_rq_of(struct task_struct *p, unsigned long *flags);
-extern void unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags);
+struct rq_flags {
+   unsigned long flags;
+   struct pin_cookie cookie;
+};
+
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+   __acquires(rq->lock);
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+   __acquires(p->pi_lock)
+   __acquires(rq->lock);
+
+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
+   __releases(rq->lock)
+{
+   lockdep_unpin_lock(&rq->lock, rf->cookie);
+   raw_spin_unlock(&rq->lock);
+}
+
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
+   __releases(rq->lock)
+   __releases(p->pi_lock)
+{
+   lockdep_unpin_lock(&rq->lock, rf->cookie);
+   raw_spin_unlock(&rq->lock);
+   raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
+}

 #ifdef CONFIG_SMP
 #ifdef CONFIG_PREEMPT
@@ -1811,8 +1941,8 @@
 extern void print_rt_stats(struct seq_file *m, int cpu);

 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
-extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
+extern void init_rt_rq(struct rt_rq *rt_rq);
+extern void init_dl_rq(struct dl_rq *dl_rq);

 extern void cfs_bandwidth_usage_inc(void);
 extern void cfs_bandwidth_usage_dec(void);
@@ -1878,6 +2008,69 @@
 #endif /* CONFIG_64BIT */
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */

+#ifdef CONFIG_CPU_FREQ
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_update_util - Take a note about CPU utilization changes.
+ * @rq: Runqueue to carry out the update for.
+ * @flags: Update reason flags.
+ *
+ * This function is called by the scheduler on the CPU whose utilization is
+ * being updated.
+ *
+ * It can only be called from RCU-sched read-side critical sections.
+ *
+ * The way cpufreq is currently arranged requires it to evaluate the CPU
+ * performance state (frequency/voltage) on a regular basis to prevent it from
+ * being stuck in a completely inadequate performance level for too long.
+ * That is not guaranteed to happen if the updates are only triggered from CFS,
+ * though, because they may not be coming in if RT or deadline tasks are active
+ * all the time (or there are RT and DL tasks only).
+ *
+ * As a workaround for that issue, this function is called by the RT and DL
+ * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ * but that really is a band-aid.  Going forward it should be replaced with
+ * solutions targeted more specifically at RT and DL tasks.
+ */
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
+{
+        struct update_util_data *data;
+
+        data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+        if (data)
+                data->func(data, rq_clock(rq), flags);
+}
+
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
+{
+        if (cpu_of(rq) == smp_processor_id())
+                cpufreq_update_util(rq, flags);
+}
+#else
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
+#endif /* CONFIG_CPU_FREQ */
+
+#ifdef CONFIG_SCHED_WALT
+
+static inline bool
+walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
+{
+   return cpu_of(rq) == task_cpu(p) &&
+          (p->on_rq || p->last_sleep_ts >= rq->window_start);
+}
+
+#endif /* CONFIG_SCHED_WALT */
+
+#ifdef arch_scale_freq_capacity
+#ifndef arch_scale_freq_invariant
+#define arch_scale_freq_invariant()     (true)
+#endif
+#else /* arch_scale_freq_capacity */
+#define arch_scale_freq_invariant()     (false)
+#endif
+
 /*
  * task_may_not_preempt - check whether a task may not be preemptible soon
  */
diff -Nur /home/ninez/android/marlin/kernel/sched/stats.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stats.c
--- /home/ninez/android/marlin/kernel/sched/stats.c 2018-08-10 01:54:08.566728454 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stats.c 2018-08-11 23:57:17.131940887 -0400
@@ -12,6 +12,26 @@
  */
 #define SCHEDSTAT_VERSION 15

+static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats)
+{
+   /* eas-specific runqueue stats */
+   seq_printf(seq, "eas %llu %llu %llu %llu %llu %llu ",
+       stats->sis_attempts, stats->sis_idle, stats->sis_cache_affine,
+       stats->sis_suff_cap, stats->sis_idle_cpu, stats->sis_count);
+
+   seq_printf(seq, "%llu %llu %llu %llu %llu %llu %llu ",
+       stats->secb_attempts, stats->secb_sync, stats->secb_idle_bt,
+       stats->secb_insuff_cap, stats->secb_no_nrg_sav,
+       stats->secb_nrg_sav, stats->secb_count);
+
+   seq_printf(seq, "%llu %llu %llu %llu %llu ",
+       stats->fbt_attempts, stats->fbt_no_cpu, stats->fbt_no_sd,
+       stats->fbt_pref_idle, stats->fbt_count);
+
+   seq_printf(seq, "%llu %llu\n",
+       stats->cas_attempts, stats->cas_count);
+}
+
 static int show_schedstat(struct seq_file *seq, void *v)
 {
    int cpu;
@@ -44,6 +64,7 @@

        seq_printf(seq, "\n");

+       show_easstat(seq, &rq->eas_stats);
 #ifdef CONFIG_SMP
        /* domain-specific stats */
        rcu_read_lock();
@@ -72,6 +93,8 @@
                sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
                sd->ttwu_wake_remote, sd->ttwu_move_affine,
                sd->ttwu_move_balance);
+
+           show_easstat(seq, &sd->eas_stats);
        }
        rcu_read_unlock();
 #endif
diff -Nur /home/ninez/android/marlin/kernel/sched/stats.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stats.h
--- /home/ninez/android/marlin/kernel/sched/stats.h 2018-08-10 01:54:08.566728454 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stats.h 2018-08-26 16:43:11.650539699 -0400
@@ -29,9 +29,13 @@
    if (rq)
        rq->rq_sched_info.run_delay += delta;
 }
-# define schedstat_inc(rq, field)  do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val)   do { var = (val); } while (0)
+#define schedstat_enabled()        static_branch_unlikely(&sched_schedstats)
+#define schedstat_inc(var)     do { if (schedstat_enabled()) { var++; } } while (0)
+#define schedstat_add(var, amt)        do { if (schedstat_enabled()) { var += (amt); } } while (0)
+#define schedstat_set(var, val)        do { if (schedstat_enabled()) { var = (val); } } while (0)
+#define schedstat_val(var)     (var)
+#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
+
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -42,10 +46,13 @@
 static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
-# define schedstat_inc(rq, field)  do { } while (0)
-# define schedstat_add(rq, field, amt) do { } while (0)
-# define schedstat_set(var, val)   do { } while (0)
-#endif
+#define schedstat_enabled()        0
+#define schedstat_inc(var)     do { } while (0)
+#define schedstat_add(var, amt)        do { } while (0)
+#define schedstat_set(var, val)        do { } while (0)
+#define schedstat_val(var)     0
+#define schedstat_val_or_zero(var) 0
+#endif /* CONFIG_SCHEDSTATS */

 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 static inline void sched_info_reset_dequeued(struct task_struct *t)
@@ -174,7 +181,8 @@
 {
    struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;

-   if (!cputimer->running)
+   /* Check if cputimer isn't running. This is accessed without locking. */
+   if (!READ_ONCE(cputimer->running))
        return false;

    /*
@@ -215,9 +223,7 @@
    if (!cputimer_running(tsk))
        return;

-   raw_spin_lock(&cputimer->lock);
-   cputimer->cputime.utime += cputime;
-   raw_spin_unlock(&cputimer->lock);
+   atomic64_add(cputime, &cputimer->cputime_atomic.utime);
 }

 /**
@@ -238,9 +244,7 @@
    if (!cputimer_running(tsk))
        return;

-   raw_spin_lock(&cputimer->lock);
-   cputimer->cputime.stime += cputime;
-   raw_spin_unlock(&cputimer->lock);
+   atomic64_add(cputime, &cputimer->cputime_atomic.stime);
 }

 /**
@@ -261,7 +265,5 @@
    if (!cputimer_running(tsk))
        return;

-   raw_spin_lock(&cputimer->lock);
-   cputimer->cputime.sum_exec_runtime += ns;
-   raw_spin_unlock(&cputimer->lock);
+   atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
 }
diff -Nur /home/ninez/android/marlin/kernel/sched/stop_task.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stop_task.c
--- /home/ninez/android/marlin/kernel/sched/stop_task.c 2018-08-10 01:54:08.566728454 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stop_task.c 2018-08-21 23:22:44.643944617 -0400
@@ -12,7 +12,8 @@

 #ifdef CONFIG_SMP
 static int
-select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags,
+           int sibling_count_hint)
 {
    return task_cpu(p); /* stop tasks as never migrate */
 }
@@ -25,7 +26,7 @@
 }

 static struct task_struct *
-pick_next_task_stop(struct rq *rq, struct task_struct *prev)
+pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
    struct task_struct *stop = rq->stop;

@@ -126,6 +127,7 @@

 #ifdef CONFIG_SMP
    .select_task_rq     = select_task_rq_stop,
+   .set_cpus_allowed   = set_cpus_allowed_common,
 #endif

    .set_curr_task          = set_curr_task_stop,
diff -Nur /home/ninez/android/marlin/kernel/sched/swait.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/swait.c
--- /home/ninez/android/marlin/kernel/sched/swait.c 1969-12-31 19:00:00.000000000 -0500
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/swait.c 2018-08-13 18:40:12.199646700 -0400
@@ -0,0 +1,134 @@
+#include <linux/sched.h>
+#include <linux/swait.h>
+
+void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+                struct lock_class_key *key)
+{
+   raw_spin_lock_init(&q->lock);
+   lockdep_set_class_and_name(&q->lock, key, name);
+   INIT_LIST_HEAD(&q->task_list);
+}
+EXPORT_SYMBOL(__init_swait_queue_head);
+
+/*
+ * The thing about the wake_up_state() return value; I think we can ignore it.
+ *
+ * If for some reason it would return 0, that means the previously waiting
+ * task is already running, so it will observe condition true (or has already).
+ */
+void swake_up_locked(struct swait_queue_head *q)
+{
+   struct swait_queue *curr;
+
+   if (list_empty(&q->task_list))
+       return;
+
+   curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
+   wake_up_process(curr->task);
+   list_del_init(&curr->task_list);
+}
+EXPORT_SYMBOL(swake_up_locked);
+
+void swake_up_all_locked(struct swait_queue_head *q)
+{
+   struct swait_queue *curr;
+   int wakes = 0;
+
+   while (!list_empty(&q->task_list)) {
+
+       curr = list_first_entry(&q->task_list, typeof(*curr),
+                   task_list);
+       wake_up_process(curr->task);
+       list_del_init(&curr->task_list);
+       wakes++;
+   }
+   //WARN_ON(wakes > 2);
+}
+EXPORT_SYMBOL(swake_up_all_locked);
+
+void swake_up(struct swait_queue_head *q)
+{
+   unsigned long flags;
+
+   raw_spin_lock_irqsave(&q->lock, flags);
+   swake_up_locked(q);
+   raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(swake_up);
+
+/*
+ * Does not allow usage from IRQ disabled, since we must be able to
+ * release IRQs to guarantee bounded hold time.
+ */
+void swake_up_all(struct swait_queue_head *q)
+{
+   struct swait_queue *curr;
+   LIST_HEAD(tmp);
+
+   raw_spin_lock_irq(&q->lock);
+   list_splice_init(&q->task_list, &tmp);
+   while (!list_empty(&tmp)) {
+       curr = list_first_entry(&tmp, typeof(*curr), task_list);
+
+       wake_up_state(curr->task, TASK_NORMAL);
+       list_del_init(&curr->task_list);
+
+       if (list_empty(&tmp))
+           break;
+
+       raw_spin_unlock_irq(&q->lock);
+       raw_spin_lock_irq(&q->lock);
+   }
+   raw_spin_unlock_irq(&q->lock);
+}
+EXPORT_SYMBOL(swake_up_all);
+
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+   wait->task = current;
+   if (list_empty(&wait->task_list))
+       list_add(&wait->task_list, &q->task_list);
+}
+
+void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+   unsigned long flags;
+
+   raw_spin_lock_irqsave(&q->lock, flags);
+   __prepare_to_swait(q, wait);
+   set_current_state(state);
+   raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_swait);
+
+long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+   if (signal_pending_state(state, current))
+       return -ERESTARTSYS;
+
+   prepare_to_swait(q, wait, state);
+
+   return 0;
+}
+EXPORT_SYMBOL(prepare_to_swait_event);
+
+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+   __set_current_state(TASK_RUNNING);
+   if (!list_empty(&wait->task_list))
+       list_del_init(&wait->task_list);
+}
+
+void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+   unsigned long flags;
+
+   __set_current_state(TASK_RUNNING);
+
+   if (!list_empty_careful(&wait->task_list)) {
+       raw_spin_lock_irqsave(&q->lock, flags);
+       list_del_init(&wait->task_list);
+       raw_spin_unlock_irqrestore(&q->lock, flags);
+   }
+}
+EXPORT_SYMBOL(finish_swait);
diff -Nur /home/ninez/android/marlin/kernel/sched/swork.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/swork.c
--- /home/ninez/android/marlin/kernel/sched/swork.c 1969-12-31 19:00:00.000000000 -0500
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/swork.c 2018-08-12 21:14:08.273505429 -0400
@@ -0,0 +1,172 @@
+/*
+ * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner [email protected]
+ *
+ * Provides a framework for enqueuing callbacks from irq context
+ * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
+ */
+
+#include <linux/swait.h>
+#include <linux/swork.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#define SWORK_EVENT_PENDING     (1 << 0)
+
+static DEFINE_MUTEX(worker_mutex);
+static struct sworker *glob_worker;
+
+struct sworker {
+   struct list_head events;
+   struct swait_queue_head wq;
+
+   raw_spinlock_t lock;
+
+   struct task_struct *task;
+   int refs;
+};
+
+static bool swork_readable(struct sworker *worker)
+{
+   bool r;
+
+   if (kthread_should_stop())
+       return true;
+
+   raw_spin_lock_irq(&worker->lock);
+   r = !list_empty(&worker->events);
+   raw_spin_unlock_irq(&worker->lock);
+
+   return r;
+}
+
+static int swork_kthread(void *arg)
+{
+   struct sworker *worker = arg;
+
+   for (;;) {
+       swait_event_interruptible(worker->wq,
+                   swork_readable(worker));
+       if (kthread_should_stop())
+           break;
+
+       raw_spin_lock_irq(&worker->lock);
+       while (!list_empty(&worker->events)) {
+           struct swork_event *sev;
+
+           sev = list_first_entry(&worker->events,
+                   struct swork_event, item);
+           list_del(&sev->item);
+           raw_spin_unlock_irq(&worker->lock);
+
+           WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
+                            &sev->flags));
+           sev->func(sev);
+           raw_spin_lock_irq(&worker->lock);
+       }
+       raw_spin_unlock_irq(&worker->lock);
+   }
+   return 0;
+}
+
+static struct sworker *swork_create(void)
+{
+   struct sworker *worker;
+
+   worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+   if (!worker)
+       return ERR_PTR(-ENOMEM);
+
+   INIT_LIST_HEAD(&worker->events);
+   raw_spin_lock_init(&worker->lock);
+   init_swait_queue_head(&worker->wq);
+
+   worker->task = kthread_run(swork_kthread, worker, "kswork");
+   if (IS_ERR(worker->task)) {
+       kfree(worker);
+       return ERR_PTR(-ENOMEM);
+   }
+
+   return worker;
+}
+
+static void swork_destroy(struct sworker *worker)
+{
+   kthread_stop(worker->task);
+
+   WARN_ON(!list_empty(&worker->events));
+   kfree(worker);
+}
+
+/**
+ * swork_queue - queue swork
+ *
+ * Returns %false if @work was already on a queue, %true otherwise.
+ *
+ * The work is queued and processed on a random CPU
+ */
+bool swork_queue(struct swork_event *sev)
+{
+   unsigned long flags;
+
+   if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
+       return false;
+
+   raw_spin_lock_irqsave(&glob_worker->lock, flags);
+   list_add_tail(&sev->item, &glob_worker->events);
+   raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
+
+   swake_up(&glob_worker->wq);
+   return true;
+}
+EXPORT_SYMBOL_GPL(swork_queue);
+
+/**
+ * swork_get - get an instance of the sworker
+ *
+ * Returns an negative error code if the initialization if the worker did not
+ * work, %0 otherwise.
+ *
+ */
+int swork_get(void)
+{
+   struct sworker *worker;
+
+   mutex_lock(&worker_mutex);
+   if (!glob_worker) {
+       worker = swork_create();
+       if (IS_ERR(worker)) {
+           mutex_unlock(&worker_mutex);
+           return -ENOMEM;
+       }
+
+       glob_worker = worker;
+   }
+
+   glob_worker->refs++;
+   mutex_unlock(&worker_mutex);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(swork_get);
+
+/**
+ * swork_put - puts an instance of the sworker
+ *
+ * Will destroy the sworker thread. This function must not be called until all
+ * queued events have been completed.
+ */
+void swork_put(void)
+{
+   mutex_lock(&worker_mutex);
+
+   glob_worker->refs--;
+   if (glob_worker->refs > 0)
+       goto out;
+
+   swork_destroy(glob_worker);
+   glob_worker = NULL;
+out:
+   mutex_unlock(&worker_mutex);
+}
+EXPORT_SYMBOL_GPL(swork_put);
diff -Nur /home/ninez/android/marlin/kernel/sched/tune.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/tune.c
--- /home/ninez/android/marlin/kernel/sched/tune.c  2018-08-10 01:54:08.566728454 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/tune.c  2018-08-14 15:53:43.604124856 -0400
@@ -12,13 +12,25 @@
 #include "tune.h"

 #ifdef CONFIG_CGROUP_SCHEDTUNE
-static bool schedtune_initialized = false;
+bool schedtune_initialized = false;
 #endif

-unsigned int sysctl_sched_cfs_boost __read_mostly;
+extern struct rq *lock_rq_of(struct task_struct *p, struct rq_flags *rf);
+extern void unlock_rq_of(struct rq *rq, struct task_struct *p, struct rq_flags *rf);

-static struct reciprocal_value schedtune_spc_rdiv;
-extern struct target_nrg schedtune_target_nrg;
+int sysctl_sched_cfs_boost __read_mostly;
+
+/* We hold schedtune boost in effect for at least this long */
+#define SCHEDTUNE_BOOST_HOLD_NS 50000000ULL
+
+extern struct reciprocal_value schedtune_spc_rdiv;
+struct target_nrg schedtune_target_nrg;
+
+#ifdef CONFIG_DYNAMIC_STUNE_BOOST
+static DEFINE_MUTEX(stune_boost_mutex);
+static struct schedtune *getSchedtune(char *st_name);
+static int dynamic_boost_write(struct schedtune *st, int boost);
+#endif /* CONFIG_DYNAMIC_STUNE_BOOST */

 /* Performance Boost region (B) threshold params */
 static int perf_boost_idx;
@@ -130,6 +142,14 @@
    /* Hint to bias scheduling of tasks on that SchedTune CGroup
     * towards idle CPUs */
    int prefer_idle;
+
+#ifdef CONFIG_DYNAMIC_STUNE_BOOST
+   /*
+    * This tracks the default boost value and is used to restore
+    * the value when Dynamic SchedTune Boost is reset.
+    */
+   int boost_default;
+#endif /* CONFIG_DYNAMIC_STUNE_BOOST */
 };

 static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
@@ -162,6 +182,9 @@
    .perf_boost_idx = 0,
    .perf_constrain_idx = 0,
    .prefer_idle = 0,
+#ifdef CONFIG_DYNAMIC_STUNE_BOOST
+   .boost_default = 0,
+#endif /* CONFIG_DYNAMIC_STUNE_BOOST */
 };

 int
@@ -206,7 +229,8 @@
  *    implementation especially for the computation of the per-CPU boost
  *    value
  */
-#define BOOSTGROUPS_COUNT 5
+
+#define BOOSTGROUPS_COUNT 7

 /* Array of configured boostgroups */
 static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
@@ -226,45 +250,68 @@
    /* Maximum boost value for all RUNNABLE tasks on a CPU */
    bool idle;
    int boost_max;
+   u64 boost_ts;
    struct {
        /* The boost for tasks on that boost group */
        int boost;
        /* Count of RUNNABLE tasks on that boost group */
        unsigned tasks;
+       /* Timestamp of boost activation */
+       u64 ts;
    } group[BOOSTGROUPS_COUNT];
    /* CPU's boost group locking */
    raw_spinlock_t lock;
 };

 /* Boost groups affecting each CPU in the system */
-DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
+static DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
+
+static inline bool schedtune_boost_timeout(u64 now, u64 ts)
+{
+   return ((now - ts) > SCHEDTUNE_BOOST_HOLD_NS);
+}
+
+static inline bool
+schedtune_boost_group_active(int idx, struct boost_groups* bg, u64 now)
+{
+   if (bg->group[idx].tasks)
+       return true;
+
+   return !schedtune_boost_timeout(now, bg->group[idx].ts);
+}

 static void
-schedtune_cpu_update(int cpu)
+schedtune_cpu_update(int cpu, u64 now)
 {
    struct boost_groups *bg;
-   int boost_max;
+   u64 boost_ts = now;
+   int boost_max = INT_MIN;
    int idx;

    bg = &per_cpu(cpu_boost_groups, cpu);

-   /* The root boost group is always active */
-   boost_max = bg->group[0].boost;
-   for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
+   for (idx = 0; idx < BOOSTGROUPS_COUNT; ++idx) {
        /*
         * A boost group affects a CPU only if it has
-        * RUNNABLE tasks on that CPU
+        * RUNNABLE tasks on that CPU or it has hold
+        * in effect from a previous task.
         */
-       if (bg->group[idx].tasks == 0)
+       if (!schedtune_boost_group_active(idx, bg, now))
+           continue;
+
+       /* this boost group is active */
+       if (boost_max > bg->group[idx].boost)
            continue;

-       boost_max = max(boost_max, bg->group[idx].boost);
+       boost_max = bg->group[idx].boost;
+       boost_ts =  bg->group[idx].ts;
    }
-   /* Ensures boost_max is non-negative when all cgroup boost values
-    * are neagtive. Avoids under-accounting of cpu capacity which may cause
-    * task stacking and frequency spikes.*/
-   boost_max = max(boost_max, 0);
+
+   /* If there are no active boost groups on the CPU, set no boost  */
+   if (boost_max == INT_MIN)
+       boost_max = 0;
    bg->boost_max = boost_max;
+   bg->boost_ts = boost_ts;
 }

 static int
@@ -274,6 +321,7 @@
    int cur_boost_max;
    int old_boost;
    int cpu;
+   u64 now;

    /* Update per CPU boost groups */
    for_each_possible_cpu(cpu) {
@@ -290,16 +338,22 @@
        /* Update the boost value of this boost group */
        bg->group[idx].boost = boost;

-       /* Check if this update increase current max */
-       if (boost > cur_boost_max && bg->group[idx].tasks) {
+       now = sched_clock_cpu(cpu);
+       /*
+        * Check if this update increase current max.
+        */
+       if (boost > cur_boost_max &&
+           schedtune_boost_group_active(idx, bg, now)) {
            bg->boost_max = boost;
+           bg->boost_ts = bg->group[idx].ts;
+
            trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
            continue;
        }

        /* Check if this update has decreased current max */
        if (cur_boost_max == old_boost && old_boost > boost) {
-           schedtune_cpu_update(cpu);
+           schedtune_cpu_update(cpu, now);
            trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
            continue;
        }
@@ -313,21 +367,38 @@
 #define ENQUEUE_TASK  1
 #define DEQUEUE_TASK -1

+static inline bool
+schedtune_update_timestamp(struct task_struct *p)
+{
+   if (sched_feat(SCHEDTUNE_BOOST_HOLD_ALL))
+       return true;
+
+   return task_has_rt_policy(p);
+}
+
 static inline void
 schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
 {
    struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
    int tasks = bg->group[idx].tasks + task_count;
+   u64 now;

    /* Update boosted tasks count while avoiding to make it negative */
    bg->group[idx].tasks = max(0, tasks);
+   /* Update timeout on enqueue */
+   if (task_count > 0) {
+       now = sched_clock_cpu(cpu);
+       if (schedtune_update_timestamp(p))
+           bg->group[idx].ts = now;
+
+       /* Boost group activation or deactivation on that RQ */
+       if (bg->group[idx].tasks == 1)
+           schedtune_cpu_update(cpu, now);
+   }

    trace_sched_tune_tasks_update(p, cpu, tasks, idx,
-           bg->group[idx].boost, bg->boost_max);
-
-   /* Boost group activation or deactivation on that RQ */
-   if (tasks == 1 || tasks == 0)
-       schedtune_cpu_update(cpu);
+           bg->group[idx].boost, bg->boost_max,
+           bg->group[idx].ts);
 }

 /*
@@ -381,12 +452,13 @@
 {
    struct task_struct *task;
    struct boost_groups *bg;
-   unsigned long irq_flags;
+   struct rq_flags irq_flags;
    unsigned int cpu;
    struct rq *rq;
    int src_bg; /* Source boost group index */
    int dst_bg; /* Destination boost group index */
    int tasks;
+   u64 now;

    if (!unlikely(schedtune_initialized))
        return 0;
@@ -431,18 +503,19 @@
         * current boost group.
         */

+       now = sched_clock_cpu(cpu);
+
        /* Move task from src to dst boost group */
        tasks = bg->group[src_bg].tasks - 1;
        bg->group[src_bg].tasks = max(0, tasks);
        bg->group[dst_bg].tasks += 1;
+       bg->group[dst_bg].ts = now;
+
+       /* update next time someone asks */
+       bg->boost_ts = now - SCHEDTUNE_BOOST_HOLD_NS;

        raw_spin_unlock(&bg->lock);
        unlock_rq_of(rq, task, &irq_flags);
-
-       /* Update CPU boost group */
-       if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
-           schedtune_cpu_update(task_cpu(task));
-
    }

    return 0;
@@ -501,7 +574,7 @@
 void schedtune_exit_task(struct task_struct *tsk)
 {
    struct schedtune *st;
-   unsigned long irq_flags;
+   struct rq_flags irq_flags;
    unsigned int cpu;
    struct rq *rq;
    int idx;
@@ -524,8 +597,15 @@
 int schedtune_cpu_boost(int cpu)
 {
    struct boost_groups *bg;
+   u64 now;

    bg = &per_cpu(cpu_boost_groups, cpu);
+   now = sched_clock_cpu(cpu);
+
+   /* check to see if we have a hold in effect */
+   if (schedtune_boost_timeout(now, bg->boost_ts))
+       schedtune_cpu_update(cpu, now);
+
    return bg->boost_max;
 }

@@ -534,6 +614,9 @@
    struct schedtune *st;
    int task_boost;

+   if (!unlikely(schedtune_initialized))
+       return 0;
+
    /* Get task boost value */
    rcu_read_lock();
    st = task_schedtune(p);
@@ -548,6 +631,9 @@
    struct schedtune *st;
    int prefer_idle;

+   if (!unlikely(schedtune_initialized))
+       return 0;
+
    /* Get prefer_idle value */
    rcu_read_lock();
    st = task_schedtune(p);
@@ -606,6 +692,9 @@
    st->perf_constrain_idx = threshold_idx;

    st->boost = boost;
+#ifdef CONFIG_DYNAMIC_STUNE_BOOST
+   st->boost_default = boost;
+#endif /* CONFIG_DYNAMIC_STUNE_BOOST */
    if (css == &root_schedtune.css) {
        sysctl_sched_cfs_boost = boost;
        perf_boost_idx  = threshold_idx;
@@ -615,11 +704,11 @@
    /* Update CPU boost */
    schedtune_boostgroup_update(st->idx, st->boost);

-   trace_sched_tune_config(st->boost,
-           threshold_gains[st->perf_boost_idx].nrg_gain,
-           threshold_gains[st->perf_boost_idx].cap_gain,
-           threshold_gains[st->perf_constrain_idx].nrg_gain,
-           threshold_gains[st->perf_constrain_idx].cap_gain);
+// trace_sched_tune_config(st->boost,
+//         threshold_gains[st->perf_boost_idx].nrg_gain,
+//         threshold_gains[st->perf_boost_idx].cap_gain,
+//         threshold_gains[st->perf_constrain_idx].nrg_gain,
+//         threshold_gains[st->perf_constrain_idx].cap_gain);

    return 0;
 }
@@ -652,6 +741,8 @@
        bg = &per_cpu(cpu_boost_groups, cpu);
        bg->group[st->idx].boost = 0;
        bg->group[st->idx].tasks = 0;
+       bg->group[st->idx].ts = 0;
+       raw_spin_lock_init(&bg->lock);
    }

    return 0;
@@ -747,6 +838,114 @@
    schedtune_initialized = true;
 }

+#ifdef CONFIG_DYNAMIC_STUNE_BOOST
+static struct schedtune *getSchedtune(char *st_name)
+{
+   int idx;
+
+   for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
+       char name_buf[NAME_MAX + 1];
+       struct schedtune *st = allocated_group[idx];
+
+       if (!st) {
+           pr_warn("SCHEDTUNE: Could not find %s\n", st_name);
+           break;
+       }
+
+       cgroup_name(st->css.cgroup, name_buf, sizeof(name_buf));
+       if (strncmp(name_buf, st_name, strlen(st_name)) == 0)
+           return st;
+   }
+
+   return NULL;
+}
+
+static int dynamic_boost_write(struct schedtune *st, int boost)
+{
+   int ret;
+   /* Backup boost_default */
+   int boost_default_backup = st->boost_default;
+
+   ret = boost_write(&st->css, NULL, boost);
+
+   /* Restore boost_default */
+   st->boost_default = boost_default_backup;
+
+   return ret;
+}
+
+int do_stune_boost(char *st_name, int boost)
+{
+   int ret = 0;
+   struct schedtune *st = getSchedtune(st_name);
+
+   if (!st)
+       return -EINVAL;
+
+   mutex_lock(&stune_boost_mutex);
+
+   /* Boost if new value is greater than current */
+   if (boost > st->boost)
+       ret = dynamic_boost_write(st, boost);
+
+   mutex_unlock(&stune_boost_mutex);
+
+   return ret;
+}
+
+int do_stune_unboost(char *st_name, int boost)
+{
+   int ret = 0;
+   struct schedtune *st = getSchedtune(st_name);
+
+   if (!st)
+       return -EINVAL;
+
+   mutex_lock(&stune_boost_mutex);
+
+   /* Unboost if new value is less than current */
+   if (boost < st->boost)
+       ret = dynamic_boost_write(st, boost);
+
+   mutex_unlock(&stune_boost_mutex);
+
+   return ret;
+}
+
+int set_stune_boost(char *st_name, int boost)
+{
+   int ret = 0;
+   struct schedtune *st = getSchedtune(st_name);
+
+   if (!st)
+       return -EINVAL;
+
+   mutex_lock(&stune_boost_mutex);
+
+   /* Set Boost regardless if new value is greater than current */
+   ret = dynamic_boost_write(st, boost);
+
+   mutex_unlock(&stune_boost_mutex);
+
+   return ret;
+}
+
+int reset_stune_boost(char *st_name)
+{
+   int ret = 0;
+   struct schedtune *st = getSchedtune(st_name);
+
+   if (!st)
+       return -EINVAL;
+
+   mutex_lock(&stune_boost_mutex);
+   ret = dynamic_boost_write(st, st->boost_default);
+   mutex_unlock(&stune_boost_mutex);
+
+   return ret;
+}
+#endif /* CONFIG_DYNAMIC_STUNE_BOOST */
+
 #else /* CONFIG_CGROUP_SCHEDTUNE */

 int
@@ -894,79 +1093,6 @@
    }
 }

-static long
-schedtune_margin(unsigned long signal, long boost)
-{
-   long long margin = 0;
-
-   /*
-    * Signal proportional compensation (SPC)
-    *
-    * The Boost (B) value is used to compute a Margin (M) which is
-    * proportional to the complement of the original Signal (S):
-    *   M = B * (SCHED_CAPACITY_SCALE - S)
-    * The obtained M could be used by the caller to "boost" S.
-    */
-   if (boost >= 0) {
-       margin  = SCHED_CAPACITY_SCALE - signal;
-       margin *= boost;
-   } else
-       margin = -signal * boost;
-
-   margin  = reciprocal_divide(margin, schedtune_spc_rdiv);
-
-   if (boost < 0)
-       margin *= -1;
-   return margin;
-}
-
-static inline int
-schedtune_cpu_margin(unsigned long util, int cpu)
-{
-   int boost = schedtune_cpu_boost(cpu);
-
-   if (boost == 0)
-       return 0;
-
-   return schedtune_margin(util, boost);
-}
-
-static inline long
-schedtune_task_margin(struct task_struct *task)
-{
-   int boost = schedtune_task_boost(task);
-   unsigned long util;
-   long margin;
-
-   if (boost == 0)
-       return 0;
-
-   util = task_util(task, UTIL_AVG);
-   margin = schedtune_margin(util, boost);
-
-   return margin;
-}
-
-unsigned long boosted_cpu_util(int cpu)
-{
-   unsigned long util = cpu_util(cpu, UTIL_EST);
-   long margin = schedtune_cpu_margin(util, cpu);
-
-   trace_sched_boost_cpu(cpu, util, margin);
-
-   return util + margin;
-}
-
-unsigned long boosted_task_util(struct task_struct *task)
-{
-   unsigned long util = task_util(task, UTIL_EST);
-   long margin = schedtune_task_margin(task);
-
-   trace_sched_boost_task(task, util, margin);
-
-   return util + margin;
-}
-
 /*
  * Initialize the constants required to compute normalized energy.
  * The values of these constants depends on the EM data for the specific
@@ -1033,3 +1159,4 @@
 }
 postcore_initcall(schedtune_init);

+
diff -Nur /home/ninez/android/marlin/kernel/sched/tune.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/tune.h
--- /home/ninez/android/marlin/kernel/sched/tune.h  2018-08-10 01:54:08.566728454 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/tune.h  2018-08-23 19:57:44.817608733 -0400
@@ -24,6 +24,9 @@
 void schedtune_enqueue_task(struct task_struct *p, int cpu);
 void schedtune_dequeue_task(struct task_struct *p, int cpu);

+int schedtune_accept_deltas(int nrg_delta, int cap_delta,
+               struct task_struct *task);
+
 #else /* CONFIG_CGROUP_SCHEDTUNE */

 #define schedtune_cpu_boost(cpu)  get_sysctl_sched_cfs_boost()
@@ -39,13 +42,6 @@
 int schedtune_accept_deltas(int nrg_delta, int cap_delta,
                struct task_struct *task);

-#ifdef CONFIG_SMP
-unsigned long boosted_cpu_util(int cpu);
-#else
-#define boosted_cpu_util(cpu) cpu_util(cpu, UTIL_EST);
-#endif
-unsigned long boosted_task_util(struct task_struct *task);
-
 #else /* CONFIG_SCHED_TUNE */

 #define schedtune_cpu_boost(cpu)  0
@@ -58,7 +54,4 @@

 #define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta

-#define boosted_cpu_util(cpu) cpu_util(cpu, UTIL_EST);
-#define boosted_task_util(cpu) task_util(cpu, UTIL_EST);
-
 #endif /* CONFIG_SCHED_TUNE */
diff -Nur /home/ninez/android/marlin/kernel/sched/wait.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/wait.c
--- /home/ninez/android/marlin/kernel/sched/wait.c  2018-08-10 01:54:08.566728454 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/wait.c  2018-08-11 23:57:17.131940887 -0400
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/wait.h>
 #include <linux/hash.h>
+#include <linux/kthread.h>

 void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
 {
@@ -297,6 +298,10 @@
 }
 EXPORT_SYMBOL(autoremove_wake_function);

+static inline bool is_kthread_should_stop(void)
+{
+   return (current->flags & PF_KTHREAD) && kthread_should_stop();
+}

 /*
  * DEFINE_WAIT_FUNC(wait, woken_wake_func);
@@ -326,7 +331,7 @@
     * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
     * also observe all state before the wakeup.
     */
-   if (!(wait->flags & WQ_FLAG_WOKEN))
+   if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
        timeout = schedule_timeout(timeout);
    __set_current_state(TASK_RUNNING);

@@ -336,7 +341,7 @@
     * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
     * an event.
     */
-   set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+   smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */

    return timeout;
 }
@@ -349,7 +354,7 @@
     * doesn't imply write barrier and the users expects write
     * barrier semantics on wakeup functions.  The following
     * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
-    * and is paired with set_mb() in wait_woken().
+    * and is paired with smp_store_mb() in wait_woken().
     */
    smp_wmb(); /* C */
    wait->flags |= WQ_FLAG_WOKEN;
diff -Nur /home/ninez/android/marlin/kernel/sched/walt.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/walt.c
--- /home/ninez/android/marlin/kernel/sched/walt.c  2018-08-10 01:54:08.566728454 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/walt.c  2018-08-11 23:57:17.131940887 -0400
@@ -20,7 +20,6 @@
  */

 #include <linux/syscore_ops.h>
-#include <linux/cpufreq.h>
 #include <trace/events/sched.h>
 #include "sched.h"
 #include "walt.h"
@@ -42,57 +41,49 @@

 unsigned int sysctl_sched_walt_init_task_load_pct = 15;

-/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
-unsigned int __read_mostly walt_disabled = 0;
-
-static unsigned int max_possible_efficiency = 1024;
-static unsigned int min_possible_efficiency = 1024;
+/* true -> use PELT based load stats, false -> use window-based load stats */
+bool __read_mostly walt_disabled = false;

 /*
- * Maximum possible frequency across all cpus. Task demand and cpu
- * capacity (cpu_power) metrics are scaled in reference to it.
+ * Window size (in ns). Adjust for the tick size so that the window
+ * rollover occurs just before the tick boundary.
  */
-static unsigned int max_possible_freq = 1;
-
-/*
- * Minimum possible max_freq across all cpus. This will be same as
- * max_possible_freq on homogeneous systems and could be different from
- * max_possible_freq on heterogenous systems. min_max_freq is used to derive
- * capacity (cpu_power) of cpus.
- */
-static unsigned int min_max_freq = 1;
-
-static unsigned int max_capacity = 1024;
-static unsigned int min_capacity = 1024;
-static unsigned int max_load_scale_factor = 1024;
-static unsigned int max_possible_capacity = 1024;
-
-/* Mask of all CPUs that have  max_possible_capacity */
-static cpumask_t mpc_mask = CPU_MASK_ALL;
-
-/* Window size (in ns) */
-__read_mostly unsigned int walt_ravg_window = 20000000;
-
-/* Min window size (in ns) = 10ms */
-#define MIN_SCHED_RAVG_WINDOW 10000000
-
-/* Max window size (in ns) = 1s */
-#define MAX_SCHED_RAVG_WINDOW 1000000000
+__read_mostly unsigned int walt_ravg_window =
+                       (20000000 / TICK_NSEC) * TICK_NSEC;
+#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC)
+#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC)

 static unsigned int sync_cpu;
 static ktime_t ktime_last;
-static bool walt_ktime_suspended;
+static __read_mostly bool walt_ktime_suspended;

 static unsigned int task_load(struct task_struct *p)
 {
    return p->ravg.demand;
 }

+static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
+{
+   rq->cum_window_demand += delta;
+   if (unlikely((s64)rq->cum_window_demand < 0))
+       rq->cum_window_demand = 0;
+}
+
 void
 walt_inc_cumulative_runnable_avg(struct rq *rq,
                 struct task_struct *p)
 {
    rq->cumulative_runnable_avg += p->ravg.demand;
+
+   /*
+    * Add a task's contribution to the cumulative window demand when
+    *
+    * (1) task is enqueued with on_rq = 1 i.e migration,
+    *     prio/cgroup/class change.
+    * (2) task is waking for the first time in this window.
+    */
+   if (p->on_rq || (p->last_sleep_ts < rq->window_start))
+       fixup_cum_window_demand(rq, p->ravg.demand);
 }

 void
@@ -101,16 +92,28 @@
 {
    rq->cumulative_runnable_avg -= p->ravg.demand;
    BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+
+   /*
+    * on_rq will be 1 for sleeping tasks. So check if the task
+    * is migrating or dequeuing in RUNNING state to change the
+    * prio/cgroup/class.
+    */
+   if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
+       fixup_cum_window_demand(rq, -(s64)p->ravg.demand);
 }

 static void
 fixup_cumulative_runnable_avg(struct rq *rq,
-                 struct task_struct *p, s64 task_load_delta)
+                 struct task_struct *p, u64 new_task_load)
 {
+   s64 task_load_delta = (s64)new_task_load - task_load(p);
+
    rq->cumulative_runnable_avg += task_load_delta;
    if ((s64)rq->cumulative_runnable_avg < 0)
        panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
            task_load_delta, task_load(p));
+
+   fixup_cum_window_demand(rq, task_load_delta);
 }

 u64 walt_ktime_clock(void)
@@ -169,16 +172,33 @@

 static int __init set_walt_ravg_window(char *str)
 {
+   unsigned int adj_window;
+   bool no_walt = walt_disabled;
+
    get_option(&str, &walt_ravg_window);

-   walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
-               walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+   /* Adjust for CONFIG_HZ */
+   adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC;
+
+   /* Warn if we're a bit too far away from the expected window size */
+   WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC,
+        "tick-adjusted window size %u, original was %u\n", adj_window,
+        walt_ravg_window);
+
+   walt_ravg_window = adj_window;
+
+   walt_disabled = walt_disabled ||
+           (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+            walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+
+   WARN(!no_walt && walt_disabled,
+        "invalid window size, disabling WALT\n");
+
    return 0;
 }

 early_param("walt_ravg_window", set_walt_ravg_window);

-extern u64 arch_counter_get_cntpct(void);
 static void
 update_window_start(struct rq *rq, u64 wallclock)
 {
@@ -188,10 +208,8 @@
    delta = wallclock - rq->window_start;
    /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
    if (delta < 0) {
-       if (arch_counter_get_cntpct() == 0)
-           delta = 0;
-       else
-           BUG_ON(1);
+       delta = 0;
+       WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
    }

    if (delta < walt_ravg_window)
@@ -199,26 +217,20 @@

    nr_windows = div64_u64(delta, walt_ravg_window);
    rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+
+   rq->cum_window_demand = rq->cumulative_runnable_avg;
 }

+/*
+ * Translate absolute delta time accounted on a CPU
+ * to a scale where 1024 is the capacity of the most
+ * capable CPU running at FMAX
+ */
 static u64 scale_exec_time(u64 delta, struct rq *rq)
 {
-   unsigned int cur_freq = rq->cur_freq;
-   int sf;
-
-   if (unlikely(cur_freq > max_possible_freq))
-       cur_freq = rq->max_possible_freq;
+   unsigned long capcurr = capacity_curr_of(cpu_of(rq));

-   /* round up div64 */
-   delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
-             max_possible_freq);
-
-   sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency);
-
-   delta *= sf;
-   delta >>= 10;
-
-   return delta;
+   return (delta * capcurr) >> SCHED_CAPACITY_SHIFT;
 }

 static int cpu_is_waiting_on_io(struct rq *rq)
@@ -595,10 +607,20 @@
     * A throttled deadline sched class task gets dequeued without
     * changing p->on_rq. Since the dequeue decrements hmp stats
     * avoid decrementing it here again.
+    *
+    * When window is rolled over, the cumulative window demand
+    * is reset to the cumulative runnable average (contribution from
+    * the tasks on the runqueue). If the current task is dequeued
+    * already, it's demand is not included in the cumulative runnable
+    * average. So add the task demand separately to cumulative window
+    * demand.
     */
-   if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
-                       !p->dl.dl_throttled))
-       fixup_cumulative_runnable_avg(rq, p, demand);
+   if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
+       if (task_on_rq_queued(p))
+           fixup_cumulative_runnable_avg(rq, p, demand);
+       else if (rq->curr == p)
+           fixup_cum_window_demand(rq, demand);
+   }

    p->ravg.demand = demand;

@@ -741,33 +763,6 @@
    p->ravg.mark_start = wallclock;
 }

-unsigned long __weak arch_get_cpu_efficiency(int cpu)
-{
-   return SCHED_LOAD_SCALE;
-}
-
-void walt_init_cpu_efficiency(void)
-{
-   int i, efficiency;
-   unsigned int max = 0, min = UINT_MAX;
-
-   for_each_possible_cpu(i) {
-       efficiency = arch_get_cpu_efficiency(i);
-       cpu_rq(i)->efficiency = efficiency;
-
-       if (efficiency > max)
-           max = efficiency;
-       if (efficiency < min)
-           min = efficiency;
-   }
-
-   if (max)
-       max_possible_efficiency = max;
-
-   if (min)
-       min_possible_efficiency = min;
-}
-
 static void reset_task_stats(struct task_struct *p)
 {
    u32 sum = 0;
@@ -799,11 +794,11 @@
    int cpu = cpu_of(rq);
    struct rq *sync_rq = cpu_rq(sync_cpu);

-   if (rq->window_start)
+   if (likely(rq->window_start))
        return;

    if (cpu == sync_cpu) {
-       rq->window_start = walt_ktime_clock();
+       rq->window_start = 1;
    } else {
        raw_spin_unlock(&rq->lock);
        double_rq_lock(rq, sync_rq);
@@ -846,6 +841,17 @@

    walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);

+   /*
+    * When a task is migrating during the wakeup, adjust
+    * the task's contribution towards cumulative window
+    * demand.
+    */
+   if (p->state == TASK_WAKING &&
+       p->last_sleep_ts >= src_rq->window_start) {
+       fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand);
+       fixup_cum_window_demand(dest_rq, p->ravg.demand);
+   }
+
    if (p->ravg.curr_window) {
        src_rq->curr_runnable_sum -= p->ravg.curr_window;
        dest_rq->curr_runnable_sum += p->ravg.curr_window;
@@ -872,283 +878,6 @@
        double_rq_unlock(src_rq, dest_rq);
 }

-/* Keep track of max/min capacity possible across CPUs "currently" */
-static void __update_min_max_capacity(void)
-{
-   int i;
-   int max = 0, min = INT_MAX;
-
-   for_each_online_cpu(i) {
-       if (cpu_rq(i)->capacity > max)
-           max = cpu_rq(i)->capacity;
-       if (cpu_rq(i)->capacity < min)
-           min = cpu_rq(i)->capacity;
-   }
-
-   max_capacity = max;
-   min_capacity = min;
-}
-
-static void update_min_max_capacity(void)
-{
-   unsigned long flags;
-   int i;
-
-   local_irq_save(flags);
-   for_each_possible_cpu(i)
-       raw_spin_lock(&cpu_rq(i)->lock);
-
-   __update_min_max_capacity();
-
-   for_each_possible_cpu(i)
-       raw_spin_unlock(&cpu_rq(i)->lock);
-   local_irq_restore(flags);
-}
-
-/*
- * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
- * least efficient cpu gets capacity of 1024
- */
-static unsigned long capacity_scale_cpu_efficiency(int cpu)
-{
-   return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency;
-}
-
-/*
- * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
- * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
- */
-static unsigned long capacity_scale_cpu_freq(int cpu)
-{
-   return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq;
-}
-
-/*
- * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
- * that "most" efficient cpu gets a load_scale_factor of 1
- */
-static unsigned long load_scale_cpu_efficiency(int cpu)
-{
-   return DIV_ROUND_UP(1024 * max_possible_efficiency,
-               cpu_rq(cpu)->efficiency);
-}
-
-/*
- * Return load_scale_factor of a cpu in reference to cpu with best max_freq
- * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
- * of 1.
- */
-static unsigned long load_scale_cpu_freq(int cpu)
-{
-   return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq);
-}
-
-static int compute_capacity(int cpu)
-{
-   int capacity = 1024;
-
-   capacity *= capacity_scale_cpu_efficiency(cpu);
-   capacity >>= 10;
-
-   capacity *= capacity_scale_cpu_freq(cpu);
-   capacity >>= 10;
-
-   return capacity;
-}
-
-static int compute_load_scale_factor(int cpu)
-{
-   int load_scale = 1024;
-
-   /*
-    * load_scale_factor accounts for the fact that task load
-    * is in reference to "best" performing cpu. Task's load will need to be
-    * scaled (up) by a factor to determine suitability to be placed on a
-    * (little) cpu.
-    */
-   load_scale *= load_scale_cpu_efficiency(cpu);
-   load_scale >>= 10;
-
-   load_scale *= load_scale_cpu_freq(cpu);
-   load_scale >>= 10;
-
-   return load_scale;
-}
-
-static int cpufreq_notifier_policy(struct notifier_block *nb,
-       unsigned long val, void *data)
-{
-   struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
-   int i, update_max = 0;
-   u64 highest_mpc = 0, highest_mplsf = 0;
-   const struct cpumask *cpus = policy->related_cpus;
-   unsigned int orig_min_max_freq = min_max_freq;
-   unsigned int orig_max_possible_freq = max_possible_freq;
-   /* Initialized to policy->max in case policy->related_cpus is empty! */
-   unsigned int orig_max_freq = policy->max;
-
-   if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
-                       val != CPUFREQ_CREATE_POLICY)
-       return 0;
-
-   if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
-       update_min_max_capacity();
-       return 0;
-   }
-
-   for_each_cpu(i, policy->related_cpus) {
-       cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
-                policy->related_cpus);
-       orig_max_freq = cpu_rq(i)->max_freq;
-       cpu_rq(i)->min_freq = policy->min;
-       cpu_rq(i)->max_freq = policy->max;
-       cpu_rq(i)->cur_freq = policy->cur;
-       cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq;
-   }
-
-   max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
-   if (min_max_freq == 1)
-       min_max_freq = UINT_MAX;
-   min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
-   BUG_ON(!min_max_freq);
-   BUG_ON(!policy->max);
-
-   /* Changes to policy other than max_freq don't require any updates */
-   if (orig_max_freq == policy->max)
-       return 0;
-
-   /*
-    * A changed min_max_freq or max_possible_freq (possible during bootup)
-    * needs to trigger re-computation of load_scale_factor and capacity for
-    * all possible cpus (even those offline). It also needs to trigger
-    * re-computation of nr_big_task count on all online cpus.
-    *
-    * A changed rq->max_freq otoh needs to trigger re-computation of
-    * load_scale_factor and capacity for just the cluster of cpus involved.
-    * Since small task definition depends on max_load_scale_factor, a
-    * changed load_scale_factor of one cluster could influence
-    * classification of tasks in another cluster. Hence a changed
-    * rq->max_freq will need to trigger re-computation of nr_big_task
-    * count on all online cpus.
-    *
-    * While it should be sufficient for nr_big_tasks to be
-    * re-computed for only online cpus, we have inadequate context
-    * information here (in policy notifier) with regard to hotplug-safety
-    * context in which notification is issued. As a result, we can't use
-    * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is
-    * fixed up to issue notification always in hotplug-safe context,
-    * re-compute nr_big_task for all possible cpus.
-    */
-
-   if (orig_min_max_freq != min_max_freq ||
-       orig_max_possible_freq != max_possible_freq) {
-           cpus = cpu_possible_mask;
-           update_max = 1;
-   }
-
-   /*
-    * Changed load_scale_factor can trigger reclassification of tasks as
-    * big or small. Make this change "atomic" so that tasks are accounted
-    * properly due to changed load_scale_factor
-    */
-   for_each_cpu(i, cpus) {
-       struct rq *rq = cpu_rq(i);
-
-       rq->capacity = compute_capacity(i);
-       rq->load_scale_factor = compute_load_scale_factor(i);
-
-       if (update_max) {
-           u64 mpc, mplsf;
-
-           mpc = div_u64(((u64) rq->capacity) *
-               rq->max_possible_freq, rq->max_freq);
-           rq->max_possible_capacity = (int) mpc;
-
-           mplsf = div_u64(((u64) rq->load_scale_factor) *
-               rq->max_possible_freq, rq->max_freq);
-
-           if (mpc > highest_mpc) {
-               highest_mpc = mpc;
-               cpumask_clear(&mpc_mask);
-               cpumask_set_cpu(i, &mpc_mask);
-           } else if (mpc == highest_mpc) {
-               cpumask_set_cpu(i, &mpc_mask);
-           }
-
-           if (mplsf > highest_mplsf)
-               highest_mplsf = mplsf;
-       }
-   }
-
-   if (update_max) {
-       max_possible_capacity = highest_mpc;
-       max_load_scale_factor = highest_mplsf;
-   }
-
-   __update_min_max_capacity();
-
-   return 0;
-}
-
-static int cpufreq_notifier_trans(struct notifier_block *nb,
-       unsigned long val, void *data)
-{
-   struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
-   unsigned int cpu = freq->cpu, new_freq = freq->new;
-   unsigned long flags;
-   int i;
-
-   if (val != CPUFREQ_POSTCHANGE)
-       return 0;
-
-   BUG_ON(!new_freq);
-
-   if (cpu_rq(cpu)->cur_freq == new_freq)
-       return 0;
-
-   for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) {
-       struct rq *rq = cpu_rq(i);
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
-                     walt_ktime_clock(), 0);
-       rq->cur_freq = new_freq;
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-   }
-
-   return 0;
-}
-
-static struct notifier_block notifier_policy_block = {
-   .notifier_call = cpufreq_notifier_policy
-};
-
-static struct notifier_block notifier_trans_block = {
-   .notifier_call = cpufreq_notifier_trans
-};
-
-static int register_sched_callback(void)
-{
-   int ret;
-
-   ret = cpufreq_register_notifier(&notifier_policy_block,
-                       CPUFREQ_POLICY_NOTIFIER);
-
-   if (!ret)
-       ret = cpufreq_register_notifier(&notifier_trans_block,
-                       CPUFREQ_TRANSITION_NOTIFIER);
-
-   return 0;
-}
-
-/*
- * cpufreq callbacks can be registered at core_initcall or later time.
- * Any registration done prior to that is "forgotten" by cpufreq. See
- * initialization of variable init_cpufreq_transition_notifier_list_called
- * for further information.
- */
-core_initcall(register_sched_callback);
-
 void walt_init_new_task_load(struct task_struct *p)
 {
    int i;
diff -Nur /home/ninez/android/marlin/kernel/sched/walt.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/walt.h
--- /home/ninez/android/marlin/kernel/sched/walt.h  2018-08-10 01:54:08.566728454 -0400
+++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/walt.h  2018-08-11 23:57:17.131940887 -0400
@@ -55,8 +55,10 @@
 static inline void walt_init_cpu_efficiency(void) { }
 static inline u64 walt_ktime_clock(void) { return 0; }

+#define walt_cpu_high_irqload(cpu) false
+
 #endif /* CONFIG_SCHED_WALT */

-extern unsigned int walt_disabled;
+extern bool walt_disabled;

 #endif