Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- diff -Nur /home/ninez/android/marlin/kernel/sched/auto_group.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/auto_group.c
- --- /home/ninez/android/marlin/kernel/sched/auto_group.c 2018-08-10 01:54:08.563395055 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/auto_group.c 2018-08-11 23:57:17.128607487 -0400
- @@ -214,7 +214,7 @@
- ag = autogroup_task_get(p);
- down_write(&ag->lock);
- - err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
- + err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]);
- if (!err)
- ag->nice = nice;
- up_write(&ag->lock);
- diff -Nur /home/ninez/android/marlin/kernel/sched/boost.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/boost.c
- --- /home/ninez/android/marlin/kernel/sched/boost.c 1969-12-31 19:00:00.000000000 -0500
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/boost.c 2018-08-14 15:53:43.604124856 -0400
- @@ -0,0 +1,68 @@
- +/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved.
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License version 2 and
- + * only version 2 as published by the Free Software Foundation.
- + *
- + * This program is distributed in the hope that it will be useful,
- + * but WITHOUT ANY WARRANTY; without even the implied warranty of
- + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- + * GNU General Public License for more details.
- + */
- +
- +#include "sched.h"
- +#include <linux/sched.h>
- +
- +/*
- + * Scheduler boost is a mechanism to temporarily place tasks on CPUs
- + * with higher capacity than those where a task would have normally
- + * ended up with their load characteristics. Any entity enabling
- + * boost is responsible for disabling it as well.
- + */
- +
- +unsigned int sysctl_sched_boost;
- +
- +static bool verify_boost_params(int old_val, int new_val)
- +{
- + /*
- + * Boost can only be turned on or off. There is no possiblity of
- + * switching from one boost type to another or to set the same
- + * kind of boost several times.
- + */
- + return !(!!old_val == !!new_val);
- +}
- +
- +int sched_boost_handler(struct ctl_table *table, int write,
- + void __user *buffer, size_t *lenp,
- + loff_t *ppos)
- +{
- + int ret;
- + unsigned int *data = (unsigned int *)table->data;
- + unsigned int old_val;
- + unsigned int dsb_top_app_boost = 30;
- + unsigned int dsb_top_app_floor = 0;
- +
- + // Backup current sysctl_sched_boost value
- + old_val = *data;
- +
- + // Set new sysctl_sched_boost value
- + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- +
- + if (ret || !write)
- + goto done;
- +
- +#ifdef CONFIG_DYNAMIC_STUNE_BOOST
- + if (verify_boost_params(old_val, *data)) {
- + if (*data > 0)
- + do_stune_boost("top-app", dsb_top_app_boost);
- + else
- + do_stune_unboost("top-app", dsb_top_app_floor);
- + } else {
- + *data = old_val;
- + ret = -EINVAL;
- + }
- +#endif // CONFIG_DYNAMIC_STUNE_BOOST
- +
- +done:
- + return ret;
- +}
- diff -Nur /home/ninez/android/marlin/kernel/sched/completion.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/completion.c
- --- /home/ninez/android/marlin/kernel/sched/completion.c 2018-08-10 01:54:08.563395055 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/completion.c 2018-08-12 21:13:57.906629665 -0400
- @@ -30,10 +30,10 @@
- {
- unsigned long flags;
- - spin_lock_irqsave(&x->wait.lock, flags);
- + raw_spin_lock_irqsave(&x->wait.lock, flags);
- x->done++;
- - __wake_up_locked(&x->wait, TASK_NORMAL, 1);
- - spin_unlock_irqrestore(&x->wait.lock, flags);
- + swake_up_locked(&x->wait);
- + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
- }
- EXPORT_SYMBOL(complete);
- @@ -50,10 +50,10 @@
- {
- unsigned long flags;
- - spin_lock_irqsave(&x->wait.lock, flags);
- + raw_spin_lock_irqsave(&x->wait.lock, flags);
- x->done += UINT_MAX/2;
- - __wake_up_locked(&x->wait, TASK_NORMAL, 0);
- - spin_unlock_irqrestore(&x->wait.lock, flags);
- + swake_up_all_locked(&x->wait);
- + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
- }
- EXPORT_SYMBOL(complete_all);
- @@ -62,20 +62,20 @@
- long (*action)(long), long timeout, int state)
- {
- if (!x->done) {
- - DECLARE_WAITQUEUE(wait, current);
- + DECLARE_SWAITQUEUE(wait);
- - __add_wait_queue_tail_exclusive(&x->wait, &wait);
- + __prepare_to_swait(&x->wait, &wait);
- do {
- if (signal_pending_state(state, current)) {
- timeout = -ERESTARTSYS;
- break;
- }
- __set_current_state(state);
- - spin_unlock_irq(&x->wait.lock);
- + raw_spin_unlock_irq(&x->wait.lock);
- timeout = action(timeout);
- - spin_lock_irq(&x->wait.lock);
- + raw_spin_lock_irq(&x->wait.lock);
- } while (!x->done && timeout);
- - __remove_wait_queue(&x->wait, &wait);
- + __finish_swait(&x->wait, &wait);
- if (!x->done)
- return timeout;
- }
- @@ -89,9 +89,9 @@
- {
- might_sleep();
- - spin_lock_irq(&x->wait.lock);
- + raw_spin_lock_irq(&x->wait.lock);
- timeout = do_wait_for_common(x, action, timeout, state);
- - spin_unlock_irq(&x->wait.lock);
- + raw_spin_unlock_irq(&x->wait.lock);
- return timeout;
- }
- @@ -267,12 +267,21 @@
- unsigned long flags;
- int ret = 1;
- - spin_lock_irqsave(&x->wait.lock, flags);
- + /*
- + * Since x->done will need to be locked only
- + * in the non-blocking case, we check x->done
- + * first without taking the lock so we can
- + * return early in the blocking case.
- + */
- + if (!READ_ONCE(x->done))
- + return 0;
- +
- + raw_spin_lock_irqsave(&x->wait.lock, flags);
- if (!x->done)
- ret = 0;
- else
- x->done--;
- - spin_unlock_irqrestore(&x->wait.lock, flags);
- + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
- return ret;
- }
- EXPORT_SYMBOL(try_wait_for_completion);
- @@ -287,13 +296,21 @@
- */
- bool completion_done(struct completion *x)
- {
- - unsigned long flags;
- - int ret = 1;
- + if (!READ_ONCE(x->done))
- + return false;
- - spin_lock_irqsave(&x->wait.lock, flags);
- - if (!x->done)
- - ret = 0;
- - spin_unlock_irqrestore(&x->wait.lock, flags);
- - return ret;
- + /*
- + * If ->done, we need to wait for complete() to release ->wait.lock
- + * otherwise we can end up freeing the completion before complete()
- + * is done referencing it.
- + *
- + * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
- + * the loads of ->done and ->wait.lock such that we cannot observe
- + * the lock before complete() acquires it while observing the ->done
- + * after it's acquired the lock.
- + */
- + smp_rmb();
- + raw_spin_unlock_wait(&x->wait.lock);
- + return true;
- }
- EXPORT_SYMBOL(completion_done);
- diff -Nur /home/ninez/android/marlin/kernel/sched/core.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/core.c
- --- /home/ninez/android/marlin/kernel/sched/core.c 2018-08-10 01:54:08.563395055 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/core.c 2018-08-26 16:43:11.647206295 -0400
- @@ -94,7 +94,6 @@
- #define CREATE_TRACE_POINTS
- #include <trace/events/sched.h>
- #include "walt.h"
- -#include "tune.h"
- DEFINE_MUTEX(sched_domains_mutex);
- DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
- @@ -105,7 +104,9 @@
- {
- s64 delta;
- - if (rq->skip_clock_update > 0)
- + lockdep_assert_held(&rq->lock);
- +
- + if (rq->clock_skip_update & RQCF_ACT_SKIP)
- return;
- delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
- @@ -168,14 +169,12 @@
- static void sched_feat_disable(int i)
- {
- - if (static_key_enabled(&sched_feat_keys[i]))
- - static_key_slow_dec(&sched_feat_keys[i]);
- + static_key_disable(&sched_feat_keys[i]);
- }
- static void sched_feat_enable(int i)
- {
- - if (!static_key_enabled(&sched_feat_keys[i]))
- - static_key_slow_inc(&sched_feat_keys[i]);
- + static_key_enable(&sched_feat_keys[i]);
- }
- #else
- static void sched_feat_disable(int i) { };
- @@ -290,10 +289,40 @@
- */
- int sysctl_sched_rt_runtime = 950000;
- +/* cpus with isolated domains */
- +cpumask_var_t cpu_isolated_map;
- +
- +struct rq *
- +lock_rq_of(struct task_struct *p, struct rq_flags *rf)
- +{
- + return task_rq_lock(p, rf);
- +}
- +
- +void
- +unlock_rq_of(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
- +{
- + task_rq_unlock(rq, p, rf);
- +}
- +
- +/*
- + * this_rq_lock - lock this runqueue and disable interrupts.
- + */
- +static struct rq *this_rq_lock(void)
- + __acquires(rq->lock)
- +{
- + struct rq *rq;
- +
- + local_irq_disable();
- + rq = this_rq();
- + raw_spin_lock(&rq->lock);
- +
- + return rq;
- +}
- +
- /*
- * __task_rq_lock - lock the rq @p resides on.
- */
- -static inline struct rq *__task_rq_lock(struct task_struct *p)
- +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
- __acquires(rq->lock)
- {
- struct rq *rq;
- @@ -303,8 +332,10 @@
- for (;;) {
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
- + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
- + rf->cookie = lockdep_pin_lock(&rq->lock);
- return rq;
- + }
- raw_spin_unlock(&rq->lock);
- while (unlikely(task_on_rq_migrating(p)))
- @@ -315,68 +346,44 @@
- /*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
- -static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
- +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
- __acquires(p->pi_lock)
- __acquires(rq->lock)
- {
- struct rq *rq;
- for (;;) {
- - raw_spin_lock_irqsave(&p->pi_lock, *flags);
- + raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
- + /*
- + * move_queued_task() task_rq_lock()
- + *
- + * ACQUIRE (rq->lock)
- + * [S] ->on_rq = MIGRATING [L] rq = task_rq()
- + * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
- + * [S] ->cpu = new_cpu [L] task_rq()
- + * [L] ->on_rq
- + * RELEASE (rq->lock)
- + *
- + * If we observe the old cpu in task_rq_lock, the acquire of
- + * the old rq->lock will fully serialize against the stores.
- + *
- + * If we observe the new cpu in task_rq_lock, the acquire will
- + * pair with the WMB to ensure we must then also see migrating.
- + */
- + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
- + rf->cookie = lockdep_pin_lock(&rq->lock);
- return rq;
- + }
- raw_spin_unlock(&rq->lock);
- - raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
- + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
- while (unlikely(task_on_rq_migrating(p)))
- cpu_relax();
- }
- }
- -struct rq *
- -lock_rq_of(struct task_struct *p, unsigned long *flags)
- -{
- - return task_rq_lock(p, flags);
- -}
- -
- -static void __task_rq_unlock(struct rq *rq)
- - __releases(rq->lock)
- -{
- - raw_spin_unlock(&rq->lock);
- -}
- -
- -static inline void
- -task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
- - __releases(rq->lock)
- - __releases(p->pi_lock)
- -{
- - raw_spin_unlock(&rq->lock);
- - raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
- -}
- -
- -void
- -unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags)
- -{
- - task_rq_unlock(rq, p, flags);
- -}
- -
- -/*
- - * this_rq_lock - lock this runqueue and disable interrupts.
- - */
- -static struct rq *this_rq_lock(void)
- - __acquires(rq->lock)
- -{
- - struct rq *rq;
- -
- - local_irq_disable();
- - rq = this_rq();
- - raw_spin_lock(&rq->lock);
- -
- - return rq;
- -}
- -
- #ifdef CONFIG_SCHED_HRTICK
- /*
- * Use HR-timers to deliver accurate preemption points.
- @@ -531,15 +538,19 @@
- /*
- * cmpxchg based fetch_or, macro so it works for different integer types
- */
- -#define fetch_or(ptr, val) \
- -({ typeof(*(ptr)) __old, __val = *(ptr); \
- - for (;;) { \
- - __old = cmpxchg((ptr), __val, __val | (val)); \
- - if (__old == __val) \
- - break; \
- - __val = __old; \
- - } \
- - __old; \
- +#define fetch_or(ptr, mask) \
- + ({ \
- + typeof(ptr) _ptr = (ptr); \
- + typeof(mask) _mask = (mask); \
- + typeof(*_ptr) _old, _val = *_ptr; \
- + \
- + for (;;) { \
- + _old = cmpxchg(_ptr, _val, _val | _mask); \
- + if (_old == _val) \
- + break; \
- + _val = _old; \
- + } \
- + _old; \
- })
- #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
- @@ -593,6 +604,58 @@
- #endif
- #endif
- +void wake_q_add(struct wake_q_head *head, struct task_struct *task)
- +{
- + struct wake_q_node *node = &task->wake_q;
- +
- + /*
- + * Atomically grab the task, if ->wake_q is !nil already it means
- + * its already queued (either by us or someone else) and will get the
- + * wakeup due to that.
- + *
- + * This cmpxchg() implies a full barrier, which pairs with the write
- + * barrier implied by the wakeup in wake_up_list().
- + */
- + if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
- + return;
- +
- + head->count++;
- +
- + get_task_struct(task);
- +
- + /*
- + * The head is context local, there can be no concurrency.
- + */
- + *head->lastp = node;
- + head->lastp = &node->next;
- +}
- +
- +static int
- +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
- + int sibling_count_hint);
- +
- +void wake_up_q(struct wake_q_head *head)
- +{
- + struct wake_q_node *node = head->first;
- +
- + while (node != WAKE_Q_TAIL) {
- + struct task_struct *task;
- +
- + task = container_of(node, struct task_struct, wake_q);
- + BUG_ON(!task);
- + /* task can safely be re-inserted now */
- + node = node->next;
- + task->wake_q.next = NULL;
- +
- + /*
- + * try_to_wake_up() implies a wmb() to pair with the queueing
- + * in wake_q_add() so as not to miss wakeups.
- + */
- + try_to_wake_up(task, TASK_NORMAL, 0, head->count);
- + put_task_struct(task);
- + }
- +}
- +
- /*
- * resched_curr - mark rq's current task 'to be rescheduled now'.
- *
- @@ -629,9 +692,9 @@
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
- - raw_spin_lock_irqsave(&rq->lock, flags);
- - if (cpu_online(cpu) || cpu == smp_processor_id())
- - resched_curr(rq);
- + if (!raw_spin_trylock_irqsave(&rq->lock, flags))
- + return;
- + resched_curr(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- }
- @@ -745,6 +808,23 @@
- bool sched_can_stop_tick(void)
- {
- /*
- + * FIFO realtime policy runs the highest priority task. Other runnable
- + * tasks are of a lower priority. The scheduler tick does nothing.
- + */
- + if (current->policy == SCHED_FIFO)
- + return true;
- +
- + /*
- + * Round-robin realtime tasks time slice with other tasks at the same
- + * realtime priority. Is this task the only one at this priority?
- + */
- + if (current->policy == SCHED_RR) {
- + struct sched_rt_entity *rt_se = ¤t->rt;
- +
- + return rt_se->run_list.prev == rt_se->run_list.next;
- + }
- +
- + /*
- * More than one running task need preemption.
- * nr_running update is assumed to be visible
- * after IPI is sent from wakers.
- @@ -844,27 +924,29 @@
- /*
- * SCHED_IDLE tasks get minimal weight:
- */
- - if (p->policy == SCHED_IDLE) {
- + if (idle_policy(p->policy)) {
- load->weight = scale_load(WEIGHT_IDLEPRIO);
- load->inv_weight = WMULT_IDLEPRIO;
- return;
- }
- - load->weight = scale_load(prio_to_weight[prio]);
- - load->inv_weight = prio_to_wmult[prio];
- + load->weight = scale_load(sched_prio_to_weight[prio]);
- + load->inv_weight = sched_prio_to_wmult[prio];
- }
- -static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
- +static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
- {
- update_rq_clock(rq);
- - sched_info_queued(rq, p);
- + if (!(flags & ENQUEUE_RESTORE))
- + sched_info_queued(rq, p);
- p->sched_class->enqueue_task(rq, p, flags);
- }
- -static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
- +static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
- {
- update_rq_clock(rq);
- - sched_info_dequeued(rq, p);
- + if (!(flags & DEQUEUE_SAVE))
- + sched_info_dequeued(rq, p);
- p->sched_class->dequeue_task(rq, p, flags);
- }
- @@ -1069,10 +1151,37 @@
- * this case, we can save a useless back to back clock update.
- */
- if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
- - rq->skip_clock_update = 1;
- + rq_clock_skip_update(rq, true);
- }
- #ifdef CONFIG_SMP
- +
- +static inline bool is_per_cpu_kthread(struct task_struct *p)
- +{
- + if (!(p->flags & PF_KTHREAD))
- + return false;
- +
- + if (p->nr_cpus_allowed != 1)
- + return false;
- +
- + return true;
- +}
- +
- +/*
- + * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
- + * __set_cpus_allowed_ptr() and select_fallback_rq().
- + */
- +static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
- +{
- + if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
- + return false;
- +
- + if (is_per_cpu_kthread(p))
- + return cpu_online(cpu);
- +
- + return cpu_active(cpu);
- +}
- +
- /*
- * This is how migration works:
- *
- @@ -1092,14 +1201,12 @@
- *
- * Returns (locked) new rq. Old rq's lock is released.
- */
- -static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
- +static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
- {
- - struct rq *rq = task_rq(p);
- -
- lockdep_assert_held(&rq->lock);
- - dequeue_task(rq, p, 0);
- p->on_rq = TASK_ON_RQ_MIGRATING;
- + dequeue_task(rq, p, 0);
- double_lock_balance(rq, cpu_rq(new_cpu));
- set_task_cpu(p, new_cpu);
- double_unlock_balance(rq, cpu_rq(new_cpu));
- @@ -1109,8 +1216,8 @@
- raw_spin_lock(&rq->lock);
- BUG_ON(task_cpu(p) != new_cpu);
- - p->on_rq = TASK_ON_RQ_QUEUED;
- enqueue_task(rq, p, 0);
- + p->on_rq = TASK_ON_RQ_QUEUED;
- check_preempt_curr(rq, p, 0);
- return rq;
- @@ -1129,41 +1236,16 @@
- *
- * So we race with normal scheduler movements, but that's OK, as long
- * as the task is no longer on this CPU.
- - *
- - * Returns non-zero if task was successfully migrated.
- */
- -static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
- +static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
- {
- - struct rq *rq;
- - int ret = 0;
- -
- - if (unlikely(!cpu_active(dest_cpu)))
- - return ret;
- -
- - rq = cpu_rq(src_cpu);
- -
- - raw_spin_lock(&p->pi_lock);
- - raw_spin_lock(&rq->lock);
- - /* Already moved. */
- - if (task_cpu(p) != src_cpu)
- - goto done;
- -
- /* Affinity changed (again). */
- - if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- - goto fail;
- + if (!is_cpu_allowed(p, dest_cpu))
- + return rq;
- - /*
- - * If we're not on a rq, the next wake-up will ensure we're
- - * placed properly.
- - */
- - if (task_on_rq_queued(p))
- - rq = move_queued_task(p, dest_cpu);
- -done:
- - ret = 1;
- -fail:
- - raw_spin_unlock(&rq->lock);
- - raw_spin_unlock(&p->pi_lock);
- - return ret;
- + rq = move_queued_task(rq, p, dest_cpu);
- +
- + return rq;
- }
- /*
- @@ -1174,6 +1256,8 @@
- static int migration_cpu_stop(void *data)
- {
- struct migration_arg *arg = data;
- + struct task_struct *p = arg->task;
- + struct rq *rq = this_rq();
- /*
- * The original target cpu might have gone down and we might
- @@ -1186,20 +1270,77 @@
- * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
- */
- sched_ttwu_pending();
- - __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
- +
- + raw_spin_lock(&p->pi_lock);
- + raw_spin_lock(&rq->lock);
- + /*
- + * If task_rq(p) != rq, it cannot be migrated here, because we're
- + * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
- + * we're holding p->pi_lock.
- + */
- + if (task_rq(p) == rq && task_on_rq_queued(p))
- + rq = __migrate_task(rq, p, arg->dest_cpu);
- + raw_spin_unlock(&rq->lock);
- + raw_spin_unlock(&p->pi_lock);
- +
- local_irq_enable();
- return 0;
- }
- +/*
- + * sched_class::set_cpus_allowed must do the below, but is not required to
- + * actually call this function.
- + */
- +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
- +{
- + cpumask_copy(&p->cpus_allowed, new_mask);
- + p->nr_cpus_allowed = cpumask_weight(new_mask);
- +}
- +
- +static const struct cpumask *get_adjusted_cpumask(const struct task_struct *p,
- + const struct cpumask *req_mask)
- +{
- + /* Force all performance-critical kthreads onto the big cluster */
- + if (p->flags & PF_PERF_CRITICAL)
- + return cpu_perf_mask;
- +
- + /* Force all trivial, unbound kthreads onto the little cluster */
- + if (p->flags & PF_KTHREAD && p->pid != 1 &&
- + cpumask_equal(req_mask, cpu_all_mask))
- + return cpu_lp_mask;
- +
- + return req_mask;
- +}
- +
- void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
- {
- + struct rq *rq = task_rq(p);
- + bool queued, running;
- +
- + new_mask = get_adjusted_cpumask(p, new_mask);
- +
- lockdep_assert_held(&p->pi_lock);
- - if (p->sched_class->set_cpus_allowed)
- - p->sched_class->set_cpus_allowed(p, new_mask);
- + queued = task_on_rq_queued(p);
- + running = task_current(rq, p);
- - cpumask_copy(&p->cpus_allowed, new_mask);
- - p->nr_cpus_allowed = cpumask_weight(new_mask);
- + if (queued) {
- + /*
- + * Because __kthread_bind() calls this on blocked tasks without
- + * holding rq->lock.
- + */
- + lockdep_assert_held(&rq->lock);
- + dequeue_task(rq, p, DEQUEUE_SAVE);
- + }
- + if (running)
- + put_prev_task(rq, p);
- +
- + p->sched_class->set_cpus_allowed(p, new_mask);
- +
- + if (queued)
- + enqueue_task(rq, p, ENQUEUE_RESTORE);
- + if (running)
- + set_curr_task(rq, p);
- }
- /*
- @@ -1214,12 +1355,23 @@
- static int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask, bool check)
- {
- - unsigned long flags;
- - struct rq *rq;
- + const struct cpumask *cpu_valid_mask = cpu_active_mask;
- unsigned int dest_cpu;
- + struct rq_flags rf;
- + struct rq *rq;
- int ret = 0;
- - rq = task_rq_lock(p, &flags);
- + new_mask = get_adjusted_cpumask(p, new_mask);
- +
- + rq = task_rq_lock(p, &rf);
- + update_rq_clock(rq);
- +
- + if (p->flags & PF_KTHREAD) {
- + /*
- + * Kernel threads are allowed on online && !active CPUs
- + */
- + cpu_valid_mask = cpu_online_mask;
- + }
- /*
- * Must re-check here, to close a race against __kthread_bind(),
- @@ -1233,29 +1385,46 @@
- if (cpumask_equal(&p->cpus_allowed, new_mask))
- goto out;
- - if (!cpumask_intersects(new_mask, cpu_active_mask)) {
- + if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
- ret = -EINVAL;
- goto out;
- }
- do_set_cpus_allowed(p, new_mask);
- + if (p->flags & PF_KTHREAD) {
- + /*
- + * For kernel threads that do indeed end up on online &&
- + * !active we want to ensure they are strict per-cpu threads.
- + */
- + WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
- + !cpumask_intersects(new_mask, cpu_active_mask) &&
- + p->nr_cpus_allowed != 1);
- + }
- +
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
- goto out;
- - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- + dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
- if (task_running(rq, p) || p->state == TASK_WAKING) {
- struct migration_arg arg = { p, dest_cpu };
- /* Need help from migration thread: drop lock and wait. */
- - task_rq_unlock(rq, p, &flags);
- + task_rq_unlock(rq, p, &rf);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- tlb_migrate_finish(p->mm);
- return 0;
- - } else if (task_on_rq_queued(p))
- - rq = move_queued_task(p, dest_cpu);
- + } else if (task_on_rq_queued(p)) {
- + /*
- + * OK, since we're going to drop the lock immediately
- + * afterwards anyway.
- + */
- + lockdep_unpin_lock(&rq->lock, rf.cookie);
- + rq = move_queued_task(rq, p, dest_cpu);
- + lockdep_repin_lock(&rq->lock, rf.cookie);
- + }
- out:
- - task_rq_unlock(rq, p, &flags);
- + task_rq_unlock(rq, p, &rf);
- return ret;
- }
- @@ -1274,7 +1443,16 @@
- * ttwu() will sort out the placement.
- */
- WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
- - !(task_preempt_count(p) & PREEMPT_ACTIVE));
- + !p->on_rq);
- +
- + /*
- + * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
- + * because schedstat_wait_{start,end} rebase migrating task's wait_start
- + * time relying on p->on_rq.
- + */
- + WARN_ON_ONCE(p->state == TASK_RUNNING &&
- + p->sched_class == &fair_sched_class &&
- + (p->on_rq && !task_on_rq_migrating(p)));
- #ifdef CONFIG_LOCKDEP
- /*
- @@ -1296,7 +1474,7 @@
- if (task_cpu(p) != new_cpu) {
- if (p->sched_class->migrate_task_rq)
- - p->sched_class->migrate_task_rq(p, new_cpu);
- + p->sched_class->migrate_task_rq(p);
- p->se.nr_migrations++;
- perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
- @@ -1314,9 +1492,13 @@
- src_rq = task_rq(p);
- dst_rq = cpu_rq(cpu);
- + p->on_rq = TASK_ON_RQ_MIGRATING;
- deactivate_task(src_rq, p, 0);
- + p->on_rq = TASK_ON_RQ_MIGRATING;
- set_task_cpu(p, cpu);
- + p->on_rq = TASK_ON_RQ_QUEUED;
- activate_task(dst_rq, p, 0);
- + p->on_rq = TASK_ON_RQ_QUEUED;
- check_preempt_curr(dst_rq, p, 0);
- } else {
- /*
- @@ -1339,12 +1521,16 @@
- struct rq *src_rq, *dst_rq;
- int ret = -EAGAIN;
- + if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
- + return -EAGAIN;
- +
- src_rq = cpu_rq(arg->src_cpu);
- dst_rq = cpu_rq(arg->dst_cpu);
- double_raw_lock(&arg->src_task->pi_lock,
- &arg->dst_task->pi_lock);
- double_rq_lock(src_rq, dst_rq);
- +
- if (task_cpu(arg->dst_task) != arg->dst_cpu)
- goto unlock;
- @@ -1426,8 +1612,8 @@
- */
- unsigned long wait_task_inactive(struct task_struct *p, long match_state)
- {
- - unsigned long flags;
- int running, queued;
- + struct rq_flags rf;
- unsigned long ncsw;
- struct rq *rq;
- @@ -1462,14 +1648,14 @@
- * lock now, to be *sure*. If we're wrong, we'll
- * just go back and repeat.
- */
- - rq = task_rq_lock(p, &flags);
- + rq = task_rq_lock(p, &rf);
- trace_sched_wait_task(p);
- running = task_running(rq, p);
- queued = task_on_rq_queued(p);
- ncsw = 0;
- if (!match_state || p->state == match_state)
- ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- - task_rq_unlock(rq, p, &flags);
- + task_rq_unlock(rq, p, &rf);
- /*
- * If it changed from the expected state, bail out now.
- @@ -1543,6 +1729,25 @@
- /*
- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
- + *
- + * A few notes on cpu_active vs cpu_online:
- + *
- + * - cpu_active must be a subset of cpu_online
- + *
- + * - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
- + * see __set_cpus_allowed_ptr(). At this point the newly online
- + * cpu isn't yet part of the sched domains, and balancing will not
- + * see it.
- + *
- + * - on cpu-down we clear cpu_active() to mask the sched domains and
- + * avoid the load balancer to place new tasks on the to be removed
- + * cpu. Existing tasks will remain running there and will be taken
- + * off.
- + *
- + * This means that fallback selection must not select !active CPUs.
- + * And can assume that any active CPU must be online. Conversely
- + * select_task_rq() below may allow selection of !active CPUs in order
- + * to satisfy the above rules.
- */
- static int select_fallback_rq(int cpu, struct task_struct *p)
- {
- @@ -1561,8 +1766,6 @@
- /* Look for allowed, online CPU in same node. */
- for_each_cpu(dest_cpu, nodemask) {
- - if (!cpu_online(dest_cpu))
- - continue;
- if (!cpu_active(dest_cpu))
- continue;
- if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- @@ -1573,20 +1776,21 @@
- for (;;) {
- /* Any allowed, online CPU? */
- for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
- - if (!cpu_online(dest_cpu))
- - continue;
- - if (!cpu_active(dest_cpu))
- + if (!is_cpu_allowed(p, dest_cpu))
- continue;
- +
- goto out;
- }
- + /* No more Mr. Nice Guy. */
- switch (state) {
- case cpuset:
- - /* No more Mr. Nice Guy. */
- - cpuset_cpus_allowed_fallback(p);
- - state = possible;
- - break;
- -
- + if (IS_ENABLED(CONFIG_CPUSETS)) {
- + cpuset_cpus_allowed_fallback(p);
- + state = possible;
- + break;
- + }
- + /* fall-through */
- case possible:
- do_set_cpus_allowed(p, cpu_possible_mask);
- state = fail;
- @@ -1618,9 +1822,14 @@
- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
- */
- static inline
- -int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
- +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
- + int sibling_count_hint)
- {
- - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
- + lockdep_assert_held(&p->pi_lock);
- +
- + if (tsk_nr_cpus_allowed(p) > 1)
- + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
- + sibling_count_hint);
- /*
- * In order not to call set_task_cpu() on a blocking task we need
- @@ -1658,23 +1867,25 @@
- static void
- ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
- {
- -#ifdef CONFIG_SCHEDSTATS
- - struct rq *rq = this_rq();
- + struct rq *rq;
- -#ifdef CONFIG_SMP
- - int this_cpu = smp_processor_id();
- + if (!schedstat_enabled())
- + return;
- +
- + rq = this_rq();
- - if (cpu == this_cpu) {
- - schedstat_inc(rq, ttwu_local);
- - schedstat_inc(p, se.statistics.nr_wakeups_local);
- +#ifdef CONFIG_SMP
- + if (cpu == rq->cpu) {
- + schedstat_inc(rq->ttwu_local);
- + schedstat_inc(p->se.statistics.nr_wakeups_local);
- } else {
- struct sched_domain *sd;
- - schedstat_inc(p, se.statistics.nr_wakeups_remote);
- + schedstat_inc(p->se.statistics.nr_wakeups_remote);
- rcu_read_lock();
- - for_each_domain(this_cpu, sd) {
- + for_each_domain(rq->cpu, sd) {
- if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- - schedstat_inc(sd, ttwu_wake_remote);
- + schedstat_inc(sd->ttwu_wake_remote);
- break;
- }
- }
- @@ -1682,34 +1893,27 @@
- }
- if (wake_flags & WF_MIGRATED)
- - schedstat_inc(p, se.statistics.nr_wakeups_migrate);
- -
- + schedstat_inc(p->se.statistics.nr_wakeups_migrate);
- #endif /* CONFIG_SMP */
- - schedstat_inc(rq, ttwu_count);
- - schedstat_inc(p, se.statistics.nr_wakeups);
- + schedstat_inc(rq->ttwu_count);
- + schedstat_inc(p->se.statistics.nr_wakeups);
- if (wake_flags & WF_SYNC)
- - schedstat_inc(p, se.statistics.nr_wakeups_sync);
- -
- -#endif /* CONFIG_SCHEDSTATS */
- + schedstat_inc(p->se.statistics.nr_wakeups_sync);
- }
- static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
- {
- activate_task(rq, p, en_flags);
- p->on_rq = TASK_ON_RQ_QUEUED;
- -
- - /* if a worker is waking up, notify workqueue */
- - if (p->flags & PF_WQ_WORKER)
- - wq_worker_waking_up(p, cpu_of(rq));
- }
- /*
- * Mark the task runnable and perform wakeup-preemption.
- */
- -static void
- -ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
- +static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
- + struct pin_cookie cookie)
- {
- check_preempt_curr(rq, p, wake_flags);
- p->state = TASK_RUNNING;
- @@ -1718,9 +1922,12 @@
- #ifdef CONFIG_SMP
- if (p->sched_class->task_woken) {
- /*
- - * XXX can drop rq->lock; most likely ok.
- + * Our task @p is fully woken up and running; so its safe to
- + * drop the rq->lock, hereafter rq is only used for statistics.
- */
- + lockdep_unpin_lock(&rq->lock, cookie);
- p->sched_class->task_woken(rq, p);
- + lockdep_repin_lock(&rq->lock, cookie);
- }
- if (rq->idle_stamp) {
- @@ -1738,15 +1945,18 @@
- }
- static void
- -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
- +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
- + struct pin_cookie cookie)
- {
- + lockdep_assert_held(&rq->lock);
- +
- #ifdef CONFIG_SMP
- if (p->sched_contributes_to_load)
- rq->nr_uninterruptible--;
- #endif
- ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
- - ttwu_do_wakeup(rq, p, wake_flags);
- + ttwu_do_wakeup(rq, p, wake_flags, cookie);
- }
- /*
- @@ -1757,17 +1967,18 @@
- */
- static int ttwu_remote(struct task_struct *p, int wake_flags)
- {
- + struct rq_flags rf;
- struct rq *rq;
- int ret = 0;
- - rq = __task_rq_lock(p);
- + rq = __task_rq_lock(p, &rf);
- if (task_on_rq_queued(p)) {
- /* check_preempt_curr() may use rq clock */
- update_rq_clock(rq);
- - ttwu_do_wakeup(rq, p, wake_flags);
- + ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
- ret = 1;
- }
- - __task_rq_unlock(rq);
- + __task_rq_unlock(rq, &rf);
- return ret;
- }
- @@ -1777,6 +1988,7 @@
- {
- struct rq *rq = this_rq();
- struct llist_node *llist = llist_del_all(&rq->wake_list);
- + struct pin_cookie cookie;
- struct task_struct *p;
- unsigned long flags;
- @@ -1784,13 +1996,15 @@
- return;
- raw_spin_lock_irqsave(&rq->lock, flags);
- + cookie = lockdep_pin_lock(&rq->lock);
- while (llist) {
- p = llist_entry(llist, struct task_struct, wake_entry);
- llist = llist_next(llist);
- - ttwu_do_activate(rq, p, 0);
- + ttwu_do_activate(rq, p, 0, cookie);
- }
- + lockdep_unpin_lock(&rq->lock, cookie);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- }
- @@ -1877,6 +2091,7 @@
- static void ttwu_queue(struct task_struct *p, int cpu)
- {
- struct rq *rq = cpu_rq(cpu);
- + struct pin_cookie cookie;
- #if defined(CONFIG_SMP)
- if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
- @@ -1887,15 +2102,110 @@
- #endif
- raw_spin_lock(&rq->lock);
- - ttwu_do_activate(rq, p, 0);
- + cookie = lockdep_pin_lock(&rq->lock);
- + ttwu_do_activate(rq, p, 0, cookie);
- + lockdep_unpin_lock(&rq->lock, cookie);
- raw_spin_unlock(&rq->lock);
- }
- +/*
- + * Notes on Program-Order guarantees on SMP systems.
- + *
- + * MIGRATION
- + *
- + * The basic program-order guarantee on SMP systems is that when a task [t]
- + * migrates, all its activity on its old cpu [c0] happens-before any subsequent
- + * execution on its new cpu [c1].
- + *
- + * For migration (of runnable tasks) this is provided by the following means:
- + *
- + * A) UNLOCK of the rq(c0)->lock scheduling out task t
- + * B) migration for t is required to synchronize *both* rq(c0)->lock and
- + * rq(c1)->lock (if not at the same time, then in that order).
- + * C) LOCK of the rq(c1)->lock scheduling in task
- + *
- + * Transitivity guarantees that B happens after A and C after B.
- + * Note: we only require RCpc transitivity.
- + * Note: the cpu doing B need not be c0 or c1
- + *
- + * Example:
- + *
- + * CPU0 CPU1 CPU2
- + *
- + * LOCK rq(0)->lock
- + * sched-out X
- + * sched-in Y
- + * UNLOCK rq(0)->lock
- + *
- + * LOCK rq(0)->lock // orders against CPU0
- + * dequeue X
- + * UNLOCK rq(0)->lock
- + *
- + * LOCK rq(1)->lock
- + * enqueue X
- + * UNLOCK rq(1)->lock
- + *
- + * LOCK rq(1)->lock // orders against CPU2
- + * sched-out Z
- + * sched-in X
- + * UNLOCK rq(1)->lock
- + *
- + *
- + * BLOCKING -- aka. SLEEP + WAKEUP
- + *
- + * For blocking we (obviously) need to provide the same guarantee as for
- + * migration. However the means are completely different as there is no lock
- + * chain to provide order. Instead we do:
- + *
- + * 1) smp_store_release(X->on_cpu, 0)
- + * 2) smp_cond_load_acquire(!X->on_cpu)
- + *
- + * Example:
- + *
- + * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule)
- + *
- + * LOCK rq(0)->lock LOCK X->pi_lock
- + * dequeue X
- + * sched-out X
- + * smp_store_release(X->on_cpu, 0);
- + *
- + * smp_cond_load_acquire(&X->on_cpu, !VAL);
- + * X->state = WAKING
- + * set_task_cpu(X,2)
- + *
- + * LOCK rq(2)->lock
- + * enqueue X
- + * X->state = RUNNING
- + * UNLOCK rq(2)->lock
- + *
- + * LOCK rq(2)->lock // orders against CPU1
- + * sched-out Z
- + * sched-in X
- + * UNLOCK rq(2)->lock
- + *
- + * UNLOCK X->pi_lock
- + * UNLOCK rq(0)->lock
- + *
- + *
- + * However; for wakeups there is a second guarantee we must provide, namely we
- + * must observe the state that lead to our wakeup. That is, not only must our
- + * task observe its own prior state, it must also observe the stores prior to
- + * its wakeup.
- + *
- + * This means that any means of doing remote wakeups must order the CPU doing
- + * the wakeup against the CPU the task is going to end up running on. This,
- + * however, is already required for the regular Program-Order guarantee above,
- + * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
- + *
- + */
- +
- /**
- * try_to_wake_up - wake up a thread
- * @p: the thread to be awakened
- * @state: the mask of task states that can be woken
- * @wake_flags: wake modifier flags (WF_*)
- + * @sibling_count_hint: A hint at the number of threads that are being woken up
- + * in this event.
- *
- * Put it on the run-queue if it's not already there. The "current"
- * thread is always on the run-queue (except when the actual
- @@ -1907,7 +2217,8 @@
- * or @state didn't match @p's state.
- */
- static int
- -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
- +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
- + int sibling_count_hint)
- {
- unsigned long flags;
- int cpu, success = 0;
- @@ -1959,15 +2270,34 @@
- #ifdef CONFIG_SMP
- /*
- - * If the owning (remote) cpu is still in the middle of schedule() with
- - * this task as prev, wait until its done referencing the task.
- + * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
- + * possible to, falsely, observe p->on_cpu == 0.
- + *
- + * One must be running (->on_cpu == 1) in order to remove oneself
- + * from the runqueue.
- + *
- + * [S] ->on_cpu = 1; [L] ->on_rq
- + * UNLOCK rq->lock
- + * RMB
- + * LOCK rq->lock
- + * [S] ->on_rq = 0; [L] ->on_cpu
- + *
- + * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
- + * from the consecutive calls to schedule(); the first switching to our
- + * task, the second putting it to sleep.
- */
- - while (p->on_cpu)
- - cpu_relax();
- + smp_rmb();
- +
- /*
- - * Pairs with the smp_wmb() in finish_lock_switch().
- + * If the owning (remote) cpu is still in the middle of schedule() with
- + * this task as prev, wait until its done referencing the task.
- + *
- + * Pairs with the smp_store_release() in finish_lock_switch().
- + *
- + * This ensures that tasks getting woken will be fully ordered against
- + * their previous state and preserve Program Order.
- */
- - smp_rmb();
- + smp_cond_load_acquire(&p->on_cpu, !VAL);
- rq = cpu_rq(task_cpu(p));
- @@ -1983,8 +2313,8 @@
- if (p->sched_class->task_waking)
- p->sched_class->task_waking(p);
- - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
- -
- + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
- + sibling_count_hint);
- if (task_cpu(p) != cpu) {
- wake_flags |= WF_MIGRATED;
- set_task_cpu(p, cpu);
- @@ -2002,47 +2332,6 @@
- }
- /**
- - * try_to_wake_up_local - try to wake up a local task with rq lock held
- - * @p: the thread to be awakened
- - *
- - * Put @p on the run-queue if it's not already there. The caller must
- - * ensure that this_rq() is locked, @p is bound to this_rq() and not
- - * the current task.
- - */
- -static void try_to_wake_up_local(struct task_struct *p)
- -{
- - struct rq *rq = task_rq(p);
- -
- - if (WARN_ON_ONCE(rq != this_rq()) ||
- - WARN_ON_ONCE(p == current))
- - return;
- -
- - lockdep_assert_held(&rq->lock);
- -
- - if (!raw_spin_trylock(&p->pi_lock)) {
- - raw_spin_unlock(&rq->lock);
- - raw_spin_lock(&p->pi_lock);
- - raw_spin_lock(&rq->lock);
- - }
- -
- - if (!(p->state & TASK_NORMAL))
- - goto out;
- -
- - if (!task_on_rq_queued(p)) {
- - u64 wallclock = walt_ktime_clock();
- -
- - walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
- - walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
- - ttwu_activate(rq, p, ENQUEUE_WAKEUP);
- - }
- -
- - ttwu_do_wakeup(rq, p, 0);
- - ttwu_stat(p, smp_processor_id(), 0);
- -out:
- - raw_spin_unlock(&p->pi_lock);
- -}
- -
- -/**
- * wake_up_process - Wake up a specific process
- * @p: The process to be woken up.
- *
- @@ -2056,34 +2345,13 @@
- */
- int wake_up_process(struct task_struct *p)
- {
- - WARN_ON(task_is_stopped_or_traced(p));
- - return try_to_wake_up(p, TASK_NORMAL, 0);
- + return try_to_wake_up(p, TASK_NORMAL, 0, 1);
- }
- EXPORT_SYMBOL(wake_up_process);
- -/**
- - * wake_up_process_no_notif - Wake up a specific process without notifying
- - * governor
- - * @p: The process to be woken up.
- - *
- - * Attempt to wake up the nominated process and move it to the set of runnable
- - * processes.
- - *
- - * Return: 1 if the process was woken up, 0 if it was already running.
- - *
- - * It may be assumed that this function implies a write memory barrier before
- - * changing the task state if and only if any tasks are woken up.
- - */
- -int wake_up_process_no_notif(struct task_struct *p)
- -{
- - WARN_ON(task_is_stopped_or_traced(p));
- - return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER);
- -}
- -EXPORT_SYMBOL(wake_up_process_no_notif);
- -
- int wake_up_state(struct task_struct *p, unsigned int state)
- {
- - return try_to_wake_up(p, state, 0);
- + return try_to_wake_up(p, state, 0, 1);
- }
- /*
- @@ -2120,6 +2388,10 @@
- p->se.prev_sum_exec_runtime = 0;
- p->se.nr_migrations = 0;
- p->se.vruntime = 0;
- +#ifdef CONFIG_SCHED_WALT
- + p->last_sleep_ts = 0;
- +#endif
- +
- INIT_LIST_HEAD(&p->se.group_node);
- walt_init_new_task_load(p);
- @@ -2128,19 +2400,19 @@
- #endif
- #ifdef CONFIG_SCHEDSTATS
- + /* Even if schedstat is disabled, there should not be garbage */
- memset(&p->se.statistics, 0, sizeof(p->se.statistics));
- #endif
- -#ifdef CONFIG_CPU_FREQ_STAT
- - cpufreq_task_stats_init(p);
- -#endif
- -
- RB_CLEAR_NODE(&p->dl.rb_node);
- init_dl_task_timer(&p->dl);
- __dl_clear_params(p);
- - init_rt_schedtune_timer(&p->rt);
- INIT_LIST_HEAD(&p->rt.run_list);
- + p->rt.timeout = 0;
- + p->rt.time_slice = sched_rr_timeslice;
- + p->rt.on_rq = 0;
- + p->rt.on_list = 0;
- #ifdef CONFIG_PREEMPT_NOTIFIERS
- INIT_HLIST_HEAD(&p->preempt_notifiers);
- @@ -2171,31 +2443,88 @@
- #endif /* CONFIG_NUMA_BALANCING */
- }
- +DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
- +
- #ifdef CONFIG_NUMA_BALANCING
- -#ifdef CONFIG_SCHED_DEBUG
- +
- void set_numabalancing_state(bool enabled)
- {
- if (enabled)
- - sched_feat_set("NUMA");
- + static_branch_enable(&sched_numa_balancing);
- else
- - sched_feat_set("NO_NUMA");
- + static_branch_disable(&sched_numa_balancing);
- }
- -#else
- -__read_mostly bool numabalancing_enabled;
- -void set_numabalancing_state(bool enabled)
- +#ifdef CONFIG_PROC_SYSCTL
- +int sysctl_numa_balancing(struct ctl_table *table, int write,
- + void __user *buffer, size_t *lenp, loff_t *ppos)
- {
- - numabalancing_enabled = enabled;
- + struct ctl_table t;
- + int err;
- + int state = static_branch_likely(&sched_numa_balancing);
- +
- + if (write && !capable(CAP_SYS_ADMIN))
- + return -EPERM;
- +
- + t = *table;
- + t.data = &state;
- + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
- + if (err < 0)
- + return err;
- + if (write)
- + set_numabalancing_state(state);
- + return err;
- }
- -#endif /* CONFIG_SCHED_DEBUG */
- +#endif
- +#endif
- +
- +DEFINE_STATIC_KEY_FALSE(sched_schedstats);
- +
- +#ifdef CONFIG_SCHEDSTATS
- +static void set_schedstats(bool enabled)
- +{
- + if (enabled)
- + static_branch_enable(&sched_schedstats);
- + else
- + static_branch_disable(&sched_schedstats);
- +}
- +
- +void force_schedstat_enabled(void)
- +{
- + if (!schedstat_enabled()) {
- + pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
- + static_branch_enable(&sched_schedstats);
- + }
- +}
- +
- +static int __init setup_schedstats(char *str)
- +{
- + int ret = 0;
- + if (!str)
- + goto out;
- +
- + if (!strcmp(str, "enable")) {
- + set_schedstats(true);
- + ret = 1;
- + } else if (!strcmp(str, "disable")) {
- + set_schedstats(false);
- + ret = 1;
- + }
- +out:
- + if (!ret)
- + pr_warn("Unable to parse schedstats=\n");
- +
- + return ret;
- +}
- +__setup("schedstats=", setup_schedstats);
- #ifdef CONFIG_PROC_SYSCTL
- -int sysctl_numa_balancing(struct ctl_table *table, int write,
- +int sysctl_schedstats(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
- {
- struct ctl_table t;
- int err;
- - int state = numabalancing_enabled;
- + int state = static_branch_likely(&sched_schedstats);
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
- @@ -2206,7 +2535,7 @@
- if (err < 0)
- return err;
- if (write)
- - set_numabalancing_state(state);
- + set_schedstats(state);
- return err;
- }
- #endif
- @@ -2220,12 +2549,11 @@
- unsigned long flags;
- int cpu = get_cpu();
- - __sched_fork(clone_flags, p);
- -
- #ifdef CONFIG_CPU_FREQ_STAT
- - cpufreq_task_stats_alloc(p);
- + cpufreq_task_stats_init(p);
- #endif
- + __sched_fork(clone_flags, p);
- /*
- * We mark the process as running here. This guarantees that
- * nobody will actually run it, and a signal or other external
- @@ -2268,8 +2596,7 @@
- p->sched_class = &fair_sched_class;
- }
- - if (p->sched_class->task_fork)
- - p->sched_class->task_fork(p);
- + init_entity_runnable_average(&p->se);
- /*
- * The child is not yet in the pid-hash so no cgroup attach races,
- @@ -2279,7 +2606,13 @@
- * Silence PROVE_RCU.
- */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- - set_task_cpu(p, cpu);
- + /*
- + * We're setting the cpu for the first time, we don't migrate,
- + * so use __set_task_cpu().
- + */
- + __set_task_cpu(p, cpu);
- + if (p->sched_class->task_fork)
- + p->sched_class->task_fork(p);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
- @@ -2318,8 +2651,8 @@
- #ifdef CONFIG_SMP
- inline struct dl_bw *dl_bw_of(int i)
- {
- - rcu_lockdep_assert(rcu_read_lock_sched_held(),
- - "sched RCU must be held");
- + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
- + "sched RCU must be held");
- return &cpu_rq(i)->rd->dl_bw;
- }
- @@ -2328,8 +2661,8 @@
- struct root_domain *rd = cpu_rq(i)->rd;
- int cpus = 0;
- - rcu_lockdep_assert(rcu_read_lock_sched_held(),
- - "sched RCU must be held");
- + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
- + "sched RCU must be held");
- for_each_cpu_and(i, rd->span, cpu_active_mask)
- cpus++;
- @@ -2347,25 +2680,6 @@
- }
- #endif
- -static inline
- -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
- -{
- - dl_b->total_bw -= tsk_bw;
- -}
- -
- -static inline
- -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
- -{
- - dl_b->total_bw += tsk_bw;
- -}
- -
- -static inline
- -bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
- -{
- - return dl_b->bw != -1 &&
- - dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
- -}
- -
- /*
- * We must be sure that accepting a new task (or allowing changing the
- * parameters of an existing one) is consistent with the bandwidth
- @@ -2387,7 +2701,8 @@
- u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
- int cpus, err = -1;
- - if (new_bw == p->dl.dl_bw)
- + /* !deadline task may carry old deadline bandwidth */
- + if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
- return 0;
- /*
- @@ -2426,45 +2741,76 @@
- */
- void wake_up_new_task(struct task_struct *p)
- {
- - unsigned long flags;
- + struct rq_flags rf;
- struct rq *rq;
- - raw_spin_lock_irqsave(&p->pi_lock, flags);
- + /* Initialize new task's runnable average */
- + init_entity_runnable_average(&p->se);
- + raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
- walt_init_new_task_load(p);
- - /* Initialize new task's runnable average */
- - init_entity_runnable_average(&p->se);
- + p->state = TASK_RUNNING;
- #ifdef CONFIG_SMP
- /*
- * Fork balancing, do it here and not earlier because:
- * - cpus_allowed can change in the fork path
- * - any previously selected cpu might disappear through hotplug
- + *
- + * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
- + * as we're not fully set-up yet.
- */
- - set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
- + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
- #endif
- + rq = __task_rq_lock(p, &rf);
- + update_rq_clock(rq);
- + post_init_entity_util_avg(&p->se);
- - rq = __task_rq_lock(p);
- walt_mark_task_starting(p);
- +
- activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
- p->on_rq = TASK_ON_RQ_QUEUED;
- trace_sched_wakeup_new(p);
- check_preempt_curr(rq, p, WF_FORK);
- #ifdef CONFIG_SMP
- - if (p->sched_class->task_woken)
- + if (p->sched_class->task_woken) {
- + /*
- + * Nothing relies on rq->lock after this, so its fine to
- + * drop it.
- + */
- + lockdep_unpin_lock(&rq->lock, rf.cookie);
- p->sched_class->task_woken(rq, p);
- + lockdep_repin_lock(&rq->lock, rf.cookie);
- + }
- #endif
- - task_rq_unlock(rq, p, &flags);
- + task_rq_unlock(rq, p, &rf);
- }
- #ifdef CONFIG_PREEMPT_NOTIFIERS
- +static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
- +
- +void preempt_notifier_inc(void)
- +{
- + static_key_slow_inc(&preempt_notifier_key);
- +}
- +EXPORT_SYMBOL_GPL(preempt_notifier_inc);
- +
- +void preempt_notifier_dec(void)
- +{
- + static_key_slow_dec(&preempt_notifier_key);
- +}
- +EXPORT_SYMBOL_GPL(preempt_notifier_dec);
- +
- /**
- * preempt_notifier_register - tell me when current is being preempted & rescheduled
- * @notifier: notifier struct to register
- */
- void preempt_notifier_register(struct preempt_notifier *notifier)
- {
- + if (!static_key_false(&preempt_notifier_key))
- + WARN(1, "registering preempt_notifier while notifiers disabled\n");
- +
- hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
- }
- EXPORT_SYMBOL_GPL(preempt_notifier_register);
- @@ -2473,7 +2819,7 @@
- * preempt_notifier_unregister - no longer interested in preemption notifications
- * @notifier: notifier struct to unregister
- *
- - * This is safe to call from within a preemption notifier.
- + * This is *not* safe to call from within a preemption notifier.
- */
- void preempt_notifier_unregister(struct preempt_notifier *notifier)
- {
- @@ -2481,7 +2827,7 @@
- }
- EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
- -static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
- +static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
- {
- struct preempt_notifier *notifier;
- @@ -2489,9 +2835,15 @@
- notifier->ops->sched_in(notifier, raw_smp_processor_id());
- }
- +static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
- +{
- + if (static_key_false(&preempt_notifier_key))
- + __fire_sched_in_preempt_notifiers(curr);
- +}
- +
- static void
- -fire_sched_out_preempt_notifiers(struct task_struct *curr,
- - struct task_struct *next)
- +__fire_sched_out_preempt_notifiers(struct task_struct *curr,
- + struct task_struct *next)
- {
- struct preempt_notifier *notifier;
- @@ -2499,13 +2851,21 @@
- notifier->ops->sched_out(notifier, next);
- }
- +static __always_inline void
- +fire_sched_out_preempt_notifiers(struct task_struct *curr,
- + struct task_struct *next)
- +{
- + if (static_key_false(&preempt_notifier_key))
- + __fire_sched_out_preempt_notifiers(curr, next);
- +}
- +
- #else /* !CONFIG_PREEMPT_NOTIFIERS */
- -static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
- +static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
- {
- }
- -static void
- +static inline void
- fire_sched_out_preempt_notifiers(struct task_struct *curr,
- struct task_struct *next)
- {
- @@ -2530,7 +2890,6 @@
- prepare_task_switch(struct rq *rq, struct task_struct *prev,
- struct task_struct *next)
- {
- - trace_sched_switch(prev, next);
- sched_info_switch(rq, prev, next);
- perf_event_task_sched_out(prev, next);
- fire_sched_out_preempt_notifiers(prev, next);
- @@ -2545,7 +2904,6 @@
- /**
- * finish_task_switch - clean up after a task-switch
- - * @rq: runqueue associated with task-switch
- * @prev: the thread we just switched away from.
- *
- * finish_task_switch must be called after the context switch, paired
- @@ -2557,13 +2915,35 @@
- * so, we finish that here outside of the runqueue lock. (Doing it
- * with the lock held can cause deadlocks; see schedule() for
- * details.)
- + *
- + * The context switch have flipped the stack from under us and restored the
- + * local variables which were saved when this task called schedule() in the
- + * past. prev == current is still correct but we need to recalculate this_rq
- + * because prev may have moved to another CPU.
- */
- -static void finish_task_switch(struct rq *rq, struct task_struct *prev)
- +static struct rq *finish_task_switch(struct task_struct *prev)
- __releases(rq->lock)
- {
- + struct rq *rq = this_rq();
- struct mm_struct *mm = rq->prev_mm;
- long prev_state;
- + /*
- + * The previous task will have left us with a preempt_count of 2
- + * because it left us after:
- + *
- + * schedule()
- + * preempt_disable(); // 1
- + * __schedule()
- + * raw_spin_lock_irq(&rq->lock) // 2
- + *
- + * Also, see FORK_PREEMPT_COUNT.
- + */
- + if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
- + "corrupted preempt_count: %s/%d/0x%x\n",
- + current->comm, current->pid, preempt_count()))
- + preempt_count_set(FORK_PREEMPT_COUNT);
- +
- rq->prev_mm = NULL;
- /*
- @@ -2579,7 +2959,6 @@
- */
- prev_state = prev->state;
- vtime_task_switch(prev);
- - finish_arch_switch(prev);
- perf_event_task_sched_in(prev, current);
- finish_lock_switch(rq, prev);
- finish_arch_post_lock_switch();
- @@ -2596,10 +2975,15 @@
- * task and put them back on the free list.
- */
- kprobe_flush_task(prev);
- +
- + /* Task is done with its stack. */
- + put_task_stack(prev);
- +
- put_task_struct(prev);
- }
- tick_nohz_task_switch(current);
- + return rq;
- }
- #ifdef CONFIG_SMP
- @@ -2646,27 +3030,31 @@
- asmlinkage __visible void schedule_tail(struct task_struct *prev)
- __releases(rq->lock)
- {
- - struct rq *rq = this_rq();
- -
- - finish_task_switch(rq, prev);
- + struct rq *rq;
- /*
- - * FIXME: do we need to worry about rq being invalidated by the
- - * task_switch?
- + * New tasks start with FORK_PREEMPT_COUNT, see there and
- + * finish_task_switch() for details.
- + *
- + * finish_task_switch() will drop rq->lock() and lower preempt_count
- + * and the preempt_enable() will end up enabling preemption (on
- + * PREEMPT_COUNT kernels).
- */
- +
- + rq = finish_task_switch(prev);
- balance_callback(rq);
- + preempt_enable();
- if (current->set_child_tid)
- put_user(task_pid_vnr(current), current->set_child_tid);
- }
- /*
- - * context_switch - switch to the new MM and the new
- - * thread's register state.
- + * context_switch - switch to the new MM and the new thread's register state.
- */
- -static inline void
- +static inline struct rq *
- context_switch(struct rq *rq, struct task_struct *prev,
- - struct task_struct *next)
- + struct task_struct *next, struct pin_cookie cookie)
- {
- struct mm_struct *mm, *oldmm;
- @@ -2698,19 +3086,15 @@
- * of the scheduler it's an obvious special-case), so we
- * do an early lockdep release here:
- */
- + lockdep_unpin_lock(&rq->lock, cookie);
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
- context_tracking_task_switch(prev, next);
- /* Here we just switch the register state and the stack. */
- switch_to(prev, next, prev);
- -
- barrier();
- - /*
- - * this_rq must be evaluated again because prev may have moved
- - * CPUs since it called schedule(), thus the 'rq' on its stack
- - * frame will be invalid.
- - */
- - finish_task_switch(this_rq(), prev);
- +
- + return finish_task_switch(prev);
- }
- /*
- @@ -2775,6 +3159,36 @@
- return atomic_read(&this->nr_iowait);
- }
- +#ifdef CONFIG_CPU_QUIET
- +u64 nr_running_integral(unsigned int cpu)
- +{
- + unsigned int seqcnt;
- + u64 integral;
- + struct rq *q;
- +
- + if (cpu >= nr_cpu_ids)
- + return 0;
- +
- + q = cpu_rq(cpu);
- +
- + /*
- + * Update average to avoid reading stalled value if there were
- + * no run-queue changes for a long time. On the other hand if
- + * the changes are happening right now, just read current value
- + * directly.
- + */
- +
- + seqcnt = read_seqcount_begin(&q->ave_seqcnt);
- + integral = do_nr_running_integral(q);
- + if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) {
- + read_seqcount_begin(&q->ave_seqcnt);
- + integral = q->nr_running_integral;
- + }
- +
- + return integral;
- +}
- +#endif
- +
- void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
- {
- struct rq *rq = this_rq();
- @@ -2795,7 +3209,7 @@
- int dest_cpu;
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
- + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
- if (dest_cpu == smp_processor_id())
- goto unlock;
- @@ -2825,7 +3239,7 @@
- */
- unsigned long long task_sched_runtime(struct task_struct *p)
- {
- - unsigned long flags;
- + struct rq_flags rf;
- struct rq *rq;
- u64 ns;
- @@ -2845,7 +3259,7 @@
- return p->se.sum_exec_runtime;
- #endif
- - rq = task_rq_lock(p, &flags);
- + rq = task_rq_lock(p, &rf);
- /*
- * Must be ->curr _and_ ->on_rq. If dequeued, we would
- * project cycles that may never be accounted to this
- @@ -2856,7 +3270,7 @@
- p->sched_class->update_curr(rq);
- }
- ns = p->se.sum_exec_runtime;
- - task_rq_unlock(rq, p, &flags);
- + task_rq_unlock(rq, p, &rf);
- return ns;
- }
- @@ -2879,6 +3293,7 @@
- return total += scr->dl;
- }
- +unsigned long boosted_cpu_util(int cpu);
- static void sched_freq_tick_pelt(int cpu)
- {
- unsigned long cpu_utilization = boosted_cpu_util(cpu);
- @@ -2889,47 +3304,45 @@
- if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr)
- return;
- - if (!use_util_est())
- - cpu_utilization = capacity_max;
- -
- /*
- * To make free room for a task that is building up its "real"
- * utilization and to harm its performance the least, request
- * a jump to a higher OPP as soon as the margin of free capacity
- * is impacted (specified by capacity_margin).
- + * Remember CPU utilization in sched_capacity_reqs should be normalised.
- */
- + cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
- set_cfs_cpu_capacity(cpu, true, cpu_utilization);
- }
- #ifdef CONFIG_SCHED_WALT
- static void sched_freq_tick_walt(int cpu)
- {
- - unsigned long cpu_utilization = cpu_util(cpu, UTIL_EST);
- + unsigned long cpu_utilization = cpu_util_freq(cpu);
- unsigned long capacity_curr = capacity_curr_of(cpu);
- if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
- return sched_freq_tick_pelt(cpu);
- /*
- - * Add a margin to the WALT utilization.
- + * Add a margin to the WALT utilization to check if we will need to
- + * increase frequency.
- * NOTE: WALT tracks a single CPU signal for all the scheduling
- * classes, thus this margin is going to be added to the DL class as
- * well, which is something we do not do in sched_freq_tick_pelt case.
- - *
- - * TODO:
- - * Here we're adding margin, but we're also adding margin in cpufreq.
- - * There shouldn't be a double addition.
- */
- - cpu_utilization = add_capacity_margin(cpu_utilization);
- - if (cpu_utilization <= capacity_curr)
- + if (add_capacity_margin(cpu_utilization) <= capacity_curr)
- return;
- /*
- * It is likely that the load is growing so we
- * keep the added margin in our request as an
- * extra boost.
- + * Remember CPU utilization in sched_capacity_reqs should be normalised.
- */
- + cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
- set_cfs_cpu_capacity(cpu, true, cpu_utilization);
- +
- }
- #define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu)
- #else
- @@ -2938,16 +3351,9 @@
- static void sched_freq_tick(int cpu)
- {
- - unsigned long capacity_orig, capacity_curr;
- -
- if (!sched_freq())
- return;
- - capacity_orig = capacity_orig_of(cpu);
- - capacity_curr = capacity_curr_of(cpu);
- - if (capacity_curr == capacity_orig)
- - return;
- -
- _sched_freq_tick(cpu);
- }
- #else
- @@ -2968,11 +3374,11 @@
- raw_spin_lock(&rq->lock);
- walt_set_window_start(rq);
- + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
- + walt_ktime_clock(), 0);
- update_rq_clock(rq);
- curr->sched_class->task_tick(rq, curr, 0);
- update_cpu_load_active(rq);
- - walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
- - walt_ktime_clock(), 0);
- calc_global_load_tick(rq);
- sched_freq_tick(cpu);
- raw_spin_unlock(&rq->lock);
- @@ -3115,25 +3521,23 @@
- if (task_stack_end_corrupted(prev))
- panic("corrupted stack end detected inside scheduler\n");
- #endif
- - /*
- - * Test if we are atomic. Since do_exit() needs to call into
- - * schedule() atomically, we ignore that path. Otherwise whine
- - * if we are scheduling when we should not.
- - */
- - if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
- +
- + if (unlikely(in_atomic_preempt_off())) {
- __schedule_bug(prev);
- + preempt_count_set(PREEMPT_DISABLED);
- + }
- rcu_sleep_check();
- profile_hit(SCHED_PROFILING, __builtin_return_address(0));
- - schedstat_inc(this_rq(), sched_count);
- + schedstat_inc(this_rq()->sched_count);
- }
- /*
- * Pick up the highest-prio task:
- */
- static inline struct task_struct *
- -pick_next_task(struct rq *rq, struct task_struct *prev)
- +pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
- {
- const struct sched_class *class = &fair_sched_class;
- struct task_struct *p;
- @@ -3144,20 +3548,21 @@
- */
- if (likely(prev->sched_class == class &&
- rq->nr_running == rq->cfs.h_nr_running)) {
- - p = fair_sched_class.pick_next_task(rq, prev);
- + p = fair_sched_class.pick_next_task(rq, prev, cookie);
- if (unlikely(p == RETRY_TASK))
- goto again;
- /* assumes fair_sched_class->next == idle_sched_class */
- if (unlikely(!p))
- - p = idle_sched_class.pick_next_task(rq, prev);
- + p = idle_sched_class.pick_next_task(rq, prev, cookie);
- - return p;
- + if (likely(p != RETRY_TASK))
- + return p;
- }
- again:
- for_each_class(class) {
- - p = class->pick_next_task(rq, prev);
- + p = class->pick_next_task(rq, prev, cookie);
- if (p) {
- if (unlikely(p == RETRY_TASK))
- goto again;
- @@ -3204,20 +3609,20 @@
- * - explicit schedule() call
- * - return from syscall or exception to user-space
- * - return from interrupt-handler to user-space
- + *
- + * WARNING: must be called with preemption disabled!
- */
- -static void __sched __schedule(void)
- +static void __sched notrace __schedule(bool preempt)
- {
- struct task_struct *prev, *next;
- unsigned long *switch_count;
- + struct pin_cookie cookie;
- struct rq *rq;
- int cpu;
- u64 wallclock;
- -need_resched:
- - preempt_disable();
- cpu = smp_processor_id();
- rq = cpu_rq(cpu);
- - rcu_note_context_switch(cpu);
- prev = rq->curr;
- schedule_debug(prev);
- @@ -3225,77 +3630,105 @@
- if (sched_feat(HRTICK))
- hrtick_clear(rq);
- + local_irq_disable();
- + rcu_note_context_switch();
- +
- /*
- * Make sure that signal_pending_state()->signal_pending() below
- * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
- * done by the caller to avoid the race with signal_wake_up().
- */
- smp_mb__before_spinlock();
- - raw_spin_lock_irq(&rq->lock);
- + raw_spin_lock(&rq->lock);
- + cookie = lockdep_pin_lock(&rq->lock);
- +
- + rq->clock_skip_update <<= 1; /* promote REQ to ACT */
- switch_count = &prev->nivcsw;
- - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
- + if (!preempt && prev->state) {
- if (unlikely(signal_pending_state(prev->state, prev))) {
- prev->state = TASK_RUNNING;
- } else {
- deactivate_task(rq, prev, DEQUEUE_SLEEP);
- prev->on_rq = 0;
- - /*
- - * If a worker went to sleep, notify and ask workqueue
- - * whether it wants to wake up a task to maintain
- - * concurrency.
- - */
- - if (prev->flags & PF_WQ_WORKER) {
- - struct task_struct *to_wakeup;
- -
- - to_wakeup = wq_worker_sleeping(prev, cpu);
- - if (to_wakeup)
- - try_to_wake_up_local(to_wakeup);
- - }
- }
- switch_count = &prev->nvcsw;
- }
- - if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
- + if (task_on_rq_queued(prev))
- update_rq_clock(rq);
- - next = pick_next_task(rq, prev);
- + next = pick_next_task(rq, prev, cookie);
- wallclock = walt_ktime_clock();
- walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
- walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
- clear_tsk_need_resched(prev);
- clear_preempt_need_resched();
- - rq->skip_clock_update = 0;
- + rq->clock_skip_update = 0;
- if (likely(prev != next)) {
- +#ifdef CONFIG_SCHED_WALT
- + if (!prev->on_rq)
- + prev->last_sleep_ts = wallclock;
- +#endif
- rq->nr_switches++;
- rq->curr = next;
- ++*switch_count;
- - context_switch(rq, prev, next); /* unlocks the rq */
- - /*
- - * The context switch have flipped the stack from under us
- - * and restored the local variables which were saved when
- - * this task called schedule() in the past. prev == current
- - * is still correct, but it can be moved to another cpu/rq.
- - */
- - cpu = smp_processor_id();
- - rq = cpu_rq(cpu);
- - } else
- + //trace_sched_switch(preempt, prev, next);
- + rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
- + cpu = cpu_of(rq);
- + } else {
- + lockdep_unpin_lock(&rq->lock, cookie);
- raw_spin_unlock_irq(&rq->lock);
- + }
- balance_callback(rq);
- +}
- - sched_preempt_enable_no_resched();
- - if (need_resched())
- - goto need_resched;
- +void __noreturn do_task_dead(void)
- +{
- + /*
- + * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
- + * when the following two conditions become true.
- + * - There is race condition of mmap_sem (It is acquired by
- + * exit_mm()), and
- + * - SMI occurs before setting TASK_RUNINNG.
- + * (or hypervisor of virtual machine switches to other guest)
- + * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
- + *
- + * To avoid it, we have to wait for releasing tsk->pi_lock which
- + * is held by try_to_wake_up()
- + */
- + smp_mb();
- + raw_spin_unlock_wait(¤t->pi_lock);
- +
- + /* causes final put_task_struct in finish_task_switch(). */
- + __set_current_state(TASK_DEAD);
- + current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
- + __schedule(false);
- + BUG();
- + /* Avoid "noreturn function does return". */
- + for (;;)
- + cpu_relax(); /* For when BUG is null */
- }
- static inline void sched_submit_work(struct task_struct *tsk)
- {
- - if (!tsk->state || tsk_is_pi_blocked(tsk))
- + if (!tsk->state)
- + return;
- + /*
- + * If a worker went to sleep, notify and ask workqueue whether
- + * it wants to wake up a task to maintain concurrency.
- + */
- + if (tsk->flags & PF_WQ_WORKER)
- + wq_worker_sleeping(tsk);
- +
- +
- + if (tsk_is_pi_blocked(tsk))
- return;
- +
- /*
- * If we are going to sleep and we have plugged IO queued,
- * make sure to submit it to avoid deadlocks.
- @@ -3304,12 +3737,23 @@
- blk_schedule_flush_plug(tsk);
- }
- +static void sched_update_worker(struct task_struct *tsk)
- +{
- + if (tsk->flags & PF_WQ_WORKER)
- + wq_worker_running(tsk);
- +}
- +
- asmlinkage __visible void __sched schedule(void)
- {
- struct task_struct *tsk = current;
- sched_submit_work(tsk);
- - __schedule();
- + do {
- + preempt_disable();
- + __schedule(false);
- + sched_preempt_enable_no_resched();
- + } while (need_resched());
- + sched_update_worker(tsk);
- }
- EXPORT_SYMBOL(schedule);
- @@ -3344,6 +3788,20 @@
- preempt_disable();
- }
- +static void preempt_schedule_common(void)
- +{
- + do {
- + preempt_disable_notrace();
- + __schedule(true);
- + preempt_enable_no_resched_notrace();
- +
- + /*
- + * Check again in case we missed a preemption opportunity
- + * between schedule and now.
- + */
- + } while (need_resched());
- +}
- +
- #ifdef CONFIG_PREEMPT
- /*
- * this is the entry point to schedule() from in-kernel preemption
- @@ -3359,24 +3817,13 @@
- if (likely(!preemptible()))
- return;
- - do {
- - __preempt_count_add(PREEMPT_ACTIVE);
- - __schedule();
- - __preempt_count_sub(PREEMPT_ACTIVE);
- -
- - /*
- - * Check again in case we missed a preemption opportunity
- - * between schedule and now.
- - */
- - barrier();
- - } while (need_resched());
- + preempt_schedule_common();
- }
- NOKPROBE_SYMBOL(preempt_schedule);
- EXPORT_SYMBOL(preempt_schedule);
- -#ifdef CONFIG_CONTEXT_TRACKING
- /**
- - * preempt_schedule_context - preempt_schedule called by tracing
- + * preempt_schedule_notrace - preempt_schedule called by tracing
- *
- * The tracing infrastructure uses preempt_enable_notrace to prevent
- * recursion and tracing preempt enabling caused by the tracing
- @@ -3389,7 +3836,7 @@
- * instead of preempt_schedule() to exit user context if needed before
- * calling the scheduler.
- */
- -asmlinkage __visible void __sched notrace preempt_schedule_context(void)
- +asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
- {
- enum ctx_state prev_ctx;
- @@ -3397,22 +3844,20 @@
- return;
- do {
- - __preempt_count_add(PREEMPT_ACTIVE);
- + preempt_disable_notrace();
- /*
- * Needs preempt disabled in case user_exit() is traced
- * and the tracer calls preempt_enable_notrace() causing
- * an infinite recursion.
- */
- prev_ctx = exception_enter();
- - __schedule();
- + __schedule(true);
- exception_exit(prev_ctx);
- - __preempt_count_sub(PREEMPT_ACTIVE);
- - barrier();
- + preempt_enable_no_resched_notrace();
- } while (need_resched());
- }
- -EXPORT_SYMBOL_GPL(preempt_schedule_context);
- -#endif /* CONFIG_CONTEXT_TRACKING */
- +EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
- #endif /* CONFIG_PREEMPT */
- @@ -3432,17 +3877,11 @@
- prev_state = exception_enter();
- do {
- - __preempt_count_add(PREEMPT_ACTIVE);
- + preempt_disable();
- local_irq_enable();
- - __schedule();
- + __schedule(true);
- local_irq_disable();
- - __preempt_count_sub(PREEMPT_ACTIVE);
- -
- - /*
- - * Check again in case we missed a preemption opportunity
- - * between schedule and now.
- - */
- - barrier();
- + sched_preempt_enable_no_resched();
- } while (need_resched());
- exception_exit(prev_state);
- @@ -3451,7 +3890,7 @@
- int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
- void *key)
- {
- - return try_to_wake_up(curr->private, mode, wake_flags);
- + return try_to_wake_up(curr->private, mode, wake_flags, 1);
- }
- EXPORT_SYMBOL(default_wake_function);
- @@ -3470,13 +3909,15 @@
- */
- void rt_mutex_setprio(struct task_struct *p, int prio)
- {
- - int oldprio, queued, running, enqueue_flag = 0;
- - struct rq *rq;
- + int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
- const struct sched_class *prev_class;
- + struct rq_flags rf;
- + struct rq *rq;
- BUG_ON(prio > MAX_PRIO);
- - rq = __task_rq_lock(p);
- + rq = __task_rq_lock(p, &rf);
- + update_rq_clock(rq);
- /*
- * Idle task boosting is a nono in general. There is one
- @@ -3498,11 +3939,15 @@
- trace_sched_pi_setprio(p, prio);
- oldprio = p->prio;
- +
- + if (oldprio == prio)
- + queue_flag &= ~DEQUEUE_MOVE;
- +
- prev_class = p->sched_class;
- queued = task_on_rq_queued(p);
- running = task_current(rq, p);
- if (queued)
- - dequeue_task(rq, p, 0);
- + dequeue_task(rq, p, queue_flag);
- if (running)
- put_prev_task(rq, p);
- @@ -3520,8 +3965,7 @@
- if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
- p->dl.dl_boosted = 1;
- - p->dl.dl_throttled = 0;
- - enqueue_flag = ENQUEUE_REPLENISH;
- + queue_flag |= ENQUEUE_REPLENISH;
- } else
- p->dl.dl_boosted = 0;
- p->sched_class = &dl_sched_class;
- @@ -3529,7 +3973,7 @@
- if (dl_prio(oldprio))
- p->dl.dl_boosted = 0;
- if (oldprio < prio)
- - enqueue_flag = ENQUEUE_HEAD;
- + queue_flag |= ENQUEUE_HEAD;
- p->sched_class = &rt_sched_class;
- } else {
- if (dl_prio(oldprio))
- @@ -3541,15 +3985,15 @@
- p->prio = prio;
- - if (running)
- - p->sched_class->set_curr_task(rq);
- if (queued)
- - enqueue_task(rq, p, enqueue_flag);
- + enqueue_task(rq, p, queue_flag);
- + if (running)
- + set_curr_task(rq, p);
- check_class_changed(rq, p, prev_class, oldprio);
- out_unlock:
- preempt_disable(); /* avoid rq from going away on us */
- - __task_rq_unlock(rq);
- + __task_rq_unlock(rq, &rf);
- balance_callback(rq);
- preempt_enable();
- @@ -3558,8 +4002,9 @@
- void set_user_nice(struct task_struct *p, long nice)
- {
- - int old_prio, delta, queued;
- - unsigned long flags;
- + bool queued, running;
- + int old_prio, delta;
- + struct rq_flags rf;
- struct rq *rq;
- if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
- @@ -3568,7 +4013,9 @@
- * We have to be careful, if called from sys_setpriority(),
- * the task might be in the middle of scheduling on another CPU.
- */
- - rq = task_rq_lock(p, &flags);
- + rq = task_rq_lock(p, &rf);
- + update_rq_clock(rq);
- +
- /*
- * The RT priorities are set via sched_setscheduler(), but we still
- * allow the 'normal' nice value to be set - but as expected
- @@ -3580,8 +4027,11 @@
- goto out_unlock;
- }
- queued = task_on_rq_queued(p);
- + running = task_current(rq, p);
- if (queued)
- - dequeue_task(rq, p, 0);
- + dequeue_task(rq, p, DEQUEUE_SAVE);
- + if (running)
- + put_prev_task(rq, p);
- p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p);
- @@ -3590,7 +4040,7 @@
- delta = p->prio - old_prio;
- if (queued) {
- - enqueue_task(rq, p, 0);
- + enqueue_task(rq, p, ENQUEUE_RESTORE);
- /*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
- @@ -3598,8 +4048,10 @@
- if (delta < 0 || (delta > 0 && task_running(rq, p)))
- resched_curr(rq);
- }
- + if (running)
- + set_curr_task(rq, p);
- out_unlock:
- - task_rq_unlock(rq, p, &flags);
- + task_rq_unlock(rq, p, &rf);
- }
- EXPORT_SYMBOL(set_user_nice);
- @@ -3874,18 +4326,33 @@
- return match;
- }
- +static bool dl_param_changed(struct task_struct *p,
- + const struct sched_attr *attr)
- +{
- + struct sched_dl_entity *dl_se = &p->dl;
- +
- + if (dl_se->dl_runtime != attr->sched_runtime ||
- + dl_se->dl_deadline != attr->sched_deadline ||
- + dl_se->dl_period != attr->sched_period ||
- + dl_se->flags != attr->sched_flags)
- + return true;
- +
- + return false;
- +}
- +
- static int __sched_setscheduler(struct task_struct *p,
- const struct sched_attr *attr,
- - bool user)
- + bool user, bool pi)
- {
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
- MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, queued, running;
- int new_effective_prio, policy = attr->sched_policy;
- - unsigned long flags;
- const struct sched_class *prev_class;
- - struct rq *rq;
- + struct rq_flags rf;
- int reset_on_fork;
- + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
- + struct rq *rq;
- /* may grab non-irq protected spin_locks */
- BUG_ON(in_interrupt());
- @@ -3897,10 +4364,7 @@
- } else {
- reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
- - if (policy != SCHED_DEADLINE &&
- - policy != SCHED_FIFO && policy != SCHED_RR &&
- - policy != SCHED_NORMAL && policy != SCHED_BATCH &&
- - policy != SCHED_IDLE)
- + if (!valid_policy(policy))
- return -EINVAL;
- }
- @@ -3956,7 +4420,7 @@
- * Treat SCHED_IDLE as nice 20. Only allow a switch to
- * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
- */
- - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
- + if (idle_policy(p->policy) && !idle_policy(policy)) {
- if (!can_nice(p, task_nice(p)))
- return -EPERM;
- }
- @@ -3983,13 +4447,14 @@
- * To be able to change p->policy safely, the appropriate
- * runqueue lock must be held.
- */
- - rq = task_rq_lock(p, &flags);
- + rq = task_rq_lock(p, &rf);
- + update_rq_clock(rq);
- /*
- * Changing the policy of the stop threads its a very bad idea
- */
- if (p == rq->stop) {
- - task_rq_unlock(rq, p, &flags);
- + task_rq_unlock(rq, p, &rf);
- return -EINVAL;
- }
- @@ -4002,11 +4467,11 @@
- goto change;
- if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
- goto change;
- - if (dl_policy(policy))
- + if (dl_policy(policy) && dl_param_changed(p, attr))
- goto change;
- p->sched_reset_on_fork = reset_on_fork;
- - task_rq_unlock(rq, p, &flags);
- + task_rq_unlock(rq, p, &rf);
- return 0;
- }
- change:
- @@ -4020,7 +4485,7 @@
- if (rt_bandwidth_enabled() && rt_policy(policy) &&
- task_group(p)->rt_bandwidth.rt_runtime == 0 &&
- !task_group_is_autogroup(task_group(p))) {
- - task_rq_unlock(rq, p, &flags);
- + task_rq_unlock(rq, p, &rf);
- return -EPERM;
- }
- #endif
- @@ -4035,7 +4500,7 @@
- */
- if (!cpumask_subset(span, &p->cpus_allowed) ||
- rq->rd->dl_bw.bw == 0) {
- - task_rq_unlock(rq, p, &flags);
- + task_rq_unlock(rq, p, &rf);
- return -EPERM;
- }
- }
- @@ -4045,7 +4510,7 @@
- /* recheck policy now with rq lock held */
- if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
- policy = oldpolicy = -1;
- - task_rq_unlock(rq, p, &flags);
- + task_rq_unlock(rq, p, &rf);
- goto recheck;
- }
- @@ -4055,52 +4520,55 @@
- * is available.
- */
- if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
- - task_rq_unlock(rq, p, &flags);
- + task_rq_unlock(rq, p, &rf);
- return -EBUSY;
- }
- p->sched_reset_on_fork = reset_on_fork;
- oldprio = p->prio;
- - /*
- - * Take priority boosted tasks into account. If the new
- - * effective priority is unchanged, we just store the new
- - * normal parameters and do not touch the scheduler class and
- - * the runqueue. This will be done when the task deboost
- - * itself.
- - */
- - new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
- - if (new_effective_prio == oldprio) {
- - __setscheduler_params(p, attr);
- - task_rq_unlock(rq, p, &flags);
- - return 0;
- + if (pi) {
- + /*
- + * Take priority boosted tasks into account. If the new
- + * effective priority is unchanged, we just store the new
- + * normal parameters and do not touch the scheduler class and
- + * the runqueue. This will be done when the task deboost
- + * itself.
- + */
- + new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
- + if (new_effective_prio == oldprio)
- + queue_flags &= ~DEQUEUE_MOVE;
- }
- queued = task_on_rq_queued(p);
- running = task_current(rq, p);
- if (queued)
- - dequeue_task(rq, p, 0);
- + dequeue_task(rq, p, queue_flags);
- if (running)
- put_prev_task(rq, p);
- prev_class = p->sched_class;
- - __setscheduler(rq, p, attr, true);
- + __setscheduler(rq, p, attr, pi);
- - if (running)
- - p->sched_class->set_curr_task(rq);
- if (queued) {
- /*
- * We enqueue to tail when the priority of a task is
- * increased (user space view).
- */
- - enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
- + if (oldprio < p->prio)
- + queue_flags |= ENQUEUE_HEAD;
- +
- + enqueue_task(rq, p, queue_flags);
- }
- + if (running)
- + set_curr_task(rq, p);
- check_class_changed(rq, p, prev_class, oldprio);
- preempt_disable(); /* avoid rq from going away on us */
- - task_rq_unlock(rq, p, &flags);
- + task_rq_unlock(rq, p, &rf);
- - rt_mutex_adjust_pi(p);
- + if (pi)
- + rt_mutex_adjust_pi(p);
- /*
- * Run balance callbacks after we've adjusted the PI chain.
- @@ -4127,7 +4595,7 @@
- attr.sched_policy = policy;
- }
- - return __sched_setscheduler(p, &attr, check);
- + return __sched_setscheduler(p, &attr, check, true);
- }
- /**
- * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
- @@ -4148,7 +4616,7 @@
- int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
- {
- - return __sched_setscheduler(p, attr, true);
- + return __sched_setscheduler(p, attr, true, true);
- }
- EXPORT_SYMBOL_GPL(sched_setattr);
- @@ -4170,6 +4638,7 @@
- {
- return _sched_setscheduler(p, policy, param, false);
- }
- +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
- static int
- do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
- @@ -4683,7 +5152,7 @@
- {
- struct rq *rq = this_rq_lock();
- - schedstat_inc(rq, yld_count);
- + schedstat_inc(rq->yld_count);
- current->sched_class->yield_task(rq);
- /*
- @@ -4700,22 +5169,17 @@
- return 0;
- }
- -static void __cond_resched(void)
- -{
- - __preempt_count_add(PREEMPT_ACTIVE);
- - __schedule();
- - __preempt_count_sub(PREEMPT_ACTIVE);
- -}
- -
- +#ifndef CONFIG_PREEMPT
- int __sched _cond_resched(void)
- {
- - if (should_resched()) {
- - __cond_resched();
- + if (should_resched(0)) {
- + preempt_schedule_common();
- return 1;
- }
- return 0;
- }
- EXPORT_SYMBOL(_cond_resched);
- +#endif
- /*
- * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
- @@ -4727,7 +5191,7 @@
- */
- int __cond_resched_lock(spinlock_t *lock)
- {
- - int resched = should_resched();
- + int resched = should_resched(PREEMPT_LOCK_OFFSET);
- int ret = 0;
- lockdep_assert_held(lock);
- @@ -4735,7 +5199,7 @@
- if (spin_needbreak(lock) || resched) {
- spin_unlock(lock);
- if (resched)
- - __cond_resched();
- + preempt_schedule_common();
- else
- cpu_relax();
- ret = 1;
- @@ -4749,9 +5213,9 @@
- {
- BUG_ON(!in_softirq());
- - if (should_resched()) {
- + if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
- local_bh_enable();
- - __cond_resched();
- + preempt_schedule_common();
- local_bh_disable();
- return 1;
- }
- @@ -4841,7 +5305,7 @@
- yielded = curr->sched_class->yield_to_task(rq, p, preempt);
- if (yielded) {
- - schedstat_inc(rq, yld_count);
- + schedstat_inc(rq->yld_count);
- /*
- * Make p's CPU reschedule; pick_next_entity takes care of
- * fairness.
- @@ -4866,36 +5330,26 @@
- * This task is about to go to sleep on IO. Increment rq->nr_iowait so
- * that process accounting knows that this is a task in IO wait state.
- */
- -void __sched io_schedule(void)
- -{
- - struct rq *rq = raw_rq();
- -
- - delayacct_blkio_start();
- - atomic_inc(&rq->nr_iowait);
- - blk_flush_plug(current);
- - current->in_iowait = 1;
- - schedule();
- - current->in_iowait = 0;
- - atomic_dec(&rq->nr_iowait);
- - delayacct_blkio_end();
- -}
- -EXPORT_SYMBOL(io_schedule);
- -
- long __sched io_schedule_timeout(long timeout)
- {
- - struct rq *rq = raw_rq();
- + int old_iowait = current->in_iowait;
- + struct rq *rq;
- long ret;
- + current->in_iowait = 1;
- + blk_schedule_flush_plug(current);
- +
- delayacct_blkio_start();
- + rq = raw_rq();
- atomic_inc(&rq->nr_iowait);
- - blk_flush_plug(current);
- - current->in_iowait = 1;
- ret = schedule_timeout(timeout);
- - current->in_iowait = 0;
- + current->in_iowait = old_iowait;
- atomic_dec(&rq->nr_iowait);
- delayacct_blkio_end();
- +
- return ret;
- }
- +EXPORT_SYMBOL(io_schedule_timeout);
- /**
- * sys_sched_get_priority_max - return maximum RT priority.
- @@ -4966,10 +5420,10 @@
- {
- struct task_struct *p;
- unsigned int time_slice;
- - unsigned long flags;
- + struct rq_flags rf;
- + struct timespec t;
- struct rq *rq;
- int retval;
- - struct timespec t;
- if (pid < 0)
- return -EINVAL;
- @@ -4984,11 +5438,11 @@
- if (retval)
- goto out_unlock;
- - rq = task_rq_lock(p, &flags);
- + rq = task_rq_lock(p, &rf);
- time_slice = 0;
- if (p->sched_class->get_rr_interval)
- time_slice = p->sched_class->get_rr_interval(rq, p);
- - task_rq_unlock(rq, p, &flags);
- + task_rq_unlock(rq, p, &rf);
- rcu_read_unlock();
- jiffies_to_timespec(time_slice, &t);
- @@ -5006,9 +5460,12 @@
- {
- unsigned long free = 0;
- int ppid;
- - unsigned state;
- + unsigned long state = p->state;
- - state = p->state ? __ffs(p->state) + 1 : 0;
- + if (!try_get_task_stack(p))
- + return;
- + if (state)
- + state = __ffs(state) + 1;
- printk(KERN_INFO "%-15.15s %c", p->comm,
- state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
- #if BITS_PER_LONG == 32
- @@ -5025,8 +5482,10 @@
- #ifdef CONFIG_DEBUG_STACK_USAGE
- free = stack_not_used(p);
- #endif
- + ppid = 0;
- rcu_read_lock();
- - ppid = task_pid_nr(rcu_dereference(p->real_parent));
- + if (pid_alive(p))
- + ppid = task_pid_nr(rcu_dereference(p->real_parent));
- rcu_read_unlock();
- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- task_pid_nr(p), ppid,
- @@ -5034,6 +5493,7 @@
- print_worker_info(KERN_INFO, p);
- show_stack(p, NULL);
- + put_task_stack(p);
- }
- void show_state_filter(unsigned long state_filter)
- @@ -5095,7 +5555,6 @@
- raw_spin_lock(&rq->lock);
- __sched_fork(0, idle);
- -
- idle->state = TASK_RUNNING;
- idle->se.exec_start = sched_clock();
- @@ -5163,26 +5622,26 @@
- */
- void sched_setnuma(struct task_struct *p, int nid)
- {
- - struct rq *rq;
- - unsigned long flags;
- bool queued, running;
- + struct rq_flags rf;
- + struct rq *rq;
- - rq = task_rq_lock(p, &flags);
- + rq = task_rq_lock(p, &rf);
- queued = task_on_rq_queued(p);
- running = task_current(rq, p);
- if (queued)
- - dequeue_task(rq, p, 0);
- + dequeue_task(rq, p, DEQUEUE_SAVE);
- if (running)
- put_prev_task(rq, p);
- p->numa_preferred_nid = nid;
- - if (running)
- - p->sched_class->set_curr_task(rq);
- if (queued)
- - enqueue_task(rq, p, 0);
- - task_rq_unlock(rq, p, &flags);
- + enqueue_task(rq, p, ENQUEUE_RESTORE);
- + if (running)
- + set_curr_task(rq, p);
- + task_rq_unlock(rq, p, &rf);
- }
- #endif /* CONFIG_NUMA_BALANCING */
- @@ -5242,10 +5701,11 @@
- * there's no concurrency possible, we hold the required locks anyway
- * because of lock validation efforts.
- */
- -static void migrate_tasks(unsigned int dead_cpu)
- +static void migrate_tasks(struct rq *dead_rq)
- {
- - struct rq *rq = cpu_rq(dead_cpu);
- + struct rq *rq = dead_rq;
- struct task_struct *next, *stop = rq->stop;
- + struct pin_cookie cookie;
- int dest_cpu;
- /*
- @@ -5266,7 +5726,7 @@
- */
- update_rq_clock(rq);
- - for ( ; ; ) {
- + for (;;) {
- /*
- * There's this thread running, bail when that's the only
- * remaining thread.
- @@ -5274,17 +5734,48 @@
- if (rq->nr_running == 1)
- break;
- - next = pick_next_task(rq, &fake_task);
- + /*
- + * pick_next_task assumes pinned rq->lock.
- + */
- + cookie = lockdep_pin_lock(&rq->lock);
- + next = pick_next_task(rq, &fake_task, cookie);
- BUG_ON(!next);
- next->sched_class->put_prev_task(rq, next);
- - /* Find suitable destination for @next, with force if needed. */
- - dest_cpu = select_fallback_rq(dead_cpu, next);
- + /*
- + * Rules for changing task_struct::cpus_allowed are holding
- + * both pi_lock and rq->lock, such that holding either
- + * stabilizes the mask.
- + *
- + * Drop rq->lock is not quite as disastrous as it usually is
- + * because !cpu_active at this point, which means load-balance
- + * will not interfere. Also, stop-machine.
- + */
- + lockdep_unpin_lock(&rq->lock, cookie);
- raw_spin_unlock(&rq->lock);
- + raw_spin_lock(&next->pi_lock);
- + raw_spin_lock(&rq->lock);
- +
- + /*
- + * Since we're inside stop-machine, _nothing_ should have
- + * changed the task, WARN if weird stuff happened, because in
- + * that case the above rq->lock drop is a fail too.
- + */
- + if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
- + raw_spin_unlock(&next->pi_lock);
- + continue;
- + }
- - __migrate_task(next, dead_cpu, dest_cpu);
- + /* Find suitable destination for @next, with force if needed. */
- + dest_cpu = select_fallback_rq(dead_rq->cpu, next);
- - raw_spin_lock(&rq->lock);
- + rq = __migrate_task(rq, next, dest_cpu);
- + if (rq != dead_rq) {
- + raw_spin_unlock(&rq->lock);
- + rq = dead_rq;
- + raw_spin_lock(&rq->lock);
- + }
- + raw_spin_unlock(&next->pi_lock);
- }
- rq->stop = stop;
- @@ -5517,8 +6008,7 @@
- /* may be called multiple times per register */
- static void unregister_sched_domain_sysctl(void)
- {
- - if (sd_sysctl_header)
- - unregister_sysctl_table(sd_sysctl_header);
- + unregister_sysctl_table(sd_sysctl_header);
- sd_sysctl_header = NULL;
- if (sd_ctl_dir[0].child)
- sd_free_ctl_entry(&sd_ctl_dir[0].child);
- @@ -5603,7 +6093,7 @@
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
- set_rq_offline(rq);
- }
- - migrate_tasks(cpu);
- + migrate_tasks(rq);
- BUG_ON(rq->nr_running != 1); /* the migration thread */
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- break;
- @@ -5629,7 +6119,7 @@
- .priority = CPU_PRI_MIGRATION,
- };
- -static void __cpuinit set_cpu_rq_start_time(void)
- +static void set_cpu_rq_start_time(void)
- {
- int cpu = smp_processor_id();
- struct rq *rq = cpu_rq(cpu);
- @@ -5745,9 +6235,6 @@
- if (!(sd->flags & SD_LOAD_BALANCE)) {
- printk("does not load-balance\n");
- - if (sd->parent)
- - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
- - " has parent");
- return -1;
- }
- @@ -5840,8 +6327,12 @@
- static int sd_degenerate(struct sched_domain *sd)
- {
- - if (cpumask_weight(sched_domain_span(sd)) == 1)
- - return 1;
- + if (cpumask_weight(sched_domain_span(sd)) == 1) {
- + if (sd->groups->sge)
- + sd->flags &= ~SD_LOAD_BALANCE;
- + else
- + return 1;
- + }
- /* Following flags need at least 2 groups */
- if (sd->flags & (SD_LOAD_BALANCE |
- @@ -5849,6 +6340,7 @@
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_SHARE_CPUCAPACITY |
- + SD_ASYM_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_SHARE_POWERDOMAIN |
- SD_SHARE_CAP_STATES)) {
- @@ -5880,11 +6372,16 @@
- SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- + SD_ASYM_CPUCAPACITY |
- SD_SHARE_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN |
- SD_SHARE_CAP_STATES);
- + if (parent->groups->sge) {
- + parent->flags &= ~SD_LOAD_BALANCE;
- + return 0;
- + }
- if (nr_node_ids == 1)
- pflags &= ~SD_SERIALIZE;
- }
- @@ -5944,6 +6441,19 @@
- call_rcu_sched(&old_rd->rcu, free_rootdomain);
- }
- +void sched_get_rd(struct root_domain *rd)
- +{
- + atomic_inc(&rd->refcount);
- +}
- +
- +void sched_put_rd(struct root_domain *rd)
- +{
- + if (!atomic_dec_and_test(&rd->refcount))
- + return;
- +
- + call_rcu_sched(&rd->rcu, free_rootdomain);
- +}
- +
- static int init_rootdomain(struct root_domain *rd)
- {
- memset(rd, 0, sizeof(*rd));
- @@ -5957,6 +6467,12 @@
- if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
- goto free_dlo_mask;
- +#ifdef HAVE_RT_PUSH_IPI
- + rd->rto_cpu = -1;
- + raw_spin_lock_init(&rd->rto_lock);
- + init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
- +#endif
- +
- init_dl_bw(&rd->dl_bw);
- if (cpudl_init(&rd->cpudl) != 0)
- goto free_dlo_mask;
- @@ -5965,6 +6481,9 @@
- goto free_rto_mask;
- init_max_cpu_capacity(&rd->max_cpu_capacity);
- +
- + rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1;
- +
- return 0;
- free_rto_mask:
- @@ -6027,10 +6546,8 @@
- } while (sg != first);
- }
- -static void free_sched_domain(struct rcu_head *rcu)
- +static void destroy_sched_domain(struct sched_domain *sd)
- {
- - struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
- -
- /*
- * If its an overlapping domain it has private groups, iterate and
- * nuke them all.
- @@ -6041,18 +6558,26 @@
- kfree(sd->groups->sgc);
- kfree(sd->groups);
- }
- + if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
- + kfree(sd->shared);
- kfree(sd);
- }
- -static void destroy_sched_domain(struct sched_domain *sd, int cpu)
- +static void destroy_sched_domains_rcu(struct rcu_head *rcu)
- {
- - call_rcu(&sd->rcu, free_sched_domain);
- + struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
- +
- + while (sd) {
- + struct sched_domain *parent = sd->parent;
- + destroy_sched_domain(sd);
- + sd = parent;
- + }
- }
- -static void destroy_sched_domains(struct sched_domain *sd, int cpu)
- +static void destroy_sched_domains(struct sched_domain *sd)
- {
- - for (; sd; sd = sd->parent)
- - destroy_sched_domain(sd, cpu);
- + if (sd)
- + call_rcu(&sd->rcu, destroy_sched_domains_rcu);
- }
- /*
- @@ -6067,16 +6592,17 @@
- DEFINE_PER_CPU(struct sched_domain *, sd_llc);
- DEFINE_PER_CPU(int, sd_llc_size);
- DEFINE_PER_CPU(int, sd_llc_id);
- +DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
- DEFINE_PER_CPU(struct sched_domain *, sd_numa);
- -DEFINE_PER_CPU(struct sched_domain *, sd_busy);
- DEFINE_PER_CPU(struct sched_domain *, sd_asym);
- DEFINE_PER_CPU(struct sched_domain *, sd_ea);
- DEFINE_PER_CPU(struct sched_domain *, sd_scs);
- static void update_top_cache_domain(int cpu)
- {
- + struct sched_domain_shared *sds = NULL;
- struct sched_domain *sd;
- - struct sched_domain *busy_sd = NULL, *ea_sd = NULL;
- + struct sched_domain *ea_sd = NULL;
- int id = cpu;
- int size = 1;
- @@ -6084,13 +6610,13 @@
- if (sd) {
- id = cpumask_first(sched_domain_span(sd));
- size = cpumask_weight(sched_domain_span(sd));
- - busy_sd = sd->parent; /* sd_busy */
- + sds = sd->shared;
- }
- - rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
- rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
- per_cpu(sd_llc_size, cpu) = size;
- per_cpu(sd_llc_id, cpu) = id;
- + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
- sd = lowest_flag_domain(cpu, SD_NUMA);
- rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
- @@ -6137,7 +6663,7 @@
- */
- if (parent->flags & SD_PREFER_SIBLING)
- tmp->flags |= SD_PREFER_SIBLING;
- - destroy_sched_domain(parent, cpu);
- + destroy_sched_domain(parent);
- } else
- tmp = tmp->parent;
- }
- @@ -6145,7 +6671,7 @@
- if (sd && sd_degenerate(sd)) {
- tmp = sd;
- sd = sd->parent;
- - destroy_sched_domain(tmp, cpu);
- + destroy_sched_domain(tmp);
- if (sd)
- sd->child = NULL;
- }
- @@ -6155,14 +6681,11 @@
- rq_attach_root(rq, rd);
- tmp = rq->sd;
- rcu_assign_pointer(rq->sd, sd);
- - destroy_sched_domains(tmp, cpu);
- + destroy_sched_domains(tmp);
- update_top_cache_domain(cpu);
- }
- -/* cpus with isolated domains */
- -static cpumask_var_t cpu_isolated_map;
- -
- /* Setup the mask of cpus configured for isolated domains */
- static int __init isolated_cpu_setup(char *str)
- {
- @@ -6288,6 +6811,7 @@
- */
- sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
- sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
- + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
- /*
- * Make sure the first group of this domain contains the
- @@ -6413,7 +6937,6 @@
- return;
- update_group_capacity(sd, cpu);
- - atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
- }
- /*
- @@ -6476,28 +6999,6 @@
- sd->groups->sge = fn(cpu);
- }
- -#ifdef CONFIG_SCHED_DEBUG
- -void set_energy_aware()
- -{
- - sched_feat_set("ENERGY_AWARE");
- -}
- -void clear_energy_aware()
- -{
- - sched_feat_set("NO_ENERGY_AWARE");
- -}
- -#else
- -struct static_key __read_mostly __energy_aware = STATIC_KEY_INIT_FALSE;
- -
- -void set_energy_aware()
- -{
- - static_key_slow_inc(&__energy_aware);
- -}
- -void clear_energy_aware()
- -{
- - static_key_slow_dec(&__energy_aware);
- -}
- -#endif /* CONFIG_SCHED_DEBUG */
- -
- /*
- * Initializers for schedule domains
- * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
- @@ -6583,6 +7084,9 @@
- WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
- *per_cpu_ptr(sdd->sd, cpu) = NULL;
- + if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
- + *per_cpu_ptr(sdd->sds, cpu) = NULL;
- +
- if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
- *per_cpu_ptr(sdd->sg, cpu) = NULL;
- @@ -6600,13 +7104,20 @@
- /*
- * SD_flags allowed in topology descriptions.
- *
- - * SD_SHARE_CPUCAPACITY - describes SMT topologies
- - * SD_SHARE_PKG_RESOURCES - describes shared caches
- - * SD_NUMA - describes NUMA topologies
- - * SD_SHARE_POWERDOMAIN - describes shared power domain
- - * SD_SHARE_CAP_STATES - describes shared capacity states
- + * These flags are purely descriptive of the topology and do not prescribe
- + * behaviour. Behaviour is artificial and mapped in the below sd_init()
- + * function:
- + *
- + * SD_SHARE_CPUCAPACITY - describes SMT topologies
- + * SD_SHARE_PKG_RESOURCES - describes shared caches
- + * SD_NUMA - describes NUMA topologies
- + * SD_SHARE_POWERDOMAIN - describes shared power domain
- + * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
- + * SD_SHARE_CAP_STATES - describes shared capacity states
- + *
- + * Odd one out, which beside describing the topology has a quirk also
- + * prescribes the desired behaviour that goes along with it:
- *
- - * Odd one out:
- * SD_ASYM_PACKING - describes SMT quirks
- */
- #define TOPOLOGY_SD_FLAGS \
- @@ -6614,14 +7125,18 @@
- SD_SHARE_PKG_RESOURCES | \
- SD_NUMA | \
- SD_ASYM_PACKING | \
- + SD_ASYM_CPUCAPACITY | \
- SD_SHARE_POWERDOMAIN | \
- SD_SHARE_CAP_STATES)
- static struct sched_domain *
- -sd_init(struct sched_domain_topology_level *tl, int cpu)
- +sd_init(struct sched_domain_topology_level *tl,
- + const struct cpumask *cpu_map,
- + struct sched_domain *child, int cpu)
- {
- - struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
- - int sd_weight, sd_flags = 0;
- + struct sd_data *sdd = &tl->data;
- + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
- + int sd_id, sd_weight, sd_flags = 0;
- #ifdef CONFIG_NUMA
- /*
- @@ -6670,15 +7185,26 @@
- .smt_gain = 0,
- .max_newidle_lb_cost = 0,
- .next_decay_max_lb_cost = jiffies,
- + .child = child,
- #ifdef CONFIG_SCHED_DEBUG
- .name = tl->name,
- #endif
- };
- + cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
- + sd_id = cpumask_first(sched_domain_span(sd));
- +
- /*
- * Convert topological properties into behaviour.
- */
- + if (sd->flags & SD_ASYM_CPUCAPACITY) {
- + struct sched_domain *t = sd;
- +
- + for_each_lower_domain(t)
- + t->flags |= SD_BALANCE_WAKE;
- + }
- +
- if (sd->flags & SD_SHARE_CPUCAPACITY) {
- sd->flags |= SD_PREFER_SIBLING;
- sd->imbalance_pct = 110;
- @@ -6710,7 +7236,17 @@
- sd->idle_idx = 1;
- }
- - sd->private = &tl->data;
- + /*
- + * For all levels sharing cache; connect a sched_domain_shared
- + * instance.
- + */
- + if (sd->flags & SD_SHARE_PKG_RESOURCES) {
- + sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
- + atomic_inc(&sd->shared->ref);
- + atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
- + }
- +
- + sd->private = sdd;
- return sd;
- }
- @@ -6729,7 +7265,8 @@
- { NULL, },
- };
- -struct sched_domain_topology_level *sched_domain_topology = default_topology;
- +static struct sched_domain_topology_level *sched_domain_topology =
- + default_topology;
- #define for_each_sd_topology(tl) \
- for (tl = sched_domain_topology; tl->mask; tl++)
- @@ -6992,6 +7529,10 @@
- if (!sdd->sd)
- return -ENOMEM;
- + sdd->sds = alloc_percpu(struct sched_domain_shared *);
- + if (!sdd->sds)
- + return -ENOMEM;
- +
- sdd->sg = alloc_percpu(struct sched_group *);
- if (!sdd->sg)
- return -ENOMEM;
- @@ -7002,6 +7543,7 @@
- for_each_cpu(j, cpu_map) {
- struct sched_domain *sd;
- + struct sched_domain_shared *sds;
- struct sched_group *sg;
- struct sched_group_capacity *sgc;
- @@ -7012,6 +7554,13 @@
- *per_cpu_ptr(sdd->sd, j) = sd;
- + sds = kzalloc_node(sizeof(struct sched_domain_shared),
- + GFP_KERNEL, cpu_to_node(j));
- + if (!sds)
- + return -ENOMEM;
- +
- + *per_cpu_ptr(sdd->sds, j) = sds;
- +
- sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(j));
- if (!sg)
- @@ -7051,6 +7600,8 @@
- kfree(*per_cpu_ptr(sdd->sd, j));
- }
- + if (sdd->sds)
- + kfree(*per_cpu_ptr(sdd->sds, j));
- if (sdd->sg)
- kfree(*per_cpu_ptr(sdd->sg, j));
- if (sdd->sgc)
- @@ -7058,6 +7609,8 @@
- }
- free_percpu(sdd->sd);
- sdd->sd = NULL;
- + free_percpu(sdd->sds);
- + sdd->sds = NULL;
- free_percpu(sdd->sg);
- sdd->sg = NULL;
- free_percpu(sdd->sgc);
- @@ -7069,16 +7622,12 @@
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *child, int cpu)
- {
- - struct sched_domain *sd = sd_init(tl, cpu);
- - if (!sd)
- - return child;
- + struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
- - cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
- if (child) {
- sd->level = child->level + 1;
- sched_domain_level_max = max(sched_domain_level_max, sd->level);
- child->parent = sd;
- - sd->child = child;
- if (!cpumask_subset(sched_domain_span(child),
- sched_domain_span(sd))) {
- @@ -7109,7 +7658,6 @@
- enum s_alloc alloc_state;
- struct sched_domain *sd;
- struct s_data d;
- - struct rq *rq = NULL;
- int i, ret = -ENOMEM;
- alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
- @@ -7127,8 +7675,6 @@
- *per_cpu_ptr(d.sd, i) = sd;
- if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
- sd->flags |= SD_OVERLAP;
- - if (cpumask_equal(cpu_map, sched_domain_span(sd)))
- - break;
- }
- }
- @@ -7163,8 +7709,19 @@
- /* Attach the domains */
- rcu_read_lock();
- for_each_cpu(i, cpu_map) {
- - rq = cpu_rq(i);
- + int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
- + int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu);
- +
- + if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig >
- + cpu_rq(max_cpu)->cpu_capacity_orig))
- + WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
- +
- + if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig <
- + cpu_rq(min_cpu)->cpu_capacity_orig))
- + WRITE_ONCE(d.rd->min_cap_orig_cpu, i);
- +
- sd = *per_cpu_ptr(d.sd, i);
- +
- cpu_attach_domain(sd, d.rd, i);
- }
- rcu_read_unlock();
- @@ -7385,17 +7942,16 @@
- * operation in the resume sequence, just build a single sched
- * domain, ignoring cpusets.
- */
- - num_cpus_frozen--;
- - if (likely(num_cpus_frozen)) {
- - partition_sched_domains(1, NULL, NULL);
- + partition_sched_domains(1, NULL, NULL);
- + if (--num_cpus_frozen)
- break;
- - }
- /*
- * This is the last CPU online operation. So fall through and
- * restore the original sched domains by considering the
- * cpuset configurations.
- */
- + cpuset_force_rebuild();
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- @@ -7428,7 +7984,6 @@
- {
- cpumask_var_t non_isolated_cpus;
- - walt_init_cpu_efficiency();
- alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
- alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
- @@ -7490,6 +8045,7 @@
- #endif
- DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
- +DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
- void __init sched_init(void)
- {
- @@ -7528,6 +8084,8 @@
- for_each_possible_cpu(i) {
- per_cpu(load_balance_mask, i) = (void *)ptr;
- ptr += cpumask_size();
- + per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
- + cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- }
- #endif /* CONFIG_CPUMASK_OFFSTACK */
- }
- @@ -7553,6 +8111,7 @@
- INIT_LIST_HEAD(&root_task_group.children);
- INIT_LIST_HEAD(&root_task_group.siblings);
- autogroup_init(&init_task);
- +
- #endif /* CONFIG_CGROUP_SCHED */
- for_each_possible_cpu(i) {
- @@ -7564,11 +8123,12 @@
- rq->calc_load_active = 0;
- rq->calc_load_update = jiffies + LOAD_FREQ;
- init_cfs_rq(&rq->cfs);
- - init_rt_rq(&rq->rt, rq);
- - init_dl_rq(&rq->dl, rq);
- + init_rt_rq(&rq->rt);
- + init_dl_rq(&rq->dl);
- #ifdef CONFIG_FAIR_GROUP_SCHED
- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
- INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
- + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- /*
- * How much cpu bandwidth does root_task_group get?
- *
- @@ -7610,6 +8170,7 @@
- rq->active_balance = 0;
- rq->next_balance = jiffies;
- rq->push_cpu = 0;
- + rq->push_task = NULL;
- rq->cpu = i;
- rq->online = 0;
- rq->idle_stamp = 0;
- @@ -7695,15 +8256,34 @@
- void __might_sleep(const char *file, int line, int preempt_offset)
- {
- + /*
- + * Blocking primitives will set (and therefore destroy) current->state,
- + * since we will exit with TASK_RUNNING make sure we enter with it,
- + * otherwise we will destroy state.
- + */
- + if (WARN_ONCE(current->state != TASK_RUNNING,
- + "do not call blocking ops when !TASK_RUNNING; "
- + "state=%lx set at [<%p>] %pS\n",
- + current->state,
- + (void *)current->task_state_change,
- + (void *)current->task_state_change))
- + __set_current_state(TASK_RUNNING);
- +
- + ___might_sleep(file, line, preempt_offset);
- +}
- +EXPORT_SYMBOL(__might_sleep);
- +
- +void ___might_sleep(const char *file, int line, int preempt_offset)
- +{
- static unsigned long prev_jiffy; /* ratelimiting */
- rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
- if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
- - !is_idle_task(current)) || oops_in_progress)
- - return;
- - if (system_state != SYSTEM_RUNNING &&
- - (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
- + !is_idle_task(current)) ||
- + system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
- + oops_in_progress)
- return;
- +
- if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
- return;
- prev_jiffy = jiffies;
- @@ -7716,6 +8296,9 @@
- in_atomic(), irqs_disabled(),
- current->pid, current->comm);
- + if (task_stack_end_corrupted(current))
- + printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
- +
- debug_show_held_locks(current);
- if (irqs_disabled())
- print_irqtrace_events(current);
- @@ -7728,36 +8311,16 @@
- #endif
- dump_stack();
- }
- -EXPORT_SYMBOL(__might_sleep);
- +EXPORT_SYMBOL(___might_sleep);
- #endif
- #ifdef CONFIG_MAGIC_SYSRQ
- -static void normalize_task(struct rq *rq, struct task_struct *p)
- +void normalize_rt_tasks(void)
- {
- - const struct sched_class *prev_class = p->sched_class;
- + struct task_struct *g, *p;
- struct sched_attr attr = {
- .sched_policy = SCHED_NORMAL,
- };
- - int old_prio = p->prio;
- - int queued;
- -
- - queued = task_on_rq_queued(p);
- - if (queued)
- - dequeue_task(rq, p, 0);
- - __setscheduler(rq, p, &attr, false);
- - if (queued) {
- - enqueue_task(rq, p, 0);
- - resched_curr(rq);
- - }
- -
- - check_class_changed(rq, p, prev_class, old_prio);
- -}
- -
- -void normalize_rt_tasks(void)
- -{
- - struct task_struct *g, *p;
- - unsigned long flags;
- - struct rq *rq;
- read_lock(&tasklist_lock);
- for_each_process_thread(g, p) {
- @@ -7767,12 +8330,10 @@
- if (p->flags & PF_KTHREAD)
- continue;
- - p->se.exec_start = 0;
- -#ifdef CONFIG_SCHEDSTATS
- - p->se.statistics.wait_start = 0;
- - p->se.statistics.sleep_start = 0;
- - p->se.statistics.block_start = 0;
- -#endif
- + p->se.exec_start = 0;
- + schedstat_set(p->se.statistics.wait_start, 0);
- + schedstat_set(p->se.statistics.sleep_start, 0);
- + schedstat_set(p->se.statistics.block_start, 0);
- if (!dl_task(p) && !rt_task(p)) {
- /*
- @@ -7784,9 +8345,7 @@
- continue;
- }
- - rq = task_rq_lock(p, &flags);
- - normalize_task(rq, p);
- - task_rq_unlock(rq, p, &flags);
- + __sched_setscheduler(p, &attr, false, false);
- }
- read_unlock(&tasklist_lock);
- }
- @@ -7920,27 +8479,9 @@
- spin_unlock_irqrestore(&task_group_lock, flags);
- }
- -/* change task's runqueue when it moves between groups.
- - * The caller of this function should have put the task in its new group
- - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- - * reflect its new group.
- - */
- -void sched_move_task(struct task_struct *tsk)
- +static void sched_change_group(struct task_struct *tsk, int type)
- {
- struct task_group *tg;
- - int queued, running;
- - unsigned long flags;
- - struct rq *rq;
- -
- - rq = task_rq_lock(tsk, &flags);
- -
- - running = task_current(rq, tsk);
- - queued = task_on_rq_queued(tsk);
- -
- - if (queued)
- - dequeue_task(rq, tsk, 0);
- - if (unlikely(running))
- - put_prev_task(rq, tsk);
- /*
- * All callers are synchronized by task_rq_lock(); we do not use RCU
- @@ -7953,18 +8494,45 @@
- tsk->sched_task_group = tg;
- #ifdef CONFIG_FAIR_GROUP_SCHED
- - if (tsk->sched_class->task_move_group)
- - tsk->sched_class->task_move_group(tsk);
- + if (tsk->sched_class->task_change_group)
- + tsk->sched_class->task_change_group(tsk, type);
- else
- #endif
- set_task_rq(tsk, task_cpu(tsk));
- +}
- +
- +/*
- + * Change task's runqueue when it moves between groups.
- + *
- + * The caller of this function should have put the task in its new group by
- + * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
- + * its new group.
- + */
- +void sched_move_task(struct task_struct *tsk)
- +{
- + int queued, running;
- + struct rq_flags rf;
- + struct rq *rq;
- + rq = task_rq_lock(tsk, &rf);
- + update_rq_clock(rq);
- +
- + running = task_current(rq, tsk);
- + queued = task_on_rq_queued(tsk);
- +
- + if (queued)
- + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (unlikely(running))
- - tsk->sched_class->set_curr_task(rq);
- + put_prev_task(rq, tsk);
- +
- + sched_change_group(tsk, TASK_MOVE_GROUP);
- +
- if (queued)
- - enqueue_task(rq, tsk, 0);
- + enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
- + if (unlikely(running))
- + set_curr_task(rq, tsk);
- - task_rq_unlock(rq, tsk, &flags);
- + task_rq_unlock(rq, tsk, &rf);
- }
- #endif /* CONFIG_CGROUP_SCHED */
- @@ -8077,6 +8645,17 @@
- {
- int i, err = 0;
- + /*
- + * Disallowing the root group RT runtime is BAD, it would disallow the
- + * kernel creating (and or operating) RT threads.
- + */
- + if (tg == &root_task_group && rt_runtime == 0)
- + return -EINVAL;
- +
- + /* No period doesn't make any sense. */
- + if (rt_period == 0)
- + return -EINVAL;
- +
- mutex_lock(&rt_constraints_mutex);
- read_lock(&tasklist_lock);
- err = __rt_schedulable(tg, rt_period, rt_runtime);
- @@ -8126,16 +8705,13 @@
- return rt_runtime_us;
- }
- -static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
- +static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
- {
- u64 rt_runtime, rt_period;
- - rt_period = (u64)rt_period_us * NSEC_PER_USEC;
- + rt_period = rt_period_us * NSEC_PER_USEC;
- rt_runtime = tg->rt_bandwidth.rt_runtime;
- - if (rt_period == 0)
- - return -EINVAL;
- -
- return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
- }
- @@ -8382,9 +8958,21 @@
- sched_offline_group(tg);
- }
- +/*
- + * This is called before wake_up_new_task(), therefore we really only
- + * have to set its group bits, all the other stuff does not apply.
- + */
- static void cpu_cgroup_fork(struct task_struct *task)
- {
- - sched_move_task(task);
- + struct rq_flags rf;
- + struct rq *rq;
- +
- + rq = task_rq_lock(task, &rf);
- +
- + update_rq_clock(rq);
- + sched_change_group(task, TASK_SET_GROUP);
- +
- + task_rq_unlock(rq, task, &rf);
- }
- static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
- @@ -8765,3 +9353,44 @@
- pr_info("Task dump for CPU %d:\n", cpu);
- sched_show_task(cpu_curr(cpu));
- }
- +
- +/*
- + * Nice levels are multiplicative, with a gentle 10% change for every
- + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
- + * nice 1, it will get ~10% less CPU time than another CPU-bound task
- + * that remained on nice 0.
- + *
- + * The "10% effect" is relative and cumulative: from _any_ nice level,
- + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
- + * If a task goes up by ~10% and another task goes down by ~10% then
- + * the relative distance between them is ~25%.)
- + */
- +const int sched_prio_to_weight[40] = {
- + /* -20 */ 88761, 71755, 56483, 46273, 36291,
- + /* -15 */ 29154, 23254, 18705, 14949, 11916,
- + /* -10 */ 9548, 7620, 6100, 4904, 3906,
- + /* -5 */ 3121, 2501, 1991, 1586, 1277,
- + /* 0 */ 1024, 820, 655, 526, 423,
- + /* 5 */ 335, 272, 215, 172, 137,
- + /* 10 */ 110, 87, 70, 56, 45,
- + /* 15 */ 36, 29, 23, 18, 15,
- +};
- +
- +/*
- + * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
- + *
- + * In cases where the weight does not change often, we can use the
- + * precalculated inverse to speed up arithmetics by turning divisions
- + * into multiplications:
- + */
- +const u32 sched_prio_to_wmult[40] = {
- + /* -20 */ 48388, 59856, 76040, 92818, 118348,
- + /* -15 */ 147320, 184698, 229616, 287308, 360437,
- + /* -10 */ 449829, 563644, 704093, 875809, 1099582,
- + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
- + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
- + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
- + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
- + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
- +};
- diff -Nur /home/ninez/android/marlin/kernel/sched/cpudeadline.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpudeadline.c
- --- /home/ninez/android/marlin/kernel/sched/cpudeadline.c 2018-08-10 01:54:08.563395055 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpudeadline.c 2018-08-11 23:57:17.128607487 -0400
- @@ -31,11 +31,6 @@
- return (i << 1) + 2;
- }
- -static inline int dl_time_before(u64 a, u64 b)
- -{
- - return (s64)(a - b) < 0;
- -}
- -
- static void cpudl_exchange(struct cpudl *cp, int a, int b)
- {
- int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
- @@ -107,7 +102,9 @@
- int best_cpu = -1;
- const struct sched_dl_entity *dl_se = &p->dl;
- - if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
- + if (later_mask &&
- + cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed) &&
- + cpumask_and(later_mask, later_mask, cpu_active_mask)) {
- best_cpu = cpumask_any(later_mask);
- goto out;
- } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
- @@ -186,6 +183,26 @@
- }
- /*
- + * cpudl_set_freecpu - Set the cpudl.free_cpus
- + * @cp: the cpudl max-heap context
- + * @cpu: rd attached cpu
- + */
- +void cpudl_set_freecpu(struct cpudl *cp, int cpu)
- +{
- + cpumask_set_cpu(cpu, cp->free_cpus);
- +}
- +
- +/*
- + * cpudl_clear_freecpu - Clear the cpudl.free_cpus
- + * @cp: the cpudl max-heap context
- + * @cpu: rd attached cpu
- + */
- +void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
- +{
- + cpumask_clear_cpu(cpu, cp->free_cpus);
- +}
- +
- +/*
- * cpudl_init - initialize the cpudl structure
- * @cp: the cpudl max-heap context
- */
- @@ -203,7 +220,7 @@
- if (!cp->elements)
- return -ENOMEM;
- - if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
- + if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
- kfree(cp->elements);
- return -ENOMEM;
- }
- @@ -211,8 +228,6 @@
- for_each_possible_cpu(i)
- cp->elements[i].idx = IDX_INVALID;
- - cpumask_setall(cp->free_cpus);
- -
- return 0;
- }
- diff -Nur /home/ninez/android/marlin/kernel/sched/cpudeadline.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpudeadline.h
- --- /home/ninez/android/marlin/kernel/sched/cpudeadline.h 2018-08-10 01:54:08.563395055 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpudeadline.h 2018-08-11 23:57:17.128607487 -0400
- @@ -2,6 +2,7 @@
- #define _LINUX_CPUDL_H
- #include <linux/sched.h>
- +#include <linux/sched/deadline.h>
- #define IDX_INVALID -1
- @@ -24,6 +25,8 @@
- struct cpumask *later_mask);
- void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
- int cpudl_init(struct cpudl *cp);
- +void cpudl_set_freecpu(struct cpudl *cp, int cpu);
- +void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
- void cpudl_cleanup(struct cpudl *cp);
- #else
- #define cpudl_set(cp, cpu, dl) do { } while (0)
- diff -Nur /home/ninez/android/marlin/kernel/sched/cpufreq.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq.c
- --- /home/ninez/android/marlin/kernel/sched/cpufreq.c 1969-12-31 19:00:00.000000000 -0500
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq.c 2018-08-11 23:57:17.128607487 -0400
- @@ -0,0 +1,63 @@
- +/*
- + * Scheduler code and data structures related to cpufreq.
- + *
- + * Copyright (C) 2016, Intel Corporation
- + * Author: Rafael J. Wysocki <[email protected]>
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License version 2 as
- + * published by the Free Software Foundation.
- + */
- +
- +#include "sched.h"
- +
- +DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
- +
- +/**
- + * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
- + * @cpu: The CPU to set the pointer for.
- + * @data: New pointer value.
- + * @func: Callback function to set for the CPU.
- + *
- + * Set and publish the update_util_data pointer for the given CPU.
- + *
- + * The update_util_data pointer of @cpu is set to @data and the callback
- + * function pointer in the target struct update_util_data is set to @func.
- + * That function will be called by cpufreq_update_util() from RCU-sched
- + * read-side critical sections, so it must not sleep. @data will always be
- + * passed to it as the first argument which allows the function to get to the
- + * target update_util_data structure and its container.
- + *
- + * The update_util_data pointer of @cpu must be NULL when this function is
- + * called or it will WARN() and return with no effect.
- + */
- +void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
- + void (*func)(struct update_util_data *data, u64 time,
- + unsigned int flags))
- +{
- + if (WARN_ON(!data || !func))
- + return;
- +
- + if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
- + return;
- +
- + data->func = func;
- + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
- +}
- +EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
- +
- +/**
- + * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer.
- + * @cpu: The CPU to clear the pointer for.
- + *
- + * Clear the update_util_data pointer for the given CPU.
- + *
- + * Callers must use RCU-sched callbacks to free any memory that might be
- + * accessed via the old update_util_data pointer or invoke synchronize_sched()
- + * right after this function to avoid use-after-free.
- + */
- +void cpufreq_remove_update_util_hook(int cpu)
- +{
- + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
- +}
- +EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
- diff -Nur /home/ninez/android/marlin/kernel/sched/cpufreq_sched.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq_sched.c
- --- /home/ninez/android/marlin/kernel/sched/cpufreq_sched.c 2018-08-10 01:54:08.563395055 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq_sched.c 2018-08-11 23:57:17.128607487 -0400
- @@ -32,6 +32,12 @@
- static DEFINE_PER_CPU(unsigned long, enabled);
- DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
- +struct gov_tunables {
- + struct gov_attr_set attr_set;
- + unsigned int up_throttle_nsec;
- + unsigned int down_throttle_nsec;
- +};
- +
- /**
- * gov_data - per-policy data internal to the governor
- * @up_throttle: next throttling period expiry if increasing OPP
- @@ -53,8 +59,8 @@
- struct gov_data {
- ktime_t up_throttle;
- ktime_t down_throttle;
- - unsigned int up_throttle_nsec;
- - unsigned int down_throttle_nsec;
- + struct gov_tunables *tunables;
- + struct list_head tunables_hook;
- struct task_struct *task;
- struct irq_work irq_work;
- unsigned int requested_freq;
- @@ -71,8 +77,10 @@
- __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
- - gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec);
- - gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec);
- + gd->up_throttle = ktime_add_ns(ktime_get(),
- + gd->tunables->up_throttle_nsec);
- + gd->down_throttle = ktime_add_ns(ktime_get(),
- + gd->tunables->down_throttle_nsec);
- up_write(&policy->rwsem);
- }
- @@ -194,7 +202,7 @@
- }
- /* Convert the new maximum capacity request into a cpu frequency */
- - freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
- + freq_new = capacity * policy->cpuinfo.max_freq >> SCHED_CAPACITY_SHIFT;
- if (cpufreq_frequency_table_target(policy, policy->freq_table,
- freq_new, CPUFREQ_RELATION_L,
- &index_new))
- @@ -227,6 +235,18 @@
- cpufreq_cpu_put(policy);
- }
- +#ifdef CONFIG_SCHED_WALT
- +static inline unsigned long
- +requested_capacity(struct sched_capacity_reqs *scr)
- +{
- + if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
- + return scr->cfs;
- + return scr->cfs + scr->rt;
- +}
- +#else
- +#define requested_capacity(scr) (scr->cfs + scr->rt)
- +#endif
- +
- void update_cpu_capacity_request(int cpu, bool request)
- {
- unsigned long new_capacity;
- @@ -237,25 +257,10 @@
- scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
- -#ifdef CONFIG_SCHED_WALT
- - if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
- - /*
- - * Same WALT signal is set at different places, take the max
- - * reported utilization
- - */
- - new_capacity = max(scr->cfs, scr->rt);
- - new_capacity = max(new_capacity, scr->dl);
- - } else {
- - /*
- - * For PELT, utilization is aggregated
- - */
- - new_capacity = scr->cfs + scr->rt + scr->dl;
- - }
- -#else
- - new_capacity = scr->cfs + scr->rt + scr->dl;
- -#endif
- + new_capacity = requested_capacity(scr);
- new_capacity = new_capacity * capacity_margin
- / SCHED_CAPACITY_SCALE;
- + new_capacity += scr->dl;
- if (new_capacity == scr->total)
- return;
- @@ -277,12 +282,70 @@
- static_key_slow_dec(&__sched_freq);
- }
- -static struct attribute_group sched_attr_group_gov_pol;
- -static struct attribute_group *get_sysfs_attr(void)
- +/* Tunables */
- +static struct gov_tunables *global_tunables;
- +
- +static inline struct gov_tunables *to_tunables(struct gov_attr_set *attr_set)
- +{
- + return container_of(attr_set, struct gov_tunables, attr_set);
- +}
- +
- +static ssize_t up_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
- +{
- + struct gov_tunables *tunables = to_tunables(attr_set);
- +
- + return sprintf(buf, "%u\n", tunables->up_throttle_nsec);
- +}
- +
- +static ssize_t up_throttle_nsec_store(struct gov_attr_set *attr_set,
- + const char *buf, size_t count)
- {
- - return &sched_attr_group_gov_pol;
- + struct gov_tunables *tunables = to_tunables(attr_set);
- + int ret;
- + long unsigned int val;
- +
- + ret = kstrtoul(buf, 0, &val);
- + if (ret < 0)
- + return ret;
- + tunables->up_throttle_nsec = val;
- + return count;
- }
- +static ssize_t down_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
- +{
- + struct gov_tunables *tunables = to_tunables(attr_set);
- +
- + return sprintf(buf, "%u\n", tunables->down_throttle_nsec);
- +}
- +
- +static ssize_t down_throttle_nsec_store(struct gov_attr_set *attr_set,
- + const char *buf, size_t count)
- +{
- + struct gov_tunables *tunables = to_tunables(attr_set);
- + int ret;
- + long unsigned int val;
- +
- + ret = kstrtoul(buf, 0, &val);
- + if (ret < 0)
- + return ret;
- + tunables->down_throttle_nsec = val;
- + return count;
- +}
- +
- +static struct governor_attr up_throttle_nsec = __ATTR_RW(up_throttle_nsec);
- +static struct governor_attr down_throttle_nsec = __ATTR_RW(down_throttle_nsec);
- +
- +static struct attribute *schedfreq_attributes[] = {
- + &up_throttle_nsec.attr,
- + &down_throttle_nsec.attr,
- + NULL
- +};
- +
- +static struct kobj_type tunables_ktype = {
- + .default_attrs = schedfreq_attributes,
- + .sysfs_ops = &governor_sysfs_ops,
- +};
- +
- static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
- {
- struct gov_data *gd;
- @@ -297,20 +360,40 @@
- if (!gd)
- return -ENOMEM;
- - gd->up_throttle_nsec = policy->cpuinfo.transition_latency ?
- - policy->cpuinfo.transition_latency :
- - THROTTLE_UP_NSEC;
- - gd->down_throttle_nsec = THROTTLE_DOWN_NSEC;
- - pr_debug("%s: throttle threshold = %u [ns]\n",
- - __func__, gd->up_throttle_nsec);
- -
- policy->governor_data = gd;
- - rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr());
- - if (rc) {
- - pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc);
- - goto err;
- - }
- + if (!global_tunables) {
- + gd->tunables = kzalloc(sizeof(*gd->tunables), GFP_KERNEL);
- + if (!gd->tunables)
- + goto free_gd;
- +
- + gd->tunables->up_throttle_nsec =
- + policy->cpuinfo.transition_latency ?
- + policy->cpuinfo.transition_latency :
- + THROTTLE_UP_NSEC;
- + gd->tunables->down_throttle_nsec =
- + THROTTLE_DOWN_NSEC;
- +
- + rc = kobject_init_and_add(&gd->tunables->attr_set.kobj,
- + &tunables_ktype,
- + get_governor_parent_kobj(policy),
- + "%s", cpufreq_gov_sched.name);
- + if (rc)
- + goto free_tunables;
- +
- + gov_attr_set_init(&gd->tunables->attr_set,
- + &gd->tunables_hook);
- +
- + pr_debug("%s: throttle_threshold = %u [ns]\n",
- + __func__, gd->tunables->up_throttle_nsec);
- +
- + if (!have_governor_per_policy())
- + global_tunables = gd->tunables;
- + } else {
- + gd->tunables = global_tunables;
- + gov_attr_set_get(&global_tunables->attr_set,
- + &gd->tunables_hook);
- + }
- if (cpufreq_driver_is_slow()) {
- cpufreq_driver_slow = true;
- @@ -320,7 +403,7 @@
- if (IS_ERR_OR_NULL(gd->task)) {
- pr_err("%s: failed to create kschedfreq thread\n",
- __func__);
- - goto err;
- + goto free_tunables;
- }
- get_task_struct(gd->task);
- kthread_bind_mask(gd->task, policy->related_cpus);
- @@ -332,7 +415,9 @@
- return 0;
- -err:
- +free_tunables:
- + kfree(gd->tunables);
- +free_gd:
- policy->governor_data = NULL;
- kfree(gd);
- return -ENOMEM;
- @@ -340,6 +425,7 @@
- static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
- {
- + unsigned int count;
- struct gov_data *gd = policy->governor_data;
- if (!gd)
- @@ -351,7 +437,12 @@
- put_task_struct(gd->task);
- }
- - sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr());
- + count = gov_attr_set_put(&gd->tunables->attr_set, &gd->tunables_hook);
- + if (!count) {
- + if (!have_governor_per_policy())
- + global_tunables = NULL;
- + kfree(gd->tunables);
- + }
- policy->governor_data = NULL;
- @@ -413,88 +504,6 @@
- return 0;
- }
- -/* Tunables */
- -static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf)
- -{
- - return sprintf(buf, "%u\n", gd->up_throttle_nsec);
- -}
- -
- -static ssize_t store_up_throttle_nsec(struct gov_data *gd,
- - const char *buf, size_t count)
- -{
- - int ret;
- - long unsigned int val;
- -
- - ret = kstrtoul(buf, 0, &val);
- - if (ret < 0)
- - return ret;
- - gd->up_throttle_nsec = val;
- - return count;
- -}
- -
- -static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf)
- -{
- - return sprintf(buf, "%u\n", gd->down_throttle_nsec);
- -}
- -
- -static ssize_t store_down_throttle_nsec(struct gov_data *gd,
- - const char *buf, size_t count)
- -{
- - int ret;
- - long unsigned int val;
- -
- - ret = kstrtoul(buf, 0, &val);
- - if (ret < 0)
- - return ret;
- - gd->down_throttle_nsec = val;
- - return count;
- -}
- -
- -/*
- - * Create show/store routines
- - * - sys: One governor instance for complete SYSTEM
- - * - pol: One governor instance per struct cpufreq_policy
- - */
- -#define show_gov_pol_sys(file_name) \
- -static ssize_t show_##file_name##_gov_pol \
- -(struct cpufreq_policy *policy, char *buf) \
- -{ \
- - return show_##file_name(policy->governor_data, buf); \
- -}
- -
- -#define store_gov_pol_sys(file_name) \
- -static ssize_t store_##file_name##_gov_pol \
- -(struct cpufreq_policy *policy, const char *buf, size_t count) \
- -{ \
- - return store_##file_name(policy->governor_data, buf, count); \
- -}
- -
- -#define gov_pol_attr_rw(_name) \
- - static struct freq_attr _name##_gov_pol = \
- - __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
- -
- -#define show_store_gov_pol_sys(file_name) \
- - show_gov_pol_sys(file_name); \
- - store_gov_pol_sys(file_name)
- -#define tunable_handlers(file_name) \
- - show_gov_pol_sys(file_name); \
- - store_gov_pol_sys(file_name); \
- - gov_pol_attr_rw(file_name)
- -
- -tunable_handlers(down_throttle_nsec);
- -tunable_handlers(up_throttle_nsec);
- -
- -/* Per policy governor instance */
- -static struct attribute *sched_attributes_gov_pol[] = {
- - &up_throttle_nsec_gov_pol.attr,
- - &down_throttle_nsec_gov_pol.attr,
- - NULL,
- -};
- -
- -static struct attribute_group sched_attr_group_gov_pol = {
- - .attrs = sched_attributes_gov_pol,
- - .name = "sched",
- -};
- #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
- static
- diff -Nur /home/ninez/android/marlin/kernel/sched/cpufreq_schedutil.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq_schedutil.c
- --- /home/ninez/android/marlin/kernel/sched/cpufreq_schedutil.c 1969-12-31 19:00:00.000000000 -0500
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cpufreq_schedutil.c 2018-08-21 13:56:47.913412345 -0400
- @@ -0,0 +1,874 @@
- +/*
- + * CPUFreq governor based on scheduler-provided CPU utilization data.
- + *
- + * Copyright (C) 2016, Intel Corporation
- + * Author: Rafael J. Wysocki <[email protected]>
- + *
- + * This program is free software; you can redistribute it and/or modify
- + * it under the terms of the GNU General Public License version 2 as
- + * published by the Free Software Foundation.
- + */
- +
- +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
- +
- +#include <linux/cpufreq.h>
- +#include <linux/kthread.h>
- +#include <linux/slab.h>
- +#include <trace/events/power.h>
- +
- +#include "sched.h"
- +#include "tune.h"
- +
- +unsigned long boosted_cpu_util(int cpu);
- +
- +/* Stub out fast switch routines present on mainline to reduce the backport
- + * overhead. */
- +#define cpufreq_driver_fast_switch(x, y) 0
- +#define cpufreq_enable_fast_switch(x)
- +#define cpufreq_disable_fast_switch(x)
- +#define LATENCY_MULTIPLIER (1000)
- +#define SUGOV_KTHREAD_PRIORITY 80
- +
- +struct sugov_tunables {
- + struct gov_attr_set attr_set;
- + unsigned int up_rate_limit_us;
- + unsigned int down_rate_limit_us;
- + bool iowait_boost_enable;
- +};
- +
- +struct sugov_policy {
- + struct cpufreq_policy *policy;
- +
- + struct sugov_tunables *tunables;
- + struct list_head tunables_hook;
- +
- + raw_spinlock_t update_lock; /* For shared policies */
- + u64 last_freq_update_time;
- + s64 min_rate_limit_ns;
- + s64 up_rate_delay_ns;
- + s64 down_rate_delay_ns;
- + unsigned int next_freq;
- + unsigned int cached_raw_freq;
- +
- + /* The next fields are only needed if fast switch cannot be used. */
- + struct irq_work irq_work;
- + struct kthread_work work;
- + struct mutex work_lock;
- + struct kthread_worker worker;
- + struct task_struct *thread;
- + bool work_in_progress;
- +
- + bool need_freq_update;
- +};
- +
- +struct sugov_cpu {
- + struct update_util_data update_util;
- + struct sugov_policy *sg_policy;
- +
- + bool iowait_boost_pending;
- + unsigned int iowait_boost;
- + unsigned int iowait_boost_max;
- + u64 last_update;
- +
- + /* The fields below are only needed when sharing a policy. */
- + unsigned long util;
- + unsigned long max;
- + unsigned int flags;
- +
- + /* The field below is for single-CPU policies only. */
- +#ifdef CONFIG_NO_HZ_COMMON
- + unsigned long saved_idle_calls;
- +#endif
- +};
- +
- +static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
- +
- +/************************ Governor internals ***********************/
- +
- +static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
- +{
- + s64 delta_ns;
- +
- + if (unlikely(sg_policy->need_freq_update))
- + return true;
- +
- + delta_ns = time - sg_policy->last_freq_update_time;
- +
- + /* No need to recalculate next freq for min_rate_limit_us at least */
- + return delta_ns >= sg_policy->min_rate_limit_ns;
- +}
- +
- +static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time,
- + unsigned int next_freq)
- +{
- + s64 delta_ns;
- +
- + delta_ns = time - sg_policy->last_freq_update_time;
- +
- + if (next_freq > sg_policy->next_freq &&
- + delta_ns < sg_policy->up_rate_delay_ns)
- + return true;
- +
- + if (next_freq < sg_policy->next_freq &&
- + delta_ns < sg_policy->down_rate_delay_ns)
- + return true;
- +
- + return false;
- +}
- +
- +static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
- + unsigned int next_freq)
- +{
- + struct cpufreq_policy *policy = sg_policy->policy;
- +
- + if (sugov_up_down_rate_limit(sg_policy, time, next_freq)) {
- + /* Reset cached freq as next_freq isn't changed */
- + sg_policy->cached_raw_freq = 0;
- + return;
- + }
- +
- + if (sg_policy->next_freq == next_freq)
- + return;
- +
- + sg_policy->next_freq = next_freq;
- + sg_policy->last_freq_update_time = time;
- +
- + if (policy->fast_switch_enabled) {
- + next_freq = cpufreq_driver_fast_switch(policy, next_freq);
- + if (next_freq == CPUFREQ_ENTRY_INVALID)
- + return;
- +
- + policy->cur = next_freq;
- + trace_cpu_frequency(next_freq, smp_processor_id());
- + } else if (!sg_policy->work_in_progress) {
- + sg_policy->work_in_progress = true;
- + irq_work_queue(&sg_policy->irq_work);
- + }
- +}
- +
- +/**
- + * get_next_freq - Compute a new frequency for a given cpufreq policy.
- + * @sg_policy: schedutil policy object to compute the new frequency for.
- + * @util: Current CPU utilization.
- + * @max: CPU capacity.
- + *
- + * If the utilization is frequency-invariant, choose the new frequency to be
- + * proportional to it, that is
- + *
- + * next_freq = C * max_freq * util / max
- + *
- + * Otherwise, approximate the would-be frequency-invariant utilization by
- + * util_raw * (curr_freq / max_freq) which leads to
- + *
- + * next_freq = C * curr_freq * util_raw / max
- + *
- + * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
- + *
- + * The lowest driver-supported frequency which is equal or greater than the raw
- + * next_freq (as calculated above) is returned, subject to policy min/max and
- + * cpufreq driver limitations.
- + */
- +static unsigned int get_next_freq(struct sugov_policy *sg_policy,
- + unsigned long util, unsigned long max)
- +{
- + struct cpufreq_policy *policy = sg_policy->policy;
- + unsigned int freq = arch_scale_freq_invariant() ?
- + policy->cpuinfo.max_freq : policy->cur;
- +
- + freq = (freq + (freq >> 2)) * util / max;
- +
- + if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
- + return sg_policy->next_freq;
- +
- + sg_policy->need_freq_update = false;
- + sg_policy->cached_raw_freq = freq;
- + return cpufreq_driver_resolve_freq(policy, freq);
- +}
- +
- +static inline bool use_pelt(void)
- +{
- +#ifdef CONFIG_SCHED_WALT
- + return (!sysctl_sched_use_walt_cpu_util || walt_disabled);
- +#else
- + return true;
- +#endif
- +}
- +
- +static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time)
- +{
- + int cpu = smp_processor_id();
- + struct rq *rq = cpu_rq(cpu);
- + unsigned long max_cap, rt;
- + s64 delta;
- +
- + max_cap = arch_scale_cpu_capacity(NULL, cpu);
- +
- + sched_avg_update(rq);
- + delta = time - rq->age_stamp;
- + if (unlikely(delta < 0))
- + delta = 0;
- + rt = div64_u64(rq->rt_avg, sched_avg_period() + delta);
- + rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;
- +
- + *util = boosted_cpu_util(cpu);
- + if (use_pelt())
- + *util = *util + rt;
- +
- + *util = min(*util, max_cap);
- + *max = max_cap;
- +}
- +
- +static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
- + unsigned int flags)
- +{
- + struct sugov_policy *sg_policy = sg_cpu->sg_policy;
- +
- + if (!sg_policy->tunables->iowait_boost_enable)
- + return;
- +
- + /* Clear iowait_boost if the CPU apprears to have been idle. */
- + if (sg_cpu->iowait_boost) {
- + s64 delta_ns = time - sg_cpu->last_update;
- +
- + if (delta_ns > TICK_NSEC) {
- + sg_cpu->iowait_boost = 0;
- + sg_cpu->iowait_boost_pending = false;
- + }
- + }
- +
- + if (flags & SCHED_CPUFREQ_IOWAIT) {
- + if (sg_cpu->iowait_boost_pending)
- + return;
- +
- + sg_cpu->iowait_boost_pending = true;
- +
- + if (sg_cpu->iowait_boost) {
- + sg_cpu->iowait_boost <<= 1;
- + if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
- + sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
- + } else {
- + sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
- + }
- + }
- +}
- +
- +static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
- + unsigned long *max)
- +{
- + unsigned int boost_util, boost_max;
- +
- + if (!sg_cpu->iowait_boost)
- + return;
- +
- + if (sg_cpu->iowait_boost_pending) {
- + sg_cpu->iowait_boost_pending = false;
- + } else {
- + sg_cpu->iowait_boost >>= 1;
- + if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
- + sg_cpu->iowait_boost = 0;
- + return;
- + }
- + }
- +
- + boost_util = sg_cpu->iowait_boost;
- + boost_max = sg_cpu->iowait_boost_max;
- +
- + if (*util * boost_max < *max * boost_util) {
- + *util = boost_util;
- + *max = boost_max;
- + }
- +}
- +
- +#ifdef CONFIG_NO_HZ_COMMON
- +static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
- +{
- + unsigned long idle_calls = tick_nohz_get_idle_calls();
- + bool ret = idle_calls == sg_cpu->saved_idle_calls;
- +
- + sg_cpu->saved_idle_calls = idle_calls;
- + return ret;
- +}
- +#else
- +static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
- +#endif /* CONFIG_NO_HZ_COMMON */
- +
- +static void sugov_update_single(struct update_util_data *hook, u64 time,
- + unsigned int flags)
- +{
- + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
- + struct sugov_policy *sg_policy = sg_cpu->sg_policy;
- + struct cpufreq_policy *policy = sg_policy->policy;
- + unsigned long util, max;
- + unsigned int next_f;
- + bool busy;
- +
- + sugov_set_iowait_boost(sg_cpu, time, flags);
- + sg_cpu->last_update = time;
- +
- + /*
- + * For slow-switch systems, single policy requests can't run at the
- + * moment if update is in progress, unless we acquire update_lock.
- + */
- + if (sg_policy->work_in_progress)
- + return;
- +
- + if (!sugov_should_update_freq(sg_policy, time))
- + return;
- +
- + busy = sugov_cpu_is_busy(sg_cpu);
- +
- + if (flags & SCHED_CPUFREQ_DL) {
- + next_f = policy->cpuinfo.max_freq;
- + } else {
- + sugov_get_util(&util, &max, time);
- + sugov_iowait_boost(sg_cpu, &util, &max);
- + next_f = get_next_freq(sg_policy, util, max);
- + /*
- + * Do not reduce the frequency if the CPU has not been idle
- + * recently, as the reduction is likely to be premature then.
- + */
- + if (busy && next_f < sg_policy->next_freq) {
- + next_f = sg_policy->next_freq;
- +
- + /* Reset cached freq as next_freq has changed */
- + sg_policy->cached_raw_freq = 0;
- + }
- + }
- + sugov_update_commit(sg_policy, time, next_f);
- +}
- +
- +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
- +{
- + struct sugov_policy *sg_policy = sg_cpu->sg_policy;
- + struct cpufreq_policy *policy = sg_policy->policy;
- + unsigned long util = 0, max = 1;
- + unsigned int j;
- +
- + for_each_cpu(j, policy->cpus) {
- + struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
- + unsigned long j_util, j_max;
- + s64 delta_ns;
- +
- + /*
- + * If the CPU utilization was last updated before the previous
- + * frequency update and the time elapsed between the last update
- + * of the CPU utilization and the last frequency update is long
- + * enough, don't take the CPU into account as it probably is
- + * idle now (and clear iowait_boost for it).
- + */
- + delta_ns = time - j_sg_cpu->last_update;
- + if (delta_ns > TICK_NSEC) {
- + j_sg_cpu->iowait_boost = 0;
- + j_sg_cpu->iowait_boost_pending = false;
- + continue;
- + }
- + if (j_sg_cpu->flags & SCHED_CPUFREQ_DL)
- + return policy->cpuinfo.max_freq;
- +
- + j_util = j_sg_cpu->util;
- + j_max = j_sg_cpu->max;
- + if (j_util * max > j_max * util) {
- + util = j_util;
- + max = j_max;
- + }
- +
- + sugov_iowait_boost(j_sg_cpu, &util, &max);
- + }
- +
- + return get_next_freq(sg_policy, util, max);
- +}
- +
- +static void sugov_update_shared(struct update_util_data *hook, u64 time,
- + unsigned int flags)
- +{
- + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
- + struct sugov_policy *sg_policy = sg_cpu->sg_policy;
- + unsigned long util, max;
- + unsigned int next_f;
- +
- + sugov_get_util(&util, &max, time);
- +
- + raw_spin_lock(&sg_policy->update_lock);
- +
- + sg_cpu->util = util;
- + sg_cpu->max = max;
- + sg_cpu->flags = flags;
- +
- + sugov_set_iowait_boost(sg_cpu, time, flags);
- + sg_cpu->last_update = time;
- +
- + if (sugov_should_update_freq(sg_policy, time)) {
- + if (flags & SCHED_CPUFREQ_DL)
- + next_f = sg_policy->policy->cpuinfo.max_freq;
- + else
- + next_f = sugov_next_freq_shared(sg_cpu, time);
- +
- + sugov_update_commit(sg_policy, time, next_f);
- + }
- +
- + raw_spin_unlock(&sg_policy->update_lock);
- +}
- +
- +static void sugov_work(struct kthread_work *work)
- +{
- + struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
- + unsigned int freq;
- + unsigned long flags;
- +
- + /*
- + * Hold sg_policy->update_lock shortly to handle the case where:
- + * incase sg_policy->next_freq is read here, and then updated by
- + * sugov_update_shared just before work_in_progress is set to false
- + * here, we may miss queueing the new update.
- + *
- + * Note: If a work was queued after the update_lock is released,
- + * sugov_work will just be called again by kthread_work code; and the
- + * request will be proceed before the sugov thread sleeps.
- + */
- + raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
- + freq = sg_policy->next_freq;
- + sg_policy->work_in_progress = false;
- + raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
- +
- + mutex_lock(&sg_policy->work_lock);
- + __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
- + mutex_unlock(&sg_policy->work_lock);
- +}
- +
- +static void sugov_irq_work(struct irq_work *irq_work)
- +{
- + struct sugov_policy *sg_policy;
- +
- + sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
- +
- + /*
- + * For RT and deadline tasks, the schedutil governor shoots the
- + * frequency to maximum. Special care must be taken to ensure that this
- + * kthread doesn't result in the same behavior.
- + *
- + * This is (mostly) guaranteed by the work_in_progress flag. The flag is
- + * updated only at the end of the sugov_work() function and before that
- + * the schedutil governor rejects all other frequency scaling requests.
- + *
- + * There is a very rare case though, where the RT thread yields right
- + * after the work_in_progress flag is cleared. The effects of that are
- + * neglected for now.
- + */
- + kthread_queue_work(&sg_policy->worker, &sg_policy->work);
- +}
- +
- +/************************** sysfs interface ************************/
- +
- +static struct sugov_tunables *global_tunables;
- +static DEFINE_MUTEX(global_tunables_lock);
- +
- +static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
- +{
- + return container_of(attr_set, struct sugov_tunables, attr_set);
- +}
- +
- +static DEFINE_MUTEX(min_rate_lock);
- +
- +static void update_min_rate_limit_us(struct sugov_policy *sg_policy)
- +{
- + mutex_lock(&min_rate_lock);
- + sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,
- + sg_policy->down_rate_delay_ns);
- + mutex_unlock(&min_rate_lock);
- +}
- +
- +static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
- +{
- + struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
- +
- + return sprintf(buf, "%u\n", tunables->up_rate_limit_us);
- +}
- +
- +static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
- +{
- + struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
- +
- + return sprintf(buf, "%u\n", tunables->down_rate_limit_us);
- +}
- +
- +static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set,
- + const char *buf, size_t count)
- +{
- + struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
- + struct sugov_policy *sg_policy;
- + unsigned int rate_limit_us;
- +
- + if (kstrtouint(buf, 10, &rate_limit_us))
- + return -EINVAL;
- +
- + tunables->up_rate_limit_us = rate_limit_us;
- +
- + list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
- + sg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
- + update_min_rate_limit_us(sg_policy);
- + }
- +
- + return count;
- +}
- +
- +static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set,
- + const char *buf, size_t count)
- +{
- + struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
- + struct sugov_policy *sg_policy;
- + unsigned int rate_limit_us;
- +
- + if (kstrtouint(buf, 10, &rate_limit_us))
- + return -EINVAL;
- +
- + tunables->down_rate_limit_us = rate_limit_us;
- +
- + list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
- + sg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
- + update_min_rate_limit_us(sg_policy);
- + }
- +
- + return count;
- +}
- +
- +static ssize_t iowait_boost_enable_show(struct gov_attr_set *attr_set,
- + char *buf)
- +{
- + struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
- +
- + return sprintf(buf, "%u\n", tunables->iowait_boost_enable);
- +}
- +
- +static ssize_t iowait_boost_enable_store(struct gov_attr_set *attr_set,
- + const char *buf, size_t count)
- +{
- + struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
- + bool enable;
- +
- + if (kstrtobool(buf, &enable))
- + return -EINVAL;
- +
- + tunables->iowait_boost_enable = enable;
- +
- + return count;
- +}
- +
- +static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us);
- +static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us);
- +static struct governor_attr iowait_boost_enable = __ATTR_RW(iowait_boost_enable);
- +
- +static struct attribute *sugov_attributes[] = {
- + &up_rate_limit_us.attr,
- + &down_rate_limit_us.attr,
- + &iowait_boost_enable.attr,
- + NULL
- +};
- +
- +static struct kobj_type sugov_tunables_ktype = {
- + .default_attrs = sugov_attributes,
- + .sysfs_ops = &governor_sysfs_ops,
- +};
- +
- +/********************** cpufreq governor interface *********************/
- +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
- +static
- +#endif
- +struct cpufreq_governor cpufreq_gov_schedutil;
- +
- +static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
- +{
- + struct sugov_policy *sg_policy;
- +
- + sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
- + if (!sg_policy)
- + return NULL;
- +
- + sg_policy->policy = policy;
- + raw_spin_lock_init(&sg_policy->update_lock);
- + return sg_policy;
- +}
- +
- +static void sugov_policy_free(struct sugov_policy *sg_policy)
- +{
- + kfree(sg_policy);
- +}
- +
- +static int sugov_kthread_create(struct sugov_policy *sg_policy)
- +{
- + struct task_struct *thread;
- + struct sched_param param = { .sched_priority = 80 };
- + struct cpufreq_policy *policy = sg_policy->policy;
- + int ret;
- +
- + /* kthread only required for slow path */
- + if (policy->fast_switch_enabled)
- + return 0;
- +
- + kthread_init_work(&sg_policy->work, sugov_work);
- + kthread_init_worker(&sg_policy->worker);
- + thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
- + "sugov:%d",
- + cpumask_first(policy->related_cpus));
- + if (IS_ERR(thread)) {
- + pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
- + return PTR_ERR(thread);
- + }
- +
- + ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m);
- + if (ret) {
- + kthread_stop(thread);
- + pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
- + return ret;
- + }
- +
- + sg_policy->thread = thread;
- + kthread_bind_mask(thread, policy->related_cpus);
- + init_irq_work(&sg_policy->irq_work, sugov_irq_work);
- + mutex_init(&sg_policy->work_lock);
- +
- + wake_up_process(thread);
- +
- + return 0;
- +}
- +
- +static void sugov_kthread_stop(struct sugov_policy *sg_policy)
- +{
- + /* kthread only required for slow path */
- + if (sg_policy->policy->fast_switch_enabled)
- + return;
- +
- + kthread_flush_worker(&sg_policy->worker);
- + kthread_stop(sg_policy->thread);
- + mutex_destroy(&sg_policy->work_lock);
- +}
- +
- +static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
- +{
- + struct sugov_tunables *tunables;
- +
- + tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
- + if (tunables) {
- + gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
- + if (!have_governor_per_policy())
- + global_tunables = tunables;
- + }
- + return tunables;
- +}
- +
- +static void sugov_tunables_free(struct sugov_tunables *tunables)
- +{
- + if (!have_governor_per_policy())
- + global_tunables = NULL;
- +
- + kfree(tunables);
- +}
- +
- +static int sugov_init(struct cpufreq_policy *policy)
- +{
- + struct sugov_policy *sg_policy;
- + struct sugov_tunables *tunables;
- + int ret = 0;
- +
- + /* State should be equivalent to EXIT */
- + if (policy->governor_data)
- + return -EBUSY;
- +
- + cpufreq_enable_fast_switch(policy);
- +
- + sg_policy = sugov_policy_alloc(policy);
- + if (!sg_policy) {
- + ret = -ENOMEM;
- + goto disable_fast_switch;
- + }
- +
- + ret = sugov_kthread_create(sg_policy);
- + if (ret)
- + goto free_sg_policy;
- +
- + mutex_lock(&global_tunables_lock);
- +
- + if (global_tunables) {
- + if (WARN_ON(have_governor_per_policy())) {
- + ret = -EINVAL;
- + goto stop_kthread;
- + }
- + policy->governor_data = sg_policy;
- + sg_policy->tunables = global_tunables;
- +
- + gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
- + goto out;
- + }
- +
- + tunables = sugov_tunables_alloc(sg_policy);
- + if (!tunables) {
- + ret = -ENOMEM;
- + goto stop_kthread;
- + }
- +
- + if (policy->up_transition_delay_us && policy->down_transition_delay_us) {
- + tunables->up_rate_limit_us = policy->up_transition_delay_us;
- + tunables->down_rate_limit_us = policy->down_transition_delay_us;
- + } else {
- + unsigned int lat;
- +
- + tunables->up_rate_limit_us = LATENCY_MULTIPLIER;
- + tunables->down_rate_limit_us = LATENCY_MULTIPLIER;
- + lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
- + if (lat) {
- + tunables->up_rate_limit_us *= lat;
- + tunables->down_rate_limit_us *= lat;
- + }
- + }
- +
- + tunables->iowait_boost_enable = policy->iowait_boost_enable;
- +
- + policy->governor_data = sg_policy;
- + sg_policy->tunables = tunables;
- +
- + ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
- + get_governor_parent_kobj(policy), "%s",
- + cpufreq_gov_schedutil.name);
- + if (ret)
- + goto fail;
- +
- +out:
- + mutex_unlock(&global_tunables_lock);
- + return 0;
- +
- +fail:
- + policy->governor_data = NULL;
- + sugov_tunables_free(tunables);
- +
- +stop_kthread:
- + sugov_kthread_stop(sg_policy);
- +
- +free_sg_policy:
- + mutex_unlock(&global_tunables_lock);
- +
- + sugov_policy_free(sg_policy);
- +
- +disable_fast_switch:
- + cpufreq_disable_fast_switch(policy);
- +
- + pr_err("initialization failed (error %d)\n", ret);
- + return ret;
- +}
- +
- +static int sugov_exit(struct cpufreq_policy *policy)
- +{
- + struct sugov_policy *sg_policy = policy->governor_data;
- + struct sugov_tunables *tunables = sg_policy->tunables;
- + unsigned int count;
- +
- + mutex_lock(&global_tunables_lock);
- +
- + count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
- + policy->governor_data = NULL;
- + if (!count)
- + sugov_tunables_free(tunables);
- +
- + mutex_unlock(&global_tunables_lock);
- +
- + sugov_kthread_stop(sg_policy);
- + sugov_policy_free(sg_policy);
- +
- + cpufreq_disable_fast_switch(policy);
- + return 0;
- +}
- +
- +static int sugov_start(struct cpufreq_policy *policy)
- +{
- + struct sugov_policy *sg_policy = policy->governor_data;
- + unsigned int cpu;
- +
- + sg_policy->up_rate_delay_ns =
- + sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;
- + sg_policy->down_rate_delay_ns =
- + sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
- + update_min_rate_limit_us(sg_policy);
- + sg_policy->last_freq_update_time = 0;
- + sg_policy->next_freq = 0;
- + sg_policy->work_in_progress = false;
- + sg_policy->need_freq_update = false;
- + sg_policy->cached_raw_freq = 0;
- +
- + for_each_cpu(cpu, policy->cpus) {
- + struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
- +
- + memset(sg_cpu, 0, sizeof(*sg_cpu));
- + sg_cpu->sg_policy = sg_policy;
- + sg_cpu->flags = SCHED_CPUFREQ_DL;
- + sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
- + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
- + policy_is_shared(policy) ?
- + sugov_update_shared :
- + sugov_update_single);
- + }
- + return 0;
- +}
- +
- +static int sugov_stop(struct cpufreq_policy *policy)
- +{
- + struct sugov_policy *sg_policy = policy->governor_data;
- + unsigned int cpu;
- +
- + for_each_cpu(cpu, policy->cpus)
- + cpufreq_remove_update_util_hook(cpu);
- +
- + synchronize_sched();
- +
- + if (!policy->fast_switch_enabled) {
- + irq_work_sync(&sg_policy->irq_work);
- + kthread_cancel_work_sync(&sg_policy->work);
- + }
- + return 0;
- +}
- +
- +static int sugov_limits(struct cpufreq_policy *policy)
- +{
- + struct sugov_policy *sg_policy = policy->governor_data;
- +
- + if (!policy->fast_switch_enabled) {
- + mutex_lock(&sg_policy->work_lock);
- + cpufreq_policy_apply_limits(policy);
- + mutex_unlock(&sg_policy->work_lock);
- + }
- +
- + sg_policy->need_freq_update = true;
- +
- + return 0;
- +}
- +
- +static int cpufreq_schedutil_cb(struct cpufreq_policy *policy,
- + unsigned int event)
- +{
- + switch(event) {
- + case CPUFREQ_GOV_POLICY_INIT:
- + return sugov_init(policy);
- + case CPUFREQ_GOV_POLICY_EXIT:
- + return sugov_exit(policy);
- + case CPUFREQ_GOV_START:
- + return sugov_start(policy);
- + case CPUFREQ_GOV_STOP:
- + return sugov_stop(policy);
- + case CPUFREQ_GOV_LIMITS:
- + return sugov_limits(policy);
- + default:
- + BUG();
- + }
- +}
- +
- +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
- +static
- +#endif
- +struct cpufreq_governor cpufreq_gov_schedutil = {
- + .name = "schedutil",
- + .governor = cpufreq_schedutil_cb,
- + .owner = THIS_MODULE,
- +};
- +
- +static int __init sugov_register(void)
- +{
- + return cpufreq_register_governor(&cpufreq_gov_schedutil);
- +}
- +fs_initcall(sugov_register);
- diff -Nur /home/ninez/android/marlin/kernel/sched/cputime.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cputime.c
- --- /home/ninez/android/marlin/kernel/sched/cputime.c 2018-08-10 01:54:08.563395055 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/cputime.c 2018-08-11 23:57:17.128607487 -0400
- @@ -306,6 +306,26 @@
- return false;
- }
- +#ifdef CONFIG_64BIT
- +static inline u64 read_sum_exec_runtime(struct task_struct *t)
- +{
- + return t->se.sum_exec_runtime;
- +}
- +#else
- +static u64 read_sum_exec_runtime(struct task_struct *t)
- +{
- + u64 ns;
- + struct rq_flags rf;
- + struct rq *rq;
- +
- + rq = task_rq_lock(t, &rf);
- + ns = t->se.sum_exec_runtime;
- + task_rq_unlock(rq, t, &rf);
- +
- + return ns;
- +}
- +#endif
- +
- /*
- * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
- * tasks (sum on group iteration) belonging to @tsk's group.
- @@ -318,6 +338,17 @@
- unsigned int seq, nextseq;
- unsigned long flags;
- + /*
- + * Update current task runtime to account pending time since last
- + * scheduler action or thread_group_cputime() call. This thread group
- + * might have other running tasks on different CPUs, but updating
- + * their runtime can affect syscall performance, so we skip account
- + * those pending times and rely only on values updated on tick or
- + * other scheduler action.
- + */
- + if (same_thread_group(current, tsk))
- + (void) task_sched_runtime(current);
- +
- rcu_read_lock();
- /* Attempt a lockless read on the first round. */
- nextseq = 0;
- @@ -332,7 +363,7 @@
- task_cputime(t, &utime, &stime);
- times->utime += utime;
- times->stime += stime;
- - times->sum_exec_runtime += task_sched_runtime(t);
- + times->sum_exec_runtime += read_sum_exec_runtime(t);
- }
- /* If lockless access failed, take the lock. */
- nextseq = 1;
- @@ -582,48 +613,43 @@
- }
- /*
- - * Atomically advance counter to the new value. Interrupts, vcpu
- - * scheduling, and scaling inaccuracies can cause cputime_advance
- - * to be occasionally called with a new value smaller than counter.
- - * Let's enforce atomicity.
- + * Adjust tick based cputime random precision against scheduler runtime
- + * accounting.
- *
- - * Normally a caller will only go through this loop once, or not
- - * at all in case a previous caller updated counter the same jiffy.
- - */
- -static void cputime_advance(cputime_t *counter, cputime_t new)
- -{
- - cputime_t old;
- -
- - while (new > (old = READ_ONCE(*counter)))
- - cmpxchg_cputime(counter, old, new);
- -}
- -
- -/*
- - * Adjust tick based cputime random precision against scheduler
- - * runtime accounting.
- + * Tick based cputime accounting depend on random scheduling timeslices of a
- + * task to be interrupted or not by the timer. Depending on these
- + * circumstances, the number of these interrupts may be over or
- + * under-optimistic, matching the real user and system cputime with a variable
- + * precision.
- + *
- + * Fix this by scaling these tick based values against the total runtime
- + * accounted by the CFS scheduler.
- + *
- + * This code provides the following guarantees:
- + *
- + * stime + utime == rtime
- + * stime_i+1 >= stime_i, utime_i+1 >= utime_i
- + *
- + * Assuming that rtime_i+1 >= rtime_i.
- */
- static void cputime_adjust(struct task_cputime *curr,
- - struct cputime *prev,
- + struct prev_cputime *prev,
- cputime_t *ut, cputime_t *st)
- {
- cputime_t rtime, stime, utime;
- + unsigned long flags;
- - /*
- - * Tick based cputime accounting depend on random scheduling
- - * timeslices of a task to be interrupted or not by the timer.
- - * Depending on these circumstances, the number of these interrupts
- - * may be over or under-optimistic, matching the real user and system
- - * cputime with a variable precision.
- - *
- - * Fix this by scaling these tick based values against the total
- - * runtime accounted by the CFS scheduler.
- - */
- + /* Serialize concurrent callers such that we can honour our guarantees */
- + raw_spin_lock_irqsave(&prev->lock, flags);
- rtime = nsecs_to_cputime(curr->sum_exec_runtime);
- /*
- - * Update userspace visible utime/stime values only if actual execution
- - * time is bigger than already exported. Note that can happen, that we
- - * provided bigger values due to scaling inaccuracy on big numbers.
- + * This is possible under two circumstances:
- + * - rtime isn't monotonic after all (a bug);
- + * - we got reordered by the lock.
- + *
- + * In both cases this acts as a filter such that the rest of the code
- + * can assume it is monotonic regardless of anything else.
- */
- if (prev->stime + prev->utime >= rtime)
- goto out;
- @@ -633,22 +659,46 @@
- if (utime == 0) {
- stime = rtime;
- - } else if (stime == 0) {
- - utime = rtime;
- - } else {
- - cputime_t total = stime + utime;
- + goto update;
- + }
- - stime = scale_stime((__force u64)stime,
- - (__force u64)rtime, (__force u64)total);
- - utime = rtime - stime;
- + if (stime == 0) {
- + utime = rtime;
- + goto update;
- }
- - cputime_advance(&prev->stime, stime);
- - cputime_advance(&prev->utime, utime);
- + stime = scale_stime((__force u64)stime, (__force u64)rtime,
- + (__force u64)(stime + utime));
- +
- + /*
- + * Make sure stime doesn't go backwards; this preserves monotonicity
- + * for utime because rtime is monotonic.
- + *
- + * utime_i+1 = rtime_i+1 - stime_i
- + * = rtime_i+1 - (rtime_i - utime_i)
- + * = (rtime_i+1 - rtime_i) + utime_i
- + * >= utime_i
- + */
- + if (stime < prev->stime)
- + stime = prev->stime;
- + utime = rtime - stime;
- +
- + /*
- + * Make sure utime doesn't go backwards; this still preserves
- + * monotonicity for stime, analogous argument to above.
- + */
- + if (utime < prev->utime) {
- + utime = prev->utime;
- + stime = rtime - utime;
- + }
- +update:
- + prev->stime = stime;
- + prev->utime = utime;
- out:
- *ut = prev->utime;
- *st = prev->stime;
- + raw_spin_unlock_irqrestore(&prev->lock, flags);
- }
- void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
- diff -Nur /home/ninez/android/marlin/kernel/sched/deadline.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/deadline.c
- --- /home/ninez/android/marlin/kernel/sched/deadline.c 2018-08-10 01:54:08.563395055 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/deadline.c 2018-08-26 16:43:11.647206295 -0400
- @@ -18,6 +18,8 @@
- #include <linux/slab.h>
- +#include "walt.h"
- +
- struct dl_bandwidth def_dl_bandwidth;
- static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
- @@ -87,7 +89,7 @@
- dl_b->total_bw = 0;
- }
- -void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
- +void init_dl_rq(struct dl_rq *dl_rq)
- {
- dl_rq->rb_root = RB_ROOT;
- @@ -152,7 +154,7 @@
- {
- struct task_struct *p = dl_task_of(dl_se);
- - if (p->nr_cpus_allowed > 1)
- + if (tsk_nr_cpus_allowed(p) > 1)
- dl_rq->dl_nr_migratory++;
- update_dl_migration(dl_rq);
- @@ -162,7 +164,7 @@
- {
- struct task_struct *p = dl_task_of(dl_se);
- - if (p->nr_cpus_allowed > 1)
- + if (tsk_nr_cpus_allowed(p) > 1)
- dl_rq->dl_nr_migratory--;
- update_dl_migration(dl_rq);
- @@ -231,17 +233,23 @@
- return dl_task(prev);
- }
- -static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
- +static DEFINE_PER_CPU(struct callback_head, dl_push_head);
- +static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
- static void push_dl_tasks(struct rq *);
- +static void pull_dl_task(struct rq *);
- static inline void queue_push_tasks(struct rq *rq)
- {
- if (!has_pushable_dl_tasks(rq))
- return;
- - queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu),
- - push_dl_tasks);
- + queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
- +}
- +
- +static inline void queue_pull_task(struct rq *rq)
- +{
- + queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
- }
- static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
- @@ -322,14 +330,17 @@
- return false;
- }
- -static inline int pull_dl_task(struct rq *rq)
- +static inline void pull_dl_task(struct rq *rq)
- {
- - return 0;
- }
- static inline void queue_push_tasks(struct rq *rq)
- {
- }
- +
- +static inline void queue_pull_task(struct rq *rq)
- +{
- +}
- #endif /* CONFIG_SMP */
- static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
- @@ -450,13 +461,13 @@
- *
- * This function returns true if:
- *
- - * runtime / (deadline - t) > dl_runtime / dl_deadline ,
- + * runtime / (deadline - t) > dl_runtime / dl_period ,
- *
- * IOW we can't recycle current parameters.
- *
- - * Notice that the bandwidth check is done against the deadline. For
- + * Notice that the bandwidth check is done against the period. For
- * task with deadline equal to period this is the same of using
- - * dl_period instead of dl_deadline in the equation above.
- + * dl_deadline instead of dl_period in the equation above.
- */
- static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se, u64 t)
- @@ -481,7 +492,7 @@
- * of anything below microseconds resolution is actually fiction
- * (but still we want to give the user that illusion >;).
- */
- - left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
- + left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
- right = ((dl_se->deadline - t) >> DL_SCALE) *
- (pi_se->dl_runtime >> DL_SCALE);
- @@ -596,16 +607,10 @@
- struct sched_dl_entity,
- dl_timer);
- struct task_struct *p = dl_task_of(dl_se);
- + struct rq_flags rf;
- struct rq *rq;
- -again:
- - rq = task_rq(p);
- - raw_spin_lock(&rq->lock);
- - if (rq != task_rq(p)) {
- - /* Task was moved, retrying. */
- - raw_spin_unlock(&rq->lock);
- - goto again;
- - }
- + rq = task_rq_lock(p, &rf);
- /*
- * The task might have changed its scheduling policy to something
- @@ -686,12 +691,19 @@
- * Queueing this task back might have overloaded rq, check if we need
- * to kick someone away.
- */
- - if (has_pushable_dl_tasks(rq))
- + if (has_pushable_dl_tasks(rq)) {
- + /*
- + * Nothing relies on rq->lock after this, so its safe to drop
- + * rq->lock.
- + */
- + lockdep_unpin_lock(&rq->lock, rf.cookie);
- push_dl_task(rq);
- + lockdep_repin_lock(&rq->lock, rf.cookie);
- + }
- #endif
- unlock:
- - raw_spin_unlock(&rq->lock);
- + task_rq_unlock(rq, p, &rf);
- /*
- * This can free the task_struct, including this hrtimer, do not touch
- @@ -711,7 +723,7 @@
- }
- static
- -int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
- +int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
- {
- return (dl_se->runtime <= 0);
- }
- @@ -743,6 +755,9 @@
- if (unlikely((s64)delta_exec <= 0))
- return;
- + /* kick cpufreq (see the comment in kernel/sched/sched.h). */
- + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
- +
- schedstat_set(curr->se.statistics.exec_max,
- max(curr->se.statistics.exec_max, delta_exec));
- @@ -753,7 +768,7 @@
- cpuacct_charge(curr, delta_exec);
- dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
- - if (dl_runtime_exceeded(rq, dl_se)) {
- + if (dl_runtime_exceeded(dl_se)) {
- dl_se->dl_throttled = 1;
- __dequeue_task_dl(rq, curr, 0);
- if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
- @@ -869,6 +884,7 @@
- WARN_ON(!dl_prio(prio));
- dl_rq->dl_nr_running++;
- add_nr_running(rq_of_dl_rq(dl_rq), 1);
- + walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
- inc_dl_deadline(dl_rq, deadline);
- inc_dl_migration(dl_se, dl_rq);
- @@ -883,6 +899,7 @@
- WARN_ON(!dl_rq->dl_nr_running);
- dl_rq->dl_nr_running--;
- sub_nr_running(rq_of_dl_rq(dl_rq), 1);
- + walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
- dec_dl_deadline(dl_rq, dl_se->deadline);
- dec_dl_migration(dl_se, dl_rq);
- @@ -969,7 +986,7 @@
- /*
- * Use the scheduling parameters of the top pi-waiter
- - * task if we have one and its (relative) deadline is
- + * task if we have one and its (absolute) deadline is
- * smaller than our one... OTW we keep our runtime and
- * deadline.
- */
- @@ -998,7 +1015,7 @@
- enqueue_dl_entity(&p->dl, pi_se, flags);
- - if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
- + if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
- enqueue_pushable_dl_task(rq, p);
- }
- @@ -1038,7 +1055,14 @@
- rq->curr->dl.dl_yielded = 1;
- p->dl.runtime = 0;
- }
- + update_rq_clock(rq);
- update_curr_dl(rq);
- + /*
- + * Tell update_rq_clock() that we've just updated,
- + * so we don't do microscopic update in schedule()
- + * and double the fastpath cost.
- + */
- + rq_clock_skip_update(rq, true);
- }
- #ifdef CONFIG_SMP
- @@ -1046,12 +1070,13 @@
- static int find_later_rq(struct task_struct *task);
- static int
- -select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
- +select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags,
- + int sibling_count_hint)
- {
- struct task_struct *curr;
- struct rq *rq;
- - if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
- + if (sd_flag != SD_BALANCE_WAKE)
- goto out;
- rq = cpu_rq(cpu);
- @@ -1069,12 +1094,15 @@
- * try to make it stay here, it might be important.
- */
- if (unlikely(dl_task(curr)) &&
- - (curr->nr_cpus_allowed < 2 ||
- + (tsk_nr_cpus_allowed(curr) < 2 ||
- !dl_entity_preempt(&p->dl, &curr->dl)) &&
- - (p->nr_cpus_allowed > 1)) {
- + (tsk_nr_cpus_allowed(p) > 1)) {
- int target = find_later_rq(p);
- - if (target != -1)
- + if (target != -1 &&
- + (dl_time_before(p->dl.deadline,
- + cpu_rq(target)->dl.earliest_dl.curr) ||
- + (cpu_rq(target)->dl.dl_nr_running == 0)))
- cpu = target;
- }
- rcu_read_unlock();
- @@ -1089,7 +1117,7 @@
- * Current can't be migrated, useless to reschedule,
- * let's hope p can move out.
- */
- - if (rq->curr->nr_cpus_allowed == 1 ||
- + if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
- cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
- return;
- @@ -1097,15 +1125,13 @@
- * p is migratable, so let's not schedule it and
- * see if it is pushed or pulled somewhere else.
- */
- - if (p->nr_cpus_allowed != 1 &&
- + if (tsk_nr_cpus_allowed(p) != 1 &&
- cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
- return;
- resched_curr(rq);
- }
- -static int pull_dl_task(struct rq *this_rq);
- -
- #endif /* CONFIG_SMP */
- /*
- @@ -1136,6 +1162,10 @@
- {
- hrtick_start(rq, p->dl.runtime);
- }
- +#else /* !CONFIG_SCHED_HRTICK */
- +static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
- +{
- +}
- #endif
- static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
- @@ -1149,7 +1179,8 @@
- return rb_entry(left, struct sched_dl_entity, rb_node);
- }
- -struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
- +struct task_struct *
- +pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
- {
- struct sched_dl_entity *dl_se;
- struct task_struct *p;
- @@ -1158,7 +1189,15 @@
- dl_rq = &rq->dl;
- if (need_pull_dl_task(rq, prev)) {
- + /*
- + * This is OK, because current is on_cpu, which avoids it being
- + * picked for load-balance and preemption/IRQs are still
- + * disabled avoiding further scheduler activity on it and we're
- + * being very careful to re-start the picking loop.
- + */
- + lockdep_unpin_lock(&rq->lock, cookie);
- pull_dl_task(rq);
- + lockdep_repin_lock(&rq->lock, cookie);
- /*
- * pull_rt_task() can drop (and re-acquire) rq->lock; this
- * means a stop task can slip in, in which case we need to
- @@ -1189,10 +1228,8 @@
- /* Running task will never be pushed. */
- dequeue_pushable_dl_task(rq, p);
- -#ifdef CONFIG_SCHED_HRTICK
- if (hrtick_enabled(rq))
- start_hrtick_dl(rq, p);
- -#endif
- queue_push_tasks(rq);
- @@ -1203,7 +1240,7 @@
- {
- update_curr_dl(rq);
- - if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
- + if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1)
- enqueue_pushable_dl_task(rq, p);
- }
- @@ -1211,10 +1248,14 @@
- {
- update_curr_dl(rq);
- -#ifdef CONFIG_SCHED_HRTICK
- - if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
- + /*
- + * Even when we have runtime, update_curr_dl() might have resulted in us
- + * not being the leftmost task anymore. In that case NEED_RESCHED will
- + * be set and schedule() will start a new hrtick for the next task.
- + */
- + if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
- + is_leftmost(p, &rq->dl))
- start_hrtick_dl(rq, p);
- -#endif
- }
- static void task_fork_dl(struct task_struct *p)
- @@ -1287,6 +1328,32 @@
- return NULL;
- }
- +/*
- + * Return the earliest pushable rq's task, which is suitable to be executed
- + * on the CPU, NULL otherwise:
- + */
- +static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
- +{
- + struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost;
- + struct task_struct *p = NULL;
- +
- + if (!has_pushable_dl_tasks(rq))
- + return NULL;
- +
- +next_node:
- + if (next_node) {
- + p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
- +
- + if (pick_dl_task(rq, p, cpu))
- + return p;
- +
- + next_node = rb_next(next_node);
- + goto next_node;
- + }
- +
- + return NULL;
- +}
- +
- static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
- static int find_later_rq(struct task_struct *task)
- @@ -1300,16 +1367,13 @@
- if (unlikely(!later_mask))
- return -1;
- - if (task->nr_cpus_allowed == 1)
- + if (tsk_nr_cpus_allowed(task) == 1)
- return -1;
- /*
- * We have to consider system topology and task affinity
- * first, then we can look for a suitable cpu.
- */
- - cpumask_copy(later_mask, task_rq(task)->rd->span);
- - cpumask_and(later_mask, later_mask, cpu_active_mask);
- - cpumask_and(later_mask, later_mask, &task->cpus_allowed);
- best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
- task, later_mask);
- if (best_cpu == -1)
- @@ -1393,6 +1457,18 @@
- later_rq = cpu_rq(cpu);
- + if (later_rq->dl.dl_nr_running &&
- + !dl_time_before(task->dl.deadline,
- + later_rq->dl.earliest_dl.curr)) {
- + /*
- + * Target rq has tasks of equal or earlier deadline,
- + * retrying does not release any lock and is unlikely
- + * to yield a different result.
- + */
- + later_rq = NULL;
- + break;
- + }
- +
- /* Retry if something changed. */
- if (double_lock_balance(rq, later_rq)) {
- if (unlikely(task_rq(task) != rq ||
- @@ -1436,7 +1512,7 @@
- BUG_ON(rq->cpu != task_cpu(p));
- BUG_ON(task_current(rq, p));
- - BUG_ON(p->nr_cpus_allowed <= 1);
- + BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
- BUG_ON(!task_on_rq_queued(p));
- BUG_ON(!dl_task(p));
- @@ -1453,6 +1529,7 @@
- {
- struct task_struct *next_task;
- struct rq *later_rq;
- + int ret = 0;
- if (!rq->dl.overloaded)
- return 0;
- @@ -1474,7 +1551,7 @@
- */
- if (dl_task(rq->curr) &&
- dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
- - rq->curr->nr_cpus_allowed > 1) {
- + tsk_nr_cpus_allowed(rq->curr) > 1) {
- resched_curr(rq);
- return 0;
- }
- @@ -1498,7 +1575,6 @@
- * The task is still there. We don't try
- * again, some other cpu will pull it when ready.
- */
- - dequeue_pushable_dl_task(rq, next_task);
- goto out;
- }
- @@ -1513,9 +1589,12 @@
- deactivate_task(rq, next_task, 0);
- clear_average_bw(&next_task->dl, &rq->dl);
- + next_task->on_rq = TASK_ON_RQ_MIGRATING;
- set_task_cpu(next_task, later_rq->cpu);
- + next_task->on_rq = TASK_ON_RQ_QUEUED;
- add_average_bw(&next_task->dl, &later_rq->dl);
- activate_task(later_rq, next_task, 0);
- + ret = 1;
- resched_curr(later_rq);
- @@ -1524,25 +1603,26 @@
- out:
- put_task_struct(next_task);
- - return 1;
- + return ret;
- }
- static void push_dl_tasks(struct rq *rq)
- {
- - /* Terminates as it moves a -deadline task */
- + /* push_dl_task() will return true if it moved a -deadline task */
- while (push_dl_task(rq))
- ;
- }
- -static int pull_dl_task(struct rq *this_rq)
- +static void pull_dl_task(struct rq *this_rq)
- {
- - int this_cpu = this_rq->cpu, ret = 0, cpu;
- + int this_cpu = this_rq->cpu, cpu;
- struct task_struct *p;
- + bool resched = false;
- struct rq *src_rq;
- u64 dmin = LONG_MAX;
- if (likely(!dl_overloaded(this_rq)))
- - return 0;
- + return;
- /*
- * Match the barrier from dl_set_overloaded; this guarantees that if we
- @@ -1575,7 +1655,7 @@
- if (src_rq->dl.dl_nr_running <= 1)
- goto skip;
- - p = pick_next_earliest_dl_task(src_rq, this_cpu);
- + p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
- /*
- * We found a task to be pulled if:
- @@ -1597,11 +1677,13 @@
- src_rq->curr->dl.deadline))
- goto skip;
- - ret = 1;
- + resched = true;
- deactivate_task(src_rq, p, 0);
- clear_average_bw(&p->dl, &src_rq->dl);
- + p->on_rq = TASK_ON_RQ_MIGRATING;
- set_task_cpu(p, this_cpu);
- + p->on_rq = TASK_ON_RQ_QUEUED;
- add_average_bw(&p->dl, &this_rq->dl);
- activate_task(this_rq, p, 0);
- dmin = p->dl.deadline;
- @@ -1612,7 +1694,8 @@
- double_unlock_balance(this_rq, src_rq);
- }
- - return ret;
- + if (resched)
- + resched_curr(this_rq);
- }
- /*
- @@ -1623,11 +1706,10 @@
- {
- if (!task_running(rq, p) &&
- !test_tsk_need_resched(rq->curr) &&
- - has_pushable_dl_tasks(rq) &&
- - p->nr_cpus_allowed > 1 &&
- + tsk_nr_cpus_allowed(p) > 1 &&
- dl_task(rq->curr) &&
- - (rq->curr->nr_cpus_allowed < 2 ||
- - dl_entity_preempt(&rq->curr->dl, &p->dl))) {
- + (tsk_nr_cpus_allowed(rq->curr) < 2 ||
- + !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
- push_dl_tasks(rq);
- }
- }
- @@ -1635,44 +1717,34 @@
- static void set_cpus_allowed_dl(struct task_struct *p,
- const struct cpumask *new_mask)
- {
- + struct root_domain *src_rd;
- struct rq *rq;
- - int weight;
- BUG_ON(!dl_task(p));
- - /*
- - * Update only if the task is actually running (i.e.,
- - * it is on the rq AND it is not throttled).
- - */
- - if (!on_dl_rq(&p->dl))
- - return;
- -
- - weight = cpumask_weight(new_mask);
- -
- - /*
- - * Only update if the process changes its state from whether it
- - * can migrate or not.
- - */
- - if ((p->nr_cpus_allowed > 1) == (weight > 1))
- - return;
- -
- rq = task_rq(p);
- -
- + src_rd = rq->rd;
- /*
- - * The process used to be able to migrate OR it can now migrate
- + * Migrating a SCHED_DEADLINE task between exclusive
- + * cpusets (different root_domains) entails a bandwidth
- + * update. We already made space for us in the destination
- + * domain (see cpuset_can_attach()).
- */
- - if (weight <= 1) {
- - if (!task_current(rq, p))
- - dequeue_pushable_dl_task(rq, p);
- - BUG_ON(!rq->dl.dl_nr_migratory);
- - rq->dl.dl_nr_migratory--;
- - } else {
- - if (!task_current(rq, p))
- - enqueue_pushable_dl_task(rq, p);
- - rq->dl.dl_nr_migratory++;
- + if (!cpumask_intersects(src_rd->span, new_mask)) {
- + struct dl_bw *src_dl_b;
- +
- + src_dl_b = dl_bw_of(cpu_of(rq));
- + /*
- + * We now free resources of the root_domain we are migrating
- + * off. In the worst case, sched_setattr() may temporary fail
- + * until we complete the update.
- + */
- + raw_spin_lock(&src_dl_b->lock);
- + __dl_clear(src_dl_b, p->dl.dl_bw);
- + raw_spin_unlock(&src_dl_b->lock);
- }
- - update_dl_migration(&rq->dl);
- + set_cpus_allowed_common(p, new_mask);
- }
- /* Assumes rq->lock is held */
- @@ -1681,6 +1753,7 @@
- if (rq->dl.overloaded)
- dl_set_overload(rq);
- + cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
- if (rq->dl.dl_nr_running > 0)
- cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
- }
- @@ -1692,9 +1765,10 @@
- dl_clear_overload(rq);
- cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
- + cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
- }
- -void init_sched_dl_class(void)
- +void __init init_sched_dl_class(void)
- {
- unsigned int i;
- @@ -1726,8 +1800,7 @@
- if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
- return;
- - if (pull_dl_task(rq))
- - resched_curr(rq);
- + queue_pull_task(rq);
- }
- /*
- @@ -1736,28 +1809,15 @@
- */
- static void switched_to_dl(struct rq *rq, struct task_struct *p)
- {
- - int check_resched = 1;
- -
- - /*
- - * If p is throttled, don't consider the possibility
- - * of preempting rq->curr, the check will be done right
- - * after its runtime will get replenished.
- - */
- - if (unlikely(p->dl.dl_throttled))
- - return;
- -
- if (task_on_rq_queued(p) && rq->curr != p) {
- #ifdef CONFIG_SMP
- - if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
- - /* Only reschedule if pushing failed */
- - check_resched = 0;
- -#endif /* CONFIG_SMP */
- - if (check_resched) {
- - if (dl_task(rq->curr))
- - check_preempt_curr_dl(rq, p, 0);
- - else
- - resched_curr(rq);
- - }
- + if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
- + queue_push_tasks(rq);
- +#endif
- + if (dl_task(rq->curr))
- + check_preempt_curr_dl(rq, p, 0);
- + else
- + resched_curr(rq);
- }
- }
- @@ -1777,15 +1837,14 @@
- * or lowering its prio, so...
- */
- if (!rq->dl.overloaded)
- - pull_dl_task(rq);
- + queue_pull_task(rq);
- /*
- * If we now have a earlier deadline task than p,
- * then reschedule, provided p is still on this
- * runqueue.
- */
- - if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
- - rq->curr == p)
- + if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
- resched_curr(rq);
- #else
- /*
- diff -Nur /home/ninez/android/marlin/kernel/sched/debug.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/debug.c
- --- /home/ninez/android/marlin/kernel/sched/debug.c 2018-08-10 01:54:08.563395055 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/debug.c 2018-08-26 16:43:11.647206295 -0400
- @@ -65,8 +65,12 @@
- #define P(F) \
- SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
- +#define P_SCHEDSTAT(F) \
- + SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
- #define PN(F) \
- SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
- +#define PN_SCHEDSTAT(F) \
- + SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
- if (!se)
- return;
- @@ -74,25 +78,27 @@
- PN(se->exec_start);
- PN(se->vruntime);
- PN(se->sum_exec_runtime);
- -#ifdef CONFIG_SCHEDSTATS
- - PN(se->statistics.wait_start);
- - PN(se->statistics.sleep_start);
- - PN(se->statistics.block_start);
- - PN(se->statistics.sleep_max);
- - PN(se->statistics.block_max);
- - PN(se->statistics.exec_max);
- - PN(se->statistics.slice_max);
- - PN(se->statistics.wait_max);
- - PN(se->statistics.wait_sum);
- - P(se->statistics.wait_count);
- -#endif
- + if (schedstat_enabled()) {
- + PN_SCHEDSTAT(se->statistics.wait_start);
- + PN_SCHEDSTAT(se->statistics.sleep_start);
- + PN_SCHEDSTAT(se->statistics.block_start);
- + PN_SCHEDSTAT(se->statistics.sleep_max);
- + PN_SCHEDSTAT(se->statistics.block_max);
- + PN_SCHEDSTAT(se->statistics.exec_max);
- + PN_SCHEDSTAT(se->statistics.slice_max);
- + PN_SCHEDSTAT(se->statistics.wait_max);
- + PN_SCHEDSTAT(se->statistics.wait_sum);
- + P_SCHEDSTAT(se->statistics.wait_count);
- + }
- P(se->load.weight);
- #ifdef CONFIG_SMP
- P(se->avg.load_avg);
- P(se->avg.util_avg);
- - P(se->avg.util_est);
- #endif
- +
- +#undef PN_SCHEDSTAT
- #undef PN
- +#undef P_SCHEDSTAT
- #undef P
- }
- #endif
- @@ -123,13 +129,17 @@
- (long long)(p->nvcsw + p->nivcsw),
- p->prio);
- #ifdef CONFIG_SCHEDSTATS
- + if (schedstat_enabled()) {
- + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- + SPLIT_NS(p->se.statistics.wait_sum),
- + SPLIT_NS(p->se.sum_exec_runtime),
- + SPLIT_NS(p->se.statistics.sum_sleep_runtime));
- + }
- +#else
- SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- - SPLIT_NS(p->se.vruntime),
- + 0LL, 0L,
- SPLIT_NS(p->se.sum_exec_runtime),
- - SPLIT_NS(p->se.statistics.sum_sleep_runtime));
- -#else
- - SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
- - 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
- + 0LL, 0L);
- #endif
- #ifdef CONFIG_NUMA_BALANCING
- SEQ_printf(m, " %d", task_node(p));
- @@ -148,7 +158,7 @@
- SEQ_printf(m,
- "\nrunnable tasks:\n"
- " task PID tree-key switches prio"
- - " exec-runtime sum-exec sum-sleep\n"
- + " wait-time sum-exec sum-sleep\n"
- "------------------------------------------------------"
- "----------------------------------------------------\n");
- @@ -210,8 +220,6 @@
- cfs_rq->runnable_load_avg);
- SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
- cfs_rq->avg.util_avg);
- - SEQ_printf(m, " .%-30s: %lu\n", "util_est",
- - cfs_rq->avg.util_est);
- SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg",
- atomic_long_read(&cfs_rq->removed_load_avg));
- SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
- @@ -297,6 +305,7 @@
- PN(next_balance);
- SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
- PN(clock);
- + PN(clock_task);
- P(cpu_load[0]);
- P(cpu_load[1]);
- P(cpu_load[2]);
- @@ -305,25 +314,23 @@
- #undef P
- #undef PN
- -#ifdef CONFIG_SCHEDSTATS
- -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
- -#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
- -
- - P(yld_count);
- -
- - P(sched_count);
- - P(sched_goidle);
- #ifdef CONFIG_SMP
- +#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
- P64(avg_idle);
- P64(max_idle_balance_cost);
- +#undef P64
- #endif
- - P(ttwu_count);
- - P(ttwu_local);
- -
- +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n));
- + if (schedstat_enabled()) {
- + P(yld_count);
- + P(sched_count);
- + P(sched_goidle);
- + P(ttwu_count);
- + P(ttwu_local);
- + }
- #undef P
- -#undef P64
- -#endif
- +
- spin_lock_irqsave(&sched_debug_lock, flags);
- print_cfs_stats(m, cpu);
- print_rt_stats(m, cpu);
- @@ -556,10 +563,14 @@
- SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
- #define P(F) \
- SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
- +#define P_SCHEDSTAT(F) \
- + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
- #define __PN(F) \
- SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
- #define PN(F) \
- SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
- +#define PN_SCHEDSTAT(F) \
- + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
- PN(se.exec_start);
- PN(se.vruntime);
- @@ -567,38 +578,66 @@
- nr_switches = p->nvcsw + p->nivcsw;
- -#ifdef CONFIG_SCHEDSTATS
- - PN(se.statistics.wait_start);
- - PN(se.statistics.sleep_start);
- - PN(se.statistics.block_start);
- - PN(se.statistics.sleep_max);
- - PN(se.statistics.block_max);
- - PN(se.statistics.exec_max);
- - PN(se.statistics.slice_max);
- - PN(se.statistics.wait_max);
- - PN(se.statistics.wait_sum);
- - P(se.statistics.wait_count);
- - PN(se.statistics.iowait_sum);
- - P(se.statistics.iowait_count);
- +
- P(se.nr_migrations);
- - P(se.statistics.nr_migrations_cold);
- - P(se.statistics.nr_failed_migrations_affine);
- - P(se.statistics.nr_failed_migrations_running);
- - P(se.statistics.nr_failed_migrations_hot);
- - P(se.statistics.nr_forced_migrations);
- - P(se.statistics.nr_wakeups);
- - P(se.statistics.nr_wakeups_sync);
- - P(se.statistics.nr_wakeups_migrate);
- - P(se.statistics.nr_wakeups_local);
- - P(se.statistics.nr_wakeups_remote);
- - P(se.statistics.nr_wakeups_affine);
- - P(se.statistics.nr_wakeups_affine_attempts);
- - P(se.statistics.nr_wakeups_passive);
- - P(se.statistics.nr_wakeups_idle);
- - {
- + if (schedstat_enabled()) {
- u64 avg_atom, avg_per_cpu;
- + PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
- + PN_SCHEDSTAT(se.statistics.wait_start);
- + PN_SCHEDSTAT(se.statistics.sleep_start);
- + PN_SCHEDSTAT(se.statistics.block_start);
- + PN_SCHEDSTAT(se.statistics.sleep_max);
- + PN_SCHEDSTAT(se.statistics.block_max);
- + PN_SCHEDSTAT(se.statistics.exec_max);
- + PN_SCHEDSTAT(se.statistics.slice_max);
- + PN_SCHEDSTAT(se.statistics.wait_max);
- + PN_SCHEDSTAT(se.statistics.wait_sum);
- + P_SCHEDSTAT(se.statistics.wait_count);
- + PN_SCHEDSTAT(se.statistics.iowait_sum);
- + P_SCHEDSTAT(se.statistics.iowait_count);
- + P_SCHEDSTAT(se.statistics.nr_migrations_cold);
- + P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
- + P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
- + P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
- + P_SCHEDSTAT(se.statistics.nr_forced_migrations);
- + P_SCHEDSTAT(se.statistics.nr_wakeups);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_local);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
- + /* eas */
- + /* select_idle_sibling() */
- + P_SCHEDSTAT(se.statistics.nr_wakeups_sis_attempts);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_sis_idle);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_sis_cache_affine);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_sis_suff_cap);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_sis_idle_cpu);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_sis_count);
- + /* select_energy_cpu_brute() */
- + P_SCHEDSTAT(se.statistics.nr_wakeups_secb_attempts);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_secb_sync);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_secb_idle_bt);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_secb_insuff_cap);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_secb_no_nrg_sav);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_secb_nrg_sav);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_secb_count);
- + /* find_best_target() */
- + P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_attempts);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_no_cpu);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_no_sd);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_pref_idle);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_fbt_count);
- + /* cas */
- + /* select_task_rq_fair() */
- + P_SCHEDSTAT(se.statistics.nr_wakeups_cas_attempts);
- + P_SCHEDSTAT(se.statistics.nr_wakeups_cas_count);
- +
- avg_atom = p->se.sum_exec_runtime;
- if (nr_switches)
- avg_atom = div64_ul(avg_atom, nr_switches);
- @@ -616,7 +655,7 @@
- __PN(avg_atom);
- __PN(avg_per_cpu);
- }
- -#endif
- +
- __P(nr_switches);
- SEQ_printf(m, "%-45s:%21Ld\n",
- "nr_voluntary_switches", (long long)p->nvcsw);
- @@ -629,13 +668,14 @@
- P(se.avg.util_sum);
- P(se.avg.load_avg);
- P(se.avg.util_avg);
- - P(se.avg.util_est);
- P(se.avg.last_update_time);
- #endif
- P(policy);
- P(prio);
- +#undef PN_SCHEDSTAT
- #undef PN
- #undef __PN
- +#undef P_SCHEDSTAT
- #undef P
- #undef __P
- diff -Nur /home/ninez/android/marlin/kernel/sched/energy.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/energy.c
- --- /home/ninez/android/marlin/kernel/sched/energy.c 2018-08-10 01:54:08.563395055 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/energy.c 2018-08-11 23:57:17.128607487 -0400
- @@ -27,8 +27,6 @@
- #include <linux/sched_energy.h>
- #include <linux/stddef.h>
- -#include "sched.h"
- -
- struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
- static void free_resources(void)
- @@ -61,14 +59,12 @@
- for_each_possible_cpu(cpu) {
- cn = of_get_cpu_node(cpu, NULL);
- if (!cn) {
- - if (sched_feat(ENERGY_AWARE))
- - pr_warn("CPU device node missing for CPU %d\n", cpu);
- + pr_warn("CPU device node missing for CPU %d\n", cpu);
- return;
- }
- if (!of_find_property(cn, "sched-energy-costs", NULL)) {
- - if (sched_feat(ENERGY_AWARE))
- - pr_warn("CPU device node has no sched-energy-costs\n");
- + pr_warn("CPU device node has no sched-energy-costs\n");
- return;
- }
- @@ -79,8 +75,7 @@
- prop = of_find_property(cp, "busy-cost-data", NULL);
- if (!prop || !prop->value) {
- - if (sched_feat(ENERGY_AWARE))
- - pr_warn("No busy-cost data, skipping sched_energy init\n");
- + pr_warn("No busy-cost data, skipping sched_energy init\n");
- goto out;
- }
- @@ -102,8 +97,7 @@
- prop = of_find_property(cp, "idle-cost-data", NULL);
- if (!prop || !prop->value) {
- - if (sched_feat(ENERGY_AWARE))
- - pr_warn("No idle-cost data, skipping sched_energy init\n");
- + pr_warn("No idle-cost data, skipping sched_energy init\n");
- goto out;
- }
- @@ -123,7 +117,6 @@
- }
- pr_info("Sched-energy-costs installed from DT\n");
- - set_energy_aware();
- return;
- out:
- diff -Nur /home/ninez/android/marlin/kernel/sched/fair.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/fair.c
- --- /home/ninez/android/marlin/kernel/sched/fair.c 2018-08-10 01:54:08.566728454 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/fair.c 2018-08-26 16:43:11.647206295 -0400
- @@ -20,8 +20,8 @@
- */
- -#include <linux/latencytop.h>
- #include <linux/sched.h>
- +#include <linux/latencytop.h>
- #include <linux/cpumask.h>
- #include <linux/cpuidle.h>
- #include <linux/slab.h>
- @@ -53,14 +53,18 @@
- unsigned int sysctl_sched_latency = 6000000ULL;
- unsigned int normalized_sysctl_sched_latency = 6000000ULL;
- -unsigned int sysctl_sched_is_big_little = 0;
- unsigned int sysctl_sched_sync_hint_enable = 1;
- unsigned int sysctl_sched_initial_task_util = 0;
- unsigned int sysctl_sched_cstate_aware = 1;
- #ifdef CONFIG_SCHED_WALT
- +#ifdef CONFIG_SCHED_WALT_DEFAULT
- unsigned int sysctl_sched_use_walt_cpu_util = 1;
- unsigned int sysctl_sched_use_walt_task_util = 1;
- +#else
- +unsigned int sysctl_sched_use_walt_cpu_util = 0;
- +unsigned int sysctl_sched_use_walt_task_util = 0;
- +#endif
- __read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
- (10 * NSEC_PER_MSEC);
- #endif
- @@ -128,6 +132,12 @@
- unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
- #endif
- +/*
- + * The margin used when comparing utilization with CPU capacity:
- + * util * margin < capacity * 1024
- + */
- +unsigned int capacity_margin = 1280; /* ~20% */
- +
- static inline void update_load_add(struct load_weight *lw, unsigned long inc)
- {
- lw->weight += inc;
- @@ -155,9 +165,9 @@
- *
- * This idea comes from the SD scheduler of Con Kolivas:
- */
- -static int get_update_sysctl_factor(void)
- +static unsigned int get_update_sysctl_factor(void)
- {
- - unsigned int cpus = min_t(int, num_online_cpus(), 8);
- + unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
- unsigned int factor;
- switch (sysctl_sched_tunable_scaling) {
- @@ -270,9 +280,7 @@
- static inline struct task_struct *task_of(struct sched_entity *se)
- {
- -#ifdef CONFIG_SCHED_DEBUG
- - WARN_ON_ONCE(!entity_is_task(se));
- -#endif
- + SCHED_WARN_ON(!entity_is_task(se));
- return container_of(se, struct task_struct, se);
- }
- @@ -300,19 +308,59 @@
- static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
- {
- if (!cfs_rq->on_list) {
- + struct rq *rq = rq_of(cfs_rq);
- + int cpu = cpu_of(rq);
- /*
- * Ensure we either appear before our parent (if already
- * enqueued) or force our parent to appear after us when it is
- - * enqueued. The fact that we always enqueue bottom-up
- - * reduces this to two cases.
- + * enqueued. The fact that we always enqueue bottom-up
- + * reduces this to two cases and a special case for the root
- + * cfs_rq. Furthermore, it also means that we will always reset
- + * tmp_alone_branch either when the branch is connected
- + * to a tree or when we reach the beg of the tree
- */
- if (cfs_rq->tg->parent &&
- - cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
- - list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- - &rq_of(cfs_rq)->leaf_cfs_rq_list);
- - } else {
- + cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
- + /*
- + * If parent is already on the list, we add the child
- + * just before. Thanks to circular linked property of
- + * the list, this means to put the child at the tail
- + * of the list that starts by parent.
- + */
- + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- + &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
- + /*
- + * The branch is now connected to its tree so we can
- + * reset tmp_alone_branch to the beginning of the
- + * list.
- + */
- + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- + } else if (!cfs_rq->tg->parent) {
- + /*
- + * cfs rq without parent should be put
- + * at the tail of the list.
- + */
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- - &rq_of(cfs_rq)->leaf_cfs_rq_list);
- + &rq->leaf_cfs_rq_list);
- + /*
- + * We have reach the beg of a tree so we can reset
- + * tmp_alone_branch to the beginning of the list.
- + */
- + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- + } else {
- + /*
- + * The parent has not already been added so we want to
- + * make sure that it will be put after us.
- + * tmp_alone_branch points to the beg of the branch
- + * where we will add parent.
- + */
- + list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- + rq->tmp_alone_branch);
- + /*
- + * update tmp_alone_branch to points to the new beg
- + * of the branch
- + */
- + rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
- }
- cfs_rq->on_list = 1;
- @@ -470,17 +518,23 @@
- static void update_min_vruntime(struct cfs_rq *cfs_rq)
- {
- + struct sched_entity *curr = cfs_rq->curr;
- +
- u64 vruntime = cfs_rq->min_vruntime;
- - if (cfs_rq->curr)
- - vruntime = cfs_rq->curr->vruntime;
- + if (curr) {
- + if (curr->on_rq)
- + vruntime = curr->vruntime;
- + else
- + curr = NULL;
- + }
- if (cfs_rq->rb_leftmost) {
- struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
- struct sched_entity,
- run_node);
- - if (!cfs_rq->curr)
- + if (!curr)
- vruntime = se->vruntime;
- else
- vruntime = min_vruntime(vruntime, se->vruntime);
- @@ -585,7 +639,7 @@
- loff_t *ppos)
- {
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- - int factor = get_update_sysctl_factor();
- + unsigned int factor = get_update_sysctl_factor();
- if (ret || !write)
- return ret;
- @@ -670,16 +724,17 @@
- }
- #ifdef CONFIG_SMP
- -static int select_idle_sibling(struct task_struct *p, int cpu);
- +static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
- static unsigned long task_h_load(struct task_struct *p);
- /*
- * We choose a half-life close to 1 scheduling period.
- - * Note: The tables below are dependent on this value.
- + * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
- + * dependent on this value.
- */
- -#define LOAD_AVG_PERIOD 16
- -#define LOAD_AVG_MAX 24117 /* maximum possible load avg */
- -#define LOAD_AVG_MAX_N 172 /* number of full periods to produce LOAD_AVG_MAX */
- +#define LOAD_AVG_PERIOD 32
- +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
- +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
- /* Give new sched_entity start runnable values to heavy its load in infant time */
- void init_entity_runnable_average(struct sched_entity *se)
- @@ -693,23 +748,117 @@
- * will definitely be update (after enqueue).
- */
- sa->period_contrib = 1023;
- - sa->load_avg = scale_load_down(se->load.weight);
- + /*
- + * Tasks are intialized with full load to be seen as heavy tasks until
- + * they get a chance to stabilize to their real load level.
- + * Group entities are intialized with zero load to reflect the fact that
- + * nothing has been attached to the task group yet.
- + */
- + if (entity_is_task(se))
- + sa->load_avg = scale_load_down(se->load.weight);
- sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- - sa->util_avg = sched_freq() ?
- - sysctl_sched_initial_task_util :
- - scale_load_down(SCHED_LOAD_SCALE);
- - sa->util_est = sa->util_avg;
- - sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
- + /*
- + * In previous Android versions, we used to have:
- + * sa->util_avg = sched_freq() ?
- + * sysctl_sched_initial_task_util :
- + * scale_load_down(SCHED_LOAD_SCALE);
- + * sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
- + * However, that functionality has been moved to enqueue.
- + * It is unclear if we should restore this in enqueue.
- + */
- + /*
- + * At this point, util_avg won't be used in select_task_rq_fair anyway
- + */
- + sa->util_avg = 0;
- + sa->util_sum = 0;
- /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
- }
- -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
- -static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
- +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
- +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
- +static void attach_entity_cfs_rq(struct sched_entity *se);
- +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
- +
- +/*
- + * With new tasks being created, their initial util_avgs are extrapolated
- + * based on the cfs_rq's current util_avg:
- + *
- + * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
- + *
- + * However, in many cases, the above util_avg does not give a desired
- + * value. Moreover, the sum of the util_avgs may be divergent, such
- + * as when the series is a harmonic series.
- + *
- + * To solve this problem, we also cap the util_avg of successive tasks to
- + * only 1/2 of the left utilization budget:
- + *
- + * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
- + *
- + * where n denotes the nth task.
- + *
- + * For example, a simplest series from the beginning would be like:
- + *
- + * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
- + * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
- + *
- + * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
- + * if util_avg > util_avg_cap.
- + */
- +void post_init_entity_util_avg(struct sched_entity *se)
- +{
- + struct cfs_rq *cfs_rq = cfs_rq_of(se);
- + struct sched_avg *sa = &se->avg;
- + long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
- +
- + if (cap > 0) {
- + if (cfs_rq->avg.util_avg != 0) {
- + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
- + sa->util_avg /= (cfs_rq->avg.load_avg + 1);
- +
- + if (sa->util_avg > cap)
- + sa->util_avg = cap;
- + } else {
- + sa->util_avg = cap;
- + }
- + /*
- + * If we wish to restore tuning via setting initial util,
- + * this is where we should do it.
- + */
- + sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
- + }
- +
- + if (entity_is_task(se)) {
- + struct task_struct *p = task_of(se);
- + if (p->sched_class != &fair_sched_class) {
- + /*
- + * For !fair tasks do:
- + *
- + update_cfs_rq_load_avg(now, cfs_rq, false);
- + attach_entity_load_avg(cfs_rq, se);
- + switched_from_fair(rq, p);
- + *
- + * such that the next switched_to_fair() has the
- + * expected state.
- + */
- + se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
- + return;
- + }
- + }
- +
- + attach_entity_cfs_rq(se);
- +}
- +
- #else
- void init_entity_runnable_average(struct sched_entity *se)
- {
- }
- -#endif
- +void post_init_entity_util_avg(struct sched_entity *se)
- +{
- +}
- +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
- +{
- +}
- +#endif /* CONFIG_SMP */
- /*
- * Update the current task's runtime statistics.
- @@ -733,7 +882,7 @@
- max(delta_exec, curr->statistics.exec_max));
- curr->sum_exec_runtime += delta_exec;
- - schedstat_add(cfs_rq, exec_clock, delta_exec);
- + schedstat_add(cfs_rq->exec_clock, delta_exec);
- curr->vruntime += calc_delta_fair(delta_exec, curr);
- update_min_vruntime(cfs_rq);
- @@ -757,48 +906,165 @@
- static inline void
- update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
- {
- - schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
- + u64 wait_start, prev_wait_start;
- +
- + if (!schedstat_enabled())
- + return;
- +
- + wait_start = rq_clock(rq_of(cfs_rq));
- + prev_wait_start = schedstat_val(se->statistics.wait_start);
- +
- + if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
- + likely(wait_start > prev_wait_start))
- + wait_start -= prev_wait_start;
- +
- + schedstat_set(se->statistics.wait_start, wait_start);
- +}
- +
- +static inline void
- +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
- +{
- + struct task_struct *p;
- + u64 delta;
- +
- + if (!schedstat_enabled())
- + return;
- +
- + delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
- +
- + if (entity_is_task(se)) {
- + p = task_of(se);
- + if (task_on_rq_migrating(p)) {
- + /*
- + * Preserve migrating task's wait time so wait_start
- + * time stamp can be adjusted to accumulate wait time
- + * prior to migration.
- + */
- + schedstat_set(se->statistics.wait_start, delta);
- + return;
- + }
- + trace_sched_stat_wait(p, delta);
- + }
- +
- + schedstat_set(se->statistics.wait_max,
- + max(schedstat_val(se->statistics.wait_max), delta));
- + schedstat_inc(se->statistics.wait_count);
- + schedstat_add(se->statistics.wait_sum, delta);
- + schedstat_set(se->statistics.wait_start, 0);
- +}
- +
- +static inline void
- +update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
- +{
- + struct task_struct *tsk = NULL;
- + u64 sleep_start, block_start;
- +
- + if (!schedstat_enabled())
- + return;
- +
- + sleep_start = schedstat_val(se->statistics.sleep_start);
- + block_start = schedstat_val(se->statistics.block_start);
- +
- + if (entity_is_task(se))
- + tsk = task_of(se);
- +
- + if (sleep_start) {
- + u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
- +
- + if ((s64)delta < 0)
- + delta = 0;
- +
- + if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
- + schedstat_set(se->statistics.sleep_max, delta);
- +
- + schedstat_set(se->statistics.sleep_start, 0);
- + schedstat_add(se->statistics.sum_sleep_runtime, delta);
- +
- + if (tsk) {
- + account_scheduler_latency(tsk, delta >> 10, 1);
- + trace_sched_stat_sleep(tsk, delta);
- + }
- + }
- + if (block_start) {
- + u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
- +
- + if ((s64)delta < 0)
- + delta = 0;
- +
- + if (unlikely(delta > schedstat_val(se->statistics.block_max)))
- + schedstat_set(se->statistics.block_max, delta);
- +
- + schedstat_set(se->statistics.block_start, 0);
- + schedstat_add(se->statistics.sum_sleep_runtime, delta);
- +
- + if (tsk) {
- + if (tsk->in_iowait) {
- + schedstat_add(se->statistics.iowait_sum, delta);
- + schedstat_inc(se->statistics.iowait_count);
- + trace_sched_stat_iowait(tsk, delta);
- + }
- +
- + trace_sched_stat_blocked(tsk, delta);
- +
- + /*
- + * Blocking time is in units of nanosecs, so shift by
- + * 20 to get a milliseconds-range estimation of the
- + * amount of time that the task spent sleeping:
- + */
- + if (unlikely(prof_on == SLEEP_PROFILING)) {
- + profile_hits(SLEEP_PROFILING,
- + (void *)get_wchan(tsk),
- + delta >> 20);
- + }
- + account_scheduler_latency(tsk, delta >> 10, 0);
- + }
- + }
- }
- /*
- * Task is being enqueued - update stats:
- */
- -static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
- +static inline void
- +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
- {
- + if (!schedstat_enabled())
- + return;
- +
- /*
- * Are we enqueueing a waiting task? (for current tasks
- * a dequeue/enqueue event is a NOP)
- */
- if (se != cfs_rq->curr)
- update_stats_wait_start(cfs_rq, se);
- -}
- -static void
- -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
- -{
- - schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
- - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
- - schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
- - schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
- - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
- -#ifdef CONFIG_SCHEDSTATS
- - if (entity_is_task(se)) {
- - trace_sched_stat_wait(task_of(se),
- - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
- - }
- -#endif
- - schedstat_set(se->statistics.wait_start, 0);
- + if (flags & ENQUEUE_WAKEUP)
- + update_stats_enqueue_sleeper(cfs_rq, se);
- }
- static inline void
- -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
- +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
- {
- +
- + if (!schedstat_enabled())
- + return;
- +
- /*
- * Mark the end of the wait period if dequeueing a
- * waiting task:
- */
- if (se != cfs_rq->curr)
- update_stats_wait_end(cfs_rq, se);
- +
- + if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
- + struct task_struct *tsk = task_of(se);
- +
- + if (tsk->state & TASK_INTERRUPTIBLE)
- + schedstat_set(se->statistics.sleep_start,
- + rq_clock(rq_of(cfs_rq)));
- + if (tsk->state & TASK_UNINTERRUPTIBLE)
- + schedstat_set(se->statistics.block_start,
- + rq_clock(rq_of(cfs_rq)));
- + }
- }
- /*
- @@ -1309,8 +1575,16 @@
- * One idle CPU per node is evaluated for a task numa move.
- * Call select_idle_sibling to maybe find a better one.
- */
- - if (!cur)
- - env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
- + if (!cur) {
- + /*
- + * select_idle_siblings() uses an per-cpu cpumask that
- + * can be used from IRQ context.
- + */
- + local_irq_disable();
- + env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
- + env->dst_cpu);
- + local_irq_enable();
- + }
- assign:
- task_numa_assign(env, cur, imp);
- @@ -1612,6 +1886,11 @@
- u64 runtime, period;
- spinlock_t *group_lock = NULL;
- + /*
- + * The p->mm->numa_scan_seq field gets updated without
- + * exclusive access. Use READ_ONCE() here to ensure
- + * that the field is read in a single access:
- + */
- seq = READ_ONCE(p->mm->numa_scan_seq);
- if (p->numa_scan_seq == seq)
- return;
- @@ -1857,7 +2136,7 @@
- int local = !!(flags & TNF_FAULT_LOCAL);
- int priv;
- - if (!numabalancing_enabled)
- + if (!static_branch_likely(&sched_numa_balancing))
- return;
- /* for example, ksmd faulting in a user's mm */
- @@ -1929,6 +2208,14 @@
- static void reset_ptenuma_scan(struct task_struct *p)
- {
- + /*
- + * We only did a read acquisition of the mmap sem, so
- + * p->mm->numa_scan_seq is written to without exclusive access
- + * and the update is not guaranteed to be atomic. That's not
- + * much of an issue though, since this is just used for
- + * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
- + * expensive, to avoid any form of compiler optimizations:
- + */
- WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
- p->mm->numa_scan_offset = 0;
- }
- @@ -1945,9 +2232,9 @@
- struct vm_area_struct *vma;
- unsigned long start, end;
- unsigned long nr_pte_updates = 0;
- - long pages;
- + long pages, virtpages;
- - WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
- + SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
- work->next = work; /* protect against double add */
- /*
- @@ -1991,9 +2278,11 @@
- start = mm->numa_scan_offset;
- pages = sysctl_numa_balancing_scan_size;
- pages <<= 20 - PAGE_SHIFT; /* MB in pages */
- + virtpages = pages * 8; /* Scan up to this much virtual space */
- if (!pages)
- return;
- +
- down_read(&mm->mmap_sem);
- vma = find_vma(mm, start);
- if (!vma) {
- @@ -2003,7 +2292,7 @@
- }
- for (; vma; vma = vma->vm_next) {
- if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
- - is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
- + is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
- continue;
- }
- @@ -2028,18 +2317,22 @@
- start = max(start, vma->vm_start);
- end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
- end = min(end, vma->vm_end);
- - nr_pte_updates += change_prot_numa(vma, start, end);
- + nr_pte_updates = change_prot_numa(vma, start, end);
- /*
- - * Scan sysctl_numa_balancing_scan_size but ensure that
- - * at least one PTE is updated so that unused virtual
- - * address space is quickly skipped.
- + * Try to scan sysctl_numa_balancing_size worth of
- + * hpages that have at least one present PTE that
- + * is not already pte-numa. If the VMA contains
- + * areas that are unused or already full of prot_numa
- + * PTEs, scan up to virtpages, to skip through those
- + * areas faster.
- */
- if (nr_pte_updates)
- pages -= (end - start) >> PAGE_SHIFT;
- + virtpages -= (end - start) >> PAGE_SHIFT;
- start = end;
- - if (pages <= 0)
- + if (pages <= 0 || virtpages <= 0)
- goto out;
- cond_resched();
- @@ -2140,28 +2433,22 @@
- #ifdef CONFIG_FAIR_GROUP_SCHED
- # ifdef CONFIG_SMP
- -static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
- +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
- {
- - long tg_weight;
- + long tg_weight, load, shares;
- /*
- - * Use this CPU's real-time load instead of the last load contribution
- - * as the updating of the contribution is delayed, and we will use the
- - * the real-time load to calc the share. See update_tg_load_avg().
- + * This really should be: cfs_rq->avg.load_avg, but instead we use
- + * cfs_rq->load.weight, which is its upper bound. This helps ramp up
- + * the shares for small weight interactive tasks.
- */
- - tg_weight = atomic_long_read(&tg->load_avg);
- - tg_weight -= cfs_rq->tg_load_avg_contrib;
- - tg_weight += cfs_rq->load.weight;
- + load = scale_load_down(cfs_rq->load.weight);
- - return tg_weight;
- -}
- -
- -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
- -{
- - long tg_weight, load, shares;
- + tg_weight = atomic_long_read(&tg->load_avg);
- - tg_weight = calc_tg_weight(tg, cfs_rq);
- - load = cfs_rq->load.weight;
- + /* Ensure tg_weight >= load */
- + tg_weight -= cfs_rq->tg_load_avg_contrib;
- + tg_weight += load;
- shares = (tg->shares * load);
- if (tg_weight)
- @@ -2198,16 +2485,20 @@
- static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
- -static void update_cfs_shares(struct cfs_rq *cfs_rq)
- +static void update_cfs_shares(struct sched_entity *se)
- {
- + struct cfs_rq *cfs_rq = group_cfs_rq(se);
- struct task_group *tg;
- - struct sched_entity *se;
- long shares;
- - tg = cfs_rq->tg;
- - se = tg->se[cpu_of(rq_of(cfs_rq))];
- - if (!se || throttled_hierarchy(cfs_rq))
- + if (!cfs_rq)
- + return;
- +
- + if (throttled_hierarchy(cfs_rq))
- return;
- +
- + tg = cfs_rq->tg;
- +
- #ifndef CONFIG_SMP
- if (likely(se->load.weight == tg->shares))
- return;
- @@ -2216,8 +2507,9 @@
- reweight_entity(cfs_rq_of(se), se, shares);
- }
- +
- #else /* CONFIG_FAIR_GROUP_SCHED */
- -static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
- +static inline void update_cfs_shares(struct sched_entity *se)
- {
- }
- #endif /* CONFIG_FAIR_GROUP_SCHED */
- @@ -2225,8 +2517,12 @@
- #ifdef CONFIG_SMP
- /* Precomputed fixed inverse multiplies for multiplication by y^n */
- static const u32 runnable_avg_yN_inv[] = {
- - 0xffff, 0xf524, 0xeabf, 0xe0cb, 0xd744, 0xce23, 0xc566, 0xbd07,
- - 0xb504, 0xad57, 0xa5fe, 0x9ef4, 0x9837, 0x91c3, 0x8b95, 0x85aa,
- + 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
- + 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
- + 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
- + 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
- + 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
- + 0x85aac367, 0x82cd8698,
- };
- /*
- @@ -2234,8 +2530,19 @@
- * over-estimates when re-combining.
- */
- static const u32 runnable_avg_yN_sum[] = {
- - 0, 980, 1919, 2818, 3679, 4503, 5292, 6048, 6772, 7465, 8129,
- - 8764, 9373, 9956,10514,11048,11560,
- + 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
- + 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
- + 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
- +};
- +
- +/*
- + * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
- + * lower integers. See Documentation/scheduler/sched-avg.txt how these
- + * were generated:
- + */
- +static const u32 __accumulated_sum_N32[] = {
- + 0, 23371, 35056, 40899, 43820, 45281,
- + 46011, 46376, 46559, 46650, 46696, 46719,
- };
- /*
- @@ -2266,8 +2573,7 @@
- local_n %= LOAD_AVG_PERIOD;
- }
- - val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n],
- - LOAD_AVG_PERIOD);
- + val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
- return val;
- }
- @@ -2287,22 +2593,13 @@
- else if (unlikely(n >= LOAD_AVG_MAX_N))
- return LOAD_AVG_MAX;
- - /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
- - do {
- - contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
- - contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
- -
- - n -= LOAD_AVG_PERIOD;
- - } while (n > LOAD_AVG_PERIOD);
- -
- + /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
- + contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
- + n %= LOAD_AVG_PERIOD;
- contrib = decay_load(contrib, n);
- return contrib + runnable_avg_yN_sum[n];
- }
- -#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
- -#error "load tracking assumes 2^10 as unit"
- -#endif
- -
- #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
- /*
- @@ -2439,10 +2736,42 @@
- return decayed;
- }
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- /*
- - * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- - * and effective_load (which is not done because it is too costly).
- + * Signed add and clamp on underflow.
- + *
- + * Explicitly do a load-store to ensure the intermediate value never hits
- + * memory. This allows lockless observations without ever seeing the negative
- + * values.
- + */
- +#define add_positive(_ptr, _val) do { \
- + typeof(_ptr) ptr = (_ptr); \
- + typeof(_val) val = (_val); \
- + typeof(*ptr) res, var = READ_ONCE(*ptr); \
- + \
- + res = var + val; \
- + \
- + if (val < 0 && res > var) \
- + res = 0; \
- + \
- + WRITE_ONCE(*ptr, res); \
- +} while (0)
- +
- +#ifdef CONFIG_FAIR_GROUP_SCHED
- +/**
- + * update_tg_load_avg - update the tg's load avg
- + * @cfs_rq: the cfs_rq whose avg changed
- + * @force: update regardless of how small the difference
- + *
- + * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
- + * However, because tg->load_avg is a global value there are performance
- + * considerations.
- + *
- + * In order to avoid having to look at the other cfs_rq's, we use a
- + * differential update where we store the last value we propagated. This in
- + * turn allows skipping updates if the differential is 'small'.
- + *
- + * Updating tg's load_avg is necessary before update_cfs_share() (which is
- + * done) and effective_load() (which is not done because it is too costly).
- */
- static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
- {
- @@ -2506,29 +2835,249 @@
- se->avg.last_update_time = n_last_update_time;
- }
- }
- +
- +/* Take into account change of utilization of a child task group */
- +static inline void
- +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
- +{
- + struct cfs_rq *gcfs_rq = group_cfs_rq(se);
- + long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
- +
- + /* Nothing to update */
- + if (!delta)
- + return;
- +
- + /* Set new sched_entity's utilization */
- + se->avg.util_avg = gcfs_rq->avg.util_avg;
- + se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
- +
- + /* Update parent cfs_rq utilization */
- + add_positive(&cfs_rq->avg.util_avg, delta);
- + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
- +}
- +
- +/* Take into account change of load of a child task group */
- +static inline void
- +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
- +{
- + struct cfs_rq *gcfs_rq = group_cfs_rq(se);
- + long delta, load = gcfs_rq->avg.load_avg;
- +
- + /*
- + * If the load of group cfs_rq is null, the load of the
- + * sched_entity will also be null so we can skip the formula
- + */
- + if (load) {
- + long tg_load;
- +
- + /* Get tg's load and ensure tg_load > 0 */
- + tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
- +
- + /* Ensure tg_load >= load and updated with current load*/
- + tg_load -= gcfs_rq->tg_load_avg_contrib;
- + tg_load += load;
- +
- + /*
- + * We need to compute a correction term in the case that the
- + * task group is consuming more CPU than a task of equal
- + * weight. A task with a weight equals to tg->shares will have
- + * a load less or equal to scale_load_down(tg->shares).
- + * Similarly, the sched_entities that represent the task group
- + * at parent level, can't have a load higher than
- + * scale_load_down(tg->shares). And the Sum of sched_entities'
- + * load must be <= scale_load_down(tg->shares).
- + */
- + if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
- + /* scale gcfs_rq's load into tg's shares*/
- + load *= scale_load_down(gcfs_rq->tg->shares);
- + load /= tg_load;
- + }
- + }
- +
- + delta = load - se->avg.load_avg;
- +
- + /* Nothing to update */
- + if (!delta)
- + return;
- +
- + /* Set new sched_entity's load */
- + se->avg.load_avg = load;
- + se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
- +
- + /* Update parent cfs_rq load */
- + add_positive(&cfs_rq->avg.load_avg, delta);
- + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
- +
- + /*
- + * If the sched_entity is already enqueued, we also have to update the
- + * runnable load avg.
- + */
- + if (se->on_rq) {
- + /* Update parent cfs_rq runnable_load_avg */
- + add_positive(&cfs_rq->runnable_load_avg, delta);
- + cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
- + }
- +}
- +
- +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
- +{
- + cfs_rq->propagate_avg = 1;
- +}
- +
- +static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
- +{
- + struct cfs_rq *cfs_rq = group_cfs_rq(se);
- +
- + if (!cfs_rq->propagate_avg)
- + return 0;
- +
- + cfs_rq->propagate_avg = 0;
- + return 1;
- +}
- +
- +/* Update task and its cfs_rq load average */
- +static inline int propagate_entity_load_avg(struct sched_entity *se)
- +{
- + struct cfs_rq *cfs_rq;
- +
- + if (entity_is_task(se))
- + return 0;
- +
- + if (!test_and_clear_tg_cfs_propagate(se))
- + return 0;
- +
- + cfs_rq = cfs_rq_of(se);
- +
- + set_tg_cfs_propagate(cfs_rq);
- +
- + update_tg_cfs_util(cfs_rq, se);
- + update_tg_cfs_load(cfs_rq, se);
- +
- + return 1;
- +}
- +
- +/*
- + * Check if we need to update the load and the utilization of a blocked
- + * group_entity:
- + */
- +static inline bool skip_blocked_update(struct sched_entity *se)
- +{
- + struct cfs_rq *gcfs_rq = group_cfs_rq(se);
- +
- + /*
- + * If sched_entity still have not zero load or utilization, we have to
- + * decay it:
- + */
- + if (se->avg.load_avg || se->avg.util_avg)
- + return false;
- +
- + /*
- + * If there is a pending propagation, we have to update the load and
- + * the utilization of the sched_entity:
- + */
- + if (gcfs_rq->propagate_avg)
- + return false;
- +
- + /*
- + * Otherwise, the load and the utilization of the sched_entity is
- + * already zero and there is no pending propagation, so it will be a
- + * waste of time to try to decay it:
- + */
- + return true;
- +}
- +
- #else /* CONFIG_FAIR_GROUP_SCHED */
- +
- static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
- +
- +static inline int propagate_entity_load_avg(struct sched_entity *se)
- +{
- + return 0;
- +}
- +
- +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
- +
- #endif /* CONFIG_FAIR_GROUP_SCHED */
- +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
- +{
- + if (&this_rq()->cfs == cfs_rq) {
- + /*
- + * There are a few boundary cases this might miss but it should
- + * get called often enough that that should (hopefully) not be
- + * a real problem -- added to that it only calls on the local
- + * CPU, so if we enqueue remotely we'll miss an update, but
- + * the next tick/schedule should update.
- + *
- + * It will not get called when we go idle, because the idle
- + * thread is a different class (!fair), nor will the utilization
- + * number include things like RT tasks.
- + *
- + * As is, the util number is not freq-invariant (we'd have to
- + * implement arch_scale_freq_capacity() for that).
- + *
- + * See cpu_util().
- + */
- + cpufreq_update_util(rq_of(cfs_rq), 0);
- + }
- +}
- +
- static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
- -/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
- -static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
- +/*
- + * Unsigned subtract and clamp on underflow.
- + *
- + * Explicitly do a load-store to ensure the intermediate value never hits
- + * memory. This allows lockless observations without ever seeing the negative
- + * values.
- + */
- +#define sub_positive(_ptr, _val) do { \
- + typeof(_ptr) ptr = (_ptr); \
- + typeof(*ptr) val = (_val); \
- + typeof(*ptr) res, var = READ_ONCE(*ptr); \
- + res = var - val; \
- + if (res > var) \
- + res = 0; \
- + WRITE_ONCE(*ptr, res); \
- +} while (0)
- +
- +/**
- + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
- + * @now: current time, as per cfs_rq_clock_task()
- + * @cfs_rq: cfs_rq to update
- + * @update_freq: should we call cfs_rq_util_change() or will the call do so
- + *
- + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
- + * avg. The immediate corollary is that all (fair) tasks must be attached, see
- + * post_init_entity_util_avg().
- + *
- + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
- + *
- + * Returns true if the load decayed or we removed load.
- + *
- + * Since both these conditions indicate a changed cfs_rq->avg.load we should
- + * call update_tg_load_avg() when this function returns true.
- + */
- +static inline int
- +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
- {
- struct sched_avg *sa = &cfs_rq->avg;
- - int decayed, removed = 0;
- + int decayed, removed = 0, removed_util = 0;
- if (atomic_long_read(&cfs_rq->removed_load_avg)) {
- - long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
- - sa->load_avg = max_t(long, sa->load_avg - r, 0);
- - sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
- + s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
- + sub_positive(&sa->load_avg, r);
- + sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
- removed = 1;
- + set_tg_cfs_propagate(cfs_rq);
- }
- if (atomic_long_read(&cfs_rq->removed_util_avg)) {
- long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
- - sa->util_avg = max_t(long, sa->util_avg - r, 0);
- - sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
- + sub_positive(&sa->util_avg, r);
- + sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
- + removed_util = 1;
- + set_tg_cfs_propagate(cfs_rq);
- }
- decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- @@ -2539,81 +3088,94 @@
- cfs_rq->load_last_update_time_copy = sa->last_update_time;
- #endif
- + /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
- + if (cfs_rq == &rq_of(cfs_rq)->cfs)
- + trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
- +
- + if (update_freq && (decayed || removed_util))
- + cfs_rq_util_change(cfs_rq);
- +
- return decayed || removed;
- }
- -static inline unsigned long task_util_est(struct task_struct *p)
- -{
- - return p->se.avg.util_est;
- -}
- +/*
- + * Optional action to be done while updating the load average
- + */
- +#define UPDATE_TG 0x1
- +#define SKIP_AGE_LOAD 0x2
- +#define SKIP_CPUFREQ 0x4
- /* Update task and its cfs_rq load average */
- -static inline void update_load_avg(struct sched_entity *se, int update_tg)
- +static inline void update_load_avg(struct sched_entity *se, int flags)
- {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 now = cfs_rq_clock_task(cfs_rq);
- int cpu = cpu_of(rq_of(cfs_rq));
- + int decayed;
- + void *ptr = NULL;
- /*
- * Track task load average for carrying it to new CPU after migrated, and
- * track group sched_entity load average for task_h_load calc in migration
- */
- - __update_load_avg(now, cpu, &se->avg,
- + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
- + __update_load_avg(now, cpu, &se->avg,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
- + }
- - if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
- - update_tg_load_avg(cfs_rq, 0);
- + decayed = update_cfs_rq_load_avg(now, cfs_rq, !(flags & SKIP_CPUFREQ));
- + decayed |= propagate_entity_load_avg(se);
- - if (entity_is_task(se))
- - trace_sched_load_avg_task(task_of(se), &se->avg);
- - trace_sched_load_avg_cpu(cpu, cfs_rq);
- + if (decayed && (flags & UPDATE_TG))
- + update_tg_load_avg(cfs_rq, 0);
- - /* Update task estimated utilization */
- - if (se->avg.util_est < se->avg.util_avg) {
- - cfs_rq->avg.util_est += (se->avg.util_avg - se->avg.util_est);
- - se->avg.util_est = se->avg.util_avg;
- + if (entity_is_task(se)) {
- +#ifdef CONFIG_SCHED_WALT
- + ptr = (void *)&(task_of(se)->ravg);
- +#endif
- + trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
- }
- -
- }
- +/**
- + * attach_entity_load_avg - attach this entity to its cfs_rq load avg
- + * @cfs_rq: cfs_rq to attach to
- + * @se: sched_entity to attach
- + *
- + * Must call update_cfs_rq_load_avg() before this, since we rely on
- + * cfs_rq->avg.last_update_time being current.
- + */
- static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
- {
- - if (!sched_feat(ATTACH_AGE_LOAD))
- - goto skip_aging;
- -
- - /*
- - * If we got migrated (either between CPUs or between cgroups) we'll
- - * have aged the average right before clearing @last_update_time.
- - */
- - if (se->avg.last_update_time) {
- - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- - &se->avg, 0, 0, NULL);
- -
- - /*
- - * XXX: we could have just aged the entire load away if we've been
- - * absent from the fair class for too long.
- - */
- - }
- -
- -skip_aging:
- se->avg.last_update_time = cfs_rq->avg.last_update_time;
- cfs_rq->avg.load_avg += se->avg.load_avg;
- cfs_rq->avg.load_sum += se->avg.load_sum;
- cfs_rq->avg.util_avg += se->avg.util_avg;
- cfs_rq->avg.util_sum += se->avg.util_sum;
- + set_tg_cfs_propagate(cfs_rq);
- +
- + cfs_rq_util_change(cfs_rq);
- }
- +/**
- + * detach_entity_load_avg - detach this entity from its cfs_rq load avg
- + * @cfs_rq: cfs_rq to detach from
- + * @se: sched_entity to detach
- + *
- + * Must call update_cfs_rq_load_avg() before this, since we rely on
- + * cfs_rq->avg.last_update_time being current.
- + */
- static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
- {
- - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- - &se->avg, se->on_rq * scale_load_down(se->load.weight),
- - cfs_rq->curr == se, NULL);
- - cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
- - cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
- - cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
- - cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
- + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- + sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
- + sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
- + set_tg_cfs_propagate(cfs_rq);
- +
- + cfs_rq_util_change(cfs_rq);
- }
- /* Add the load generated by se into cfs_rq's load average */
- @@ -2621,34 +3183,20 @@
- enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
- {
- struct sched_avg *sa = &se->avg;
- - u64 now = cfs_rq_clock_task(cfs_rq);
- - int migrated, decayed;
- -
- - migrated = !sa->last_update_time;
- - if (!migrated) {
- - __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- - se->on_rq * scale_load_down(se->load.weight),
- - cfs_rq->curr == se, NULL);
- - }
- -
- - decayed = update_cfs_rq_load_avg(now, cfs_rq);
- cfs_rq->runnable_load_avg += sa->load_avg;
- cfs_rq->runnable_load_sum += sa->load_sum;
- - if (migrated)
- + if (!sa->last_update_time) {
- attach_entity_load_avg(cfs_rq, se);
- -
- - if (decayed || migrated)
- update_tg_load_avg(cfs_rq, 0);
- + }
- }
- /* Remove the runnable load generated by se from cfs_rq's runnable load average */
- static inline void
- dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
- {
- - update_load_avg(se, 1);
- -
- cfs_rq->runnable_load_avg =
- max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
- cfs_rq->runnable_load_sum =
- @@ -2677,46 +3225,36 @@
- #endif
- /*
- + * Synchronize entity load avg of dequeued entity without locking
- + * the previous rq.
- + */
- +void sync_entity_load_avg(struct sched_entity *se)
- +{
- + struct cfs_rq *cfs_rq = cfs_rq_of(se);
- + u64 last_update_time;
- +
- + last_update_time = cfs_rq_last_update_time(cfs_rq);
- + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
- +}
- +
- +/*
- * Task first catches up with cfs_rq, and then subtract
- * itself from the cfs_rq (task must be off the queue now).
- */
- void remove_entity_load_avg(struct sched_entity *se)
- {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- - u64 last_update_time;
- /*
- * Newly created task or never used group entity should not be removed
- * from its (source) cfs_rq
- */
- - if (se->avg.last_update_time == 0)
- - return;
- -
- - last_update_time = cfs_rq_last_update_time(cfs_rq);
- - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
- + sync_entity_load_avg(se);
- atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
- atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
- }
- -/*
- - * Update the rq's load with the elapsed running time before entering
- - * idle. if the last scheduled task is not a CFS task, idle_enter will
- - * be the only way to update the runnable statistic.
- - */
- -void idle_enter_fair(struct rq *this_rq)
- -{
- -}
- -
- -/*
- - * Update the rq's load with the elapsed idle time before a task is
- - * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
- - * be the only way to update the runnable statistic.
- - */
- -void idle_exit_fair(struct rq *this_rq)
- -{
- -}
- -
- static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
- {
- return cfs_rq->runnable_load_avg;
- @@ -2731,7 +3269,17 @@
- #else /* CONFIG_SMP */
- -static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
- +static inline int
- +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
- +{
- + return 0;
- +}
- +
- +#define UPDATE_TG 0x0
- +#define SKIP_AGE_LOAD 0x0
- +#define SKIP_CPUFREQ 0x0
- +
- +static inline void update_load_avg(struct sched_entity *se, int not_used1){}
- static inline void
- enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
- static inline void
- @@ -2750,69 +3298,6 @@
- #endif /* CONFIG_SMP */
- -static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
- -{
- -#ifdef CONFIG_SCHEDSTATS
- - struct task_struct *tsk = NULL;
- -
- - if (entity_is_task(se))
- - tsk = task_of(se);
- -
- - if (se->statistics.sleep_start) {
- - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
- -
- - if ((s64)delta < 0)
- - delta = 0;
- -
- - if (unlikely(delta > se->statistics.sleep_max))
- - se->statistics.sleep_max = delta;
- -
- - se->statistics.sleep_start = 0;
- - se->statistics.sum_sleep_runtime += delta;
- -
- - if (tsk) {
- - account_scheduler_latency(tsk, delta >> 10, 1);
- - trace_sched_stat_sleep(tsk, delta);
- - }
- - }
- - if (se->statistics.block_start) {
- - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
- -
- - if ((s64)delta < 0)
- - delta = 0;
- -
- - if (unlikely(delta > se->statistics.block_max))
- - se->statistics.block_max = delta;
- -
- - se->statistics.block_start = 0;
- - se->statistics.sum_sleep_runtime += delta;
- -
- - if (tsk) {
- - if (tsk->in_iowait) {
- - se->statistics.iowait_sum += delta;
- - se->statistics.iowait_count++;
- - trace_sched_stat_iowait(tsk, delta);
- - }
- -
- - trace_sched_stat_blocked(tsk, delta);
- - trace_sched_blocked_reason(tsk);
- -
- - /*
- - * Blocking time is in units of nanosecs, so shift by
- - * 20 to get a milliseconds-range estimation of the
- - * amount of time that the task spent sleeping:
- - */
- - if (unlikely(prof_on == SLEEP_PROFILING)) {
- - profile_hits(SLEEP_PROFILING,
- - (void *)get_wchan(tsk),
- - delta >> 20);
- - }
- - account_scheduler_latency(tsk, delta >> 10, 0);
- - }
- - }
- -#endif
- -}
- -
- static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
- {
- #ifdef CONFIG_SCHED_DEBUG
- @@ -2822,7 +3307,7 @@
- d = -d;
- if (d > 3*sysctl_sched_latency)
- - schedstat_inc(cfs_rq, nr_spread_over);
- + schedstat_inc(cfs_rq->nr_spread_over);
- #endif
- }
- @@ -2860,6 +3345,26 @@
- static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
- +static inline void check_schedstat_required(void)
- +{
- +#ifdef CONFIG_SCHEDSTATS
- + if (schedstat_enabled())
- + return;
- +
- + /* Force schedstat enabled if a dependent tracepoint is active */
- + if (trace_sched_stat_wait_enabled() ||
- + trace_sched_stat_sleep_enabled() ||
- + trace_sched_stat_iowait_enabled() ||
- + trace_sched_stat_blocked_enabled() ||
- + trace_sched_stat_runtime_enabled()) {
- + pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
- + "stat_blocked and stat_runtime require the "
- + "kernel parameter schedstats=enabled or "
- + "kernel.sched_schedstats=1\n");
- + }
- +#endif
- +}
- +
- static void
- enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
- {
- @@ -2874,16 +3379,16 @@
- * Update run-time statistics of the 'current'.
- */
- update_curr(cfs_rq);
- + update_load_avg(se, UPDATE_TG);
- enqueue_entity_load_avg(cfs_rq, se);
- + update_cfs_shares(se);
- account_entity_enqueue(cfs_rq, se);
- - update_cfs_shares(cfs_rq);
- - if (flags & ENQUEUE_WAKEUP) {
- + if (flags & ENQUEUE_WAKEUP)
- place_entity(cfs_rq, se, 0);
- - enqueue_sleeper(cfs_rq, se);
- - }
- - update_stats_enqueue(cfs_rq, se);
- + check_schedstat_required();
- + update_stats_enqueue(cfs_rq, se, flags);
- check_spread(cfs_rq, se);
- if (se != cfs_rq->curr)
- __enqueue_entity(cfs_rq, se);
- @@ -2945,25 +3450,30 @@
- static void
- dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
- {
- + int update_flags;
- +
- /*
- * Update run-time statistics of the 'current'.
- */
- update_curr(cfs_rq);
- - dequeue_entity_load_avg(cfs_rq, se);
- - update_stats_dequeue(cfs_rq, se);
- - if (flags & DEQUEUE_SLEEP) {
- -#ifdef CONFIG_SCHEDSTATS
- - if (entity_is_task(se)) {
- - struct task_struct *tsk = task_of(se);
- + /*
- + * When dequeuing a sched_entity, we must:
- + * - Update loads to have both entity and cfs_rq synced with now.
- + * - Substract its load from the cfs_rq->runnable_avg.
- + * - Substract its previous weight from cfs_rq->load.weight.
- + * - For group entity, update its weight to reflect the new share
- + * of its group cfs_rq.
- + */
- + update_flags = UPDATE_TG;
- - if (tsk->state & TASK_INTERRUPTIBLE)
- - se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
- - if (tsk->state & TASK_UNINTERRUPTIBLE)
- - se->statistics.block_start = rq_clock(rq_of(cfs_rq));
- - }
- -#endif
- - }
- + if (flags & DEQUEUE_IDLE)
- + update_flags |= SKIP_CPUFREQ;
- +
- + update_load_avg(se, update_flags);
- + dequeue_entity_load_avg(cfs_rq, se);
- +
- + update_stats_dequeue(cfs_rq, se, flags);
- clear_buddies(cfs_rq, se);
- @@ -2983,8 +3493,16 @@
- /* return excess runtime on last dequeue */
- return_cfs_rq_runtime(cfs_rq);
- - update_min_vruntime(cfs_rq);
- - update_cfs_shares(cfs_rq);
- + update_cfs_shares(se);
- +
- + /*
- + * Now advance min_vruntime if @se was the entity holding it back,
- + * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
- + * put back on, and if we advance min_vruntime, we'll be placed back
- + * further than we started -- ie. we'll be penalized.
- + */
- + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
- + update_min_vruntime(cfs_rq);
- }
- /*
- @@ -3039,22 +3557,23 @@
- */
- update_stats_wait_end(cfs_rq, se);
- __dequeue_entity(cfs_rq, se);
- - update_load_avg(se, 1);
- + update_load_avg(se, UPDATE_TG);
- }
- update_stats_curr_start(cfs_rq, se);
- cfs_rq->curr = se;
- -#ifdef CONFIG_SCHEDSTATS
- +
- /*
- * Track our maximum slice length, if the CPU's load is at
- * least twice that of our own weight (i.e. dont track it
- * when there are only lesser-weight tasks around):
- */
- - if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
- - se->statistics.slice_max = max(se->statistics.slice_max,
- - se->sum_exec_runtime - se->prev_sum_exec_runtime);
- + if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
- + schedstat_set(se->statistics.slice_max,
- + max((u64)schedstat_val(se->statistics.slice_max),
- + se->sum_exec_runtime - se->prev_sum_exec_runtime));
- }
- -#endif
- +
- se->prev_sum_exec_runtime = se->sum_exec_runtime;
- }
- @@ -3134,6 +3653,7 @@
- check_cfs_rq_runtime(cfs_rq);
- check_spread(cfs_rq, prev);
- +
- if (prev->on_rq) {
- update_stats_wait_start(cfs_rq, prev);
- /* Put 'current' back into the tree. */
- @@ -3155,8 +3675,8 @@
- /*
- * Ensure that runnable average is periodically updated.
- */
- - update_load_avg(curr, 1);
- - update_cfs_shares(cfs_rq);
- + update_load_avg(curr, UPDATE_TG);
- + update_cfs_shares(curr);
- #ifdef CONFIG_SCHED_HRTICK
- /*
- @@ -3255,7 +3775,7 @@
- static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
- {
- if (unlikely(cfs_rq->throttle_count))
- - return cfs_rq->throttled_clock_task;
- + return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
- return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
- }
- @@ -3393,13 +3913,11 @@
- struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- cfs_rq->throttle_count--;
- -#ifdef CONFIG_SMP
- if (!cfs_rq->throttle_count) {
- /* adjust cfs_rq_clock_task() */
- cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
- cfs_rq->throttled_clock_task;
- }
- -#endif
- return 0;
- }
- @@ -3766,6 +4284,23 @@
- throttle_cfs_rq(cfs_rq);
- }
- +static void sync_throttle(struct task_group *tg, int cpu)
- +{
- + struct cfs_rq *pcfs_rq, *cfs_rq;
- +
- + if (!cfs_bandwidth_used())
- + return;
- +
- + if (!tg->parent)
- + return;
- +
- + cfs_rq = tg->cfs_rq[cpu];
- + pcfs_rq = tg->parent->cfs_rq[cpu];
- +
- + cfs_rq->throttle_count = pcfs_rq->throttle_count;
- + pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
- +}
- +
- /* conditionally throttle active cfs_rq's from put_prev_entity() */
- static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
- {
- @@ -3851,6 +4386,10 @@
- static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
- {
- + /* init_cfs_bandwidth() was not called */
- + if (!cfs_b->throttled_cfs_rq.next)
- + return;
- +
- hrtimer_cancel(&cfs_b->period_timer);
- hrtimer_cancel(&cfs_b->slack_timer);
- }
- @@ -3901,6 +4440,7 @@
- static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
- static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
- static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
- +static inline void sync_throttle(struct task_group *tg, int cpu) {}
- static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
- static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
- @@ -3945,9 +4485,9 @@
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- - WARN_ON(task_rq(p) != rq);
- + SCHED_WARN_ON(task_rq(p) != rq);
- - if (cfs_rq->nr_running > 1) {
- + if (rq->cfs.h_nr_running > 1) {
- u64 slice = sched_slice(cfs_rq, se);
- u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
- s64 delta = slice - ran;
- @@ -3988,21 +4528,25 @@
- #endif
- #ifdef CONFIG_SMP
- +static bool __cpu_overutilized(int cpu, int delta);
- static bool cpu_overutilized(int cpu);
- +unsigned long boosted_cpu_util(int cpu);
- +#else
- +#define boosted_cpu_util(cpu) cpu_util_freq(cpu)
- #endif
- -#ifdef CONFIG_CPU_FREQ_GOV_SCHED
- -static void update_capacity_of(int cpu, bool request)
- +#ifdef CONFIG_SMP
- +static void update_capacity_of(int cpu)
- {
- unsigned long req_cap;
- if (!sched_freq())
- return;
- - /* Convert scale-invariant capacity to cpu. */
- + /* Normalize scale-invariant capacity to cpu. */
- req_cap = boosted_cpu_util(cpu);
- req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
- - set_cfs_cpu_capacity(cpu, request, req_cap);
- + set_cfs_cpu_capacity(cpu, true, req_cap);
- }
- #endif
- @@ -4019,8 +4563,35 @@
- #ifdef CONFIG_SMP
- int task_new = flags & ENQUEUE_WAKEUP_NEW;
- int task_wakeup = flags & ENQUEUE_WAKEUP;
- +
- + /*
- + * Update SchedTune accounting.
- + *
- + * We do it before updating the CPU capacity to ensure the
- + * boost value of the current task is accounted for in the
- + * selection of the OPP.
- + *
- + * We do it also in the case where we enqueue a throttled task;
- + * we could argue that a throttled task should not boost a CPU,
- + * however:
- + * a) properly implementing CPU boosting considering throttled
- + * tasks will increase a lot the complexity of the solution
- + * b) it's not easy to quantify the benefits introduced by
- + * such a more complex solution.
- + * Thus, for the time being we go for the simple solution and boost
- + * also for throttled RQs.
- + */
- + schedtune_enqueue_task(p, cpu_of(rq));
- #endif
- + /*
- + * If in_iowait is set, the code below may not trigger any cpufreq
- + * utilization updates, so do it here explicitly with the IOWAIT flag
- + * passed.
- + */
- + if (p->in_iowait)
- + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
- +
- for_each_sched_entity(se) {
- if (se->on_rq)
- break;
- @@ -4032,7 +4603,7 @@
- *
- * note: in the case of encountering a throttled cfs_rq we will
- * post the final h_nr_running increment below.
- - */
- + */
- if (cfs_rq_throttled(cfs_rq))
- break;
- cfs_rq->h_nr_running++;
- @@ -4049,34 +4620,14 @@
- if (cfs_rq_throttled(cfs_rq))
- break;
- - update_load_avg(se, 1);
- - update_cfs_shares(cfs_rq);
- + update_load_avg(se, UPDATE_TG);
- + update_cfs_shares(se);
- }
- if (!se)
- add_nr_running(rq, 1);
- #ifdef CONFIG_SMP
- -
- - /*
- - * Update SchedTune accouting.
- - *
- - * We do it before updating the CPU capacity to ensure the
- - * boost value of the current task is accounted for in the
- - * selection of the OPP.
- - *
- - * We do it also in the case where we enqueue a trottled task;
- - * we could argue that a throttled task should not boost a CPU,
- - * however:
- - * a) properly implementing CPU boosting considering throttled
- - * tasks will increase a lot the complexity of the solution
- - * b) it's not easy to quantify the benefits introduced by
- - * such a more complex solution.
- - * Thus, for the time being we go for the simple solution and boost
- - * also for throttled RQs.
- - */
- - schedtune_enqueue_task(p, cpu_of(rq));
- -
- if (!se) {
- walt_inc_cumulative_runnable_avg(rq, p);
- if (!task_new && !rq->rd->overutilized &&
- @@ -4093,17 +4644,10 @@
- * request after load balancing is done.
- */
- if (task_new || task_wakeup)
- - update_capacity_of(cpu_of(rq), true);
- + update_capacity_of(cpu_of(rq));
- }
- - /* Get the top level CFS RQ for the task CPU */
- - cfs_rq = &(task_rq(p)->cfs);
- -
- - /* Update RQ estimated utilization */
- - cfs_rq->avg.util_est += task_util_est(p);
- -
- #endif /* CONFIG_SMP */
- -
- hrtick_update(rq);
- }
- @@ -4120,6 +4664,20 @@
- struct sched_entity *se = &p->se;
- int task_sleep = flags & DEQUEUE_SLEEP;
- + if (task_sleep && rq->nr_running == 1)
- + flags |= DEQUEUE_IDLE;
- +
- +#ifdef CONFIG_SMP
- + /*
- + * Update SchedTune accounting
- + *
- + * We do it before updating the CPU capacity to ensure the
- + * boost value of the current task is accounted for in the
- + * selection of the OPP.
- + */
- + schedtune_dequeue_task(p, cpu_of(rq));
- +#endif
- +
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
- dequeue_entity(cfs_rq, se, flags);
- @@ -4137,21 +4695,22 @@
- /* Don't dequeue parent if it has other entities besides us */
- if (cfs_rq->load.weight) {
- + /* Avoid re-evaluating load for this entity: */
- + se = parent_entity(se);
- /*
- * Bias pick_next to pick a task from this cfs_rq, as
- * p is sleeping when it is within its sched_slice.
- */
- - if (task_sleep && parent_entity(se))
- - set_next_buddy(parent_entity(se));
- -
- - /* avoid re-evaluating load for this entity */
- - se = parent_entity(se);
- + if (task_sleep && se && !throttled_hierarchy(cfs_rq))
- + set_next_buddy(se);
- break;
- }
- flags |= DEQUEUE_SLEEP;
- }
- for_each_sched_entity(se) {
- + int update_flags;
- +
- cfs_rq = cfs_rq_of(se);
- cfs_rq->h_nr_running--;
- walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
- @@ -4159,24 +4718,19 @@
- if (cfs_rq_throttled(cfs_rq))
- break;
- - update_load_avg(se, 1);
- - update_cfs_shares(cfs_rq);
- + update_flags = UPDATE_TG;
- +
- + if (flags & DEQUEUE_IDLE)
- + update_flags |= SKIP_CPUFREQ;
- +
- + update_load_avg(se, update_flags);
- + update_cfs_shares(se);
- }
- if (!se)
- sub_nr_running(rq, 1);
- #ifdef CONFIG_SMP
- -
- - /*
- - * Update SchedTune accouting
- - *
- - * We do it before updating the CPU capacity to ensure the
- - * boost value of the current task is accounted for in the
- - * selection of the OPP.
- - */
- - schedtune_dequeue_task(p, cpu_of(rq));
- -
- if (!se) {
- walt_dec_cumulative_runnable_avg(rq, p);
- @@ -4190,26 +4744,12 @@
- */
- if (task_sleep) {
- if (rq->cfs.nr_running)
- - update_capacity_of(cpu_of(rq), true);
- + update_capacity_of(cpu_of(rq));
- else if (sched_freq())
- - update_capacity_of(cpu_of(rq), false);
- + set_cfs_cpu_capacity(cpu_of(rq), false, 0); /* no normalization required for 0 */
- }
- }
- - /* Get the top level CFS RQ for the task CPU */
- - cfs_rq = &(task_rq(p)->cfs);
- -
- - /* Update RQ estimated utilization */
- - if (cfs_rq->avg.util_est >= task_util_est(p))
- - cfs_rq->avg.util_est -= task_util_est(p);
- - else
- - cfs_rq->avg.util_est = 0;
- -
- -
- - /* Update estimated utilization */
- - if (task_sleep)
- - p->se.avg.util_est = p->se.avg.util_avg;
- -
- #endif /* CONFIG_SMP */
- hrtick_update(rq);
- @@ -4545,25 +5085,30 @@
- return wl;
- for_each_sched_entity(se) {
- - long w, W;
- + struct cfs_rq *cfs_rq = se->my_q;
- + long W, w = cfs_rq_load_avg(cfs_rq);
- - tg = se->my_q->tg;
- + tg = cfs_rq->tg;
- /*
- * W = @wg + \Sum rw_j
- */
- - W = wg + calc_tg_weight(tg, se->my_q);
- + W = wg + atomic_long_read(&tg->load_avg);
- +
- + /* Ensure \Sum rw_j >= rw_i */
- + W -= cfs_rq->tg_load_avg_contrib;
- + W += w;
- /*
- * w = rw_i + @wl
- */
- - w = cfs_rq_load_avg(se->my_q) + wl;
- + w += wl;
- /*
- * wl = S * s'_i; see (2)
- */
- if (W > 0 && w < W)
- - wl = (w * tg->shares) / W;
- + wl = (w * (long)tg->shares) / W;
- else
- wl = tg->shares;
- @@ -4612,16 +5157,95 @@
- >> SCHED_CAPACITY_SHIFT;
- }
- +/*
- + * Returns the current capacity of cpu after applying both
- + * cpu and min freq scaling.
- + */
- +unsigned long capacity_min_of(int cpu)
- +{
- + if (!sched_feat(MIN_CAPACITY_CAPPING))
- + return 0;
- + return arch_scale_cpu_capacity(NULL, cpu) *
- + arch_scale_min_freq_capacity(NULL, cpu)
- + >> SCHED_CAPACITY_SHIFT;
- +}
- +
- +
- static inline bool energy_aware(void)
- {
- return sched_feat(ENERGY_AWARE);
- }
- /*
- + * CPU candidates.
- + *
- + * These are labels to reference CPU candidates for an energy_diff.
- + * Currently we support only two possible candidates: the task's previous CPU
- + * and another candiate CPU.
- + * More advanced/aggressive EAS selection policies can consider more
- + * candidates.
- + */
- +#define EAS_CPU_PRV 0
- +#define EAS_CPU_NXT 1
- +#define EAS_CPU_BKP 2
- +#define EAS_CPU_CNT 3
- +
- +/*
- + * energy_diff - supports the computation of the estimated energy impact in
- + * moving a "task"'s "util_delta" between different CPU candidates.
- + */
- +struct energy_env {
- + /* Utilization to move */
- + struct task_struct *p;
- + int util_delta;
- +
- + /* Mask of CPUs candidates to evaluate */
- + cpumask_t cpus_mask;
- +
- + /* CPU candidates to evaluate */
- + struct {
- +
- + /* CPU ID, must be in cpus_mask */
- + int cpu_id;
- +
- + /*
- + * Index (into sched_group_energy::cap_states) of the OPP the
- + * CPU needs to run at if the task is placed on it.
- + * This includes the both active and blocked load, due to
- + * other tasks on this CPU, as well as the task's own
- + * utilization.
- + */
- + int cap_idx;
- + int cap;
- +
- + /* Estimated system energy */
- + unsigned int energy;
- +
- + /* Estimated energy variation wrt EAS_CPU_PRV */
- + int nrg_delta;
- +
- + } cpu[EAS_CPU_CNT];
- +
- + /*
- + * Index (into energy_env::cpu) of the morst energy efficient CPU for
- + * the specified energy_env::task
- + */
- + int next_idx;
- +
- + /* Support data */
- + struct sched_group *sg_top;
- + struct sched_group *sg_cap;
- + struct sched_group *sg;
- +};
- +
- +static int cpu_util_wake(int cpu, struct task_struct *p);
- +
- +/*
- * __cpu_norm_util() returns the cpu util relative to a specific capacity,
- - * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
- - * energy calculations. Using the scale-invariant util returned by
- - * cpu_util() and approximating scale-invariant util by:
- + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
- + * energy calculations.
- + *
- + * Since util is a scale-invariant utilization defined as:
- *
- * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
- *
- @@ -4631,34 +5255,41 @@
- *
- * norm_util = running_time/time ~ util/capacity
- */
- -static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta)
- +static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
- {
- - int util = __cpu_util(cpu, delta, UTIL_EST);
- -
- if (util >= capacity)
- return SCHED_CAPACITY_SCALE;
- return (util << SCHED_CAPACITY_SHIFT)/capacity;
- }
- -static int calc_util_delta(struct energy_env *eenv, int cpu)
- -{
- - if (cpu == eenv->src_cpu)
- - return -eenv->util_delta;
- - if (cpu == eenv->dst_cpu)
- - return eenv->util_delta;
- - return 0;
- -}
- -
- -static
- -unsigned long group_max_util(struct energy_env *eenv)
- +static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx)
- {
- - int i, delta;
- unsigned long max_util = 0;
- + unsigned long util;
- + int cpu;
- +
- + for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
- + util = cpu_util_wake(cpu, eenv->p);
- +
- + /*
- + * If we are looking at the target CPU specified by the eenv,
- + * then we should add the (estimated) utilization of the task
- + * assuming we will wake it up on that CPU.
- + */
- + if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
- + util += eenv->util_delta;
- +
- + max_util = max(max_util, util);
- +
- + /*
- + * Take into account any minimum frequency imposed
- + * elsewhere which limits the energy states available
- + * If the MIN_CAPACITY_CAPPING feature is not enabled
- + * capacity_min_of will return 0 (not capped).
- + */
- + max_util = max(max_util, capacity_min_of(cpu));
- - for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) {
- - delta = calc_util_delta(eenv, i);
- - max_util = max(max_util, __cpu_util(i, delta, UTIL_EST));
- }
- return max_util;
- @@ -4666,93 +5297,67 @@
- /*
- * group_norm_util() returns the approximated group util relative to it's
- - * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in
- - * energy calculations. Since task executions may or may not overlap in time in
- - * the group the true normalized util is between max(cpu_norm_util(i)) and
- - * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
- - * latter is used as the estimate as it leads to a more pessimistic energy
- + * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
- + * in energy calculations.
- + *
- + * Since task executions may or may not overlap in time in the group the true
- + * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
- + * when iterating over all CPUs in the group.
- + * The latter estimate is used as it leads to a more pessimistic energy
- * estimate (more busy).
- */
- static unsigned
- -long group_norm_util(struct energy_env *eenv)
- +long group_norm_util(struct energy_env *eenv, int cpu_idx)
- {
- - int i, delta;
- - unsigned long util_sum = 0;
- - struct sched_group *sg = eenv->sg;
- - unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
- + unsigned long capacity = eenv->cpu[cpu_idx].cap;
- + unsigned long util, util_sum = 0;
- + int cpu;
- - for_each_cpu(i, sched_group_cpus(sg)) {
- - delta = calc_util_delta(eenv, i);
- - util_sum += __cpu_norm_util(i, capacity, delta);
- - }
- + for_each_cpu(cpu, sched_group_cpus(eenv->sg)) {
- + util = cpu_util_wake(cpu, eenv->p);
- - if (util_sum > SCHED_CAPACITY_SCALE)
- - return SCHED_CAPACITY_SCALE;
- - return util_sum;
- -}
- + /*
- + * If we are looking at the target CPU specified by the eenv,
- + * then we should add the (estimated) utilization of the task
- + * assuming we will wake it up on that CPU.
- + */
- + if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
- + util += eenv->util_delta;
- -#ifdef CONFIG_SCHED_TUNE
- -static inline int
- -find_min_capacity(struct energy_env *eenv)
- -{
- - const struct sched_group_energy const *sge = eenv->sg->sge;
- - unsigned long min_capacity, cur_capacity;
- - int min_cap_idx, cap_idx;
- - unsigned long min_util;
- -
- - /* Non boosted tasks do not affect the minimum capacity */
- - if (!schedtune_task_boost(eenv->task))
- - return eenv->cap_idx;
- -
- - /* Find minimum capacity to satify the task boost value */
- - min_util = boosted_task_util(eenv->task);
- - for (min_cap_idx = 0; min_cap_idx < (sge->nr_cap_states-1); min_cap_idx++) {
- - if (sge->cap_states[min_cap_idx].cap >= min_util)
- - break;
- + util_sum += __cpu_norm_util(util, capacity);
- }
- - min_capacity = sge->cap_states[min_cap_idx].cap;
- -
- - /* The current capacity is the one computed by the caller */
- - cur_capacity = sge->cap_states[eenv->cap_idx].cap;
- -
- - /*
- - * Compute the minumum CPU capacity required to support task boosting
- - * within this SG.
- - */
- - cur_capacity = max(min_capacity, cur_capacity);
- - cap_idx = max(eenv->cap_idx, min_cap_idx);
- - return cap_idx;
- + return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
- }
- -#else
- -#define find_min_capacity(eenv) eenv->cap_idx
- -#endif /* CONFIG_SCHED_TUNE */
- -static int find_new_capacity(struct energy_env *eenv)
- +static int find_new_capacity(struct energy_env *eenv, int cpu_idx)
- {
- - const struct sched_group_energy const *sge = eenv->sg->sge;
- + const struct sched_group_energy *sge = eenv->sg->sge;
- int idx, max_idx = sge->nr_cap_states - 1;
- - unsigned long util = group_max_util(eenv);
- + unsigned long util = group_max_util(eenv, cpu_idx);
- /* default is max_cap if we don't find a match */
- - eenv->cap_idx = max_idx;
- + eenv->cpu[cpu_idx].cap_idx = max_idx;
- + eenv->cpu[cpu_idx].cap = sge->cap_states[max_idx].cap;
- for (idx = 0; idx < sge->nr_cap_states; idx++) {
- if (sge->cap_states[idx].cap >= util) {
- - /* Keep track of SG's capacity index */
- - eenv->cap_idx = idx;
- + /* Keep track of SG's capacity */
- + eenv->cpu[cpu_idx].cap_idx = idx;
- + eenv->cpu[cpu_idx].cap = sge->cap_states[idx].cap;
- break;
- }
- }
- - /* Update SG's capacity based on boost value of the current task */
- - eenv->cap_idx = find_min_capacity(eenv);
- - return eenv->cap_idx;
- + return eenv->cpu[cpu_idx].cap_idx;
- }
- -static int group_idle_state(struct sched_group *sg)
- +static int group_idle_state(struct energy_env *eenv, int cpu_idx)
- {
- + struct sched_group *sg = eenv->sg;
- int i, state = INT_MAX;
- + int src_in_grp, dst_in_grp;
- + long grp_util = 0;
- /* Find the shallowest idle state in the sched group. */
- for_each_cpu(i, sched_group_cpus(sg))
- @@ -4761,114 +5366,161 @@
- /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
- state++;
- + src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id,
- + sched_group_cpus(sg));
- + dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id,
- + sched_group_cpus(sg));
- + if (src_in_grp == dst_in_grp) {
- + /* both CPUs under consideration are in the same group or not in
- + * either group, migration should leave idle state the same.
- + */
- + goto end;
- + }
- +
- + /*
- + * Try to estimate if a deeper idle state is
- + * achievable when we move the task.
- + */
- + for_each_cpu(i, sched_group_cpus(sg)) {
- + grp_util += cpu_util_wake(i, eenv->p);
- + if (unlikely(i == eenv->cpu[cpu_idx].cpu_id))
- + grp_util += eenv->util_delta;
- + }
- +
- + if (grp_util <=
- + ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
- + /* after moving, this group is at most partly
- + * occupied, so it should have some idle time.
- + */
- + int max_idle_state_idx = sg->sge->nr_idle_states - 2;
- + int new_state = grp_util * max_idle_state_idx;
- + if (grp_util <= 0)
- + /* group will have no util, use lowest state */
- + new_state = max_idle_state_idx + 1;
- + else {
- + /* for partially idle, linearly map util to idle
- + * states, excluding the lowest one. This does not
- + * correspond to the state we expect to enter in
- + * reality, but an indication of what might happen.
- + */
- + new_state = min(max_idle_state_idx, (int)
- + (new_state / sg->sgc->max_capacity));
- + new_state = max_idle_state_idx - new_state;
- + }
- + state = new_state;
- + } else {
- + /* After moving, the group will be fully occupied
- + * so assume it will not be idle at all.
- + */
- + state = 0;
- + }
- +end:
- return state;
- }
- /*
- - * Compute energy for the eenv's SG (i.e. eenv->sg).
- + * sched_group_energy(): Computes the absolute energy consumption of cpus
- + * belonging to the sched_group including shared resources shared only by
- + * members of the group. Iterates over all cpus in the hierarchy below the
- + * sched_group starting from the bottom working it's way up before going to
- + * the next cpu until all cpus are covered at all levels. The current
- + * implementation is likely to gather the same util statistics multiple times.
- + * This can probably be done in a faster but more complex way.
- + * Note: sched_group_energy() may fail when racing with sched_domain updates.
- + * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg).
- + *
- + * This works in iterations to compute the SG's energy for each CPU
- + * candidate defined by the energy_env's cpu array.
- *
- - * This works in two iterations:
- - * first iteration, before moving the utilization, i.e.
- - * util_delta == 0
- - * second iteration, after moving the utilization, i.e.
- - * util_delta != 0
- + * NOTE: in the following computations for busy_energy and idle_energy we do
- + * not shift by SCHED_CAPACITY_SHIFT in order to reduce rounding errors.
- + * The required scaling will be performed just one time, by the calling
- + * functions, once we accumulated the contributons for all the SGs.
- */
- -static void before_after_energy(struct energy_env *eenv)
- +static void calc_sg_energy(struct energy_env *eenv)
- {
- -
- - int sg_busy_energy, sg_idle_energy;
- struct sched_group *sg = eenv->sg;
- - unsigned long util_delta;
- - unsigned long group_util;
- + int busy_energy, idle_energy;
- + unsigned int busy_power;
- + unsigned int idle_power;
- + unsigned long sg_util;
- int cap_idx, idle_idx;
- int total_energy = 0;
- - unsigned int cap;
- - bool after;
- -
- - util_delta = eenv->util_delta;
- - eenv->util_delta = 0;
- - after = false;
- -
- -compute_after:
- + int cpu_idx;
- - idle_idx = group_idle_state(sg);
- + for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
- - cap_idx = find_new_capacity(eenv);
- - group_util = group_norm_util(eenv);
- - cap = sg->sge->cap_states[cap_idx].cap;
- - sg_busy_energy = group_util * sg->sge->cap_states[cap_idx].power;
- - sg_busy_energy >>= SCHED_CAPACITY_SHIFT;
- + if (eenv->cpu[cpu_idx].cpu_id == -1)
- + continue;
- + /* Compute ACTIVE energy */
- + cap_idx = find_new_capacity(eenv, cpu_idx);
- + busy_power = sg->sge->cap_states[cap_idx].power;
- + /*
- + * in order to calculate cpu_norm_util, we need to know which
- + * capacity level the group will be at, so calculate that first
- + */
- + sg_util = group_norm_util(eenv, cpu_idx);
- - sg_idle_energy = SCHED_CAPACITY_SCALE - group_util;
- - sg_idle_energy *= sg->sge->idle_states[idle_idx].power;
- - sg_idle_energy >>= SCHED_CAPACITY_SHIFT;
- + busy_energy = sg_util * busy_power;
- - total_energy = sg_busy_energy + sg_idle_energy;
- + /* Compute IDLE energy */
- + idle_idx = group_idle_state(eenv, cpu_idx);
- + idle_power = sg->sge->idle_states[idle_idx].power;
- - /* Account for "after" metrics */
- - if (after) {
- - if (sg->group_weight == 1 &&
- - cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
- - eenv->after.utilization = group_util;
- - eenv->after.capacity = cap;
- - }
- - eenv->after.energy += total_energy;
- - return;
- - }
- + idle_energy = SCHED_CAPACITY_SCALE - sg_util;
- + idle_energy *= idle_power;
- - /* Account for "before" metrics */
- - if (sg->group_weight == 1 &&
- - cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
- - eenv->after.utilization = group_util;
- - eenv->before.capacity = cap;
- + total_energy = busy_energy + idle_energy;
- + eenv->cpu[cpu_idx].energy += total_energy;
- }
- - eenv->before.energy += total_energy;
- -
- - /* Setup eenv for the "after" case */
- - eenv->util_delta = util_delta;
- - after = true;
- -
- - goto compute_after;
- -
- }
- /*
- - * sched_group_energy(): Computes the absolute energy consumption of cpus
- - * belonging to the sched_group including shared resources shared only by
- - * members of the group. Iterates over all cpus in the hierarchy below the
- - * sched_group starting from the bottom working it's way up before going to
- - * the next cpu until all cpus are covered at all levels. The current
- - * implementation is likely to gather the same util statistics multiple times.
- - * This can probably be done in a faster but more complex way.
- - * Note: sched_group_energy() may fail when racing with sched_domain updates.
- + * compute_energy() computes the absolute variation in energy consumption by
- + * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT.
- + *
- + * NOTE: compute_energy() may fail when racing with sched_domain updates, in
- + * which case we abort by returning -EINVAL.
- */
- -static int sched_group_energy(struct energy_env *eenv)
- +static int compute_energy(struct energy_env *eenv)
- {
- - struct sched_domain *sd;
- struct cpumask visit_cpus;
- - struct sched_group *sg;
- - int cpu;
- + int cpu_count;
- WARN_ON(!eenv->sg_top->sge);
- cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
- + /* If a cpu is hotplugged in while we are in this function,
- + * it does not appear in the existing visit_cpus mask
- + * which came from the sched_group pointer of the
- + * sched_domain pointed at by sd_ea for either the prev
- + * or next cpu and was dereferenced in __energy_diff.
- + * Since we will dereference sd_scs later as we iterate
- + * through the CPUs we expect to visit, new CPUs can
- + * be present which are not in the visit_cpus mask.
- + * Guard this with cpu_count.
- + */
- + cpu_count = cpumask_weight(&visit_cpus);
- while (!cpumask_empty(&visit_cpus)) {
- struct sched_group *sg_shared_cap = NULL;
- -
- - cpu = cpumask_first(&visit_cpus);
- + int cpu = cpumask_first(&visit_cpus);
- + struct sched_domain *sd;
- /*
- * Is the group utilization affected by cpus outside this
- * sched_group?
- + * This sd may have groups with cpus which were not present
- + * when we took visit_cpus.
- */
- sd = rcu_dereference(per_cpu(sd_scs, cpu));
- +
- if (sd && sd->parent)
- sg_shared_cap = sd->parent->groups;
- for_each_domain(cpu, sd) {
- - sg = sd->groups;
- + struct sched_group *sg = sd->groups;
- /* Has this sched_domain already been visited? */
- if (sd->child && group_first_cpu(sg) != cpu)
- @@ -4878,18 +5530,52 @@
- eenv->sg_cap = sg;
- if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
- eenv->sg_cap = sg_shared_cap;
- + else
- + eenv->sg_cap = sg;
- + /*
- + * Compute the energy for all the candidate
- + * CPUs in the current visited SG.
- + */
- eenv->sg = sg;
- - before_after_energy(eenv);
- -
- - if (!sd->child)
- + calc_sg_energy(eenv);
- +
- + /* remove CPUs we have just visited */
- + if (!sd->child) {
- + /*
- + * cpu_count here is the number of
- + * cpus we expect to visit in this
- + * calculation. If we race against
- + * hotplug, we can have extra cpus
- + * added to the groups we are
- + * iterating which do not appear in
- + * the visit_cpus mask. In that case
- + * we are not able to calculate energy
- + * without restarting so we will bail
- + * out and use prev_cpu this time.
- + */
- + if (!cpu_count)
- + return -EINVAL;
- cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
- + cpu_count--;
- + }
- if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
- goto next_cpu;
- } while (sg = sg->next, sg != sd->groups);
- }
- +
- + /*
- + * If we raced with hotplug and got an sd NULL-pointer;
- + * returning a wrong energy estimation is better than
- + * entering an infinite loop.
- + * Specifically: If a cpu is unplugged after we took
- + * the visit_cpus mask, it no longer has an sd_scs
- + * pointer, so when we dereference it, we get NULL.
- + */
- + if (cpumask_test_cpu(cpu, &visit_cpus))
- + return -EINVAL;
- next_cpu:
- cpumask_clear_cpu(cpu, &visit_cpus);
- continue;
- @@ -4903,168 +5589,103 @@
- return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
- }
- -static inline int normalize_energy(int energy_diff);
- -
- -#define eenv_before(__X) eenv->before.__X
- -#define eenv_after(__X) eenv->after.__X
- -#define eenv_delta(__X) eenv->after.__X - eenv->before.__X
- -
- -static inline void
- -__update_perf_energy_deltas(struct energy_env *eenv)
- -{
- - unsigned long task_util = eenv->util_delta;
- -
- - /*
- - * SpeedUp Index
- - *
- - * SPI := cpu_capacity - task_util
- - *
- - * which estimate how sooner a task will complete when running
- - * on an higher OPP wrt the minimum required.
- - */
- - eenv_before(speedup_idx) = eenv_before(capacity) - task_util;
- - eenv_after(speedup_idx) = eenv_after(capacity) - task_util;
- -
- - /*
- - * Delay Index
- - *
- - * DLI := 1024 * (cpu_util - task_util) / cpu_util
- - *
- - * which represents the "fraction" of CPU bandwidth consumed by other
- - * tasks in the worst case, i.e. assuming all other tasks runs before.
- - *
- - * NOTE: in the above formula we assume that "cpu_util" includes
- - * already the task utilization.
- - */
- - eenv_before(delay_idx) = SCHED_CAPACITY_SCALE;
- - eenv_before(delay_idx) *= (eenv_before(utilization) - task_util);
- - eenv_before(delay_idx) /= eenv_before(utilization);
- - eenv_after(delay_idx) = SCHED_CAPACITY_SCALE;
- - eenv_after(delay_idx) *= (eenv_after(utilization) - task_util);
- - eenv_after(delay_idx) /= eenv_after(utilization);
- -
- - /* Performance Variation */
- - eenv->prf_delta = eenv_delta(speedup_idx) - eenv_delta(delay_idx);
- -
- - /* Energy Variation */
- - eenv->nrg_delta = normalize_energy(eenv_delta(energy));
- -
- -}
- -
- /*
- - * energy_diff(): Estimate the energy impact of changing the utilization
- - * distribution. eenv specifies the change: utilisation amount, source, and
- - * destination cpu. Source or destination cpu may be -1 in which case the
- - * utilization is removed from or added to the system (e.g. task wake-up). If
- - * both are specified, the utilization is migrated.
- + * select_energy_cpu_idx(): estimate the energy impact of changing the
- + * utilization distribution.
- + *
- + * The eenv parameter specifies the changes: utilisation amount and a pair of
- + * possible CPU candidates (the previous CPU and a different target CPU).
- + *
- + * This function returns the index of a CPU candidate specified by the
- + * energy_env which corresponds to the first CPU saving energy.
- + * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy
- + * efficient than running on prev_cpu. This is also the value returned in case
- + * of abort due to error conditions during the computations.
- + * A value greater than zero means that the first energy-efficient CPU is the
- + * one represented by eenv->cpu[eenv->next_idx].cpu_id.
- */
- -static inline int __energy_diff(struct energy_env *eenv)
- +static inline int select_energy_cpu_idx(struct energy_env *eenv)
- {
- struct sched_domain *sd;
- struct sched_group *sg;
- int sd_cpu = -1;
- + int cpu_idx;
- + int margin;
- - if (eenv->src_cpu == eenv->dst_cpu)
- - return 0;
- -
- - sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
- + sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id;
- sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
- +
- if (!sd)
- - return 0; /* Error */
- + return EAS_CPU_PRV;
- +
- + cpumask_clear(&eenv->cpus_mask);
- + for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
- + int cpu = eenv->cpu[cpu_idx].cpu_id;
- +
- + if (cpu < 0)
- + continue;
- + cpumask_set_cpu(cpu, &eenv->cpus_mask);
- + }
- sg = sd->groups;
- +
- do {
- - if (!cpu_in_sg(sg, eenv->src_cpu) &&
- - !cpu_in_sg(sg, eenv->dst_cpu))
- + /* Skip SGs which do not contains a candidate CPU */
- + if (!cpumask_intersects(&eenv->cpus_mask, sched_group_cpus(sg)))
- continue;
- eenv->sg_top = sg;
- - if (sched_group_energy(eenv))
- - return 0; /* Invalid result abort */
- + /* energy is unscaled to reduce rounding errors */
- + if (compute_energy(eenv) == -EINVAL)
- + return EAS_CPU_PRV;
- } while (sg = sg->next, sg != sd->groups);
- - __update_perf_energy_deltas(eenv);
- -
- - trace_sched_energy_diff(eenv);
- - trace_sched_energy_perf_deltas(eenv);
- -
- - return eenv->nrg_delta;
- -}
- + /* Scale energy before comparisons */
- + for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx)
- + eenv->cpu[cpu_idx].energy >>= SCHED_CAPACITY_SHIFT;
- -#ifdef CONFIG_SCHED_TUNE
- -
- -struct target_nrg schedtune_target_nrg;
- -
- -/*
- - * System energy normalization
- - * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
- - * corresponding to the specified energy variation.
- - */
- -static inline int
- -normalize_energy(int energy_diff)
- -{
- - u32 normalized_nrg;
- -#ifdef CONFIG_SCHED_DEBUG
- - int max_delta;
- -
- - /* Check for boundaries */
- - max_delta = schedtune_target_nrg.max_power;
- - max_delta -= schedtune_target_nrg.min_power;
- - WARN_ON(abs(energy_diff) >= max_delta);
- -#endif
- -
- - /* Do scaling using positive numbers to increase the range */
- - normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
- -
- - /* Scale by energy magnitude */
- - normalized_nrg <<= SCHED_CAPACITY_SHIFT;
- -
- - /* Normalize on max energy for target platform */
- - normalized_nrg = reciprocal_divide(
- - normalized_nrg, schedtune_target_nrg.rdiv);
- -
- - return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
- -}
- -
- -static inline bool filter_energy(void)
- -{
- - return sched_feat(ENERGY_FILTER);
- -}
- -
- -static inline int
- -energy_diff(struct energy_env *eenv)
- -{
- - int boost;
- + /*
- + * Compute the dead-zone margin used to prevent too many task
- + * migrations with negligible energy savings.
- + * An energy saving is considered meaningful if it reduces the energy
- + * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56%
- + */
- + margin = eenv->cpu[EAS_CPU_PRV].energy >> 6;
- - /* Conpute "absolute" energy diff */
- - __energy_diff(eenv);
- - if (!filter_energy())
- - return eenv->nrg_delta;
- + /*
- + * By default the EAS_CPU_PRV CPU is considered the most energy
- + * efficient, with a 0 energy variation.
- + */
- + eenv->next_idx = EAS_CPU_PRV;
- - /* Return energy diff when boost margin is 0 */
- - boost = schedtune_task_boost(eenv->task);
- - if (boost == 0)
- - return eenv->nrg_delta;
- + /*
- + * Compare the other CPU candidates to find a CPU which can be
- + * more energy efficient then EAS_CPU_PRV
- + */
- + for (cpu_idx = EAS_CPU_NXT; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
- + /* Skip not valid scheduled candidates */
- + if (eenv->cpu[cpu_idx].cpu_id < 0)
- + continue;
- + /* Compute energy delta wrt EAS_CPU_PRV */
- + eenv->cpu[cpu_idx].nrg_delta =
- + eenv->cpu[cpu_idx].energy -
- + eenv->cpu[EAS_CPU_PRV].energy;
- + /* filter energy variations within the dead-zone margin */
- + if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin)
- + eenv->cpu[cpu_idx].nrg_delta = 0;
- + /* update the schedule candidate with min(nrg_delta) */
- + if (eenv->cpu[cpu_idx].nrg_delta <
- + eenv->cpu[eenv->next_idx].nrg_delta) {
- + eenv->next_idx = cpu_idx;
- + if (sched_feat(FBT_STRICT_ORDER))
- + break;
- + }
- + }
- - eenv->payoff = schedtune_accept_deltas(
- - eenv->nrg_delta,
- - eenv->prf_delta,
- - eenv->task);
- -
- - /*
- - * When SchedTune is enabled, the energy_diff() function will return
- - * the computed energy payoff value. Since the energy_diff() return
- - * value is expected to be negative by its callers, this evaluation
- - * function return a negative value each time the evaluation return a
- - * positive payoff, which is the condition for the acceptance of
- - * a scheduling decision
- - */
- - return -eenv->payoff;
- + return eenv->next_idx;
- +
- }
- -#else /* CONFIG_SCHED_TUNE */
- -#define energy_diff(eenv) __energy_diff(eenv)
- -#endif
- /*
- * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
- @@ -5078,31 +5699,34 @@
- * being client/server, worker/dispatcher, interrupt source or whatever is
- * irrelevant, spread criteria is apparent partner count exceeds socket size.
- */
- -static int wake_wide(struct task_struct *p)
- +static int wake_wide(struct task_struct *p, int sibling_count_hint)
- {
- unsigned int master = current->wakee_flips;
- unsigned int slave = p->wakee_flips;
- - int factor = this_cpu_read(sd_llc_size);
- + int llc_size = this_cpu_read(sd_llc_size);
- +
- + if (sibling_count_hint >= llc_size)
- + return 1;
- if (master < slave)
- swap(master, slave);
- - if (slave < factor || master < slave * factor)
- + if (slave < llc_size || master < slave * llc_size)
- return 0;
- return 1;
- }
- -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
- +static int wake_affine(struct sched_domain *sd, struct task_struct *p,
- + int prev_cpu, int sync)
- {
- s64 this_load, load;
- s64 this_eff_load, prev_eff_load;
- - int idx, this_cpu, prev_cpu;
- + int idx, this_cpu;
- struct task_group *tg;
- unsigned long weight;
- int balanced;
- idx = sd->wake_idx;
- this_cpu = smp_processor_id();
- - prev_cpu = task_cpu(p);
- load = source_load(prev_cpu, idx);
- this_load = target_load(this_cpu, idx);
- @@ -5146,18 +5770,29 @@
- balanced = this_eff_load <= prev_eff_load;
- - schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
- + schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
- if (!balanced)
- return 0;
- - schedstat_inc(sd, ttwu_move_affine);
- - schedstat_inc(p, se.statistics.nr_wakeups_affine);
- + schedstat_inc(sd->ttwu_move_affine);
- + schedstat_inc(p->se.statistics.nr_wakeups_affine);
- return 1;
- }
- -unsigned int capacity_margin = 1280; /* ~20% margin */
- +static inline unsigned long task_util(struct task_struct *p)
- +{
- +#ifdef CONFIG_SCHED_WALT
- + if (!walt_disabled && sysctl_sched_use_walt_task_util) {
- + unsigned long demand = p->ravg.demand;
- + return (demand << 10) / walt_ravg_window;
- + }
- +#endif
- + return p->se.avg.util_avg;
- +}
- +
- +static inline unsigned long boosted_task_util(struct task_struct *p);
- static inline bool __task_fits(struct task_struct *p, int cpu, int util)
- {
- @@ -5182,29 +5817,131 @@
- return __task_fits(p, cpu, 0);
- }
- -static inline bool task_fits_spare(struct task_struct *p, int cpu)
- +static bool __cpu_overutilized(int cpu, int delta)
- {
- - return __task_fits(p, cpu, cpu_util(cpu, UTIL_EST));
- + return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin);
- }
- static bool cpu_overutilized(int cpu)
- {
- - return (capacity_of(cpu) * 1024) < (cpu_util(cpu, UTIL_AVG) * capacity_margin);
- + return __cpu_overutilized(cpu, 0);
- +}
- +
- +#ifdef CONFIG_SCHED_TUNE
- +
- +struct reciprocal_value schedtune_spc_rdiv;
- +
- +static long
- +schedtune_margin(unsigned long signal, long boost)
- +{
- + long long margin = 0;
- +
- + /*
- + * Signal proportional compensation (SPC)
- + *
- + * The Boost (B) value is used to compute a Margin (M) which is
- + * proportional to the complement of the original Signal (S):
- + * M = B * (SCHED_CAPACITY_SCALE - S)
- + * The obtained M could be used by the caller to "boost" S.
- + */
- + if (boost >= 0) {
- + margin = SCHED_CAPACITY_SCALE - signal;
- + margin *= boost;
- + } else {
- + margin = -signal * boost;
- + }
- +
- + margin = reciprocal_divide(margin, schedtune_spc_rdiv);
- + if (boost < 0)
- + margin *= -1;
- +
- + return margin;
- +}
- +
- +static inline int
- +schedtune_cpu_margin(unsigned long util, int cpu)
- +{
- + int boost = schedtune_cpu_boost(cpu);
- +
- + if (boost == 0)
- + return 0;
- +
- + return schedtune_margin(util, boost);
- +}
- +
- +static inline long
- +schedtune_task_margin(struct task_struct *p)
- +{
- + int boost = schedtune_task_boost(p);
- + unsigned long util;
- + long margin;
- +
- + if (boost == 0)
- + return 0;
- +
- + util = task_util(p);
- + margin = schedtune_margin(util, boost);
- +
- + return margin;
- +}
- +
- +#else /* CONFIG_SCHED_TUNE */
- +
- +static inline int
- +schedtune_cpu_margin(unsigned long util, int cpu)
- +{
- + return 0;
- +}
- +
- +static inline int
- +schedtune_task_margin(struct task_struct *p)
- +{
- + return 0;
- +}
- +
- +#endif /* CONFIG_SCHED_TUNE */
- +
- +unsigned long
- +boosted_cpu_util(int cpu)
- +{
- + unsigned long util = cpu_util_freq(cpu);
- + long margin = schedtune_cpu_margin(util, cpu);
- +
- + trace_sched_boost_cpu(cpu, util, margin);
- +
- + return util + margin;
- +}
- +
- +static inline unsigned long
- +boosted_task_util(struct task_struct *p)
- +{
- + unsigned long util = task_util(p);
- + long margin = schedtune_task_margin(p);
- +
- + trace_sched_boost_task(p, util, margin);
- +
- + return util + margin;
- +}
- +
- +static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
- +{
- + return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
- }
- /*
- * find_idlest_group finds and returns the least busy CPU group within the
- * domain.
- + *
- + * Assumes p is allowed on at least one CPU in sd.
- */
- static struct sched_group *
- find_idlest_group(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int sd_flag)
- {
- struct sched_group *idlest = NULL, *group = sd->groups;
- - struct sched_group *fit_group = NULL, *spare_group = NULL;
- - unsigned long min_load = ULONG_MAX, this_load = 0;
- - unsigned long fit_capacity = ULONG_MAX;
- - unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE;
- + struct sched_group *most_spare_sg = NULL;
- + unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
- + unsigned long most_spare = 0, this_spare = 0;
- int load_idx = sd->forkexec_idx;
- int imbalance = 100 + (sd->imbalance_pct-100)/2;
- @@ -5212,7 +5949,7 @@
- load_idx = sd->wake_idx;
- do {
- - unsigned long load, avg_load, spare_capacity;
- + unsigned long load, avg_load, spare_cap, max_spare_cap;
- int local_group;
- int i;
- @@ -5224,8 +5961,12 @@
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_cpus(group));
- - /* Tally up the load of all CPUs in the group */
- + /*
- + * Tally up the load of all CPUs in the group and find
- + * the group containing the CPU with most spare capacity.
- + */
- avg_load = 0;
- + max_spare_cap = 0;
- for_each_cpu(i, sched_group_cpus(group)) {
- /* Bias balancing toward cpus of our domain */
- @@ -5236,24 +5977,10 @@
- avg_load += load;
- - /*
- - * Look for most energy-efficient group that can fit
- - * that can fit the task.
- - */
- - if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
- - fit_capacity = capacity_of(i);
- - fit_group = group;
- - }
- + spare_cap = capacity_spare_wake(i, p);
- - /*
- - * Look for group which has most spare capacity on a
- - * single cpu.
- - */
- - spare_capacity = capacity_of(i) - cpu_util(i, UTIL_EST);
- - if (spare_capacity > max_spare_capacity) {
- - max_spare_capacity = spare_capacity;
- - spare_group = group;
- - }
- + if (spare_cap > max_spare_cap)
- + max_spare_cap = spare_cap;
- }
- /* Adjust by relative CPU capacity of the group */
- @@ -5261,28 +5988,51 @@
- if (local_group) {
- this_load = avg_load;
- - } else if (avg_load < min_load) {
- - min_load = avg_load;
- - idlest = group;
- + this_spare = max_spare_cap;
- + } else {
- + if (avg_load < min_load) {
- + min_load = avg_load;
- + idlest = group;
- + }
- +
- + if (most_spare < max_spare_cap) {
- + most_spare = max_spare_cap;
- + most_spare_sg = group;
- + }
- }
- } while (group = group->next, group != sd->groups);
- - if (fit_group)
- - return fit_group;
- + /*
- + * The cross-over point between using spare capacity or least load
- + * is too conservative for high utilization tasks on partially
- + * utilized systems if we require spare_capacity > task_util(p)
- + * so we allow for some task stuffing by using
- + * spare_capacity > task_util(p)/2.
- + *
- + * Spare capacity can't be used for fork because the utilization has
- + * not been set yet, we must first select a rq to compute the initial
- + * utilization.
- + */
- + if (sd_flag & SD_BALANCE_FORK)
- + goto skip_spare;
- - if (spare_group)
- - return spare_group;
- + if (this_spare > task_util(p) / 2 &&
- + imbalance*this_spare > 100*most_spare)
- + return NULL;
- + else if (most_spare > task_util(p) / 2)
- + return most_spare_sg;
- +skip_spare:
- if (!idlest || 100*this_load < imbalance*min_load)
- return NULL;
- return idlest;
- }
- /*
- - * find_idlest_cpu - find the idlest cpu among the cpus in group.
- + * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
- */
- static int
- -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
- +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
- {
- unsigned long load, min_load = ULONG_MAX;
- unsigned int min_exit_latency = UINT_MAX;
- @@ -5291,9 +6041,13 @@
- int shallowest_idle_cpu = -1;
- int i;
- + /* Check if we have any choice: */
- + if (group->group_weight == 1)
- + return cpumask_first(sched_group_cpus(group));
- +
- /* Traverse only the allowed CPUs */
- for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
- - if (task_fits_spare(p, i)) {
- + if (idle_cpu(i)) {
- struct rq *rq = cpu_rq(i);
- struct cpuidle_state *idle = idle_get_state(rq);
- if (idle && idle->exit_latency < min_exit_latency) {
- @@ -5305,8 +6059,7 @@
- min_exit_latency = idle->exit_latency;
- latest_idle_timestamp = rq->idle_stamp;
- shallowest_idle_cpu = i;
- - } else if (idle_cpu(i) &&
- - (!idle || idle->exit_latency == min_exit_latency) &&
- + } else if ((!idle || idle->exit_latency == min_exit_latency) &&
- rq->idle_stamp > latest_idle_timestamp) {
- /*
- * If equal or no active idle state, then
- @@ -5315,15 +6068,8 @@
- */
- latest_idle_timestamp = rq->idle_stamp;
- shallowest_idle_cpu = i;
- - } else if (shallowest_idle_cpu == -1) {
- - /*
- - * If we haven't found an idle CPU yet
- - * pick a non-idle one that can fit the task as
- - * fallback.
- - */
- - shallowest_idle_cpu = i;
- }
- - } else {
- + } else if (shallowest_idle_cpu == -1) {
- load = weighted_cpuload(i);
- if (load < min_load || (load == min_load && i == this_cpu)) {
- min_load = load;
- @@ -5333,29 +6079,99 @@
- }
- return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
- + }
- +
- +static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
- + int cpu, int prev_cpu, int sd_flag)
- +{
- + int new_cpu = cpu;
- + int wu = sd_flag & SD_BALANCE_WAKE;
- + int cas_cpu = -1;
- +
- + if (wu) {
- + schedstat_inc(p->se.statistics.nr_wakeups_cas_attempts);
- + schedstat_inc(this_rq()->eas_stats.cas_attempts);
- + }
- +
- + if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
- + return prev_cpu;
- +
- + while (sd) {
- + struct sched_group *group;
- + struct sched_domain *tmp;
- + int weight;
- +
- + if (wu)
- + schedstat_inc(sd->eas_stats.cas_attempts);
- +
- + if (!(sd->flags & sd_flag)) {
- + sd = sd->child;
- + continue;
- + }
- +
- + group = find_idlest_group(sd, p, cpu, sd_flag);
- + if (!group) {
- + sd = sd->child;
- + continue;
- + }
- +
- + new_cpu = find_idlest_group_cpu(group, p, cpu);
- + if (new_cpu == cpu) {
- + /* Now try balancing at a lower domain level of cpu */
- + sd = sd->child;
- + continue;
- + }
- +
- + /* Now try balancing at a lower domain level of new_cpu */
- + cpu = cas_cpu = new_cpu;
- + weight = sd->span_weight;
- + sd = NULL;
- + for_each_domain(cpu, tmp) {
- + if (weight <= tmp->span_weight)
- + break;
- + if (tmp->flags & sd_flag)
- + sd = tmp;
- + }
- + /* while loop will break here if sd == NULL */
- + }
- +
- + if (wu && (cas_cpu >= 0)) {
- + schedstat_inc(p->se.statistics.nr_wakeups_cas_count);
- + schedstat_inc(this_rq()->eas_stats.cas_count);
- + }
- +
- + return new_cpu;
- }
- /*
- * Try and locate an idle CPU in the sched_domain.
- */
- -static int select_idle_sibling(struct task_struct *p, int target)
- +static int select_idle_sibling(struct task_struct *p, int prev, int target)
- {
- struct sched_domain *sd;
- struct sched_group *sg;
- - int i = task_cpu(p);
- - int best_idle = -1;
- - int best_idle_cstate = -1;
- - int best_idle_capacity = INT_MAX;
- + int best_idle_cpu = -1;
- + int best_idle_cstate = INT_MAX;
- + unsigned long best_idle_capacity = ULONG_MAX;
- +
- + schedstat_inc(p->se.statistics.nr_wakeups_sis_attempts);
- + schedstat_inc(this_rq()->eas_stats.sis_attempts);
- if (!sysctl_sched_cstate_aware) {
- - if (idle_cpu(target))
- + if (idle_cpu(target)) {
- + schedstat_inc(p->se.statistics.nr_wakeups_sis_idle);
- + schedstat_inc(this_rq()->eas_stats.sis_idle);
- return target;
- + }
- /*
- * If the prevous cpu is cache affine and idle, don't be stupid.
- */
- - if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- - return i;
- + if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {
- + schedstat_inc(p->se.statistics.nr_wakeups_sis_cache_affine);
- + schedstat_inc(this_rq()->eas_stats.sis_cache_affine);
- + return prev;
- + }
- }
- /*
- @@ -5365,24 +6181,30 @@
- for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- + int i;
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
- if (sysctl_sched_cstate_aware) {
- for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
- - struct rq *rq = cpu_rq(i);
- - int idle_idx = idle_get_state_idx(rq);
- + int idle_idx = idle_get_state_idx(cpu_rq(i));
- unsigned long new_usage = boosted_task_util(p);
- unsigned long capacity_orig = capacity_orig_of(i);
- +
- if (new_usage > capacity_orig || !idle_cpu(i))
- goto next;
- - if (i == target && new_usage <= capacity_curr_of(target))
- + if (i == target && new_usage <= capacity_curr_of(target)) {
- + schedstat_inc(p->se.statistics.nr_wakeups_sis_suff_cap);
- + schedstat_inc(this_rq()->eas_stats.sis_suff_cap);
- + schedstat_inc(sd->eas_stats.sis_suff_cap);
- return target;
- + }
- - if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
- - best_idle = i;
- + if (idle_idx < best_idle_cstate &&
- + capacity_orig <= best_idle_capacity) {
- + best_idle_cpu = i;
- best_idle_cstate = idle_idx;
- best_idle_capacity = capacity_orig;
- }
- @@ -5395,6 +6217,9 @@
- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- + schedstat_inc(p->se.statistics.nr_wakeups_sis_idle_cpu);
- + schedstat_inc(this_rq()->eas_stats.sis_idle_cpu);
- + schedstat_inc(sd->eas_stats.sis_idle_cpu);
- goto done;
- }
- next:
- @@ -5402,171 +6227,112 @@
- } while (sg != sd->groups);
- }
- - if (best_idle > 0)
- - target = best_idle;
- + if (best_idle_cpu >= 0)
- + target = best_idle_cpu;
- done:
- + schedstat_inc(p->se.statistics.nr_wakeups_sis_count);
- + schedstat_inc(this_rq()->eas_stats.sis_count);
- +
- return target;
- }
- -static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
- +/*
- + * cpu_util_wake: Compute cpu utilization with any contributions from
- + * the waking task p removed. check_for_migration() looks for a better CPU of
- + * rq->curr. For that case we should return cpu util with contributions from
- + * currently running task p removed.
- + */
- +static int cpu_util_wake(int cpu, struct task_struct *p)
- {
- - int iter_cpu;
- - int target_cpu = -1;
- - int target_util = 0;
- - int backup_capacity = 0;
- - int best_idle_cpu = -1;
- - int best_idle_cstate = INT_MAX;
- - int backup_cpu = -1;
- - unsigned long min_util;
- - unsigned long new_util;
- -
- - min_util = boosted_task_util(p);
- - for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
- - int cur_capacity;
- - struct rq *rq;
- - int idle_idx;
- -
- - /*
- - * Iterate from higher cpus for boosted tasks.
- - */
- - int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
- -
- - if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
- - continue;
- -
- - /*
- - * p's blocked utilization is still accounted for on prev_cpu
- - * so prev_cpu will receive a negative bias due to the double
- - * accounting. However, the blocked utilization may be zero.
- - */
- - new_util = cpu_util(i, UTIL_EST) + task_util(p, UTIL_EST);
- -
- - /*
- - * Ensure minimum capacity to grant the required boost.
- - * The target CPU can be already at a capacity level higher
- - * than the one required to boost the task.
- - */
- - new_util = max(min_util, new_util);
- - if (new_util > capacity_orig_of(i))
- - continue;
- + unsigned long util, capacity;
- #ifdef CONFIG_SCHED_WALT
- - if (walt_cpu_high_irqload(i))
- - continue;
- + /*
- + * WALT does not decay idle tasks in the same manner
- + * as PELT, so it makes little sense to subtract task
- + * utilization from cpu utilization. Instead just use
- + * cpu_util for this case.
- + */
- + if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
- + p->state == TASK_WAKING)
- + return cpu_util(cpu);
- #endif
- + /* Task has no contribution or is new */
- + if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
- + return cpu_util(cpu);
- - /*
- - * Unconditionally favoring tasks that prefer idle cpus to
- - * improve latency.
- - */
- - if (idle_cpu(i) && prefer_idle) {
- - if (best_idle_cpu < 0)
- - best_idle_cpu = i;
- - continue;
- - }
- -
- - cur_capacity = capacity_curr_of(i);
- - rq = cpu_rq(i);
- - idle_idx = idle_get_state_idx(rq);
- + capacity = capacity_orig_of(cpu);
- + util = max_t(long, cpu_util(cpu) - task_util(p), 0);
- - if (new_util < cur_capacity) {
- - if (cpu_rq(i)->nr_running) {
- - if (!prefer_idle) {
- - /* Find a target cpu with highest
- - * utilization.
- - */
- - if (target_util == 0 ||
- - target_util < new_util) {
- - target_cpu = i;
- - target_util = new_util;
- - }
- - } else {
- - /* Find a target cpu with lowest
- - * utilization.
- - */
- - if (target_util == 0 ||
- - target_util > new_util) {
- - target_cpu = i;
- - target_util = new_util;
- - }
- - }
- - } else if (!prefer_idle) {
- - if (best_idle_cpu < 0 ||
- - (sysctl_sched_cstate_aware &&
- - best_idle_cstate > idle_idx)) {
- - best_idle_cstate = idle_idx;
- - best_idle_cpu = i;
- - }
- - }
- - } else if (backup_capacity == 0 ||
- - backup_capacity > cur_capacity) {
- - // Find a backup cpu with least capacity.
- - backup_capacity = cur_capacity;
- - backup_cpu = i;
- - }
- - }
- + return (util >= capacity) ? capacity : util;
- +}
- - if (prefer_idle && best_idle_cpu >= 0)
- - target_cpu = best_idle_cpu;
- - else if (target_cpu < 0)
- - target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
- +static int start_cpu(bool boosted)
- +{
- + struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
- - return target_cpu;
- + return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
- }
- -static int energy_aware_wake_cpu(struct task_struct *p, int target)
- +static inline int find_best_target(struct task_struct *p, int *backup_cpu,
- + bool boosted, bool prefer_idle)
- {
- + unsigned long min_util = boosted_task_util(p);
- + unsigned long target_capacity = ULONG_MAX;
- + unsigned long min_wake_util = ULONG_MAX;
- + unsigned long target_max_spare_cap = 0;
- + unsigned long best_active_util = ULONG_MAX;
- + int best_idle_cstate = INT_MAX;
- struct sched_domain *sd;
- - struct sched_group *sg, *sg_target;
- - int target_max_cap = INT_MAX;
- - int target_cpu = task_cpu(p);
- - unsigned long min_util;
- - unsigned long new_util;
- - int i;
- + struct sched_group *sg;
- + int best_active_cpu = -1;
- + int best_idle_cpu = -1;
- + int target_cpu = -1;
- + int cpu, i;
- - sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
- + *backup_cpu = -1;
- - if (!sd)
- - return target;
- + schedstat_inc(p->se.statistics.nr_wakeups_fbt_attempts);
- + schedstat_inc(this_rq()->eas_stats.fbt_attempts);
- - sg = sd->groups;
- - sg_target = sg;
- + /* Find start CPU based on boost value */
- + cpu = start_cpu(boosted);
- + if (cpu < 0) {
- + schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_cpu);
- + schedstat_inc(this_rq()->eas_stats.fbt_no_cpu);
- + return -1;
- + }
- - if (sysctl_sched_is_big_little) {
- + /* Find SD for the start CPU */
- + sd = rcu_dereference(per_cpu(sd_ea, cpu));
- + if (!sd) {
- + schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_sd);
- + schedstat_inc(this_rq()->eas_stats.fbt_no_sd);
- + return -1;
- + }
- - /*
- - * Find group with sufficient capacity. We only get here if no cpu is
- - * overutilized. We may end up overutilizing a cpu by adding the task,
- - * but that should not be any worse than select_idle_sibling().
- - * load_balance() should sort it out later as we get above the tipping
- - * point.
- - */
- - do {
- - /* Assuming all cpus are the same in group */
- - int max_cap_cpu = group_first_cpu(sg);
- + /* Scan CPUs in all SDs */
- + sg = sd->groups;
- + do {
- + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
- + unsigned long capacity_curr = capacity_curr_of(i);
- + unsigned long capacity_orig = capacity_orig_of(i);
- + unsigned long wake_util, new_util;
- - /*
- - * Assume smaller max capacity means more energy-efficient.
- - * Ideally we should query the energy model for the right
- - * answer but it easily ends up in an exhaustive search.
- - */
- - if (capacity_of(max_cap_cpu) < target_max_cap &&
- - task_fits_max(p, max_cap_cpu)) {
- - sg_target = sg;
- - target_max_cap = capacity_of(max_cap_cpu);
- - }
- - } while (sg = sg->next, sg != sd->groups);
- + if (!cpu_online(i))
- + continue;
- +
- + if (walt_cpu_high_irqload(i))
- + continue;
- - /* Find cpu with sufficient capacity */
- - min_util = boosted_task_util(p);
- - for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
- /*
- * p's blocked utilization is still accounted for on prev_cpu
- * so prev_cpu will receive a negative bias due to the double
- * accounting. However, the blocked utilization may be zero.
- */
- - new_util = cpu_util(i, UTIL_EST) + task_util(p, UTIL_EST);
- + wake_util = cpu_util_wake(i, p);
- + new_util = wake_util + task_util(p);
- /*
- * Ensure minimum capacity to grant the required boost.
- @@ -5574,49 +6340,349 @@
- * than the one required to boost the task.
- */
- new_util = max(min_util, new_util);
- - if (new_util > capacity_orig_of(i))
- + if (new_util > capacity_orig)
- + continue;
- +
- + /*
- + * Case A) Latency sensitive tasks
- + *
- + * Unconditionally favoring tasks that prefer idle CPU to
- + * improve latency.
- + *
- + * Looking for:
- + * - an idle CPU, whatever its idle_state is, since
- + * the first CPUs we explore are more likely to be
- + * reserved for latency sensitive tasks.
- + * - a non idle CPU where the task fits in its current
- + * capacity and has the maximum spare capacity.
- + * - a non idle CPU with lower contention from other
- + * tasks and running at the lowest possible OPP.
- + *
- + * The last two goals tries to favor a non idle CPU
- + * where the task can run as if it is "almost alone".
- + * A maximum spare capacity CPU is favoured since
- + * the task already fits into that CPU's capacity
- + * without waiting for an OPP chance.
- + *
- + * The following code path is the only one in the CPUs
- + * exploration loop which is always used by
- + * prefer_idle tasks. It exits the loop with wither a
- + * best_active_cpu or a target_cpu which should
- + * represent an optimal choice for latency sensitive
- + * tasks.
- + */
- + if (prefer_idle) {
- +
- + /*
- + * Case A.1: IDLE CPU
- + * Return the first IDLE CPU we find.
- + */
- + if (idle_cpu(i)) {
- + schedstat_inc(p->se.statistics.nr_wakeups_fbt_pref_idle);
- + schedstat_inc(this_rq()->eas_stats.fbt_pref_idle);
- +
- + trace_sched_find_best_target(p,
- + prefer_idle, min_util,
- + cpu, best_idle_cpu,
- + best_active_cpu, i);
- +
- + return i;
- + }
- +
- + /*
- + * Case A.2: Target ACTIVE CPU
- + * Favor CPUs with max spare capacity.
- + */
- + if ((capacity_curr > new_util) &&
- + (capacity_orig - new_util > target_max_spare_cap)) {
- + target_max_spare_cap = capacity_orig - new_util;
- + target_cpu = i;
- + continue;
- + }
- + if (target_cpu != -1)
- + continue;
- +
- +
- + /*
- + * Case A.3: Backup ACTIVE CPU
- + * Favor CPUs with:
- + * - lower utilization due to other tasks
- + * - lower utilization with the task in
- + */
- + if (wake_util > min_wake_util)
- + continue;
- + if (new_util > best_active_util)
- + continue;
- + min_wake_util = wake_util;
- + best_active_util = new_util;
- + best_active_cpu = i;
- continue;
- + }
- - if (new_util < capacity_curr_of(i)) {
- - target_cpu = i;
- - if (cpu_rq(i)->nr_running)
- - break;
- + /*
- + * Enforce EAS mode
- + *
- + * For non latency sensitive tasks, skip CPUs that
- + * will be overutilized by moving the task there.
- + *
- + * The goal here is to remain in EAS mode as long as
- + * possible at least for !prefer_idle tasks.
- + */
- + if ((new_util * capacity_margin) >
- + (capacity_orig * SCHED_CAPACITY_SCALE))
- + continue;
- +
- + /*
- + * Favor CPUs with smaller capacity for Non latency
- + * sensitive tasks.
- + */
- + if (capacity_orig > target_capacity)
- + continue;
- +
- + /*
- + * Case B) Non latency sensitive tasks on IDLE CPUs.
- + *
- + * Find an optimal backup IDLE CPU for non latency
- + * sensitive tasks.
- + *
- + * Looking for:
- + * - minimizing the capacity_orig,
- + * i.e. preferring LITTLE CPUs
- + * - favoring shallowest idle states
- + * i.e. avoid to wakeup deep-idle CPUs
- + *
- + * The following code path is used by non latency
- + * sensitive tasks if IDLE CPUs are available. If at
- + * least one of such CPUs are available it sets the
- + * best_idle_cpu to the most suitable idle CPU to be
- + * selected.
- + *
- + * If idle CPUs are available, favour these CPUs to
- + * improve performances by spreading tasks.
- + * Indeed, the energy_diff() computed by the caller
- + * will take care to ensure the minimization of energy
- + * consumptions without affecting performance.
- + */
- + if (idle_cpu(i)) {
- + int idle_idx = idle_get_state_idx(cpu_rq(i));
- +
- + /*
- + * Skip CPUs in deeper idle state, but only
- + * if they are also less energy efficient.
- + * IOW, prefer a deep IDLE LITTLE CPU vs a
- + * shallow idle big CPU.
- + */
- + if (sysctl_sched_cstate_aware &&
- + best_idle_cstate <= idle_idx)
- + continue;
- +
- + /* Keep track of best idle CPU */
- + target_capacity = capacity_orig;
- + best_idle_cstate = idle_idx;
- + best_idle_cpu = i;
- + continue;
- }
- - /* cpu has capacity at higher OPP, keep it as fallback */
- - if (target_cpu == task_cpu(p))
- - target_cpu = i;
- + /*
- + * Case C) Non latency sensitive tasks on ACTIVE CPUs.
- + *
- + * Pack tasks in the most energy efficient capacities.
- + *
- + * This task packing strategy prefers more energy
- + * efficient CPUs (i.e. pack on smaller maximum
- + * capacity CPUs) while also trying to spread tasks to
- + * run them all at the lower OPP.
- + *
- + * This assumes for example that it's more energy
- + * efficient to run two tasks on two CPUs at a lower
- + * OPP than packing both on a single CPU but running
- + * that CPU at an higher OPP.
- + *
- + * Thus, this case keep track of the CPU with the
- + * smallest maximum capacity and highest spare maximum
- + * capacity.
- + */
- +
- + /* Favor CPUs with maximum spare capacity */
- + if ((capacity_orig - new_util) < target_max_spare_cap)
- + continue;
- +
- + target_max_spare_cap = capacity_orig - new_util;
- + target_capacity = capacity_orig;
- + target_cpu = i;
- }
- - } else {
- - /*
- - * Find a cpu with sufficient capacity
- - */
- - bool boosted = schedtune_task_boost(p) > 0;
- - bool prefer_idle = schedtune_prefer_idle(p) > 0;
- - int tmp_target = find_best_target(p, boosted, prefer_idle);
- - if (tmp_target >= 0) {
- - target_cpu = tmp_target;
- - if ((boosted || prefer_idle) && idle_cpu(target_cpu))
- - return target_cpu;
- +
- + } while (sg = sg->next, sg != sd->groups);
- +
- + /*
- + * For non latency sensitive tasks, cases B and C in the previous loop,
- + * we pick the best IDLE CPU only if we was not able to find a target
- + * ACTIVE CPU.
- + *
- + * Policies priorities:
- + *
- + * - prefer_idle tasks:
- + *
- + * a) IDLE CPU available, we return immediately
- + * b) ACTIVE CPU where task fits and has the bigger maximum spare
- + * capacity (i.e. target_cpu)
- + * c) ACTIVE CPU with less contention due to other tasks
- + * (i.e. best_active_cpu)
- + *
- + * - NON prefer_idle tasks:
- + *
- + * a) ACTIVE CPU: target_cpu
- + * b) IDLE CPU: best_idle_cpu
- + */
- + if (target_cpu == -1)
- + target_cpu = prefer_idle
- + ? best_active_cpu
- + : best_idle_cpu;
- + else
- + *backup_cpu = prefer_idle
- + ? best_active_cpu
- + : best_idle_cpu;
- +
- + trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
- + best_idle_cpu, best_active_cpu,
- + target_cpu);
- +
- + schedstat_inc(p->se.statistics.nr_wakeups_fbt_count);
- + schedstat_inc(this_rq()->eas_stats.fbt_count);
- +
- + return target_cpu;
- +}
- +
- +/*
- + * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
- + * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
- + *
- + * In that case WAKE_AFFINE doesn't make sense and we'll let
- + * BALANCE_WAKE sort things out.
- + */
- +static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
- +{
- + long min_cap, max_cap;
- + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
- + max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
- + /* Minimum capacity is close to max, no need to abort wake_affine */
- + if (max_cap - min_cap < max_cap >> 3)
- + return 0;
- +
- + /* Bring task utilization in sync with prev_cpu */
- + sync_entity_load_avg(&p->se);
- +
- + return min_cap * 1024 < task_util(p) * capacity_margin;
- +}
- +
- +static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
- +{
- + bool boosted, prefer_idle;
- + struct sched_domain *sd;
- + int target_cpu;
- + int backup_cpu;
- + int next_cpu;
- +
- + schedstat_inc(p->se.statistics.nr_wakeups_secb_attempts);
- + schedstat_inc(this_rq()->eas_stats.secb_attempts);
- +
- + if (sysctl_sched_sync_hint_enable && sync) {
- + int cpu = smp_processor_id();
- +
- + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
- + schedstat_inc(p->se.statistics.nr_wakeups_secb_sync);
- + schedstat_inc(this_rq()->eas_stats.secb_sync);
- + return cpu;
- }
- }
- - if (target_cpu != task_cpu(p)) {
- + rcu_read_lock();
- +#ifdef CONFIG_CGROUP_SCHEDTUNE
- + boosted = schedtune_task_boost(p) > 0;
- + prefer_idle = schedtune_prefer_idle(p) > 0;
- +#else
- + boosted = get_sysctl_sched_cfs_boost() > 0;
- + prefer_idle = 0;
- +#endif
- +
- +
- +
- + sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
- + if (!sd) {
- + target_cpu = prev_cpu;
- + goto unlock;
- + }
- +
- + sync_entity_load_avg(&p->se);
- +
- + /* Find a cpu with sufficient capacity */
- + next_cpu = find_best_target(p, &backup_cpu, boosted, prefer_idle);
- + if (next_cpu == -1) {
- + target_cpu = prev_cpu;
- + goto unlock;
- + }
- +
- + /* Unconditionally prefer IDLE CPUs for boosted/prefer_idle tasks */
- + if ((boosted || prefer_idle) && idle_cpu(next_cpu)) {
- + schedstat_inc(p->se.statistics.nr_wakeups_secb_idle_bt);
- + schedstat_inc(this_rq()->eas_stats.secb_idle_bt);
- + target_cpu = next_cpu;
- + goto unlock;
- + }
- +
- + target_cpu = prev_cpu;
- + if (next_cpu != prev_cpu) {
- + int delta = 0;
- struct energy_env eenv = {
- - .util_delta = task_util(p, UTIL_EST),
- - .src_cpu = task_cpu(p),
- - .dst_cpu = target_cpu,
- - .task = p,
- + .p = p,
- + .util_delta = task_util(p),
- + /* Task's previous CPU candidate */
- + .cpu[EAS_CPU_PRV] = {
- + .cpu_id = prev_cpu,
- + },
- + /* Main alternative CPU candidate */
- + .cpu[EAS_CPU_NXT] = {
- + .cpu_id = next_cpu,
- + },
- + /* Backup alternative CPU candidate */
- + .cpu[EAS_CPU_BKP] = {
- + .cpu_id = backup_cpu,
- + },
- };
- +
- +#ifdef CONFIG_SCHED_WALT
- + if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
- + p->state == TASK_WAKING)
- + delta = task_util(p);
- +#endif
- /* Not enough spare capacity on previous cpu */
- - if (cpu_overutilized(task_cpu(p)))
- - return target_cpu;
- + if (__cpu_overutilized(prev_cpu, delta)) {
- + schedstat_inc(p->se.statistics.nr_wakeups_secb_insuff_cap);
- + schedstat_inc(this_rq()->eas_stats.secb_insuff_cap);
- + target_cpu = next_cpu;
- + goto unlock;
- + }
- - if (energy_diff(&eenv) >= 0)
- - return task_cpu(p);
- + /* Check if EAS_CPU_NXT is a more energy efficient CPU */
- + if (select_energy_cpu_idx(&eenv) != EAS_CPU_PRV) {
- + schedstat_inc(p->se.statistics.nr_wakeups_secb_nrg_sav);
- + schedstat_inc(this_rq()->eas_stats.secb_nrg_sav);
- + target_cpu = eenv.cpu[eenv.next_idx].cpu_id;
- + goto unlock;
- + }
- +
- + schedstat_inc(p->se.statistics.nr_wakeups_secb_no_nrg_sav);
- + schedstat_inc(this_rq()->eas_stats.secb_no_nrg_sav);
- + target_cpu = prev_cpu;
- + goto unlock;
- }
- + schedstat_inc(p->se.statistics.nr_wakeups_secb_count);
- + schedstat_inc(this_rq()->eas_stats.secb_count);
- +
- +unlock:
- + rcu_read_unlock();
- return target_cpu;
- }
- @@ -5633,7 +6699,8 @@
- * preempt must be disabled.
- */
- static int
- -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
- +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
- + int sibling_count_hint)
- {
- struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
- int cpu = smp_processor_id();
- @@ -5641,13 +6708,15 @@
- int want_affine = 0;
- int sync = wake_flags & WF_SYNC;
- - if (p->nr_cpus_allowed == 1)
- - return prev_cpu;
- + if (sd_flag & SD_BALANCE_WAKE) {
- + record_wakee(p);
- + want_affine = !wake_wide(p, sibling_count_hint) &&
- + !wake_cap(p, cpu, prev_cpu) &&
- + cpumask_test_cpu(cpu, &p->cpus_allowed);
- + }
- - if (sd_flag & SD_BALANCE_WAKE)
- - want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
- - cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
- - energy_aware();
- + if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
- + return select_energy_cpu_brute(p, prev_cpu, sync);
- rcu_read_lock();
- for_each_domain(cpu, tmp) {
- @@ -5672,65 +6741,25 @@
- if (affine_sd) {
- sd = NULL; /* Prefer wake_affine over balance flags */
- - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
- + if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
- new_cpu = cpu;
- }
- - if (!sd) {
- - int sync_used = 0;
- - bool about_to_idle = (cpu_rq(cpu)->nr_running < 2);
- -
- - if (sysctl_sched_sync_hint_enable && sync
- - && about_to_idle) {
- - cpumask_t search_cpus;
- - cpumask_and(&search_cpus, tsk_cpus_allowed(p),
- - cpu_online_mask);
- - if (cpumask_test_cpu(cpu, &search_cpus)) {
- - sync_used = 1;
- - new_cpu = cpu;
- - }
- - }
- -
- - if (!sync_used) {
- - if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
- - new_cpu = energy_aware_wake_cpu(p, prev_cpu);
- - else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
- - new_cpu = select_idle_sibling(p, new_cpu);
- - }
- -
- - } else while (sd) {
- - struct sched_group *group;
- - int weight;
- -
- - if (!(sd->flags & sd_flag)) {
- - sd = sd->child;
- - continue;
- - }
- -
- - group = find_idlest_group(sd, p, cpu, sd_flag);
- - if (!group) {
- - sd = sd->child;
- - continue;
- - }
- + if (sd && !(sd_flag & SD_BALANCE_FORK)) {
- + /*
- + * We're going to need the task's util for capacity_spare_wake
- + * in find_idlest_group. Sync it up to prev_cpu's
- + * last_update_time.
- + */
- + sync_entity_load_avg(&p->se);
- + }
- - new_cpu = find_idlest_cpu(group, p, cpu);
- - if (new_cpu == -1 || new_cpu == cpu) {
- - /* Now try balancing at a lower domain level of cpu */
- - sd = sd->child;
- - continue;
- - }
- + if (!sd) {
- + if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
- + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- - /* Now try balancing at a lower domain level of new_cpu */
- - cpu = new_cpu;
- - weight = sd->span_weight;
- - sd = NULL;
- - for_each_domain(cpu, tmp) {
- - if (weight <= tmp->span_weight)
- - break;
- - if (tmp->flags & sd_flag)
- - sd = tmp;
- - }
- - /* while loop will break here if sd == NULL */
- + } else {
- + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
- }
- rcu_read_unlock();
- @@ -5742,7 +6771,7 @@
- * cfs_rq_of(p) references at time of call are still valid and identify the
- * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
- */
- -static void migrate_task_rq_fair(struct task_struct *p, int next_cpu)
- +static void migrate_task_rq_fair(struct task_struct *p)
- {
- /*
- * We are supposed to update the task to "current" time, then its up to date
- @@ -5929,7 +6958,7 @@
- }
- static struct task_struct *
- -pick_next_task_fair(struct rq *rq, struct task_struct *prev)
- +pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
- {
- struct cfs_rq *cfs_rq = &rq->cfs;
- struct sched_entity *se;
- @@ -6041,8 +7070,15 @@
- idle:
- rq->misfit_task = 0;
- -
- + /*
- + * This is OK, because current is on_cpu, which avoids it being picked
- + * for load-balance and preemption/IRQs are still disabled avoiding
- + * further scheduler activity on it and we're being very careful to
- + * re-start the picking loop.
- + */
- + lockdep_unpin_lock(&rq->lock, cookie);
- new_tasks = idle_balance(rq);
- + lockdep_repin_lock(&rq->lock, cookie);
- /*
- * Because idle_balance() releases (and re-acquires) rq->lock, it is
- * possible for any higher priority task to appear. In that case we
- @@ -6101,7 +7137,7 @@
- * so we don't do microscopic update in schedule()
- * and double the fastpath cost.
- */
- - rq->skip_clock_update = 1;
- + rq_clock_skip_update(rq, true);
- }
- set_skip_buddy(se);
- @@ -6320,90 +7356,57 @@
- }
- #ifdef CONFIG_NUMA_BALANCING
- -/* Returns true if the destination node has incurred more faults */
- -static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
- +/*
- + * Returns 1, if task migration degrades locality
- + * Returns 0, if task migration improves locality i.e migration preferred.
- + * Returns -1, if task migration is not affected by locality.
- + */
- +static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
- {
- struct numa_group *numa_group = rcu_dereference(p->numa_group);
- + unsigned long src_faults, dst_faults;
- int src_nid, dst_nid;
- - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
- - !(env->sd->flags & SD_NUMA)) {
- - return false;
- - }
- + if (!static_branch_likely(&sched_numa_balancing))
- + return -1;
- +
- + if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
- + return -1;
- src_nid = cpu_to_node(env->src_cpu);
- dst_nid = cpu_to_node(env->dst_cpu);
- if (src_nid == dst_nid)
- - return false;
- -
- - if (numa_group) {
- - /* Task is already in the group's interleave set. */
- - if (node_isset(src_nid, numa_group->active_nodes))
- - return false;
- -
- - /* Task is moving into the group's interleave set. */
- - if (node_isset(dst_nid, numa_group->active_nodes))
- - return true;
- + return -1;
- - return group_faults(p, dst_nid) > group_faults(p, src_nid);
- + /* Migrating away from the preferred node is always bad. */
- + if (src_nid == p->numa_preferred_nid) {
- + if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
- + return 1;
- + else
- + return -1;
- }
- /* Encourage migration to the preferred node. */
- if (dst_nid == p->numa_preferred_nid)
- - return true;
- -
- - return task_faults(p, dst_nid) > task_faults(p, src_nid);
- -}
- -
- -
- -static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
- -{
- - struct numa_group *numa_group = rcu_dereference(p->numa_group);
- - int src_nid, dst_nid;
- -
- - if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
- - return false;
- -
- - if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
- - return false;
- -
- - src_nid = cpu_to_node(env->src_cpu);
- - dst_nid = cpu_to_node(env->dst_cpu);
- -
- - if (src_nid == dst_nid)
- - return false;
- + return 0;
- if (numa_group) {
- - /* Task is moving within/into the group's interleave set. */
- - if (node_isset(dst_nid, numa_group->active_nodes))
- - return false;
- -
- - /* Task is moving out of the group's interleave set. */
- - if (node_isset(src_nid, numa_group->active_nodes))
- - return true;
- -
- - return group_faults(p, dst_nid) < group_faults(p, src_nid);
- + src_faults = group_faults(p, src_nid);
- + dst_faults = group_faults(p, dst_nid);
- + } else {
- + src_faults = task_faults(p, src_nid);
- + dst_faults = task_faults(p, dst_nid);
- }
- - /* Migrating away from the preferred node is always bad. */
- - if (src_nid == p->numa_preferred_nid)
- - return true;
- -
- - return task_faults(p, dst_nid) < task_faults(p, src_nid);
- + return dst_faults < src_faults;
- }
- #else
- -static inline bool migrate_improves_locality(struct task_struct *p,
- +static inline int migrate_degrades_locality(struct task_struct *p,
- struct lb_env *env)
- {
- - return false;
- -}
- -
- -static inline bool migrate_degrades_locality(struct task_struct *p,
- - struct lb_env *env)
- -{
- - return false;
- + return -1;
- }
- #endif
- @@ -6413,7 +7416,7 @@
- static
- int can_migrate_task(struct task_struct *p, struct lb_env *env)
- {
- - int tsk_cache_hot = 0;
- + int tsk_cache_hot;
- lockdep_assert_held(&env->src_rq->lock);
- @@ -6430,7 +7433,7 @@
- if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
- int cpu;
- - schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
- + schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
- env->flags |= LBF_SOME_PINNED;
- @@ -6461,7 +7464,7 @@
- env->flags &= ~LBF_ALL_PINNED;
- if (task_running(env->src_rq, p)) {
- - schedstat_inc(p, se.statistics.nr_failed_migrations_running);
- + schedstat_inc(p->se.statistics.nr_failed_migrations_running);
- return 0;
- }
- @@ -6471,20 +7474,20 @@
- * 2) task is cache cold, or
- * 3) too many balance attempts have failed.
- */
- - tsk_cache_hot = task_hot(p, env);
- - if (!tsk_cache_hot)
- - tsk_cache_hot = migrate_degrades_locality(p, env);
- + tsk_cache_hot = migrate_degrades_locality(p, env);
- + if (tsk_cache_hot == -1)
- + tsk_cache_hot = task_hot(p, env);
- - if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
- + if (tsk_cache_hot <= 0 ||
- env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
- - if (tsk_cache_hot) {
- - schedstat_inc(env->sd, lb_hot_gained[env->idle]);
- - schedstat_inc(p, se.statistics.nr_forced_migrations);
- + if (tsk_cache_hot == 1) {
- + schedstat_inc(env->sd->lb_hot_gained[env->idle]);
- + schedstat_inc(p->se.statistics.nr_forced_migrations);
- }
- return 1;
- }
- - schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
- + schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
- return 0;
- }
- @@ -6495,8 +7498,8 @@
- {
- lockdep_assert_held(&env->src_rq->lock);
- - deactivate_task(env->src_rq, p, 0);
- p->on_rq = TASK_ON_RQ_MIGRATING;
- + deactivate_task(env->src_rq, p, 0);
- double_lock_balance(env->src_rq, env->dst_rq);
- set_task_cpu(p, env->dst_cpu);
- double_unlock_balance(env->src_rq, env->dst_rq);
- @@ -6526,7 +7529,7 @@
- * so we can safely collect stats here rather than
- * inside detach_tasks().
- */
- - schedstat_inc(env->sd, lb_gained[env->idle]);
- + schedstat_inc(env->sd->lb_gained[env->idle]);
- return p;
- }
- return NULL;
- @@ -6618,7 +7621,7 @@
- * so we can safely collect detach_one_task() stats here rather
- * than inside detach_one_task().
- */
- - schedstat_add(env->sd, lb_gained[env->idle], detached);
- + schedstat_add(env->sd->lb_gained[env->idle], detached);
- return detached;
- }
- @@ -6631,8 +7634,8 @@
- lockdep_assert_held(&rq->lock);
- BUG_ON(task_rq(p) != rq);
- - p->on_rq = TASK_ON_RQ_QUEUED;
- activate_task(rq, p, 0);
- + p->on_rq = TASK_ON_RQ_QUEUED;
- check_preempt_curr(rq, p, 0);
- }
- @@ -6647,7 +7650,7 @@
- /*
- * We want to potentially raise target_cpu's OPP.
- */
- - update_capacity_of(cpu_of(rq), true);
- + update_capacity_of(cpu_of(rq));
- raw_spin_unlock(&rq->lock);
- }
- @@ -6672,7 +7675,7 @@
- /*
- * We want to potentially raise env.dst_cpu's OPP.
- */
- - update_capacity_of(env->dst_cpu, true);
- + update_capacity_of(env->dst_cpu);
- raw_spin_unlock(&env->dst_rq->lock);
- }
- @@ -6692,12 +7695,20 @@
- * list_add_leaf_cfs_rq() for details.
- */
- for_each_leaf_cfs_rq(rq, cfs_rq) {
- + struct sched_entity *se;
- +
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- continue;
- - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
- + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
- + true))
- update_tg_load_avg(cfs_rq, 0);
- +
- + /* Propagate pending load changes to the parent, if any: */
- + se = cfs_rq->tg->se[cpu];
- + if (se && !skip_blocked_update(se))
- + update_load_avg(se, 0);
- }
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- }
- @@ -6757,7 +7768,7 @@
- raw_spin_lock_irqsave(&rq->lock, flags);
- update_rq_clock(rq);
- - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
- + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- }
- @@ -6908,6 +7919,9 @@
- cpu_rq(cpu)->cpu_capacity_orig = capacity;
- + capacity *= arch_scale_max_freq_capacity(sd, cpu);
- + capacity >>= SCHED_CAPACITY_SHIFT;
- +
- mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
- raw_spin_lock_irqsave(&mcc->lock, flags);
- @@ -6937,13 +7951,14 @@
- cpu_rq(cpu)->cpu_capacity = capacity;
- sdg->sgc->capacity = capacity;
- sdg->sgc->max_capacity = capacity;
- + sdg->sgc->min_capacity = capacity;
- }
- void update_group_capacity(struct sched_domain *sd, int cpu)
- {
- struct sched_domain *child = sd->child;
- struct sched_group *group, *sdg = sd->groups;
- - unsigned long capacity, max_capacity;
- + unsigned long capacity, max_capacity, min_capacity;
- unsigned long interval;
- interval = msecs_to_jiffies(sd->balance_interval);
- @@ -6957,6 +7972,7 @@
- capacity = 0;
- max_capacity = 0;
- + min_capacity = ULONG_MAX;
- if (child->flags & SD_OVERLAP) {
- /*
- @@ -6987,6 +8003,7 @@
- }
- max_capacity = max(capacity, max_capacity);
- + min_capacity = min(capacity, min_capacity);
- }
- } else {
- /*
- @@ -7000,12 +8017,14 @@
- capacity += sgc->capacity;
- max_capacity = max(sgc->max_capacity, max_capacity);
- + min_capacity = min(sgc->min_capacity, min_capacity);
- group = group->next;
- } while (group != child->groups);
- }
- sdg->sgc->capacity = capacity;
- sdg->sgc->max_capacity = max_capacity;
- + sdg->sgc->min_capacity = min_capacity;
- }
- /*
- @@ -7112,9 +8131,9 @@
- ref->sgc->max_capacity;
- }
- -static enum group_type group_classify(struct lb_env *env,
- - struct sched_group *group,
- - struct sg_lb_stats *sgs)
- +static inline enum
- +group_type group_classify(struct sched_group *group,
- + struct sg_lb_stats *sgs)
- {
- if (sgs->group_no_capacity)
- return group_overloaded;
- @@ -7128,6 +8147,38 @@
- return group_other;
- }
- +#ifdef CONFIG_NO_HZ_COMMON
- +/*
- + * idle load balancing data
- + * - used by the nohz balance, but we want it available here
- + * so that we can see which CPUs have no tick.
- + */
- +static struct {
- + cpumask_var_t idle_cpus_mask;
- + atomic_t nr_cpus;
- + unsigned long next_balance; /* in jiffy units */
- +} nohz ____cacheline_aligned;
- +
- +static inline void update_cpu_stats_if_tickless(struct rq *rq)
- +{
- + /* only called from update_sg_lb_stats when irqs are disabled */
- + if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
- + /* rate limit updates to once-per-jiffie at most */
- + if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
- + return;
- +
- + raw_spin_lock(&rq->lock);
- + update_rq_clock(rq);
- + update_idle_cpu_load(rq);
- + update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
- + raw_spin_unlock(&rq->lock);
- + }
- +}
- +
- +#else
- +static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
- +#endif
- +
- /**
- * update_sg_lb_stats - Update sched_group's statistics for load balancing.
- * @env: The load balancing environment.
- @@ -7151,6 +8202,12 @@
- for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
- struct rq *rq = cpu_rq(i);
- + /* if we are entering idle and there are CPUs with
- + * their tick stopped, do an update for them
- + */
- + if (env->idle == CPU_NEWLY_IDLE)
- + update_cpu_stats_if_tickless(rq);
- +
- /* Bias balancing toward cpus of our domain */
- if (local_group)
- load = target_load(i, load_idx);
- @@ -7158,7 +8215,7 @@
- load = source_load(i, load_idx);
- sgs->group_load += load;
- - sgs->group_util += cpu_util(i, UTIL_AVG);
- + sgs->group_util += cpu_util(i);
- sgs->sum_nr_running += rq->cfs.h_nr_running;
- nr_running = rq->nr_running;
- @@ -7193,7 +8250,7 @@
- sgs->group_weight = group->group_weight;
- sgs->group_no_capacity = group_is_overloaded(env, sgs);
- - sgs->group_type = group_classify(env, group, sgs);
- + sgs->group_type = group_classify(group, sgs);
- }
- /**
- @@ -7233,18 +8290,27 @@
- if (sgs->avg_load <= busiest->avg_load)
- return false;
- + if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
- + goto asym_packing;
- +
- /*
- - * Candiate sg has no more than one task per cpu and has higher
- - * per-cpu capacity. No reason to pull tasks to less capable cpus.
- + * Candidate sg has no more than one task per CPU and
- + * has higher per-CPU capacity. Migrating tasks to less
- + * capable CPUs may harm throughput. Maximize throughput,
- + * power/energy consequences are not considered.
- */
- if (sgs->sum_nr_running <= sgs->group_weight &&
- group_smaller_cpu_capacity(sds->local, sg))
- return false;
- +asym_packing:
- /* This is the busiest node in its class. */
- if (!(env->sd->flags & SD_ASYM_PACKING))
- return true;
- + /* No ASYM_PACKING if target cpu is already busy */
- + if (env->idle == CPU_NOT_IDLE)
- + return true;
- /*
- * ASYM_PACKING needs to move all the work to the lowest
- * numbered CPUs in the group, therefore mark all groups
- @@ -7254,7 +8320,8 @@
- if (!sds->busiest)
- return true;
- - if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
- + /* Prefer to move from highest possible cpu's work */
- + if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
- return true;
- }
- @@ -7291,6 +8358,9 @@
- }
- #endif /* CONFIG_NUMA_BALANCING */
- +#define lb_sd_parent(sd) \
- + (sd->parent && sd->parent->groups != sd->parent->groups->next)
- +
- /**
- * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
- * @env: The load balancing environment.
- @@ -7343,7 +8413,7 @@
- group_has_capacity(env, &sds->local_stat) &&
- (sgs->sum_nr_running > 1)) {
- sgs->group_no_capacity = 1;
- - sgs->group_type = group_overloaded;
- + sgs->group_type = group_classify(sg, sgs);
- }
- /*
- @@ -7373,7 +8443,7 @@
- env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
- - if (!env->sd->parent) {
- + if (!lb_sd_parent(env->sd)) {
- /* update overload indicator if we are at root domain */
- if (env->dst_rq->rd->overload != overload)
- env->dst_rq->rd->overload = overload;
- @@ -7422,6 +8492,9 @@
- if (!(env->sd->flags & SD_ASYM_PACKING))
- return 0;
- + if (env->idle == CPU_NOT_IDLE)
- + return 0;
- +
- if (!sds->busiest)
- return 0;
- @@ -7639,8 +8712,7 @@
- busiest = &sds.busiest_stat;
- /* ASYM feature bypasses nice load balance check */
- - if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
- - check_asym_packing(env, &sds))
- + if (check_asym_packing(env, &sds))
- return sds.busiest;
- /* There is no busy sibling group to pull tasks from */
- @@ -7658,8 +8730,11 @@
- if (busiest->group_type == group_imbalanced)
- goto force_balance;
- - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- - if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
- + /*
- + * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
- + * capacities from resulting in underutilization due to avg_load.
- + */
- + if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
- busiest->group_no_capacity)
- goto force_balance;
- @@ -7827,6 +8902,7 @@
- }
- if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
- + ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
- env->src_rq->cfs.h_nr_running == 1 &&
- cpu_overutilized(env->src_cpu) &&
- !cpu_overutilized(env->dst_cpu)) {
- @@ -7881,7 +8957,7 @@
- int *continue_balancing)
- {
- int ld_moved, cur_ld_moved, active_balance = 0;
- - struct sched_domain *sd_parent = sd->parent;
- + struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
- struct sched_group *group;
- struct rq *busiest;
- unsigned long flags;
- @@ -7908,7 +8984,7 @@
- cpumask_copy(cpus, cpu_active_mask);
- - schedstat_inc(sd, lb_count[idle]);
- + schedstat_inc(sd->lb_count[idle]);
- redo:
- if (!should_we_balance(&env)) {
- @@ -7918,19 +8994,19 @@
- group = find_busiest_group(&env);
- if (!group) {
- - schedstat_inc(sd, lb_nobusyg[idle]);
- + schedstat_inc(sd->lb_nobusyg[idle]);
- goto out_balanced;
- }
- busiest = find_busiest_queue(&env, group);
- if (!busiest) {
- - schedstat_inc(sd, lb_nobusyq[idle]);
- + schedstat_inc(sd->lb_nobusyq[idle]);
- goto out_balanced;
- }
- BUG_ON(busiest == env.dst_rq);
- - schedstat_add(sd, lb_imbalance[idle], env.imbalance);
- + schedstat_add(sd->lb_imbalance[idle], env.imbalance);
- env.src_cpu = busiest->cpu;
- env.src_rq = busiest;
- @@ -7948,6 +9024,7 @@
- more_balance:
- raw_spin_lock_irqsave(&busiest->lock, flags);
- + update_rq_clock(busiest);
- /*
- * cur_ld_moved - load moved in current iteration
- @@ -7958,7 +9035,7 @@
- * We want to potentially lower env.src_cpu's OPP.
- */
- if (cur_ld_moved)
- - update_capacity_of(env.src_cpu, true);
- + update_capacity_of(env.src_cpu);
- /*
- * We've detached some tasks from busiest_rq. Every
- @@ -8032,7 +9109,24 @@
- /* All tasks on this runqueue were pinned by CPU affinity */
- if (unlikely(env.flags & LBF_ALL_PINNED)) {
- cpumask_clear_cpu(cpu_of(busiest), cpus);
- - if (!cpumask_empty(cpus)) {
- + /*
- + * dst_cpu is not a valid busiest cpu in the following
- + * check since load cannot be pulled from dst_cpu to be
- + * put on dst_cpu.
- + */
- + cpumask_clear_cpu(env.dst_cpu, cpus);
- + /*
- + * Go back to "redo" iff the load-balance cpumask
- + * contains other potential busiest cpus for the
- + * current sched domain.
- + */
- + if (cpumask_intersects(cpus, sched_domain_span(env.sd))) {
- + /*
- + * Now that the check has passed, reenable
- + * dst_cpu so that load can be calculated on
- + * it in the redo path.
- + */
- + cpumask_set_cpu(env.dst_cpu, cpus);
- env.loop = 0;
- env.loop_break = sched_nr_migrate_break;
- goto redo;
- @@ -8042,7 +9136,7 @@
- }
- if (!ld_moved) {
- - schedstat_inc(sd, lb_failed[idle]);
- + schedstat_inc(sd->lb_failed[idle]);
- /*
- * Increment the failure counter only on periodic balance.
- * We do not want newidle balance, which can be very
- @@ -8086,10 +9180,7 @@
- &busiest->active_balance_work);
- }
- - /*
- - * We've kicked active balancing, reset the failure
- - * counter.
- - */
- + /* We've kicked active balancing, force task migration. */
- sd->nr_balance_failed = sd->cache_nice_tries+1;
- }
- } else
- @@ -8129,7 +9220,7 @@
- * we can't migrate them. Let the imbalance flag set so parent level
- * can try to migrate them.
- */
- - schedstat_inc(sd, lb_balanced[idle]);
- + schedstat_inc(sd->lb_balanced[idle]);
- sd->nr_balance_failed = 0;
- @@ -8185,8 +9276,6 @@
- u64 curr_cost = 0;
- long removed_util = 0;
- - idle_enter_fair(this_rq);
- -
- /*
- * We must set idle_stamp _before_ calling idle_balance(), such that we
- * measure the duration of idle_balance() as idle time.
- @@ -8283,14 +9372,13 @@
- pulled_task = -1;
- if (pulled_task) {
- - idle_exit_fair(this_rq);
- this_rq->idle_stamp = 0;
- } else if (removed_util) {
- /*
- * No task pulled and someone has been migrated away.
- * Good case to trigger an OPP update.
- */
- - update_capacity_of(this_cpu, true);
- + update_capacity_of(this_cpu);
- }
- return pulled_task;
- @@ -8308,8 +9396,18 @@
- int busiest_cpu = cpu_of(busiest_rq);
- int target_cpu = busiest_rq->push_cpu;
- struct rq *target_rq = cpu_rq(target_cpu);
- - struct sched_domain *sd;
- + struct sched_domain *sd = NULL;
- struct task_struct *p = NULL;
- + struct task_struct *push_task = NULL;
- + int push_task_detached = 0;
- + struct lb_env env = {
- + .sd = sd,
- + .dst_cpu = target_cpu,
- + .dst_rq = target_rq,
- + .src_cpu = busiest_rq->cpu,
- + .src_rq = busiest_rq,
- + .idle = CPU_IDLE,
- + };
- raw_spin_lock_irq(&busiest_rq->lock);
- @@ -8329,6 +9427,17 @@
- */
- BUG_ON(busiest_rq == target_rq);
- + push_task = busiest_rq->push_task;
- + if (push_task) {
- + if (task_on_rq_queued(push_task) &&
- + task_cpu(push_task) == busiest_cpu &&
- + cpu_online(target_cpu)) {
- + detach_task(push_task, &env);
- + push_task_detached = 1;
- + }
- + goto out_unlock;
- + }
- +
- /* Search for an sd spanning us and the target CPU. */
- rcu_read_lock();
- for_each_domain(target_cpu, sd) {
- @@ -8338,33 +9447,36 @@
- }
- if (likely(sd)) {
- - struct lb_env env = {
- - .sd = sd,
- - .dst_cpu = target_cpu,
- - .dst_rq = target_rq,
- - .src_cpu = busiest_rq->cpu,
- - .src_rq = busiest_rq,
- - .idle = CPU_IDLE,
- - };
- -
- - schedstat_inc(sd, alb_count);
- + env.sd = sd;
- + schedstat_inc(sd->alb_count);
- + update_rq_clock(busiest_rq);
- p = detach_one_task(&env);
- if (p) {
- - schedstat_inc(sd, alb_pushed);
- + schedstat_inc(sd->alb_pushed);
- /*
- * We want to potentially lower env.src_cpu's OPP.
- */
- - update_capacity_of(env.src_cpu, true);
- + update_capacity_of(env.src_cpu);
- }
- else
- - schedstat_inc(sd, alb_failed);
- + schedstat_inc(sd->alb_failed);
- }
- rcu_read_unlock();
- out_unlock:
- busiest_rq->active_balance = 0;
- +
- + if (push_task)
- + busiest_rq->push_task = NULL;
- +
- raw_spin_unlock(&busiest_rq->lock);
- + if (push_task) {
- + if (push_task_detached)
- + attach_one_task(target_rq, push_task);
- + put_task_struct(push_task);
- + }
- +
- if (p)
- attach_one_task(target_rq, p);
- @@ -8385,12 +9497,6 @@
- * needed, they will kick the idle load balancer, which then does idle
- * load balancing for all the idle CPUs.
- */
- -static struct {
- - cpumask_var_t idle_cpus_mask;
- - atomic_t nr_cpus;
- - unsigned long next_balance; /* in jiffy units */
- -} nohz ____cacheline_aligned;
- -
- static inline int find_new_ilb(void)
- {
- int ilb = cpumask_first(nohz.idle_cpus_mask);
- @@ -8449,13 +9555,13 @@
- int cpu = smp_processor_id();
- rcu_read_lock();
- - sd = rcu_dereference(per_cpu(sd_busy, cpu));
- + sd = rcu_dereference(per_cpu(sd_llc, cpu));
- if (!sd || !sd->nohz_idle)
- goto unlock;
- sd->nohz_idle = 0;
- - atomic_inc(&sd->groups->sgc->nr_busy_cpus);
- + atomic_inc(&sd->shared->nr_busy_cpus);
- unlock:
- rcu_read_unlock();
- }
- @@ -8466,13 +9572,13 @@
- int cpu = smp_processor_id();
- rcu_read_lock();
- - sd = rcu_dereference(per_cpu(sd_busy, cpu));
- + sd = rcu_dereference(per_cpu(sd_llc, cpu));
- if (!sd || sd->nohz_idle)
- goto unlock;
- sd->nohz_idle = 1;
- - atomic_dec(&sd->groups->sgc->nr_busy_cpus);
- + atomic_dec(&sd->shared->nr_busy_cpus);
- unlock:
- rcu_read_unlock();
- }
- @@ -8711,8 +9817,8 @@
- static inline bool nohz_kick_needed(struct rq *rq)
- {
- unsigned long now = jiffies;
- + struct sched_domain_shared *sds;
- struct sched_domain *sd;
- - struct sched_group_capacity *sgc;
- int nr_busy, cpu = rq->cpu;
- bool kick = false;
- @@ -8740,12 +9846,18 @@
- (!energy_aware() || cpu_overutilized(cpu)))
- return true;
- - rcu_read_lock();
- - sd = rcu_dereference(per_cpu(sd_busy, cpu));
- - if (sd && !energy_aware()) {
- - sgc = sd->groups->sgc;
- - nr_busy = atomic_read(&sgc->nr_busy_cpus);
- + /* Do idle load balance if there have misfit task */
- + if (energy_aware())
- + return rq->misfit_task;
- + rcu_read_lock();
- + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
- + if (sds) {
- + /*
- + * XXX: write a coherent comment on why we do this.
- + * See also: http://lkml.kernel.org/r/[email protected]
- + */
- + nr_busy = atomic_read(&sds->nr_busy_cpus);
- if (nr_busy > 1) {
- kick = true;
- goto unlock;
- @@ -8831,6 +9943,47 @@
- unthrottle_offline_cfs_rqs(rq);
- }
- +static inline int
- +kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
- +{
- + int rc = 0;
- +
- + /* Invoke active balance to force migrate currently running task */
- + raw_spin_lock(&rq->lock);
- + if (!rq->active_balance) {
- + rq->active_balance = 1;
- + rq->push_cpu = new_cpu;
- + get_task_struct(p);
- + rq->push_task = p;
- + rc = 1;
- + }
- + raw_spin_unlock(&rq->lock);
- +
- + return rc;
- +}
- +
- +void check_for_migration(struct rq *rq, struct task_struct *p)
- +{
- + int new_cpu;
- + int active_balance;
- + int cpu = task_cpu(p);
- +
- + if (energy_aware() && rq->misfit_task) {
- + if (rq->curr->state != TASK_RUNNING ||
- + rq->curr->nr_cpus_allowed == 1)
- + return;
- +
- + new_cpu = select_energy_cpu_brute(p, cpu, 0);
- + if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) {
- + active_balance = kick_active_balance(rq, p, new_cpu);
- + if (active_balance)
- + stop_one_cpu_nowait(cpu,
- + active_load_balance_cpu_stop,
- + rq, &rq->active_balance_work);
- + }
- + }
- +}
- +
- #endif /* CONFIG_SMP */
- /*
- @@ -8846,7 +9999,7 @@
- entity_tick(cfs_rq, se, queued);
- }
- - if (numabalancing_enabled)
- + if (static_branch_unlikely(&sched_numa_balancing))
- task_tick_numa(rq, curr);
- #ifdef CONFIG_SMP
- @@ -8869,31 +10022,17 @@
- {
- struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se, *curr;
- - int this_cpu = smp_processor_id();
- struct rq *rq = this_rq();
- - unsigned long flags;
- -
- - raw_spin_lock_irqsave(&rq->lock, flags);
- + raw_spin_lock(&rq->lock);
- update_rq_clock(rq);
- cfs_rq = task_cfs_rq(current);
- curr = cfs_rq->curr;
- -
- - /*
- - * Not only the cpu but also the task_group of the parent might have
- - * been changed after parent->se.parent,cfs_rq were copied to
- - * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
- - * of child point to valid ones.
- - */
- - rcu_read_lock();
- - __set_task_cpu(p, this_cpu);
- - rcu_read_unlock();
- -
- - update_curr(cfs_rq);
- -
- - if (curr)
- + if (curr) {
- + update_curr(cfs_rq);
- se->vruntime = curr->vruntime;
- + }
- place_entity(cfs_rq, se, 1);
- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
- @@ -8906,8 +10045,7 @@
- }
- se->vruntime -= cfs_rq->min_vruntime;
- -
- - raw_spin_unlock_irqrestore(&rq->lock, flags);
- + raw_spin_unlock(&rq->lock);
- }
- /*
- @@ -8959,6 +10097,61 @@
- return false;
- }
- +#ifdef CONFIG_FAIR_GROUP_SCHED
- +/*
- + * Propagate the changes of the sched_entity across the tg tree to make it
- + * visible to the root
- + */
- +static void propagate_entity_cfs_rq(struct sched_entity *se)
- +{
- + struct cfs_rq *cfs_rq;
- +
- + /* Start to propagate at parent */
- + se = se->parent;
- +
- + for_each_sched_entity(se) {
- + cfs_rq = cfs_rq_of(se);
- +
- + if (cfs_rq_throttled(cfs_rq))
- + break;
- +
- + update_load_avg(se, UPDATE_TG);
- + }
- +}
- +#else
- +static void propagate_entity_cfs_rq(struct sched_entity *se) { }
- +#endif
- +
- +static void detach_entity_cfs_rq(struct sched_entity *se)
- +{
- + struct cfs_rq *cfs_rq = cfs_rq_of(se);
- +
- + /* Catch up with the cfs_rq and remove our load when we leave */
- + update_load_avg(se, 0);
- + detach_entity_load_avg(cfs_rq, se);
- + update_tg_load_avg(cfs_rq, false);
- + propagate_entity_cfs_rq(se);
- +}
- +
- +static void attach_entity_cfs_rq(struct sched_entity *se)
- +{
- + struct cfs_rq *cfs_rq = cfs_rq_of(se);
- +
- +#ifdef CONFIG_FAIR_GROUP_SCHED
- + /*
- + * Since the real-depth could have been changed (only FAIR
- + * class maintain depth value), reset depth properly.
- + */
- + se->depth = se->parent ? se->parent->depth + 1 : 0;
- +#endif
- +
- + /* Synchronize entity with its cfs_rq */
- + update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
- + attach_entity_load_avg(cfs_rq, se);
- + update_tg_load_avg(cfs_rq, false);
- + propagate_entity_cfs_rq(se);
- +}
- +
- static void detach_task_cfs_rq(struct task_struct *p)
- {
- struct sched_entity *se = &p->se;
- @@ -8973,8 +10166,7 @@
- se->vruntime -= cfs_rq->min_vruntime;
- }
- - /* Catch up with the cfs_rq and remove our load when we leave */
- - detach_entity_load_avg(cfs_rq, se);
- + detach_entity_cfs_rq(se);
- }
- static void attach_task_cfs_rq(struct task_struct *p)
- @@ -8982,16 +10174,7 @@
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- -#ifdef CONFIG_FAIR_GROUP_SCHED
- - /*
- - * Since the real-depth could have been changed (only FAIR
- - * class maintain depth value), reset depth properly.
- - */
- - se->depth = se->parent ? se->parent->depth + 1 : 0;
- -#endif
- -
- - /* Synchronize task with its cfs_rq */
- - attach_entity_load_avg(cfs_rq, se);
- + attach_entity_cfs_rq(se);
- if (!vruntime_normalized(p))
- se->vruntime += cfs_rq->min_vruntime;
- @@ -9045,12 +10228,23 @@
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
- #endif
- #ifdef CONFIG_SMP
- +#ifdef CONFIG_FAIR_GROUP_SCHED
- + cfs_rq->propagate_avg = 0;
- +#endif
- atomic_long_set(&cfs_rq->removed_load_avg, 0);
- atomic_long_set(&cfs_rq->removed_util_avg, 0);
- #endif
- }
- #ifdef CONFIG_FAIR_GROUP_SCHED
- +static void task_set_group_fair(struct task_struct *p)
- +{
- + struct sched_entity *se = &p->se;
- +
- + set_task_rq(p, task_cpu(p));
- + se->depth = se->parent ? se->parent->depth + 1 : 0;
- +}
- +
- static void task_move_group_fair(struct task_struct *p)
- {
- detach_task_cfs_rq(p);
- @@ -9063,6 +10257,19 @@
- attach_task_cfs_rq(p);
- }
- +static void task_change_group_fair(struct task_struct *p, int type)
- +{
- + switch (type) {
- + case TASK_SET_GROUP:
- + task_set_group_fair(p);
- + break;
- +
- + case TASK_MOVE_GROUP:
- + task_move_group_fair(p);
- + break;
- + }
- +}
- +
- void free_fair_sched_group(struct task_group *tg)
- {
- int i;
- @@ -9085,8 +10292,9 @@
- int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
- {
- - struct cfs_rq *cfs_rq;
- struct sched_entity *se;
- + struct cfs_rq *cfs_rq;
- + struct rq *rq;
- int i;
- tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
- @@ -9101,6 +10309,8 @@
- init_cfs_bandwidth(tg_cfs_bandwidth(tg));
- for_each_possible_cpu(i) {
- + rq = cpu_rq(i);
- +
- cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
- GFP_KERNEL, cpu_to_node(i));
- if (!cfs_rq)
- @@ -9114,6 +10324,11 @@
- init_cfs_rq(cfs_rq);
- init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
- init_entity_runnable_average(se);
- +
- + raw_spin_lock_irq(&rq->lock);
- + post_init_entity_util_avg(se);
- + sync_throttle(tg, i);
- + raw_spin_unlock_irq(&rq->lock);
- }
- return 1;
- @@ -9202,8 +10417,10 @@
- /* Possible calls to update_curr() need rq clock */
- update_rq_clock(rq);
- - for_each_sched_entity(se)
- - update_cfs_shares(group_cfs_rq(se));
- + for_each_sched_entity(se) {
- + update_load_avg(se, UPDATE_TG);
- + update_cfs_shares(se);
- + }
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- }
- @@ -9264,6 +10481,7 @@
- .task_waking = task_waking_fair,
- .task_dead = task_dead_fair,
- + .set_cpus_allowed = set_cpus_allowed_common,
- #endif
- .set_curr_task = set_curr_task_fair,
- @@ -9279,7 +10497,7 @@
- .update_curr = update_curr_fair,
- #ifdef CONFIG_FAIR_GROUP_SCHED
- - .task_move_group = task_move_group_fair,
- + .task_change_group = task_change_group_fair,
- #endif
- };
- diff -Nur /home/ninez/android/marlin/kernel/sched/features.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/features.h
- --- /home/ninez/android/marlin/kernel/sched/features.h 2018-08-10 01:54:08.566728454 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/features.h 2018-08-15 17:51:31.901600413 -0400
- @@ -49,7 +49,7 @@
- * Queue remote wakeups on the target CPU and process them
- * using the scheduler IPI. Reduces rq->lock contention/bounces.
- */
- -SCHED_FEAT(TTWU_QUEUE, true)
- +SCHED_FEAT(TTWU_QUEUE, false)
- #ifdef HAVE_RT_PUSH_IPI
- /*
- @@ -66,48 +66,39 @@
- SCHED_FEAT(FORCE_SD_OVERLAP, false)
- SCHED_FEAT(RT_RUNTIME_SHARE, true)
- +SCHED_FEAT(RT_RUNTIME_GREED, false)
- SCHED_FEAT(LB_MIN, false)
- SCHED_FEAT(ATTACH_AGE_LOAD, true)
- /*
- - * Apply the automatic NUMA scheduling policy. Enabled automatically
- - * at runtime if running on a NUMA machine. Can be controlled via
- - * numa_balancing=
- - */
- -#ifdef CONFIG_NUMA_BALANCING
- -SCHED_FEAT(NUMA, false)
- -
- -/*
- - * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
- - * higher number of hinting faults are recorded during active load
- - * balancing.
- - */
- -SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
- -
- -/*
- - * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
- - * lower number of hinting faults have been recorded. As this has
- - * the potential to prevent a task ever migrating to a new node
- - * due to CPU overload it is disabled by default.
- + * Energy aware scheduling. Use platform energy model to guide scheduling
- + * decisions optimizing for energy efficiency.
- */
- -SCHED_FEAT(NUMA_RESIST_LOWER, false)
- -#endif
- +SCHED_FEAT(ENERGY_AWARE, true)
- /*
- - * Energy aware scheduling. Use platform energy model to guide scheduling
- - * decisions optimizing for energy efficiency.
- + * Minimum capacity capping. Keep track of minimum capacity factor when
- + * minimum frequency available to a policy is modified.
- + * If enabled, this can be used to inform the scheduler about capacity
- + * restrictions.
- */
- -SCHED_FEAT(ENERGY_AWARE, false)
- +SCHED_FEAT(MIN_CAPACITY_CAPPING, false)
- /*
- - * UtilEstimation. Use estimated CPU utiliation.
- + * Enforce the priority of candidates selected by find_best_target()
- + * ON: If the target CPU saves any energy, use that.
- + * OFF: Use whichever of target or backup saves most.
- */
- -SCHED_FEAT(UTIL_EST, false)
- +SCHED_FEAT(FBT_STRICT_ORDER, false)
- /*
- - * SchedTune. Use Performance/Energy filtering function to evaluate the
- - * trade off between energy consumption and performance impact when comparing
- - * previous and next candidate CPUs.
- + * Apply schedtune boost hold to tasks of all sched classes.
- + * If enabled, schedtune will hold the boost applied to a CPU
- + * for 50ms regardless of task activation - if the task is
- + * still running 50ms later, the boost hold expires and schedtune
- + * boost will expire immediately the task stops.
- + * If disabled, this behaviour will only apply to tasks of the
- + * RT class.
- */
- -SCHED_FEAT(ENERGY_FILTER, true)
- +SCHED_FEAT(SCHEDTUNE_BOOST_HOLD_ALL, false)
- diff -Nur /home/ninez/android/marlin/kernel/sched/idle.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/idle.c
- --- /home/ninez/android/marlin/kernel/sched/idle.c 2018-08-10 01:54:08.566728454 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/idle.c 2018-08-11 23:57:17.131940887 -0400
- @@ -58,7 +58,8 @@
- rcu_idle_enter();
- trace_cpu_idle_rcuidle(0, smp_processor_id());
- local_irq_enable();
- - while (!tif_need_resched())
- + while (!tif_need_resched() &&
- + (cpu_idle_force_poll || tick_check_broadcast_expired()))
- cpu_relax();
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
- rcu_idle_exit();
- @@ -208,6 +209,8 @@
- goto exit_idle;
- }
- +DEFINE_PER_CPU(bool, cpu_dead_idle);
- +
- /*
- * Generic idle loop implementation
- *
- @@ -233,8 +236,13 @@
- check_pgt_cache();
- rmb();
- - if (cpu_is_offline(smp_processor_id()))
- + if (cpu_is_offline(smp_processor_id())) {
- + rcu_cpu_notify(NULL, CPU_DYING_IDLE,
- + (void *)(long)smp_processor_id());
- + smp_mb(); /* all activity before dead. */
- + this_cpu_write(cpu_dead_idle, true);
- arch_cpu_idle_dead();
- + }
- local_irq_disable();
- arch_cpu_idle_enter();
- diff -Nur /home/ninez/android/marlin/kernel/sched/idle_task.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/idle_task.c
- --- /home/ninez/android/marlin/kernel/sched/idle_task.c 2018-08-10 01:54:08.566728454 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/idle_task.c 2018-08-26 16:43:11.650539699 -0400
- @@ -9,7 +9,8 @@
- #ifdef CONFIG_SMP
- static int
- -select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
- +select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags,
- + int sibling_count_hint)
- {
- return task_cpu(p); /* IDLE tasks as never migrated */
- }
- @@ -24,11 +25,16 @@
- }
- static struct task_struct *
- -pick_next_task_idle(struct rq *rq, struct task_struct *prev)
- +pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
- {
- + if (sched_feat(RT_RUNTIME_GREED))
- + if (try_to_unthrottle_rt_rq(&rq->rt))
- + return RETRY_TASK;
- +
- put_prev_task(rq, prev);
- - schedstat_inc(rq, sched_goidle);
- + update_idle_core(rq);
- + schedstat_inc(rq->sched_goidle);
- return rq->idle;
- }
- @@ -47,7 +53,6 @@
- static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
- {
- - idle_exit_fair(rq);
- rq_last_tick_reset(rq);
- }
- @@ -96,6 +101,7 @@
- #ifdef CONFIG_SMP
- .select_task_rq = select_task_rq_idle,
- + .set_cpus_allowed = set_cpus_allowed_common,
- #endif
- .set_curr_task = set_curr_task_idle,
- diff -Nur /home/ninez/android/marlin/kernel/sched/loadavg.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/loadavg.c
- --- /home/ninez/android/marlin/kernel/sched/loadavg.c 2018-08-10 01:54:08.566728454 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/loadavg.c 2018-08-11 23:57:17.131940887 -0400
- @@ -168,7 +168,7 @@
- * If the folding window started, make sure we start writing in the
- * next idle-delta.
- */
- - if (!time_before(jiffies, calc_load_update))
- + if (!time_before(jiffies, READ_ONCE(calc_load_update)))
- idx++;
- return idx & 1;
- @@ -201,8 +201,9 @@
- struct rq *this_rq = this_rq();
- /*
- - * If we're still before the sample window, we're done.
- + * If we're still before the pending sample window, we're done.
- */
- + this_rq->calc_load_update = READ_ONCE(calc_load_update);
- if (time_before(jiffies, this_rq->calc_load_update))
- return;
- @@ -211,7 +212,6 @@
- * accounted through the nohz accounting, so skip the entire deal and
- * sync up for the next window.
- */
- - this_rq->calc_load_update = calc_load_update;
- if (time_before(jiffies, this_rq->calc_load_update + 10))
- this_rq->calc_load_update += LOAD_FREQ;
- }
- @@ -307,13 +307,15 @@
- */
- static void calc_global_nohz(void)
- {
- + unsigned long sample_window;
- long delta, active, n;
- - if (!time_before(jiffies, calc_load_update + 10)) {
- + sample_window = READ_ONCE(calc_load_update);
- + if (!time_before(jiffies, sample_window + 10)) {
- /*
- * Catch-up, fold however many we are behind still
- */
- - delta = jiffies - calc_load_update - 10;
- + delta = jiffies - sample_window - 10;
- n = 1 + (delta / LOAD_FREQ);
- active = atomic_long_read(&calc_load_tasks);
- @@ -323,7 +325,7 @@
- avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
- avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
- - calc_load_update += n * LOAD_FREQ;
- + WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ);
- }
- /*
- @@ -351,9 +353,11 @@
- */
- void calc_global_load(unsigned long ticks)
- {
- + unsigned long sample_window;
- long active, delta;
- - if (time_before(jiffies, calc_load_update + 10))
- + sample_window = READ_ONCE(calc_load_update);
- + if (time_before(jiffies, sample_window + 10))
- return;
- /*
- @@ -370,7 +374,7 @@
- avenrun[1] = calc_load(avenrun[1], EXP_5, active);
- avenrun[2] = calc_load(avenrun[2], EXP_15, active);
- - calc_load_update += LOAD_FREQ;
- + WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);
- /*
- * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
- diff -Nur /home/ninez/android/marlin/kernel/sched/Makefile /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/Makefile
- --- /home/ninez/android/marlin/kernel/sched/Makefile 2018-08-10 01:54:08.563395055 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/Makefile 2018-08-21 23:16:53.820436609 -0400
- @@ -2,15 +2,6 @@
- CFLAGS_REMOVE_clock.o = -pg
- endif
- -# KASAN instrumentation is temporarily disabled for energy.o due to the repeated
- -# reports that caused the kernel to not boot as seen in b/31800756. Should a fix
- -# be provided, this line can be removed again. But given that KCOV is also disabled
- -# for this module, it might be worth thinking about whether or not we should also
- -# just turn off KASAN instrumentation entirely here.
- -KASAN_SANITIZE_core.o := n
- -KASAN_SANITIZE_energy.o := n
- -KASAN_SANITIZE_fair.o := n
- -
- # These files are disabled because they produce non-interesting flaky coverage
- # that is not a function of syscall inputs. E.g. involuntary context switches.
- KCOV_INSTRUMENT := n
- @@ -26,7 +17,7 @@
- obj-y += core.o loadavg.o clock.o cputime.o
- obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
- -obj-y += wait.o completion.o idle.o
- +obj-y += wait.o swait.o swork.o completion.o idle.o
- obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
- obj-$(CONFIG_SCHED_WALT) += walt.o
- obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
- @@ -34,4 +25,7 @@
- obj-$(CONFIG_SCHED_DEBUG) += debug.o
- obj-$(CONFIG_SCHED_TUNE) += tune.o
- obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
- +obj-$(CONFIG_CPU_FREQ) += cpufreq.o
- obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
- +obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
- +obj-y += boost.o
- diff -Nur /home/ninez/android/marlin/kernel/sched/rt.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/rt.c
- --- /home/ninez/android/marlin/kernel/sched/rt.c 2018-08-10 01:54:08.566728454 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/rt.c 2018-08-26 16:43:11.650539699 -0400
- @@ -8,10 +8,9 @@
- #include <linux/interrupt.h>
- #include <linux/slab.h>
- #include <linux/irq_work.h>
- -#include <linux/hrtimer.h>
- #include "walt.h"
- -#include "tune.h"
- +#include "tune.h"
- int sched_rr_timeslice = RR_TIMESLICE;
- @@ -69,11 +68,7 @@
- raw_spin_unlock(&rt_b->rt_runtime_lock);
- }
- -#ifdef CONFIG_SMP
- -static void push_irq_work_func(struct irq_work *work);
- -#endif
- -
- -void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
- +void init_rt_rq(struct rt_rq *rt_rq)
- {
- struct rt_prio_array *array;
- int i;
- @@ -92,13 +87,6 @@
- rt_rq->rt_nr_migratory = 0;
- rt_rq->overloaded = 0;
- plist_head_init(&rt_rq->pushable_tasks);
- -
- -#ifdef HAVE_RT_PUSH_IPI
- - rt_rq->push_flags = 0;
- - rt_rq->push_cpu = nr_cpu_ids;
- - raw_spin_lock_init(&rt_rq->push_lock);
- - init_irq_work(&rt_rq->push_work, push_irq_work_func);
- -#endif
- #endif /* CONFIG_SMP */
- /* We start is dequeued state, because no RT tasks are queued */
- rt_rq->rt_queued = 0;
- @@ -214,7 +202,7 @@
- if (!rt_se)
- goto err_free_rq;
- - init_rt_rq(rt_rq, cpu_rq(i));
- + init_rt_rq(rt_rq);
- rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
- init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
- }
- @@ -331,7 +319,7 @@
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
- rt_rq->rt_nr_total++;
- - if (p->nr_cpus_allowed > 1)
- + if (tsk_nr_cpus_allowed(p) > 1)
- rt_rq->rt_nr_migratory++;
- update_rt_migration(rt_rq);
- @@ -348,7 +336,7 @@
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
- rt_rq->rt_nr_total--;
- - if (p->nr_cpus_allowed > 1)
- + if (tsk_nr_cpus_allowed(p) > 1)
- rt_rq->rt_nr_migratory--;
- update_rt_migration(rt_rq);
- @@ -370,14 +358,12 @@
- if (!has_pushable_tasks(rq))
- return;
- - queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu),
- - push_rt_tasks);
- + queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
- }
- static inline void queue_pull_task(struct rq *rq)
- {
- - queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu),
- - pull_rt_task);
- + queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
- }
- static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
- @@ -443,7 +429,7 @@
- static inline int on_rt_rq(struct sched_rt_entity *rt_se)
- {
- - return !list_empty(&rt_se->run_list);
- + return rt_se->on_rq;
- }
- #ifdef CONFIG_RT_GROUP_SCHED
- @@ -489,8 +475,8 @@
- return rt_se->my_q;
- }
- -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
- -static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
- +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
- +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
- static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
- {
- @@ -506,7 +492,7 @@
- if (!rt_se)
- enqueue_top_rt_rq(rt_rq);
- else if (!on_rt_rq(rt_se))
- - enqueue_rt_entity(rt_se, false);
- + enqueue_rt_entity(rt_se, 0);
- if (rt_rq->highest_prio.curr < curr->prio)
- resched_curr(rq);
- @@ -523,7 +509,7 @@
- if (!rt_se)
- dequeue_top_rt_rq(rt_rq);
- else if (on_rt_rq(rt_se))
- - dequeue_rt_entity(rt_se);
- + dequeue_rt_entity(rt_se, 0);
- }
- static inline int rt_rq_throttled(struct rt_rq *rt_rq)
- @@ -630,6 +616,22 @@
- #endif /* CONFIG_RT_GROUP_SCHED */
- +static inline void unthrottle_rt_rq(struct rt_rq *rt_rq)
- +{
- + rt_rq->rt_time = 0;
- + rt_rq->rt_throttled = 0;
- + sched_rt_rq_enqueue(rt_rq);
- +}
- +
- +int try_to_unthrottle_rt_rq(struct rt_rq *rt_rq)
- +{
- + if (rt_rq_throttled(rt_rq)) {
- + unthrottle_rt_rq(rt_rq);
- + return 1;
- + }
- + return 0;
- +}
- +
- bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
- {
- struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
- @@ -642,11 +644,11 @@
- /*
- * We ran out of runtime, see if we can borrow some from our neighbours.
- */
- -static int do_balance_runtime(struct rt_rq *rt_rq)
- +static void do_balance_runtime(struct rt_rq *rt_rq)
- {
- struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
- struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
- - int i, weight, more = 0;
- + int i, weight;
- u64 rt_period;
- weight = cpumask_weight(rd->span);
- @@ -680,7 +682,6 @@
- diff = rt_period - rt_rq->rt_runtime;
- iter->rt_runtime -= diff;
- rt_rq->rt_runtime += diff;
- - more = 1;
- if (rt_rq->rt_runtime == rt_period) {
- raw_spin_unlock(&iter->rt_runtime_lock);
- break;
- @@ -690,8 +691,6 @@
- raw_spin_unlock(&iter->rt_runtime_lock);
- }
- raw_spin_unlock(&rt_b->rt_runtime_lock);
- -
- - return more;
- }
- /*
- @@ -803,26 +802,19 @@
- }
- }
- -static int balance_runtime(struct rt_rq *rt_rq)
- +static void balance_runtime(struct rt_rq *rt_rq)
- {
- - int more = 0;
- -
- if (!sched_feat(RT_RUNTIME_SHARE))
- - return more;
- + return;
- if (rt_rq->rt_time > rt_rq->rt_runtime) {
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- - more = do_balance_runtime(rt_rq);
- + do_balance_runtime(rt_rq);
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- }
- -
- - return more;
- }
- #else /* !CONFIG_SMP */
- -static inline int balance_runtime(struct rt_rq *rt_rq)
- -{
- - return 0;
- -}
- +static inline void balance_runtime(struct rt_rq *rt_rq) {}
- #endif /* CONFIG_SMP */
- static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
- @@ -848,6 +840,17 @@
- int enqueue = 0;
- struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
- struct rq *rq = rq_of_rt_rq(rt_rq);
- + int skip;
- +
- + /*
- + * When span == cpu_online_mask, taking each rq->lock
- + * can be time-consuming. Try to avoid it when possible.
- + */
- + raw_spin_lock(&rt_rq->rt_runtime_lock);
- + skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
- + raw_spin_unlock(&rt_rq->rt_runtime_lock);
- + if (skip)
- + continue;
- raw_spin_lock(&rq->lock);
- update_rq_clock(rq);
- @@ -865,11 +868,14 @@
- enqueue = 1;
- /*
- - * Force a clock update if the CPU was idle,
- - * lest wakeup -> unthrottle time accumulate.
- + * When we're idle and a woken (rt) task is
- + * throttled check_preempt_curr() will set
- + * skip_update and the time between the wakeup
- + * and this unthrottle will get accounted as
- + * 'runtime'.
- */
- if (rt_rq->rt_nr_running && rq->curr == rq->idle)
- - rq->skip_clock_update = -1;
- + rq_clock_skip_update(rq, false);
- }
- if (rt_rq->rt_time || rt_rq->rt_nr_running)
- idle = 0;
- @@ -973,8 +979,22 @@
- * but accrue some time due to boosting.
- */
- if (likely(rt_b->rt_runtime)) {
- +
- static bool once = false;
- + if (sched_feat(RT_RUNTIME_GREED)) {
- + struct rq *rq = rq_of_rt_rq(rt_rq);
- + /*
- + * If there is no other tasks able to run
- + * on this rq, lets be greed and reset our
- + * rt_time.
- + */
- + if (rq->nr_running == rt_rq->rt_nr_running) {
- + rt_rq->rt_time = 0;
- + return 0;
- + }
- + }
- +
- rt_rq->rt_throttled = 1;
- if (!once) {
- @@ -999,73 +1019,6 @@
- return 0;
- }
- -/* TODO: Make configurable */
- -#define RT_SCHEDTUNE_INTERVAL 50000000ULL
- -
- -static void sched_rt_update_capacity_req(struct rq *rq, bool tick);
- -
- -static enum hrtimer_restart rt_schedtune_timer(struct hrtimer *timer)
- -{
- - struct sched_rt_entity *rt_se = container_of(timer,
- - struct sched_rt_entity,
- - schedtune_timer);
- - struct task_struct *p = rt_task_of(rt_se);
- - struct rq *rq = task_rq(p);
- -
- - raw_spin_lock(&rq->lock);
- -
- - /*
- - * Nothing to do if:
- - * - task has switched runqueues
- - * - task isn't RT anymore
- - */
- - if (rq != task_rq(p) || (p->sched_class != &rt_sched_class))
- - goto out;
- -
- - /*
- - * If task got enqueued back during callback time, it means we raced
- - * with the enqueue on another cpu, that's Ok, just do nothing as
- - * enqueue path would have tried to cancel us and we shouldn't run
- - * Also check the schedtune_enqueued flag as class-switch on a
- - * sleeping task may have already canceled the timer and done dq
- - */
- - if (p->on_rq || rt_se->schedtune_enqueued == false)
- - goto out;
- -
- - /*
- - * RT task is no longer active, cancel boost
- - */
- - rt_se->schedtune_enqueued = false;
- - schedtune_dequeue_task(p, cpu_of(rq));
- - sched_rt_update_capacity_req(rq, false);
- -out:
- - raw_spin_unlock(&rq->lock);
- -
- - /*
- - * This can free the task_struct if no more references.
- - */
- - put_task_struct(p);
- -
- - return HRTIMER_NORESTART;
- -}
- -
- -void init_rt_schedtune_timer(struct sched_rt_entity *rt_se)
- -{
- - struct hrtimer *timer = &rt_se->schedtune_timer;
- -
- - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- - timer->function = rt_schedtune_timer;
- - rt_se->schedtune_enqueued = false;
- -}
- -
- -static void start_schedtune_timer(struct sched_rt_entity *rt_se)
- -{
- - struct hrtimer *timer = &rt_se->schedtune_timer;
- -
- - hrtimer_start(timer, ns_to_ktime(RT_SCHEDTUNE_INTERVAL),
- - HRTIMER_MODE_REL_PINNED);
- -}
- -
- /*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
- @@ -1083,6 +1036,9 @@
- if (unlikely((s64)delta_exec <= 0))
- return;
- + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
- + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
- +
- schedstat_set(curr->se.statistics.exec_max,
- max(curr->se.statistics.exec_max, delta_exec));
- @@ -1276,12 +1232,27 @@
- }
- static inline
- +unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
- +{
- + struct rt_rq *group_rq = group_rt_rq(rt_se);
- + struct task_struct *tsk;
- +
- + if (group_rq)
- + return group_rq->rr_nr_running;
- +
- + tsk = rt_task_of(rt_se);
- +
- + return (tsk->policy == SCHED_RR) ? 1 : 0;
- +}
- +
- +static inline
- void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
- {
- int prio = rt_se_prio(rt_se);
- WARN_ON(!rt_prio(prio));
- rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
- + rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
- inc_rt_prio(rt_rq, prio);
- inc_rt_migration(rt_se, rt_rq);
- @@ -1294,13 +1265,37 @@
- WARN_ON(!rt_prio(rt_se_prio(rt_se)));
- WARN_ON(!rt_rq->rt_nr_running);
- rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
- + rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
- dec_rt_prio(rt_rq, rt_se_prio(rt_se));
- dec_rt_migration(rt_se, rt_rq);
- dec_rt_group(rt_se, rt_rq);
- }
- -static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
- +/*
- + * Change rt_se->run_list location unless SAVE && !MOVE
- + *
- + * assumes ENQUEUE/DEQUEUE flags match
- + */
- +static inline bool move_entity(unsigned int flags)
- +{
- + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
- + return false;
- +
- + return true;
- +}
- +
- +static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
- +{
- + list_del_init(&rt_se->run_list);
- +
- + if (list_empty(array->queue + rt_se_prio(rt_se)))
- + __clear_bit(rt_se_prio(rt_se), array->bitmap);
- +
- + rt_se->on_list = 0;
- +}
- +
- +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
- {
- struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
- struct rt_prio_array *array = &rt_rq->active;
- @@ -1313,26 +1308,37 @@
- * get throttled and the current group doesn't have any other
- * active members.
- */
- - if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
- + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
- + if (rt_se->on_list)
- + __delist_rt_entity(rt_se, array);
- return;
- + }
- - if (head)
- - list_add(&rt_se->run_list, queue);
- - else
- - list_add_tail(&rt_se->run_list, queue);
- - __set_bit(rt_se_prio(rt_se), array->bitmap);
- + if (move_entity(flags)) {
- + WARN_ON_ONCE(rt_se->on_list);
- + if (flags & ENQUEUE_HEAD)
- + list_add(&rt_se->run_list, queue);
- + else
- + list_add_tail(&rt_se->run_list, queue);
- +
- + __set_bit(rt_se_prio(rt_se), array->bitmap);
- + rt_se->on_list = 1;
- + }
- + rt_se->on_rq = 1;
- inc_rt_tasks(rt_se, rt_rq);
- }
- -static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
- +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
- {
- struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
- struct rt_prio_array *array = &rt_rq->active;
- - list_del_init(&rt_se->run_list);
- - if (list_empty(array->queue + rt_se_prio(rt_se)))
- - __clear_bit(rt_se_prio(rt_se), array->bitmap);
- + if (move_entity(flags)) {
- + WARN_ON_ONCE(!rt_se->on_list);
- + __delist_rt_entity(rt_se, array);
- + }
- + rt_se->on_rq = 0;
- dec_rt_tasks(rt_se, rt_rq);
- }
- @@ -1341,7 +1347,7 @@
- * Because the prio of an upper entry depends on the lower
- * entries, we must remove entries top - down.
- */
- -static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
- +static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
- {
- struct sched_rt_entity *back = NULL;
- @@ -1354,35 +1360,64 @@
- for (rt_se = back; rt_se; rt_se = rt_se->back) {
- if (on_rt_rq(rt_se))
- - __dequeue_rt_entity(rt_se);
- + __dequeue_rt_entity(rt_se, flags);
- }
- }
- -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
- +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
- {
- struct rq *rq = rq_of_rt_se(rt_se);
- - dequeue_rt_stack(rt_se);
- + dequeue_rt_stack(rt_se, flags);
- for_each_sched_rt_entity(rt_se)
- - __enqueue_rt_entity(rt_se, head);
- + __enqueue_rt_entity(rt_se, flags);
- enqueue_top_rt_rq(&rq->rt);
- }
- -static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
- +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
- {
- struct rq *rq = rq_of_rt_se(rt_se);
- - dequeue_rt_stack(rt_se);
- + dequeue_rt_stack(rt_se, flags);
- for_each_sched_rt_entity(rt_se) {
- struct rt_rq *rt_rq = group_rt_rq(rt_se);
- if (rt_rq && rt_rq->rt_nr_running)
- - __enqueue_rt_entity(rt_se, false);
- + __enqueue_rt_entity(rt_se, flags);
- }
- enqueue_top_rt_rq(&rq->rt);
- }
- +static void sched_rt_update_capacity_req(struct rq *rq)
- +{
- + u64 total, used, age_stamp, avg;
- + s64 delta;
- +
- + if (!sched_freq())
- + return;
- +
- + sched_avg_update(rq);
- + /*
- + * Since we're reading these variables without serialization make sure
- + * we read them once before doing sanity checks on them.
- + */
- + age_stamp = READ_ONCE(rq->age_stamp);
- + avg = READ_ONCE(rq->rt_avg);
- + delta = rq_clock(rq) - age_stamp;
- +
- + if (unlikely(delta < 0))
- + delta = 0;
- +
- + total = sched_avg_period() + delta;
- +
- + used = div_u64(avg, total);
- + if (unlikely(used > SCHED_CAPACITY_SCALE))
- + used = SCHED_CAPACITY_SCALE;
- +
- + set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used));
- +}
- +
- /*
- * Adding/removing a task to/from a priority array:
- */
- @@ -1391,65 +1426,37 @@
- {
- struct sched_rt_entity *rt_se = &p->rt;
- +#ifdef CONFIG_SMP
- + schedtune_enqueue_task(p, cpu_of(rq));
- +#endif
- +
- if (flags & ENQUEUE_WAKEUP)
- rt_se->timeout = 0;
- - enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
- + enqueue_rt_entity(rt_se, flags);
- walt_inc_cumulative_runnable_avg(rq, p);
- - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) {
- + if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
- enqueue_pushable_task(rq, p);
- - }
- - if (!schedtune_task_boost(p))
- - return;
- - /*
- - * If schedtune timer is active, that means a boost was already
- - * done, just cancel the timer so that deboost doesn't happen.
- - * Otherwise, increase the boost. If an enqueued timer was
- - * cancelled, put the task reference.
- - */
- - if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
- - put_task_struct(p);
- -
- - /*
- - * schedtune_enqueued can be true in the following situation:
- - * enqueue_task_rt grabs rq lock before timer fires
- - * or before its callback acquires rq lock
- - * schedtune_enqueued can be false if timer callback is running
- - * and timer just released rq lock, or if the timer finished
- - * running and canceling the boost
- - */
- - if (rt_se->schedtune_enqueued == true)
- - return;
- -
- - rt_se->schedtune_enqueued = true;
- - schedtune_enqueue_task(p, cpu_of(rq));
- - sched_rt_update_capacity_req(rq, false);
- + sched_rt_update_capacity_req(rq);
- }
- static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
- {
- struct sched_rt_entity *rt_se = &p->rt;
- +#ifdef CONFIG_SMP
- + schedtune_dequeue_task(p, cpu_of(rq));
- +#endif
- +
- update_curr_rt(rq);
- - dequeue_rt_entity(rt_se);
- + dequeue_rt_entity(rt_se, flags);
- walt_dec_cumulative_runnable_avg(rq, p);
- dequeue_pushable_task(rq, p);
- - if (rt_se->schedtune_enqueued == false)
- - return;
- -
- - if (flags == DEQUEUE_SLEEP) {
- - get_task_struct(p);
- - start_schedtune_timer(rt_se);
- - return;
- - }
- -
- - rt_se->schedtune_enqueued = false;
- - schedtune_dequeue_task(p, cpu_of(rq));
- - sched_rt_update_capacity_req(rq, false);
- + sched_rt_update_capacity_req(rq);
- }
- /*
- @@ -1499,20 +1506,6 @@
- return !!((pc & SOFTIRQ_MASK)>= SOFTIRQ_DISABLE_OFFSET);
- }
- -static bool is_top_app_cpu(int cpu)
- -{
- - bool boosted = (schedtune_cpu_boost(cpu) > 0);
- -
- - return boosted;
- -}
- -
- -static bool is_top_app(struct task_struct *cur)
- -{
- - bool boosted = (schedtune_task_boost(cur) > 0);
- -
- - return boosted;
- -}
- -
- /*
- * Return whether the task on the given cpu is currently non-preemptible
- * while handling a potentially long softint, or if the task is likely
- @@ -1527,14 +1520,8 @@
- struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu);
- int task_pc = 0;
- - if (task) {
- - if (is_top_app(task))
- - return true;
- + if (task)
- task_pc = task_preempt_count(task);
- - }
- -
- - if (is_top_app_cpu(cpu))
- - return true;
- if (softirq_masked(task_pc))
- return true;
- @@ -1544,37 +1531,12 @@
- task_pc & SOFTIRQ_MASK));
- }
- -static void schedtune_dequeue_rt(struct rq *rq, struct task_struct *p)
- -{
- - struct sched_rt_entity *rt_se = &p->rt;
- -
- - BUG_ON(!raw_spin_is_locked(&rq->lock));
- -
- - if (rt_se->schedtune_enqueued == false)
- - return;
- -
- - /*
- - * Incase of class change cancel any active timers. Otherwise, increase
- - * the boost. If an enqueued timer was cancelled, put the task ref.
- - */
- - if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
- - put_task_struct(p);
- -
- - /* schedtune_enqueued is true, deboost it */
- - rt_se->schedtune_enqueued = false;
- - schedtune_dequeue_task(p, task_cpu(p));
- - sched_rt_update_capacity_req(rq, false);
- -}
- -
- static int
- -select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
- +select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
- + int sibling_count_hint)
- {
- struct task_struct *curr;
- struct rq *rq;
- - bool may_not_preempt;
- -
- - if (p->nr_cpus_allowed == 1)
- - goto out;
- /* For anything but wake ups, just return the task_cpu */
- if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
- @@ -1586,12 +1548,7 @@
- curr = READ_ONCE(rq->curr); /* unlocked access */
- /*
- - * If the current task on @p's runqueue is a softirq task,
- - * it may run without preemption for a time that is
- - * ill-suited for a waiting RT task. Therefore, try to
- - * wake this RT task on another runqueue.
- - *
- - * Also, if the current task on @p's runqueue is an RT task, then
- + * If the current task on @p's runqueue is an RT task, then
- * try to see if we can wake this RT task up on another
- * runqueue. Otherwise simply start this RT task
- * on its current runqueue.
- @@ -1612,54 +1569,43 @@
- * This test is optimistic, if we get it wrong the load-balancer
- * will have to sort it out.
- */
- - may_not_preempt = task_may_not_preempt(curr, cpu);
- - if (curr && (may_not_preempt ||
- - (unlikely(rt_task(curr)) &&
- - (curr->nr_cpus_allowed < 2 ||
- - curr->prio <= p->prio)))) {
- + if (curr && unlikely(rt_task(curr)) &&
- + (tsk_nr_cpus_allowed(curr) < 2 ||
- + curr->prio <= p->prio)) {
- int target = find_lowest_rq(p);
- +
- /*
- - * If cpu is non-preemptible, prefer remote cpu
- - * even if it's running a higher-prio task.
- - * Otherwise: Possible race. Don't bother moving it if the
- - * destination CPU is not running a lower priority task.
- + * Don't bother moving it if the destination CPU is
- + * not running a lower priority task.
- */
- if (target != -1 &&
- - (may_not_preempt ||
- - p->prio < cpu_rq(target)->rt.highest_prio.curr))
- + p->prio < cpu_rq(target)->rt.highest_prio.curr)
- cpu = target;
- }
- rcu_read_unlock();
- out:
- - /*
- - * If previous CPU was different, make sure to cancel any active
- - * schedtune timers and deboost.
- - */
- - if (task_cpu(p) != cpu) {
- - unsigned long fl;
- - struct rq *prq = task_rq(p);
- -
- - raw_spin_lock_irqsave(&prq->lock, fl);
- - schedtune_dequeue_rt(prq, p);
- - raw_spin_unlock_irqrestore(&prq->lock, fl);
- - }
- -
- return cpu;
- }
- static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
- {
- - if (rq->curr->nr_cpus_allowed == 1)
- + /*
- + * Current can't be migrated, useless to reschedule,
- + * let's hope p can move out.
- + */
- + if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
- + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
- return;
- - if (p->nr_cpus_allowed != 1
- + /*
- + * p is migratable, so let's not schedule it and
- + * see if it is pushed or pulled somewhere else.
- + */
- + if (tsk_nr_cpus_allowed(p) != 1
- && cpupri_find(&rq->rd->cpupri, p, NULL))
- return;
- - if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
- - return;
- -
- /*
- * There appears to be other cpus that can accept
- * current and none to run 'p', so lets reschedule
- @@ -1699,61 +1645,6 @@
- #endif
- }
- -#ifdef CONFIG_SMP
- -
- -static void sched_rt_update_capacity_req(struct rq *rq, bool tick)
- -{
- - u64 total, used, age_stamp, avg;
- - s64 delta;
- - int cpu = cpu_of(rq);
- -
- - if (!sched_freq())
- - return;
- -
- -#ifdef CONFIG_SCHED_WALT
- - if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
- - unsigned long cpu_utilization = boosted_cpu_util(cpu);
- - unsigned long capacity_curr = capacity_curr_of(cpu);
- - int req = 1;
- -
- - /*
- - * During a tick, we don't throttle frequency down, just update
- - * the rt utilization.
- - */
- - if (tick && cpu_utilization <= capacity_curr)
- - req = 0;
- -
- - set_rt_cpu_capacity(cpu, req, cpu_utilization);
- -
- - return;
- - }
- -#endif
- - sched_avg_update(rq);
- - /*
- - * Since we're reading these variables without serialization make sure
- - * we read them once before doing sanity checks on them.
- - */
- - age_stamp = READ_ONCE(rq->age_stamp);
- - avg = READ_ONCE(rq->rt_avg);
- - delta = rq_clock(rq) - age_stamp;
- -
- - if (unlikely(delta < 0))
- - delta = 0;
- -
- - total = sched_avg_period() + delta;
- -
- - used = div_u64(avg, total);
- - if (unlikely(used > SCHED_CAPACITY_SCALE))
- - used = SCHED_CAPACITY_SCALE;
- -
- - set_rt_cpu_capacity(cpu, 1, (unsigned long)(used));
- -}
- -#else
- -static inline void sched_rt_update_capacity_req(struct rq *rq, bool tick)
- -{ }
- -
- -#endif
- -
- static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
- struct rt_rq *rt_rq)
- {
- @@ -1790,13 +1681,21 @@
- }
- static struct task_struct *
- -pick_next_task_rt(struct rq *rq, struct task_struct *prev)
- +pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
- {
- struct task_struct *p;
- struct rt_rq *rt_rq = &rq->rt;
- if (need_pull_rt_task(rq, prev)) {
- + /*
- + * This is OK, because current is on_cpu, which avoids it being
- + * picked for load-balance and preemption/IRQs are still
- + * disabled avoiding further scheduler activity on it and we're
- + * being very careful to re-start the picking loop.
- + */
- + lockdep_unpin_lock(&rq->lock, cookie);
- pull_rt_task(rq);
- + lockdep_repin_lock(&rq->lock, cookie);
- /*
- * pull_rt_task() can drop (and re-acquire) rq->lock; this
- * means a dl or stop task can slip in, in which case we need
- @@ -1822,7 +1721,7 @@
- * This value will be the used as an estimation of the next
- * activity.
- */
- - sched_rt_update_capacity_req(rq, false);
- + sched_rt_update_capacity_req(rq);
- return NULL;
- }
- @@ -1846,7 +1745,7 @@
- * The previous task needs to be made eligible for pushing
- * if it is still active
- */
- - if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
- + if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
- enqueue_pushable_task(rq, p);
- }
- @@ -1896,7 +1795,7 @@
- if (unlikely(!lowest_mask))
- return -1;
- - if (task->nr_cpus_allowed == 1)
- + if (tsk_nr_cpus_allowed(task) == 1)
- return -1; /* No other targets possible */
- if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
- @@ -1956,9 +1855,7 @@
- cpu = cpumask_any(lowest_mask);
- if (cpu < nr_cpu_ids)
- return cpu;
- -
- - cpu = -1;
- - return cpu;
- + return -1;
- }
- /* Will lock the rq it finds */
- @@ -1986,6 +1883,16 @@
- break;
- }
- + if (lowest_rq->rt.highest_prio.curr <= task->prio) {
- + /*
- + * Target rq has tasks of equal or higher priority,
- + * retrying does not release any lock and is unlikely
- + * to yield a different result.
- + */
- + lowest_rq = NULL;
- + break;
- + }
- +
- /* if the prio of this runqueue changed, try again */
- if (double_lock_balance(rq, lowest_rq)) {
- /*
- @@ -1998,6 +1905,7 @@
- !cpumask_test_cpu(lowest_rq->cpu,
- tsk_cpus_allowed(task)) ||
- task_running(rq, task) ||
- + !rt_task(task) ||
- !task_on_rq_queued(task))) {
- double_unlock_balance(rq, lowest_rq);
- @@ -2030,7 +1938,7 @@
- BUG_ON(rq->cpu != task_cpu(p));
- BUG_ON(task_current(rq, p));
- - BUG_ON(p->nr_cpus_allowed <= 1);
- + BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
- BUG_ON(!task_on_rq_queued(p));
- BUG_ON(!rt_task(p));
- @@ -2111,7 +2019,9 @@
- }
- deactivate_task(rq, next_task, 0);
- + next_task->on_rq = TASK_ON_RQ_MIGRATING;
- set_task_cpu(next_task, lowest_rq->cpu);
- + next_task->on_rq = TASK_ON_RQ_QUEUED;
- activate_task(lowest_rq, next_task, 0);
- ret = 1;
- @@ -2133,160 +2043,172 @@
- }
- #ifdef HAVE_RT_PUSH_IPI
- +
- /*
- - * The search for the next cpu always starts at rq->cpu and ends
- - * when we reach rq->cpu again. It will never return rq->cpu.
- - * This returns the next cpu to check, or nr_cpu_ids if the loop
- - * is complete.
- + * When a high priority task schedules out from a CPU and a lower priority
- + * task is scheduled in, a check is made to see if there's any RT tasks
- + * on other CPUs that are waiting to run because a higher priority RT task
- + * is currently running on its CPU. In this case, the CPU with multiple RT
- + * tasks queued on it (overloaded) needs to be notified that a CPU has opened
- + * up that may be able to run one of its non-running queued RT tasks.
- + *
- + * All CPUs with overloaded RT tasks need to be notified as there is currently
- + * no way to know which of these CPUs have the highest priority task waiting
- + * to run. Instead of trying to take a spinlock on each of these CPUs,
- + * which has shown to cause large latency when done on machines with many
- + * CPUs, sending an IPI to the CPUs to have them push off the overloaded
- + * RT tasks waiting to run.
- + *
- + * Just sending an IPI to each of the CPUs is also an issue, as on large
- + * count CPU machines, this can cause an IPI storm on a CPU, especially
- + * if its the only CPU with multiple RT tasks queued, and a large number
- + * of CPUs scheduling a lower priority task at the same time.
- + *
- + * Each root domain has its own irq work function that can iterate over
- + * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
- + * tassk must be checked if there's one or many CPUs that are lowering
- + * their priority, there's a single irq work iterator that will try to
- + * push off RT tasks that are waiting to run.
- + *
- + * When a CPU schedules a lower priority task, it will kick off the
- + * irq work iterator that will jump to each CPU with overloaded RT tasks.
- + * As it only takes the first CPU that schedules a lower priority task
- + * to start the process, the rto_start variable is incremented and if
- + * the atomic result is one, then that CPU will try to take the rto_lock.
- + * This prevents high contention on the lock as the process handles all
- + * CPUs scheduling lower priority tasks.
- + *
- + * All CPUs that are scheduling a lower priority task will increment the
- + * rt_loop_next variable. This will make sure that the irq work iterator
- + * checks all RT overloaded CPUs whenever a CPU schedules a new lower
- + * priority task, even if the iterator is in the middle of a scan. Incrementing
- + * the rt_loop_next will cause the iterator to perform another scan.
- *
- - * rq->rt.push_cpu holds the last cpu returned by this function,
- - * or if this is the first instance, it must hold rq->cpu.
- */
- -static int rto_next_cpu(struct rq *rq)
- +static int rto_next_cpu(struct root_domain *rd)
- {
- - int prev_cpu = rq->rt.push_cpu;
- + int next;
- int cpu;
- - cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
- -
- /*
- - * If the previous cpu is less than the rq's CPU, then it already
- - * passed the end of the mask, and has started from the beginning.
- - * We end if the next CPU is greater or equal to rq's CPU.
- + * When starting the IPI RT pushing, the rto_cpu is set to -1,
- + * rt_next_cpu() will simply return the first CPU found in
- + * the rto_mask.
- + *
- + * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
- + * will return the next CPU found in the rto_mask.
- + *
- + * If there are no more CPUs left in the rto_mask, then a check is made
- + * against rto_loop and rto_loop_next. rto_loop is only updated with
- + * the rto_lock held, but any CPU may increment the rto_loop_next
- + * without any locking.
- */
- - if (prev_cpu < rq->cpu) {
- - if (cpu >= rq->cpu)
- - return nr_cpu_ids;
- + for (;;) {
- - } else if (cpu >= nr_cpu_ids) {
- - /*
- - * We passed the end of the mask, start at the beginning.
- - * If the result is greater or equal to the rq's CPU, then
- - * the loop is finished.
- - */
- - cpu = cpumask_first(rq->rd->rto_mask);
- - if (cpu >= rq->cpu)
- - return nr_cpu_ids;
- - }
- - rq->rt.push_cpu = cpu;
- + /* When rto_cpu is -1 this acts like cpumask_first() */
- + cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
- - /* Return cpu to let the caller know if the loop is finished or not */
- - return cpu;
- -}
- + rd->rto_cpu = cpu;
- -static int find_next_push_cpu(struct rq *rq)
- -{
- - struct rq *next_rq;
- - int cpu;
- + if (cpu < nr_cpu_ids)
- + return cpu;
- - while (1) {
- - cpu = rto_next_cpu(rq);
- - if (cpu >= nr_cpu_ids)
- - break;
- - next_rq = cpu_rq(cpu);
- + rd->rto_cpu = -1;
- +
- + /*
- + * ACQUIRE ensures we see the @rto_mask changes
- + * made prior to the @next value observed.
- + *
- + * Matches WMB in rt_set_overload().
- + */
- + next = atomic_read_acquire(&rd->rto_loop_next);
- - /* Make sure the next rq can push to this rq */
- - if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
- + if (rd->rto_loop == next)
- break;
- +
- + rd->rto_loop = next;
- }
- - return cpu;
- + return -1;
- }
- -#define RT_PUSH_IPI_EXECUTING 1
- -#define RT_PUSH_IPI_RESTART 2
- +static inline bool rto_start_trylock(atomic_t *v)
- +{
- + return !atomic_cmpxchg_acquire(v, 0, 1);
- +}
- -static void tell_cpu_to_push(struct rq *rq)
- +static inline void rto_start_unlock(atomic_t *v)
- {
- - int cpu;
- + atomic_set_release(v, 0);
- +}
- - if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- - raw_spin_lock(&rq->rt.push_lock);
- - /* Make sure it's still executing */
- - if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- - /*
- - * Tell the IPI to restart the loop as things have
- - * changed since it started.
- - */
- - rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
- - raw_spin_unlock(&rq->rt.push_lock);
- - return;
- - }
- - raw_spin_unlock(&rq->rt.push_lock);
- - }
- +static void tell_cpu_to_push(struct rq *rq)
- +{
- + int cpu = -1;
- - /* When here, there's no IPI going around */
- + /* Keep the loop going if the IPI is currently active */
- + atomic_inc(&rq->rd->rto_loop_next);
- - rq->rt.push_cpu = rq->cpu;
- - cpu = find_next_push_cpu(rq);
- - if (cpu >= nr_cpu_ids)
- + /* Only one CPU can initiate a loop at a time */
- + if (!rto_start_trylock(&rq->rd->rto_loop_start))
- return;
- - rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
- + raw_spin_lock(&rq->rd->rto_lock);
- - irq_work_queue_on(&rq->rt.push_work, cpu);
- + /*
- + * The rto_cpu is updated under the lock, if it has a valid cpu
- + * then the IPI is still running and will continue due to the
- + * update to loop_next, and nothing needs to be done here.
- + * Otherwise it is finishing up and an ipi needs to be sent.
- + */
- + if (rq->rd->rto_cpu < 0)
- + cpu = rto_next_cpu(rq->rd);
- +
- + raw_spin_unlock(&rq->rd->rto_lock);
- +
- + rto_start_unlock(&rq->rd->rto_loop_start);
- +
- + if (cpu >= 0) {
- + /* Make sure the rd does not get freed while pushing */
- + sched_get_rd(rq->rd);
- + irq_work_queue_on(&rq->rd->rto_push_work, cpu);
- + }
- }
- /* Called from hardirq context */
- -static void try_to_push_tasks(void *arg)
- +void rto_push_irq_work_func(struct irq_work *work)
- {
- - struct rt_rq *rt_rq = arg;
- - struct rq *rq, *src_rq;
- - int this_cpu;
- + struct root_domain *rd =
- + container_of(work, struct root_domain, rto_push_work);
- + struct rq *rq;
- int cpu;
- - this_cpu = rt_rq->push_cpu;
- -
- - /* Paranoid check */
- - BUG_ON(this_cpu != smp_processor_id());
- + rq = this_rq();
- - rq = cpu_rq(this_cpu);
- - src_rq = rq_of_rt_rq(rt_rq);
- -
- -again:
- + /*
- + * We do not need to grab the lock to check for has_pushable_tasks.
- + * When it gets updated, a check is made if a push is possible.
- + */
- if (has_pushable_tasks(rq)) {
- raw_spin_lock(&rq->lock);
- - push_rt_task(rq);
- + push_rt_tasks(rq);
- raw_spin_unlock(&rq->lock);
- }
- - /* Pass the IPI to the next rt overloaded queue */
- - raw_spin_lock(&rt_rq->push_lock);
- - /*
- - * If the source queue changed since the IPI went out,
- - * we need to restart the search from that CPU again.
- - */
- - if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
- - rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
- - rt_rq->push_cpu = src_rq->cpu;
- - }
- + raw_spin_lock(&rd->rto_lock);
- - cpu = find_next_push_cpu(src_rq);
- + /* Pass the IPI to the next rt overloaded queue */
- + cpu = rto_next_cpu(rd);
- - if (cpu >= nr_cpu_ids)
- - rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
- - raw_spin_unlock(&rt_rq->push_lock);
- + raw_spin_unlock(&rd->rto_lock);
- - if (cpu >= nr_cpu_ids)
- + if (cpu < 0) {
- + sched_put_rd(rd);
- return;
- -
- - /*
- - * It is possible that a restart caused this CPU to be
- - * chosen again. Don't bother with an IPI, just see if we
- - * have more to push.
- - */
- - if (unlikely(cpu == rq->cpu))
- - goto again;
- + }
- /* Try the next RT overloaded CPU */
- - irq_work_queue_on(&rt_rq->push_work, cpu);
- -}
- -
- -static void push_irq_work_func(struct irq_work *work)
- -{
- - struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
- -
- - try_to_push_tasks(rt_rq);
- + irq_work_queue_on(&rd->rto_push_work, cpu);
- }
- #endif /* HAVE_RT_PUSH_IPI */
- @@ -2296,8 +2218,9 @@
- bool resched = false;
- struct task_struct *p;
- struct rq *src_rq;
- + int rt_overload_count = rt_overloaded(this_rq);
- - if (likely(!rt_overloaded(this_rq)))
- + if (likely(!rt_overload_count))
- return;
- /*
- @@ -2306,6 +2229,11 @@
- */
- smp_rmb();
- + /* If we are the only overloaded CPU do nothing */
- + if (rt_overload_count == 1 &&
- + cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
- + return;
- +
- #ifdef HAVE_RT_PUSH_IPI
- if (sched_feat(RT_PUSH_IPI)) {
- tell_cpu_to_push(this_rq);
- @@ -2365,7 +2293,9 @@
- resched = true;
- deactivate_task(src_rq, p, 0);
- + p->on_rq = TASK_ON_RQ_MIGRATING;
- set_task_cpu(p, this_cpu);
- + p->on_rq = TASK_ON_RQ_QUEUED;
- activate_task(this_rq, p, 0);
- /*
- * We continue with the search, just in
- @@ -2390,53 +2320,13 @@
- {
- if (!task_running(rq, p) &&
- !test_tsk_need_resched(rq->curr) &&
- - has_pushable_tasks(rq) &&
- - p->nr_cpus_allowed > 1 &&
- + tsk_nr_cpus_allowed(p) > 1 &&
- (dl_task(rq->curr) || rt_task(rq->curr)) &&
- - (rq->curr->nr_cpus_allowed < 2 ||
- + (tsk_nr_cpus_allowed(rq->curr) < 2 ||
- rq->curr->prio <= p->prio))
- push_rt_tasks(rq);
- }
- -static void set_cpus_allowed_rt(struct task_struct *p,
- - const struct cpumask *new_mask)
- -{
- - struct rq *rq;
- - int weight;
- -
- - BUG_ON(!rt_task(p));
- -
- - if (!task_on_rq_queued(p))
- - return;
- -
- - weight = cpumask_weight(new_mask);
- -
- - /*
- - * Only update if the process changes its state from whether it
- - * can migrate or not.
- - */
- - if ((p->nr_cpus_allowed > 1) == (weight > 1))
- - return;
- -
- - rq = task_rq(p);
- -
- - /*
- - * The process used to be able to migrate OR it can now migrate
- - */
- - if (weight <= 1) {
- - if (!task_current(rq, p))
- - dequeue_pushable_task(rq, p);
- - BUG_ON(!rq->rt.rt_nr_migratory);
- - rq->rt.rt_nr_migratory--;
- - } else {
- - if (!task_current(rq, p))
- - enqueue_pushable_task(rq, p);
- - rq->rt.rt_nr_migratory++;
- - }
- -
- - update_rt_migration(&rq->rt);
- -}
- -
- /* Assumes rq->lock is held */
- static void rq_online_rt(struct rq *rq)
- {
- @@ -2466,13 +2356,6 @@
- static void switched_from_rt(struct rq *rq, struct task_struct *p)
- {
- /*
- - * On class switch from rt, always cancel active schedtune timers,
- - * this handles the cases where we switch class for a task that is
- - * already rt-dequeued but has a running timer.
- - */
- - schedtune_dequeue_rt(rq, p);
- -
- - /*
- * If there are other RT tasks then we will reschedule
- * and the scheduling of the other RT tasks will handle
- * the balancing. But if we are the last RT task
- @@ -2512,7 +2395,7 @@
- */
- if (task_on_rq_queued(p) && rq->curr != p) {
- #ifdef CONFIG_SMP
- - if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
- + if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
- queue_push_tasks(rq);
- #endif /* CONFIG_SMP */
- if (p->prio < rq->curr->prio)
- @@ -2590,7 +2473,7 @@
- update_curr_rt(rq);
- if (rq->rt.rt_nr_running)
- - sched_rt_update_capacity_req(rq, true);
- + sched_rt_update_capacity_req(rq);
- watchdog(rq, p);
- @@ -2654,7 +2537,7 @@
- #ifdef CONFIG_SMP
- .select_task_rq = select_task_rq_rt,
- - .set_cpus_allowed = set_cpus_allowed_rt,
- + .set_cpus_allowed = set_cpus_allowed_common,
- .rq_online = rq_online_rt,
- .rq_offline = rq_offline_rt,
- .task_woken = task_woken_rt,
- diff -Nur /home/ninez/android/marlin/kernel/sched/sched.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/sched.h
- --- /home/ninez/android/marlin/kernel/sched/sched.h 2018-08-10 01:54:08.566728454 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/sched.h 2018-08-26 16:43:11.650539699 -0400
- @@ -1,3 +1,4 @@
- +
- #include <linux/sched.h>
- #include <linux/sched/sysctl.h>
- #include <linux/sched/rt.h>
- @@ -13,6 +14,12 @@
- #include "cpudeadline.h"
- #include "cpuacct.h"
- +#ifdef CONFIG_SCHED_DEBUG
- +#define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
- +#else
- +#define SCHED_WARN_ON(x) ((void)(x))
- +#endif
- +
- struct rq;
- struct cpuidle_state;
- @@ -34,6 +41,12 @@
- static inline void update_cpu_load_active(struct rq *this_rq) { }
- #endif
- +#ifdef CONFIG_SCHED_SMT
- +extern void update_idle_core(struct rq *rq);
- +#else
- +static inline void update_idle_core(struct rq *rq) { }
- +#endif
- +
- /*
- * Helpers for converting nanosecond timing to jiffy resolution
- */
- @@ -47,23 +60,30 @@
- * and does not change the user-interface for setting shares/weights.
- *
- * We increase resolution only if we have enough bits to allow this increased
- - * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
- - * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
- - * increased costs.
- - */
- -#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
- -# define SCHED_LOAD_RESOLUTION 10
- -# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
- -# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
- + * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are
- + * pretty high and the returns do not justify the increased costs.
- + *
- + * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to
- + * increase coverage and consistency always enable it on 64bit platforms.
- + */
- +#ifdef CONFIG_64BIT
- +# define SCHED_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
- +# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
- +# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT)
- #else
- -# define SCHED_LOAD_RESOLUTION 0
- +# define SCHED_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
- # define scale_load(w) (w)
- # define scale_load_down(w) (w)
- #endif
- -#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
- #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
- +/*
- + * NICE_0's weight (visible to users) and its load (invisible to users) have
- + * independent ranges, but they should be well calibrated. We use scale_load()
- + * and scale_load_down(w) to convert between them, and the following must be true:
- + * scale_load(sched_prio_to_weight[20]) == NICE_0_LOAD
- + */
- #define NICE_0_LOAD SCHED_LOAD_SCALE
- #define NICE_0_SHIFT SCHED_LOAD_SHIFT
- @@ -83,6 +103,10 @@
- */
- #define RUNTIME_INF ((u64)~0ULL)
- +static inline int idle_policy(int policy)
- +{
- + return policy == SCHED_IDLE;
- +}
- static inline int fair_policy(int policy)
- {
- return policy == SCHED_NORMAL || policy == SCHED_BATCH;
- @@ -97,6 +121,11 @@
- {
- return policy == SCHED_DEADLINE;
- }
- +static inline bool valid_policy(int policy)
- +{
- + return idle_policy(policy) || fair_policy(policy) ||
- + rt_policy(policy) || dl_policy(policy);
- +}
- static inline int task_has_rt_policy(struct task_struct *p)
- {
- @@ -108,11 +137,6 @@
- return dl_policy(p->policy);
- }
- -static inline bool dl_time_before(u64 a, u64 b)
- -{
- - return (s64)(a - b) < 0;
- -}
- -
- /*
- * Tells if entity @a should preempt entity @b.
- */
- @@ -183,6 +207,25 @@
- u64 bw, total_bw;
- };
- +static inline
- +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
- +{
- + dl_b->total_bw -= tsk_bw;
- +}
- +
- +static inline
- +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
- +{
- + dl_b->total_bw += tsk_bw;
- +}
- +
- +static inline
- +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
- +{
- + return dl_b->bw != -1 &&
- + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
- +}
- +
- extern struct mutex sched_domains_mutex;
- #ifdef CONFIG_CGROUP_SCHED
- @@ -365,6 +408,7 @@
- unsigned long runnable_load_avg;
- #ifdef CONFIG_FAIR_GROUP_SCHED
- unsigned long tg_load_avg_contrib;
- + unsigned long propagate_avg;
- #endif
- atomic_long_t removed_load_avg, removed_util_avg;
- #ifndef CONFIG_64BIT
- @@ -422,7 +466,7 @@
- }
- /* RT IPI pull logic requires IRQ_WORK */
- -#ifdef CONFIG_IRQ_WORK
- +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
- # define HAVE_RT_PUSH_IPI
- #endif
- @@ -430,6 +474,7 @@
- struct rt_rq {
- struct rt_prio_array active;
- unsigned int rt_nr_running;
- + unsigned int rr_nr_running;
- #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- struct {
- int curr; /* highest queued rt task prio */
- @@ -443,12 +488,6 @@
- unsigned long rt_nr_total;
- int overloaded;
- struct plist_head pushable_tasks;
- -#ifdef HAVE_RT_PUSH_IPI
- - int push_flags;
- - int push_cpu;
- - struct irq_work push_work;
- - raw_spinlock_t push_lock;
- -#endif
- #endif /* CONFIG_SMP */
- int rt_queued;
- @@ -466,6 +505,8 @@
- #endif
- };
- +int try_to_unthrottle_rt_rq(struct rt_rq *rt_rq);
- +
- /* Deadline class' related fields in a runqueue */
- struct dl_rq {
- /* runqueue is an rbtree, ordered by deadline */
- @@ -541,6 +582,19 @@
- struct dl_bw dl_bw;
- struct cpudl cpudl;
- +#ifdef HAVE_RT_PUSH_IPI
- + /*
- + * For IPI pull requests, loop across the rto_mask.
- + */
- + struct irq_work rto_push_work;
- + raw_spinlock_t rto_lock;
- + /* These are only updated and read within rto_lock */
- + int rto_loop;
- + int rto_cpu;
- + /* These atomics are updated outside of a lock */
- + atomic_t rto_loop_next;
- + atomic_t rto_loop_start;
- +#endif
- /*
- * The "RT overload" flag: it gets set if a CPU has more than
- * one runnable RT task.
- @@ -550,10 +604,18 @@
- /* Maximum cpu capacity in the system. */
- struct max_cpu_capacity max_cpu_capacity;
- +
- + /* First cpu with maximum and minimum original capacity */
- + int max_cap_orig_cpu, min_cap_orig_cpu;
- };
- extern struct root_domain def_root_domain;
- +extern void sched_get_rd(struct root_domain *rd);
- +extern void sched_put_rd(struct root_domain *rd);
- +#ifdef HAVE_RT_PUSH_IPI
- +extern void rto_push_irq_work_func(struct irq_work *work);
- +#endif
- #endif /* CONFIG_SMP */
- /*
- @@ -587,7 +649,13 @@
- #ifdef CONFIG_NO_HZ_FULL
- unsigned long last_sched_tick;
- #endif
- - int skip_clock_update;
- +
- +#ifdef CONFIG_CPU_QUIET
- + /* time-based average load */
- + u64 nr_last_stamp;
- + u64 nr_running_integral;
- + seqcount_t ave_seqcnt;
- +#endif
- /* capture load from *all* tasks on this cpu: */
- struct load_weight load;
- @@ -601,6 +669,7 @@
- #ifdef CONFIG_FAIR_GROUP_SCHED
- /* list of leaf cfs_rq on this cpu: */
- struct list_head leaf_cfs_rq_list;
- + struct list_head *tmp_alone_branch;
- #endif /* CONFIG_FAIR_GROUP_SCHED */
- /*
- @@ -615,6 +684,7 @@
- unsigned long next_balance;
- struct mm_struct *prev_mm;
- + unsigned int clock_skip_update;
- u64 clock;
- u64 clock_task;
- @@ -633,6 +703,7 @@
- /* For active balancing */
- int active_balance;
- int push_cpu;
- + struct task_struct *push_task;
- struct cpu_stop_work active_balance_work;
- /* cpu of this runqueue: */
- int cpu;
- @@ -651,24 +722,14 @@
- #endif
- #ifdef CONFIG_SCHED_WALT
- - /*
- - * max_freq = user or thermal defined maximum
- - * max_possible_freq = maximum supported by hardware
- - */
- - unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
- - struct cpumask freq_domain_cpumask;
- -
- u64 cumulative_runnable_avg;
- - int efficiency; /* Differentiate cpus with different IPC capability */
- - int load_scale_factor;
- - int capacity;
- - int max_possible_capacity;
- u64 window_start;
- u64 curr_runnable_sum;
- u64 prev_runnable_sum;
- u64 cur_irqload;
- u64 avg_irqload;
- u64 irqload_ts;
- + u64 cum_window_demand;
- #endif /* CONFIG_SCHED_WALT */
- @@ -710,6 +771,8 @@
- /* try_to_wake_up() stats */
- unsigned int ttwu_count;
- unsigned int ttwu_local;
- +
- + struct eas_stats eas_stats;
- #endif
- #ifdef CONFIG_SMP
- @@ -742,7 +805,7 @@
- static inline u64 __rq_clock_broken(struct rq *rq)
- {
- - return ACCESS_ONCE(rq->clock);
- + return READ_ONCE(rq->clock);
- }
- static inline u64 rq_clock(struct rq *rq)
- @@ -757,6 +820,18 @@
- return rq->clock_task;
- }
- +#define RQCF_REQ_SKIP 0x01
- +#define RQCF_ACT_SKIP 0x02
- +
- +static inline void rq_clock_skip_update(struct rq *rq, bool skip)
- +{
- + lockdep_assert_held(&rq->lock);
- + if (skip)
- + rq->clock_skip_update |= RQCF_REQ_SKIP;
- + else
- + rq->clock_skip_update &= ~RQCF_REQ_SKIP;
- +}
- +
- #ifdef CONFIG_NUMA_BALANCING
- extern void sched_setnuma(struct task_struct *p, int node);
- extern int migrate_task_to(struct task_struct *p, int cpu);
- @@ -836,8 +911,8 @@
- DECLARE_PER_CPU(struct sched_domain *, sd_llc);
- DECLARE_PER_CPU(int, sd_llc_size);
- DECLARE_PER_CPU(int, sd_llc_id);
- +DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
- DECLARE_PER_CPU(struct sched_domain *, sd_numa);
- -DECLARE_PER_CPU(struct sched_domain *, sd_busy);
- DECLARE_PER_CPU(struct sched_domain *, sd_asym);
- DECLARE_PER_CPU(struct sched_domain *, sd_ea);
- DECLARE_PER_CPU(struct sched_domain *, sd_scs);
- @@ -850,12 +925,9 @@
- */
- unsigned long capacity;
- unsigned long max_capacity; /* Max per-cpu capacity in group */
- + unsigned long min_capacity; /* Min per-CPU capacity in group */
- unsigned long next_update;
- int imbalance; /* XXX unrelated to capacity but shared group state */
- - /*
- - * Number of busy cpus in this group.
- - */
- - atomic_t nr_busy_cpus;
- unsigned long cpumask[0]; /* iteration mask */
- };
- @@ -866,7 +938,7 @@
- unsigned int group_weight;
- struct sched_group_capacity *sgc;
- - const struct sched_group_energy const *sge;
- + const struct sched_group_energy *sge;
- /*
- * The CPUs this group covers.
- @@ -878,9 +950,6 @@
- unsigned long cpumask[0];
- };
- -void set_energy_aware(void);
- -void clear_energy_aware(void);
- -
- static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
- {
- return to_cpumask(sg->cpumask);
- @@ -961,7 +1030,6 @@
- {
- return NULL;
- }
- -
- #endif /* CONFIG_CGROUP_SCHED */
- static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
- @@ -1022,17 +1090,8 @@
- #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
- #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
- -#ifdef CONFIG_NUMA_BALANCING
- -#define sched_feat_numa(x) sched_feat(x)
- -#ifdef CONFIG_SCHED_DEBUG
- -#define numabalancing_enabled sched_feat_numa(NUMA)
- -#else
- -extern bool numabalancing_enabled;
- -#endif /* CONFIG_SCHED_DEBUG */
- -#else
- -#define sched_feat_numa(x) (0)
- -#define numabalancing_enabled (0)
- -#endif /* CONFIG_NUMA_BALANCING */
- +extern struct static_key_false sched_numa_balancing;
- +extern struct static_key_false sched_schedstats;
- static inline u64 global_rt_period(void)
- {
- @@ -1074,9 +1133,6 @@
- #ifndef prepare_arch_switch
- # define prepare_arch_switch(next) do { } while (0)
- #endif
- -#ifndef finish_arch_switch
- -# define finish_arch_switch(prev) do { } while (0)
- -#endif
- #ifndef finish_arch_post_lock_switch
- # define finish_arch_post_lock_switch() do { } while (0)
- #endif
- @@ -1101,7 +1157,7 @@
- * We must ensure this doesn't happen until the switch is completely
- * finished.
- *
- - * Pairs with the control dependency and rmb in try_to_wake_up().
- + * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
- */
- smp_store_release(&prev->on_cpu, 0);
- #endif
- @@ -1139,59 +1195,45 @@
- #define WEIGHT_IDLEPRIO 3
- #define WMULT_IDLEPRIO 1431655765
- -/*
- - * Nice levels are multiplicative, with a gentle 10% change for every
- - * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
- - * nice 1, it will get ~10% less CPU time than another CPU-bound task
- - * that remained on nice 0.
- - *
- - * The "10% effect" is relative and cumulative: from _any_ nice level,
- - * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- - * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
- - * If a task goes up by ~10% and another task goes down by ~10% then
- - * the relative distance between them is ~25%.)
- - */
- -static const int prio_to_weight[40] = {
- - /* -20 */ 88761, 71755, 56483, 46273, 36291,
- - /* -15 */ 29154, 23254, 18705, 14949, 11916,
- - /* -10 */ 9548, 7620, 6100, 4904, 3906,
- - /* -5 */ 3121, 2501, 1991, 1586, 1277,
- - /* 0 */ 1024, 820, 655, 526, 423,
- - /* 5 */ 335, 272, 215, 172, 137,
- - /* 10 */ 110, 87, 70, 56, 45,
- - /* 15 */ 36, 29, 23, 18, 15,
- -};
- +extern const int sched_prio_to_weight[40];
- +extern const u32 sched_prio_to_wmult[40];
- /*
- - * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
- + * {de,en}queue flags:
- *
- - * In cases where the weight does not change often, we can use the
- - * precalculated inverse to speed up arithmetics by turning divisions
- - * into multiplications:
- - */
- -static const u32 prio_to_wmult[40] = {
- - /* -20 */ 48388, 59856, 76040, 92818, 118348,
- - /* -15 */ 147320, 184698, 229616, 287308, 360437,
- - /* -10 */ 449829, 563644, 704093, 875809, 1099582,
- - /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
- - /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
- - /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
- - /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
- - /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
- -};
- + * DEQUEUE_SLEEP - task is no longer runnable
- + * ENQUEUE_WAKEUP - task just became runnable
- + *
- + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
- + * are in a known state which allows modification. Such pairs
- + * should preserve as much state as possible.
- + *
- + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
- + * in the runqueue.
- + *
- + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
- + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
- + * ENQUEUE_WAKING - sched_class::task_waking was called
- + *
- + */
- -#define ENQUEUE_WAKEUP 1
- -#define ENQUEUE_HEAD 2
- +#define DEQUEUE_SLEEP 0x01
- +#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
- +#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
- +#define DEQUEUE_IDLE 0x80 /* The last dequeue before IDLE */
- +
- +#define ENQUEUE_WAKEUP 0x01
- +#define ENQUEUE_RESTORE 0x02
- +#define ENQUEUE_MOVE 0x04
- +
- +#define ENQUEUE_HEAD 0x08
- +#define ENQUEUE_REPLENISH 0x10
- #ifdef CONFIG_SMP
- -#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
- +#define ENQUEUE_WAKING 0x20
- #else
- -#define ENQUEUE_WAKING 0
- +#define ENQUEUE_WAKING 0x00
- #endif
- -#define ENQUEUE_REPLENISH 0x08
- -#define ENQUEUE_RESTORE 0x10
- -#define ENQUEUE_WAKEUP_NEW 0x20
- -
- -#define DEQUEUE_SLEEP 1
- +#define ENQUEUE_WAKEUP_NEW 0x40
- #define RETRY_TASK ((void *)-1UL)
- @@ -1214,12 +1256,14 @@
- * tasks.
- */
- struct task_struct * (*pick_next_task) (struct rq *rq,
- - struct task_struct *prev);
- + struct task_struct *prev,
- + struct pin_cookie cookie);
- void (*put_prev_task) (struct rq *rq, struct task_struct *p);
- #ifdef CONFIG_SMP
- - int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
- - void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
- + int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags,
- + int subling_count_hint);
- + void (*migrate_task_rq)(struct task_struct *p);
- void (*task_waking) (struct task_struct *task);
- void (*task_woken) (struct rq *this_rq, struct task_struct *task);
- @@ -1251,8 +1295,11 @@
- void (*update_curr) (struct rq *rq);
- +#define TASK_SET_GROUP 0
- +#define TASK_MOVE_GROUP 1
- +
- #ifdef CONFIG_FAIR_GROUP_SCHED
- - void (*task_move_group) (struct task_struct *p);
- + void (*task_change_group)(struct task_struct *p, int type);
- #endif
- };
- @@ -1261,6 +1308,11 @@
- prev->sched_class->put_prev_task(rq, prev);
- }
- +static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
- +{
- + curr->sched_class->set_curr_task(rq);
- +}
- +
- #define sched_class_highest (&stop_sched_class)
- #define for_each_class(class) \
- for (class = sched_class_highest; class; class = class->next)
- @@ -1279,13 +1331,7 @@
- extern void trigger_load_balance(struct rq *rq);
- -extern void idle_enter_fair(struct rq *this_rq);
- -extern void idle_exit_fair(struct rq *this_rq);
- -
- -#else
- -
- -static inline void idle_enter_fair(struct rq *rq) { }
- -static inline void idle_exit_fair(struct rq *rq) { }
- +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
- #endif
- @@ -1298,7 +1344,7 @@
- static inline struct cpuidle_state *idle_get_state(struct rq *rq)
- {
- - WARN_ON(!rcu_read_lock_held());
- + SCHED_WARN_ON(!rcu_read_lock_held());
- return rq->idle_state;
- }
- @@ -1340,7 +1386,6 @@
- extern void init_sched_dl_class(void);
- extern void init_sched_rt_class(void);
- extern void init_sched_fair_class(void);
- -extern void init_sched_dl_class(void);
- extern void resched_curr(struct rq *rq);
- extern void resched_cpu(int cpu);
- @@ -1350,14 +1395,14 @@
- extern struct dl_bandwidth def_dl_bandwidth;
- extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
- -extern void init_rt_schedtune_timer(struct sched_rt_entity *rt_se);
- extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
- unsigned long to_ratio(u64 period, u64 runtime);
- extern void init_entity_runnable_average(struct sched_entity *se);
- +extern void post_init_entity_util_avg(struct sched_entity *se);
- -static inline void add_nr_running(struct rq *rq, unsigned count)
- +static inline void __add_nr_running(struct rq *rq, unsigned count)
- {
- unsigned prev_nr = rq->nr_running;
- @@ -1385,11 +1430,48 @@
- }
- }
- -static inline void sub_nr_running(struct rq *rq, unsigned count)
- +static inline void __sub_nr_running(struct rq *rq, unsigned count)
- {
- rq->nr_running -= count;
- }
- +#ifdef CONFIG_CPU_QUIET
- +#define NR_AVE_SCALE(x) ((x) << FSHIFT)
- +static inline u64 do_nr_running_integral(struct rq *rq)
- +{
- + s64 nr, deltax;
- + u64 nr_running_integral = rq->nr_running_integral;
- +
- + deltax = rq->clock_task - rq->nr_last_stamp;
- + nr = NR_AVE_SCALE(rq->nr_running);
- +
- + nr_running_integral += nr * deltax;
- +
- + return nr_running_integral;
- +}
- +
- +static inline void add_nr_running(struct rq *rq, unsigned count)
- +{
- + write_seqcount_begin(&rq->ave_seqcnt);
- + rq->nr_running_integral = do_nr_running_integral(rq);
- + rq->nr_last_stamp = rq->clock_task;
- + __add_nr_running(rq, count);
- + write_seqcount_end(&rq->ave_seqcnt);
- +}
- +
- +static inline void sub_nr_running(struct rq *rq, unsigned count)
- +{
- + write_seqcount_begin(&rq->ave_seqcnt);
- + rq->nr_running_integral = do_nr_running_integral(rq);
- + rq->nr_last_stamp = rq->clock_task;
- + __sub_nr_running(rq, count);
- + write_seqcount_end(&rq->ave_seqcnt);
- +}
- +#else
- +#define add_nr_running __add_nr_running
- +#define sub_nr_running __sub_nr_running
- +#endif
- +
- static inline void rq_last_tick_reset(struct rq *rq)
- {
- #ifdef CONFIG_NO_HZ_FULL
- @@ -1451,6 +1533,26 @@
- }
- #endif
- +#ifndef arch_scale_max_freq_capacity
- +static __always_inline
- +unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
- +{
- + return SCHED_CAPACITY_SCALE;
- +}
- +#endif
- +
- +#ifndef arch_scale_min_freq_capacity
- +static __always_inline
- +unsigned long arch_scale_min_freq_capacity(struct sched_domain *sd, int cpu)
- +{
- + /*
- + * Multiplied with any capacity value, this scale factor will return
- + * 0, which represents an un-capped state
- + */
- + return 0;
- +}
- +#endif
- +
- #ifndef arch_scale_cpu_capacity
- static __always_inline
- unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
- @@ -1473,33 +1575,9 @@
- return cpu_rq(cpu)->cpu_capacity_orig;
- }
- -/* Force usage of PELT signal, i.e. util_avg */
- -#define UTIL_AVG true
- -/* Use estimated utilization when possible, i.e. UTIL_EST feature enabled */
- -#define UTIL_EST false
- -static inline bool use_util_est(void)
- -{
- - return sched_feat(UTIL_EST);
- -}
- -
- extern unsigned int sysctl_sched_use_walt_cpu_util;
- extern unsigned int walt_ravg_window;
- -extern unsigned int walt_disabled;
- -
- -static inline unsigned long task_util(struct task_struct *p, bool use_pelt)
- -{
- -
- -#ifdef CONFIG_SCHED_WALT
- - if (!walt_disabled && sysctl_sched_use_walt_task_util) {
- - unsigned long demand = p->ravg.demand;
- - return (demand << 10) / walt_ravg_window;
- - }
- -#endif
- - if (use_util_est() && !use_pelt)
- - return p->se.avg.util_est;
- - return p->se.avg.util_avg;
- -}
- -
- +extern bool walt_disabled;
- /*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- @@ -1527,18 +1605,15 @@
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- */
- -static inline unsigned long __cpu_util(int cpu, int delta, bool use_pelt)
- +static inline unsigned long __cpu_util(int cpu, int delta)
- {
- unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
- unsigned long capacity = capacity_orig_of(cpu);
- - if (use_util_est() && !use_pelt)
- - util = max(util, cpu_rq(cpu)->cfs.avg.util_est);
- -
- #ifdef CONFIG_SCHED_WALT
- if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
- - util = (cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT) /
- - walt_ravg_window;
- + util = div64_u64(cpu_rq(cpu)->cumulative_runnable_avg,
- + walt_ravg_window >> SCHED_LOAD_SHIFT);
- #endif
- delta += util;
- if (delta < 0)
- @@ -1547,9 +1622,22 @@
- return (delta >= capacity) ? capacity : delta;
- }
- -static inline unsigned long cpu_util(int cpu, bool use_pelt)
- +static inline unsigned long cpu_util(int cpu)
- {
- - return __cpu_util(cpu, 0, use_pelt);
- + return __cpu_util(cpu, 0);
- +}
- +
- +static inline unsigned long cpu_util_freq(int cpu)
- +{
- + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
- + unsigned long capacity = capacity_orig_of(cpu);
- +
- +#ifdef CONFIG_SCHED_WALT
- + if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
- + util = div64_u64(cpu_rq(cpu)->prev_runnable_sum,
- + walt_ravg_window >> SCHED_LOAD_SHIFT);
- +#endif
- + return (util >= capacity) ? capacity : util;
- }
- #endif
- @@ -1564,6 +1652,10 @@
- return static_key_false(&__sched_freq);
- }
- +/*
- + * sched_capacity_reqs expects capacity requests to be normalised.
- + * All capacities should sum to the range of 0-1024.
- + */
- DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
- void update_cpu_capacity_request(int cpu, bool request);
- @@ -1572,32 +1664,45 @@
- {
- struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
- - if (scr->cfs == capacity)
- - return;
- - scr->cfs = capacity;
- - update_cpu_capacity_request(cpu, request);
- +#ifdef CONFIG_SCHED_WALT
- + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
- + int rtdl = scr->rt + scr->dl;
- + /*
- + * WALT tracks the utilization of a CPU considering the load
- + * generated by all the scheduling classes.
- + * Since the following call to:
- + * update_cpu_capacity
- + * is already adding the RT and DL utilizations let's remove
- + * these contributions from the WALT signal.
- + */
- + if (capacity > rtdl)
- + capacity -= rtdl;
- + else
- + capacity = 0;
- + }
- +#endif
- + if (scr->cfs != capacity) {
- + scr->cfs = capacity;
- + update_cpu_capacity_request(cpu, request);
- + }
- }
- static inline void set_rt_cpu_capacity(int cpu, bool request,
- unsigned long capacity)
- {
- - struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
- -
- - if (scr->rt == capacity)
- - return;
- - scr->rt = capacity;
- - update_cpu_capacity_request(cpu, request);
- + if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) {
- + per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity;
- + update_cpu_capacity_request(cpu, request);
- + }
- }
- static inline void set_dl_cpu_capacity(int cpu, bool request,
- unsigned long capacity)
- {
- - struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
- -
- - if (scr->dl == capacity)
- - return;
- - scr->dl = capacity;
- - update_cpu_capacity_request(cpu, request);
- + if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) {
- + per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity;
- + update_cpu_capacity_request(cpu, request);
- + }
- }
- #else
- static inline bool sched_freq(void) { return false; }
- @@ -1621,8 +1726,33 @@
- static inline void sched_avg_update(struct rq *rq) { }
- #endif
- -extern struct rq *lock_rq_of(struct task_struct *p, unsigned long *flags);
- -extern void unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags);
- +struct rq_flags {
- + unsigned long flags;
- + struct pin_cookie cookie;
- +};
- +
- +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
- + __acquires(rq->lock);
- +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
- + __acquires(p->pi_lock)
- + __acquires(rq->lock);
- +
- +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
- + __releases(rq->lock)
- +{
- + lockdep_unpin_lock(&rq->lock, rf->cookie);
- + raw_spin_unlock(&rq->lock);
- +}
- +
- +static inline void
- +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
- + __releases(rq->lock)
- + __releases(p->pi_lock)
- +{
- + lockdep_unpin_lock(&rq->lock, rf->cookie);
- + raw_spin_unlock(&rq->lock);
- + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
- +}
- #ifdef CONFIG_SMP
- #ifdef CONFIG_PREEMPT
- @@ -1811,8 +1941,8 @@
- extern void print_rt_stats(struct seq_file *m, int cpu);
- extern void init_cfs_rq(struct cfs_rq *cfs_rq);
- -extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
- -extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
- +extern void init_rt_rq(struct rt_rq *rt_rq);
- +extern void init_dl_rq(struct dl_rq *dl_rq);
- extern void cfs_bandwidth_usage_inc(void);
- extern void cfs_bandwidth_usage_dec(void);
- @@ -1878,6 +2008,69 @@
- #endif /* CONFIG_64BIT */
- #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
- +#ifdef CONFIG_CPU_FREQ
- +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
- +
- +/**
- + * cpufreq_update_util - Take a note about CPU utilization changes.
- + * @rq: Runqueue to carry out the update for.
- + * @flags: Update reason flags.
- + *
- + * This function is called by the scheduler on the CPU whose utilization is
- + * being updated.
- + *
- + * It can only be called from RCU-sched read-side critical sections.
- + *
- + * The way cpufreq is currently arranged requires it to evaluate the CPU
- + * performance state (frequency/voltage) on a regular basis to prevent it from
- + * being stuck in a completely inadequate performance level for too long.
- + * That is not guaranteed to happen if the updates are only triggered from CFS,
- + * though, because they may not be coming in if RT or deadline tasks are active
- + * all the time (or there are RT and DL tasks only).
- + *
- + * As a workaround for that issue, this function is called by the RT and DL
- + * sched classes to trigger extra cpufreq updates to prevent it from stalling,
- + * but that really is a band-aid. Going forward it should be replaced with
- + * solutions targeted more specifically at RT and DL tasks.
- + */
- +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
- +{
- + struct update_util_data *data;
- +
- + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
- + if (data)
- + data->func(data, rq_clock(rq), flags);
- +}
- +
- +static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
- +{
- + if (cpu_of(rq) == smp_processor_id())
- + cpufreq_update_util(rq, flags);
- +}
- +#else
- +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
- +static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
- +#endif /* CONFIG_CPU_FREQ */
- +
- +#ifdef CONFIG_SCHED_WALT
- +
- +static inline bool
- +walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
- +{
- + return cpu_of(rq) == task_cpu(p) &&
- + (p->on_rq || p->last_sleep_ts >= rq->window_start);
- +}
- +
- +#endif /* CONFIG_SCHED_WALT */
- +
- +#ifdef arch_scale_freq_capacity
- +#ifndef arch_scale_freq_invariant
- +#define arch_scale_freq_invariant() (true)
- +#endif
- +#else /* arch_scale_freq_capacity */
- +#define arch_scale_freq_invariant() (false)
- +#endif
- +
- /*
- * task_may_not_preempt - check whether a task may not be preemptible soon
- */
- diff -Nur /home/ninez/android/marlin/kernel/sched/stats.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stats.c
- --- /home/ninez/android/marlin/kernel/sched/stats.c 2018-08-10 01:54:08.566728454 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stats.c 2018-08-11 23:57:17.131940887 -0400
- @@ -12,6 +12,26 @@
- */
- #define SCHEDSTAT_VERSION 15
- +static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats)
- +{
- + /* eas-specific runqueue stats */
- + seq_printf(seq, "eas %llu %llu %llu %llu %llu %llu ",
- + stats->sis_attempts, stats->sis_idle, stats->sis_cache_affine,
- + stats->sis_suff_cap, stats->sis_idle_cpu, stats->sis_count);
- +
- + seq_printf(seq, "%llu %llu %llu %llu %llu %llu %llu ",
- + stats->secb_attempts, stats->secb_sync, stats->secb_idle_bt,
- + stats->secb_insuff_cap, stats->secb_no_nrg_sav,
- + stats->secb_nrg_sav, stats->secb_count);
- +
- + seq_printf(seq, "%llu %llu %llu %llu %llu ",
- + stats->fbt_attempts, stats->fbt_no_cpu, stats->fbt_no_sd,
- + stats->fbt_pref_idle, stats->fbt_count);
- +
- + seq_printf(seq, "%llu %llu\n",
- + stats->cas_attempts, stats->cas_count);
- +}
- +
- static int show_schedstat(struct seq_file *seq, void *v)
- {
- int cpu;
- @@ -44,6 +64,7 @@
- seq_printf(seq, "\n");
- + show_easstat(seq, &rq->eas_stats);
- #ifdef CONFIG_SMP
- /* domain-specific stats */
- rcu_read_lock();
- @@ -72,6 +93,8 @@
- sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
- sd->ttwu_wake_remote, sd->ttwu_move_affine,
- sd->ttwu_move_balance);
- +
- + show_easstat(seq, &sd->eas_stats);
- }
- rcu_read_unlock();
- #endif
- diff -Nur /home/ninez/android/marlin/kernel/sched/stats.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stats.h
- --- /home/ninez/android/marlin/kernel/sched/stats.h 2018-08-10 01:54:08.566728454 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stats.h 2018-08-26 16:43:11.650539699 -0400
- @@ -29,9 +29,13 @@
- if (rq)
- rq->rq_sched_info.run_delay += delta;
- }
- -# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
- -# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
- -# define schedstat_set(var, val) do { var = (val); } while (0)
- +#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
- +#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
- +#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
- +#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
- +#define schedstat_val(var) (var)
- +#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
- +
- #else /* !CONFIG_SCHEDSTATS */
- static inline void
- rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
- @@ -42,10 +46,13 @@
- static inline void
- rq_sched_info_depart(struct rq *rq, unsigned long long delta)
- {}
- -# define schedstat_inc(rq, field) do { } while (0)
- -# define schedstat_add(rq, field, amt) do { } while (0)
- -# define schedstat_set(var, val) do { } while (0)
- -#endif
- +#define schedstat_enabled() 0
- +#define schedstat_inc(var) do { } while (0)
- +#define schedstat_add(var, amt) do { } while (0)
- +#define schedstat_set(var, val) do { } while (0)
- +#define schedstat_val(var) 0
- +#define schedstat_val_or_zero(var) 0
- +#endif /* CONFIG_SCHEDSTATS */
- #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
- static inline void sched_info_reset_dequeued(struct task_struct *t)
- @@ -174,7 +181,8 @@
- {
- struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
- - if (!cputimer->running)
- + /* Check if cputimer isn't running. This is accessed without locking. */
- + if (!READ_ONCE(cputimer->running))
- return false;
- /*
- @@ -215,9 +223,7 @@
- if (!cputimer_running(tsk))
- return;
- - raw_spin_lock(&cputimer->lock);
- - cputimer->cputime.utime += cputime;
- - raw_spin_unlock(&cputimer->lock);
- + atomic64_add(cputime, &cputimer->cputime_atomic.utime);
- }
- /**
- @@ -238,9 +244,7 @@
- if (!cputimer_running(tsk))
- return;
- - raw_spin_lock(&cputimer->lock);
- - cputimer->cputime.stime += cputime;
- - raw_spin_unlock(&cputimer->lock);
- + atomic64_add(cputime, &cputimer->cputime_atomic.stime);
- }
- /**
- @@ -261,7 +265,5 @@
- if (!cputimer_running(tsk))
- return;
- - raw_spin_lock(&cputimer->lock);
- - cputimer->cputime.sum_exec_runtime += ns;
- - raw_spin_unlock(&cputimer->lock);
- + atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
- }
- diff -Nur /home/ninez/android/marlin/kernel/sched/stop_task.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stop_task.c
- --- /home/ninez/android/marlin/kernel/sched/stop_task.c 2018-08-10 01:54:08.566728454 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/stop_task.c 2018-08-21 23:22:44.643944617 -0400
- @@ -12,7 +12,8 @@
- #ifdef CONFIG_SMP
- static int
- -select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
- +select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags,
- + int sibling_count_hint)
- {
- return task_cpu(p); /* stop tasks as never migrate */
- }
- @@ -25,7 +26,7 @@
- }
- static struct task_struct *
- -pick_next_task_stop(struct rq *rq, struct task_struct *prev)
- +pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
- {
- struct task_struct *stop = rq->stop;
- @@ -126,6 +127,7 @@
- #ifdef CONFIG_SMP
- .select_task_rq = select_task_rq_stop,
- + .set_cpus_allowed = set_cpus_allowed_common,
- #endif
- .set_curr_task = set_curr_task_stop,
- diff -Nur /home/ninez/android/marlin/kernel/sched/swait.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/swait.c
- --- /home/ninez/android/marlin/kernel/sched/swait.c 1969-12-31 19:00:00.000000000 -0500
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/swait.c 2018-08-13 18:40:12.199646700 -0400
- @@ -0,0 +1,134 @@
- +#include <linux/sched.h>
- +#include <linux/swait.h>
- +
- +void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
- + struct lock_class_key *key)
- +{
- + raw_spin_lock_init(&q->lock);
- + lockdep_set_class_and_name(&q->lock, key, name);
- + INIT_LIST_HEAD(&q->task_list);
- +}
- +EXPORT_SYMBOL(__init_swait_queue_head);
- +
- +/*
- + * The thing about the wake_up_state() return value; I think we can ignore it.
- + *
- + * If for some reason it would return 0, that means the previously waiting
- + * task is already running, so it will observe condition true (or has already).
- + */
- +void swake_up_locked(struct swait_queue_head *q)
- +{
- + struct swait_queue *curr;
- +
- + if (list_empty(&q->task_list))
- + return;
- +
- + curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
- + wake_up_process(curr->task);
- + list_del_init(&curr->task_list);
- +}
- +EXPORT_SYMBOL(swake_up_locked);
- +
- +void swake_up_all_locked(struct swait_queue_head *q)
- +{
- + struct swait_queue *curr;
- + int wakes = 0;
- +
- + while (!list_empty(&q->task_list)) {
- +
- + curr = list_first_entry(&q->task_list, typeof(*curr),
- + task_list);
- + wake_up_process(curr->task);
- + list_del_init(&curr->task_list);
- + wakes++;
- + }
- + //WARN_ON(wakes > 2);
- +}
- +EXPORT_SYMBOL(swake_up_all_locked);
- +
- +void swake_up(struct swait_queue_head *q)
- +{
- + unsigned long flags;
- +
- + raw_spin_lock_irqsave(&q->lock, flags);
- + swake_up_locked(q);
- + raw_spin_unlock_irqrestore(&q->lock, flags);
- +}
- +EXPORT_SYMBOL(swake_up);
- +
- +/*
- + * Does not allow usage from IRQ disabled, since we must be able to
- + * release IRQs to guarantee bounded hold time.
- + */
- +void swake_up_all(struct swait_queue_head *q)
- +{
- + struct swait_queue *curr;
- + LIST_HEAD(tmp);
- +
- + raw_spin_lock_irq(&q->lock);
- + list_splice_init(&q->task_list, &tmp);
- + while (!list_empty(&tmp)) {
- + curr = list_first_entry(&tmp, typeof(*curr), task_list);
- +
- + wake_up_state(curr->task, TASK_NORMAL);
- + list_del_init(&curr->task_list);
- +
- + if (list_empty(&tmp))
- + break;
- +
- + raw_spin_unlock_irq(&q->lock);
- + raw_spin_lock_irq(&q->lock);
- + }
- + raw_spin_unlock_irq(&q->lock);
- +}
- +EXPORT_SYMBOL(swake_up_all);
- +
- +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
- +{
- + wait->task = current;
- + if (list_empty(&wait->task_list))
- + list_add(&wait->task_list, &q->task_list);
- +}
- +
- +void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
- +{
- + unsigned long flags;
- +
- + raw_spin_lock_irqsave(&q->lock, flags);
- + __prepare_to_swait(q, wait);
- + set_current_state(state);
- + raw_spin_unlock_irqrestore(&q->lock, flags);
- +}
- +EXPORT_SYMBOL(prepare_to_swait);
- +
- +long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
- +{
- + if (signal_pending_state(state, current))
- + return -ERESTARTSYS;
- +
- + prepare_to_swait(q, wait, state);
- +
- + return 0;
- +}
- +EXPORT_SYMBOL(prepare_to_swait_event);
- +
- +void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
- +{
- + __set_current_state(TASK_RUNNING);
- + if (!list_empty(&wait->task_list))
- + list_del_init(&wait->task_list);
- +}
- +
- +void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
- +{
- + unsigned long flags;
- +
- + __set_current_state(TASK_RUNNING);
- +
- + if (!list_empty_careful(&wait->task_list)) {
- + raw_spin_lock_irqsave(&q->lock, flags);
- + list_del_init(&wait->task_list);
- + raw_spin_unlock_irqrestore(&q->lock, flags);
- + }
- +}
- +EXPORT_SYMBOL(finish_swait);
- diff -Nur /home/ninez/android/marlin/kernel/sched/swork.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/swork.c
- --- /home/ninez/android/marlin/kernel/sched/swork.c 1969-12-31 19:00:00.000000000 -0500
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/swork.c 2018-08-12 21:14:08.273505429 -0400
- @@ -0,0 +1,172 @@
- +/*
- + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner [email protected]
- + *
- + * Provides a framework for enqueuing callbacks from irq context
- + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
- + */
- +
- +#include <linux/swait.h>
- +#include <linux/swork.h>
- +#include <linux/kthread.h>
- +#include <linux/slab.h>
- +#include <linux/spinlock.h>
- +
- +#define SWORK_EVENT_PENDING (1 << 0)
- +
- +static DEFINE_MUTEX(worker_mutex);
- +static struct sworker *glob_worker;
- +
- +struct sworker {
- + struct list_head events;
- + struct swait_queue_head wq;
- +
- + raw_spinlock_t lock;
- +
- + struct task_struct *task;
- + int refs;
- +};
- +
- +static bool swork_readable(struct sworker *worker)
- +{
- + bool r;
- +
- + if (kthread_should_stop())
- + return true;
- +
- + raw_spin_lock_irq(&worker->lock);
- + r = !list_empty(&worker->events);
- + raw_spin_unlock_irq(&worker->lock);
- +
- + return r;
- +}
- +
- +static int swork_kthread(void *arg)
- +{
- + struct sworker *worker = arg;
- +
- + for (;;) {
- + swait_event_interruptible(worker->wq,
- + swork_readable(worker));
- + if (kthread_should_stop())
- + break;
- +
- + raw_spin_lock_irq(&worker->lock);
- + while (!list_empty(&worker->events)) {
- + struct swork_event *sev;
- +
- + sev = list_first_entry(&worker->events,
- + struct swork_event, item);
- + list_del(&sev->item);
- + raw_spin_unlock_irq(&worker->lock);
- +
- + WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
- + &sev->flags));
- + sev->func(sev);
- + raw_spin_lock_irq(&worker->lock);
- + }
- + raw_spin_unlock_irq(&worker->lock);
- + }
- + return 0;
- +}
- +
- +static struct sworker *swork_create(void)
- +{
- + struct sworker *worker;
- +
- + worker = kzalloc(sizeof(*worker), GFP_KERNEL);
- + if (!worker)
- + return ERR_PTR(-ENOMEM);
- +
- + INIT_LIST_HEAD(&worker->events);
- + raw_spin_lock_init(&worker->lock);
- + init_swait_queue_head(&worker->wq);
- +
- + worker->task = kthread_run(swork_kthread, worker, "kswork");
- + if (IS_ERR(worker->task)) {
- + kfree(worker);
- + return ERR_PTR(-ENOMEM);
- + }
- +
- + return worker;
- +}
- +
- +static void swork_destroy(struct sworker *worker)
- +{
- + kthread_stop(worker->task);
- +
- + WARN_ON(!list_empty(&worker->events));
- + kfree(worker);
- +}
- +
- +/**
- + * swork_queue - queue swork
- + *
- + * Returns %false if @work was already on a queue, %true otherwise.
- + *
- + * The work is queued and processed on a random CPU
- + */
- +bool swork_queue(struct swork_event *sev)
- +{
- + unsigned long flags;
- +
- + if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
- + return false;
- +
- + raw_spin_lock_irqsave(&glob_worker->lock, flags);
- + list_add_tail(&sev->item, &glob_worker->events);
- + raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
- +
- + swake_up(&glob_worker->wq);
- + return true;
- +}
- +EXPORT_SYMBOL_GPL(swork_queue);
- +
- +/**
- + * swork_get - get an instance of the sworker
- + *
- + * Returns an negative error code if the initialization if the worker did not
- + * work, %0 otherwise.
- + *
- + */
- +int swork_get(void)
- +{
- + struct sworker *worker;
- +
- + mutex_lock(&worker_mutex);
- + if (!glob_worker) {
- + worker = swork_create();
- + if (IS_ERR(worker)) {
- + mutex_unlock(&worker_mutex);
- + return -ENOMEM;
- + }
- +
- + glob_worker = worker;
- + }
- +
- + glob_worker->refs++;
- + mutex_unlock(&worker_mutex);
- +
- + return 0;
- +}
- +EXPORT_SYMBOL_GPL(swork_get);
- +
- +/**
- + * swork_put - puts an instance of the sworker
- + *
- + * Will destroy the sworker thread. This function must not be called until all
- + * queued events have been completed.
- + */
- +void swork_put(void)
- +{
- + mutex_lock(&worker_mutex);
- +
- + glob_worker->refs--;
- + if (glob_worker->refs > 0)
- + goto out;
- +
- + swork_destroy(glob_worker);
- + glob_worker = NULL;
- +out:
- + mutex_unlock(&worker_mutex);
- +}
- +EXPORT_SYMBOL_GPL(swork_put);
- diff -Nur /home/ninez/android/marlin/kernel/sched/tune.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/tune.c
- --- /home/ninez/android/marlin/kernel/sched/tune.c 2018-08-10 01:54:08.566728454 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/tune.c 2018-08-14 15:53:43.604124856 -0400
- @@ -12,13 +12,25 @@
- #include "tune.h"
- #ifdef CONFIG_CGROUP_SCHEDTUNE
- -static bool schedtune_initialized = false;
- +bool schedtune_initialized = false;
- #endif
- -unsigned int sysctl_sched_cfs_boost __read_mostly;
- +extern struct rq *lock_rq_of(struct task_struct *p, struct rq_flags *rf);
- +extern void unlock_rq_of(struct rq *rq, struct task_struct *p, struct rq_flags *rf);
- -static struct reciprocal_value schedtune_spc_rdiv;
- -extern struct target_nrg schedtune_target_nrg;
- +int sysctl_sched_cfs_boost __read_mostly;
- +
- +/* We hold schedtune boost in effect for at least this long */
- +#define SCHEDTUNE_BOOST_HOLD_NS 50000000ULL
- +
- +extern struct reciprocal_value schedtune_spc_rdiv;
- +struct target_nrg schedtune_target_nrg;
- +
- +#ifdef CONFIG_DYNAMIC_STUNE_BOOST
- +static DEFINE_MUTEX(stune_boost_mutex);
- +static struct schedtune *getSchedtune(char *st_name);
- +static int dynamic_boost_write(struct schedtune *st, int boost);
- +#endif /* CONFIG_DYNAMIC_STUNE_BOOST */
- /* Performance Boost region (B) threshold params */
- static int perf_boost_idx;
- @@ -130,6 +142,14 @@
- /* Hint to bias scheduling of tasks on that SchedTune CGroup
- * towards idle CPUs */
- int prefer_idle;
- +
- +#ifdef CONFIG_DYNAMIC_STUNE_BOOST
- + /*
- + * This tracks the default boost value and is used to restore
- + * the value when Dynamic SchedTune Boost is reset.
- + */
- + int boost_default;
- +#endif /* CONFIG_DYNAMIC_STUNE_BOOST */
- };
- static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
- @@ -162,6 +182,9 @@
- .perf_boost_idx = 0,
- .perf_constrain_idx = 0,
- .prefer_idle = 0,
- +#ifdef CONFIG_DYNAMIC_STUNE_BOOST
- + .boost_default = 0,
- +#endif /* CONFIG_DYNAMIC_STUNE_BOOST */
- };
- int
- @@ -206,7 +229,8 @@
- * implementation especially for the computation of the per-CPU boost
- * value
- */
- -#define BOOSTGROUPS_COUNT 5
- +
- +#define BOOSTGROUPS_COUNT 7
- /* Array of configured boostgroups */
- static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
- @@ -226,45 +250,68 @@
- /* Maximum boost value for all RUNNABLE tasks on a CPU */
- bool idle;
- int boost_max;
- + u64 boost_ts;
- struct {
- /* The boost for tasks on that boost group */
- int boost;
- /* Count of RUNNABLE tasks on that boost group */
- unsigned tasks;
- + /* Timestamp of boost activation */
- + u64 ts;
- } group[BOOSTGROUPS_COUNT];
- /* CPU's boost group locking */
- raw_spinlock_t lock;
- };
- /* Boost groups affecting each CPU in the system */
- -DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
- +static DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
- +
- +static inline bool schedtune_boost_timeout(u64 now, u64 ts)
- +{
- + return ((now - ts) > SCHEDTUNE_BOOST_HOLD_NS);
- +}
- +
- +static inline bool
- +schedtune_boost_group_active(int idx, struct boost_groups* bg, u64 now)
- +{
- + if (bg->group[idx].tasks)
- + return true;
- +
- + return !schedtune_boost_timeout(now, bg->group[idx].ts);
- +}
- static void
- -schedtune_cpu_update(int cpu)
- +schedtune_cpu_update(int cpu, u64 now)
- {
- struct boost_groups *bg;
- - int boost_max;
- + u64 boost_ts = now;
- + int boost_max = INT_MIN;
- int idx;
- bg = &per_cpu(cpu_boost_groups, cpu);
- - /* The root boost group is always active */
- - boost_max = bg->group[0].boost;
- - for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
- + for (idx = 0; idx < BOOSTGROUPS_COUNT; ++idx) {
- /*
- * A boost group affects a CPU only if it has
- - * RUNNABLE tasks on that CPU
- + * RUNNABLE tasks on that CPU or it has hold
- + * in effect from a previous task.
- */
- - if (bg->group[idx].tasks == 0)
- + if (!schedtune_boost_group_active(idx, bg, now))
- + continue;
- +
- + /* this boost group is active */
- + if (boost_max > bg->group[idx].boost)
- continue;
- - boost_max = max(boost_max, bg->group[idx].boost);
- + boost_max = bg->group[idx].boost;
- + boost_ts = bg->group[idx].ts;
- }
- - /* Ensures boost_max is non-negative when all cgroup boost values
- - * are neagtive. Avoids under-accounting of cpu capacity which may cause
- - * task stacking and frequency spikes.*/
- - boost_max = max(boost_max, 0);
- +
- + /* If there are no active boost groups on the CPU, set no boost */
- + if (boost_max == INT_MIN)
- + boost_max = 0;
- bg->boost_max = boost_max;
- + bg->boost_ts = boost_ts;
- }
- static int
- @@ -274,6 +321,7 @@
- int cur_boost_max;
- int old_boost;
- int cpu;
- + u64 now;
- /* Update per CPU boost groups */
- for_each_possible_cpu(cpu) {
- @@ -290,16 +338,22 @@
- /* Update the boost value of this boost group */
- bg->group[idx].boost = boost;
- - /* Check if this update increase current max */
- - if (boost > cur_boost_max && bg->group[idx].tasks) {
- + now = sched_clock_cpu(cpu);
- + /*
- + * Check if this update increase current max.
- + */
- + if (boost > cur_boost_max &&
- + schedtune_boost_group_active(idx, bg, now)) {
- bg->boost_max = boost;
- + bg->boost_ts = bg->group[idx].ts;
- +
- trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
- continue;
- }
- /* Check if this update has decreased current max */
- if (cur_boost_max == old_boost && old_boost > boost) {
- - schedtune_cpu_update(cpu);
- + schedtune_cpu_update(cpu, now);
- trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
- continue;
- }
- @@ -313,21 +367,38 @@
- #define ENQUEUE_TASK 1
- #define DEQUEUE_TASK -1
- +static inline bool
- +schedtune_update_timestamp(struct task_struct *p)
- +{
- + if (sched_feat(SCHEDTUNE_BOOST_HOLD_ALL))
- + return true;
- +
- + return task_has_rt_policy(p);
- +}
- +
- static inline void
- schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
- {
- struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
- int tasks = bg->group[idx].tasks + task_count;
- + u64 now;
- /* Update boosted tasks count while avoiding to make it negative */
- bg->group[idx].tasks = max(0, tasks);
- + /* Update timeout on enqueue */
- + if (task_count > 0) {
- + now = sched_clock_cpu(cpu);
- + if (schedtune_update_timestamp(p))
- + bg->group[idx].ts = now;
- +
- + /* Boost group activation or deactivation on that RQ */
- + if (bg->group[idx].tasks == 1)
- + schedtune_cpu_update(cpu, now);
- + }
- trace_sched_tune_tasks_update(p, cpu, tasks, idx,
- - bg->group[idx].boost, bg->boost_max);
- -
- - /* Boost group activation or deactivation on that RQ */
- - if (tasks == 1 || tasks == 0)
- - schedtune_cpu_update(cpu);
- + bg->group[idx].boost, bg->boost_max,
- + bg->group[idx].ts);
- }
- /*
- @@ -381,12 +452,13 @@
- {
- struct task_struct *task;
- struct boost_groups *bg;
- - unsigned long irq_flags;
- + struct rq_flags irq_flags;
- unsigned int cpu;
- struct rq *rq;
- int src_bg; /* Source boost group index */
- int dst_bg; /* Destination boost group index */
- int tasks;
- + u64 now;
- if (!unlikely(schedtune_initialized))
- return 0;
- @@ -431,18 +503,19 @@
- * current boost group.
- */
- + now = sched_clock_cpu(cpu);
- +
- /* Move task from src to dst boost group */
- tasks = bg->group[src_bg].tasks - 1;
- bg->group[src_bg].tasks = max(0, tasks);
- bg->group[dst_bg].tasks += 1;
- + bg->group[dst_bg].ts = now;
- +
- + /* update next time someone asks */
- + bg->boost_ts = now - SCHEDTUNE_BOOST_HOLD_NS;
- raw_spin_unlock(&bg->lock);
- unlock_rq_of(rq, task, &irq_flags);
- -
- - /* Update CPU boost group */
- - if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
- - schedtune_cpu_update(task_cpu(task));
- -
- }
- return 0;
- @@ -501,7 +574,7 @@
- void schedtune_exit_task(struct task_struct *tsk)
- {
- struct schedtune *st;
- - unsigned long irq_flags;
- + struct rq_flags irq_flags;
- unsigned int cpu;
- struct rq *rq;
- int idx;
- @@ -524,8 +597,15 @@
- int schedtune_cpu_boost(int cpu)
- {
- struct boost_groups *bg;
- + u64 now;
- bg = &per_cpu(cpu_boost_groups, cpu);
- + now = sched_clock_cpu(cpu);
- +
- + /* check to see if we have a hold in effect */
- + if (schedtune_boost_timeout(now, bg->boost_ts))
- + schedtune_cpu_update(cpu, now);
- +
- return bg->boost_max;
- }
- @@ -534,6 +614,9 @@
- struct schedtune *st;
- int task_boost;
- + if (!unlikely(schedtune_initialized))
- + return 0;
- +
- /* Get task boost value */
- rcu_read_lock();
- st = task_schedtune(p);
- @@ -548,6 +631,9 @@
- struct schedtune *st;
- int prefer_idle;
- + if (!unlikely(schedtune_initialized))
- + return 0;
- +
- /* Get prefer_idle value */
- rcu_read_lock();
- st = task_schedtune(p);
- @@ -606,6 +692,9 @@
- st->perf_constrain_idx = threshold_idx;
- st->boost = boost;
- +#ifdef CONFIG_DYNAMIC_STUNE_BOOST
- + st->boost_default = boost;
- +#endif /* CONFIG_DYNAMIC_STUNE_BOOST */
- if (css == &root_schedtune.css) {
- sysctl_sched_cfs_boost = boost;
- perf_boost_idx = threshold_idx;
- @@ -615,11 +704,11 @@
- /* Update CPU boost */
- schedtune_boostgroup_update(st->idx, st->boost);
- - trace_sched_tune_config(st->boost,
- - threshold_gains[st->perf_boost_idx].nrg_gain,
- - threshold_gains[st->perf_boost_idx].cap_gain,
- - threshold_gains[st->perf_constrain_idx].nrg_gain,
- - threshold_gains[st->perf_constrain_idx].cap_gain);
- +// trace_sched_tune_config(st->boost,
- +// threshold_gains[st->perf_boost_idx].nrg_gain,
- +// threshold_gains[st->perf_boost_idx].cap_gain,
- +// threshold_gains[st->perf_constrain_idx].nrg_gain,
- +// threshold_gains[st->perf_constrain_idx].cap_gain);
- return 0;
- }
- @@ -652,6 +741,8 @@
- bg = &per_cpu(cpu_boost_groups, cpu);
- bg->group[st->idx].boost = 0;
- bg->group[st->idx].tasks = 0;
- + bg->group[st->idx].ts = 0;
- + raw_spin_lock_init(&bg->lock);
- }
- return 0;
- @@ -747,6 +838,114 @@
- schedtune_initialized = true;
- }
- +#ifdef CONFIG_DYNAMIC_STUNE_BOOST
- +static struct schedtune *getSchedtune(char *st_name)
- +{
- + int idx;
- +
- + for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
- + char name_buf[NAME_MAX + 1];
- + struct schedtune *st = allocated_group[idx];
- +
- + if (!st) {
- + pr_warn("SCHEDTUNE: Could not find %s\n", st_name);
- + break;
- + }
- +
- + cgroup_name(st->css.cgroup, name_buf, sizeof(name_buf));
- + if (strncmp(name_buf, st_name, strlen(st_name)) == 0)
- + return st;
- + }
- +
- + return NULL;
- +}
- +
- +static int dynamic_boost_write(struct schedtune *st, int boost)
- +{
- + int ret;
- + /* Backup boost_default */
- + int boost_default_backup = st->boost_default;
- +
- + ret = boost_write(&st->css, NULL, boost);
- +
- + /* Restore boost_default */
- + st->boost_default = boost_default_backup;
- +
- + return ret;
- +}
- +
- +int do_stune_boost(char *st_name, int boost)
- +{
- + int ret = 0;
- + struct schedtune *st = getSchedtune(st_name);
- +
- + if (!st)
- + return -EINVAL;
- +
- + mutex_lock(&stune_boost_mutex);
- +
- + /* Boost if new value is greater than current */
- + if (boost > st->boost)
- + ret = dynamic_boost_write(st, boost);
- +
- + mutex_unlock(&stune_boost_mutex);
- +
- + return ret;
- +}
- +
- +int do_stune_unboost(char *st_name, int boost)
- +{
- + int ret = 0;
- + struct schedtune *st = getSchedtune(st_name);
- +
- + if (!st)
- + return -EINVAL;
- +
- + mutex_lock(&stune_boost_mutex);
- +
- + /* Unboost if new value is less than current */
- + if (boost < st->boost)
- + ret = dynamic_boost_write(st, boost);
- +
- + mutex_unlock(&stune_boost_mutex);
- +
- + return ret;
- +}
- +
- +int set_stune_boost(char *st_name, int boost)
- +{
- + int ret = 0;
- + struct schedtune *st = getSchedtune(st_name);
- +
- + if (!st)
- + return -EINVAL;
- +
- + mutex_lock(&stune_boost_mutex);
- +
- + /* Set Boost regardless if new value is greater than current */
- + ret = dynamic_boost_write(st, boost);
- +
- + mutex_unlock(&stune_boost_mutex);
- +
- + return ret;
- +}
- +
- +int reset_stune_boost(char *st_name)
- +{
- + int ret = 0;
- + struct schedtune *st = getSchedtune(st_name);
- +
- + if (!st)
- + return -EINVAL;
- +
- + mutex_lock(&stune_boost_mutex);
- + ret = dynamic_boost_write(st, st->boost_default);
- + mutex_unlock(&stune_boost_mutex);
- +
- + return ret;
- +}
- +#endif /* CONFIG_DYNAMIC_STUNE_BOOST */
- +
- #else /* CONFIG_CGROUP_SCHEDTUNE */
- int
- @@ -894,79 +1093,6 @@
- }
- }
- -static long
- -schedtune_margin(unsigned long signal, long boost)
- -{
- - long long margin = 0;
- -
- - /*
- - * Signal proportional compensation (SPC)
- - *
- - * The Boost (B) value is used to compute a Margin (M) which is
- - * proportional to the complement of the original Signal (S):
- - * M = B * (SCHED_CAPACITY_SCALE - S)
- - * The obtained M could be used by the caller to "boost" S.
- - */
- - if (boost >= 0) {
- - margin = SCHED_CAPACITY_SCALE - signal;
- - margin *= boost;
- - } else
- - margin = -signal * boost;
- -
- - margin = reciprocal_divide(margin, schedtune_spc_rdiv);
- -
- - if (boost < 0)
- - margin *= -1;
- - return margin;
- -}
- -
- -static inline int
- -schedtune_cpu_margin(unsigned long util, int cpu)
- -{
- - int boost = schedtune_cpu_boost(cpu);
- -
- - if (boost == 0)
- - return 0;
- -
- - return schedtune_margin(util, boost);
- -}
- -
- -static inline long
- -schedtune_task_margin(struct task_struct *task)
- -{
- - int boost = schedtune_task_boost(task);
- - unsigned long util;
- - long margin;
- -
- - if (boost == 0)
- - return 0;
- -
- - util = task_util(task, UTIL_AVG);
- - margin = schedtune_margin(util, boost);
- -
- - return margin;
- -}
- -
- -unsigned long boosted_cpu_util(int cpu)
- -{
- - unsigned long util = cpu_util(cpu, UTIL_EST);
- - long margin = schedtune_cpu_margin(util, cpu);
- -
- - trace_sched_boost_cpu(cpu, util, margin);
- -
- - return util + margin;
- -}
- -
- -unsigned long boosted_task_util(struct task_struct *task)
- -{
- - unsigned long util = task_util(task, UTIL_EST);
- - long margin = schedtune_task_margin(task);
- -
- - trace_sched_boost_task(task, util, margin);
- -
- - return util + margin;
- -}
- -
- /*
- * Initialize the constants required to compute normalized energy.
- * The values of these constants depends on the EM data for the specific
- @@ -1033,3 +1159,4 @@
- }
- postcore_initcall(schedtune_init);
- +
- diff -Nur /home/ninez/android/marlin/kernel/sched/tune.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/tune.h
- --- /home/ninez/android/marlin/kernel/sched/tune.h 2018-08-10 01:54:08.566728454 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/tune.h 2018-08-23 19:57:44.817608733 -0400
- @@ -24,6 +24,9 @@
- void schedtune_enqueue_task(struct task_struct *p, int cpu);
- void schedtune_dequeue_task(struct task_struct *p, int cpu);
- +int schedtune_accept_deltas(int nrg_delta, int cap_delta,
- + struct task_struct *task);
- +
- #else /* CONFIG_CGROUP_SCHEDTUNE */
- #define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost()
- @@ -39,13 +42,6 @@
- int schedtune_accept_deltas(int nrg_delta, int cap_delta,
- struct task_struct *task);
- -#ifdef CONFIG_SMP
- -unsigned long boosted_cpu_util(int cpu);
- -#else
- -#define boosted_cpu_util(cpu) cpu_util(cpu, UTIL_EST);
- -#endif
- -unsigned long boosted_task_util(struct task_struct *task);
- -
- #else /* CONFIG_SCHED_TUNE */
- #define schedtune_cpu_boost(cpu) 0
- @@ -58,7 +54,4 @@
- #define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta
- -#define boosted_cpu_util(cpu) cpu_util(cpu, UTIL_EST);
- -#define boosted_task_util(cpu) task_util(cpu, UTIL_EST);
- -
- #endif /* CONFIG_SCHED_TUNE */
- diff -Nur /home/ninez/android/marlin/kernel/sched/wait.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/wait.c
- --- /home/ninez/android/marlin/kernel/sched/wait.c 2018-08-10 01:54:08.566728454 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/wait.c 2018-08-11 23:57:17.131940887 -0400
- @@ -9,6 +9,7 @@
- #include <linux/mm.h>
- #include <linux/wait.h>
- #include <linux/hash.h>
- +#include <linux/kthread.h>
- void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
- {
- @@ -297,6 +298,10 @@
- }
- EXPORT_SYMBOL(autoremove_wake_function);
- +static inline bool is_kthread_should_stop(void)
- +{
- + return (current->flags & PF_KTHREAD) && kthread_should_stop();
- +}
- /*
- * DEFINE_WAIT_FUNC(wait, woken_wake_func);
- @@ -326,7 +331,7 @@
- * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
- * also observe all state before the wakeup.
- */
- - if (!(wait->flags & WQ_FLAG_WOKEN))
- + if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
- timeout = schedule_timeout(timeout);
- __set_current_state(TASK_RUNNING);
- @@ -336,7 +341,7 @@
- * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
- * an event.
- */
- - set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
- + smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
- return timeout;
- }
- @@ -349,7 +354,7 @@
- * doesn't imply write barrier and the users expects write
- * barrier semantics on wakeup functions. The following
- * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
- - * and is paired with set_mb() in wait_woken().
- + * and is paired with smp_store_mb() in wait_woken().
- */
- smp_wmb(); /* C */
- wait->flags |= WQ_FLAG_WOKEN;
- diff -Nur /home/ninez/android/marlin/kernel/sched/walt.c /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/walt.c
- --- /home/ninez/android/marlin/kernel/sched/walt.c 2018-08-10 01:54:08.566728454 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/walt.c 2018-08-11 23:57:17.131940887 -0400
- @@ -20,7 +20,6 @@
- */
- #include <linux/syscore_ops.h>
- -#include <linux/cpufreq.h>
- #include <trace/events/sched.h>
- #include "sched.h"
- #include "walt.h"
- @@ -42,57 +41,49 @@
- unsigned int sysctl_sched_walt_init_task_load_pct = 15;
- -/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
- -unsigned int __read_mostly walt_disabled = 0;
- -
- -static unsigned int max_possible_efficiency = 1024;
- -static unsigned int min_possible_efficiency = 1024;
- +/* true -> use PELT based load stats, false -> use window-based load stats */
- +bool __read_mostly walt_disabled = false;
- /*
- - * Maximum possible frequency across all cpus. Task demand and cpu
- - * capacity (cpu_power) metrics are scaled in reference to it.
- + * Window size (in ns). Adjust for the tick size so that the window
- + * rollover occurs just before the tick boundary.
- */
- -static unsigned int max_possible_freq = 1;
- -
- -/*
- - * Minimum possible max_freq across all cpus. This will be same as
- - * max_possible_freq on homogeneous systems and could be different from
- - * max_possible_freq on heterogenous systems. min_max_freq is used to derive
- - * capacity (cpu_power) of cpus.
- - */
- -static unsigned int min_max_freq = 1;
- -
- -static unsigned int max_capacity = 1024;
- -static unsigned int min_capacity = 1024;
- -static unsigned int max_load_scale_factor = 1024;
- -static unsigned int max_possible_capacity = 1024;
- -
- -/* Mask of all CPUs that have max_possible_capacity */
- -static cpumask_t mpc_mask = CPU_MASK_ALL;
- -
- -/* Window size (in ns) */
- -__read_mostly unsigned int walt_ravg_window = 20000000;
- -
- -/* Min window size (in ns) = 10ms */
- -#define MIN_SCHED_RAVG_WINDOW 10000000
- -
- -/* Max window size (in ns) = 1s */
- -#define MAX_SCHED_RAVG_WINDOW 1000000000
- +__read_mostly unsigned int walt_ravg_window =
- + (20000000 / TICK_NSEC) * TICK_NSEC;
- +#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC)
- +#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC)
- static unsigned int sync_cpu;
- static ktime_t ktime_last;
- -static bool walt_ktime_suspended;
- +static __read_mostly bool walt_ktime_suspended;
- static unsigned int task_load(struct task_struct *p)
- {
- return p->ravg.demand;
- }
- +static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
- +{
- + rq->cum_window_demand += delta;
- + if (unlikely((s64)rq->cum_window_demand < 0))
- + rq->cum_window_demand = 0;
- +}
- +
- void
- walt_inc_cumulative_runnable_avg(struct rq *rq,
- struct task_struct *p)
- {
- rq->cumulative_runnable_avg += p->ravg.demand;
- +
- + /*
- + * Add a task's contribution to the cumulative window demand when
- + *
- + * (1) task is enqueued with on_rq = 1 i.e migration,
- + * prio/cgroup/class change.
- + * (2) task is waking for the first time in this window.
- + */
- + if (p->on_rq || (p->last_sleep_ts < rq->window_start))
- + fixup_cum_window_demand(rq, p->ravg.demand);
- }
- void
- @@ -101,16 +92,28 @@
- {
- rq->cumulative_runnable_avg -= p->ravg.demand;
- BUG_ON((s64)rq->cumulative_runnable_avg < 0);
- +
- + /*
- + * on_rq will be 1 for sleeping tasks. So check if the task
- + * is migrating or dequeuing in RUNNING state to change the
- + * prio/cgroup/class.
- + */
- + if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
- + fixup_cum_window_demand(rq, -(s64)p->ravg.demand);
- }
- static void
- fixup_cumulative_runnable_avg(struct rq *rq,
- - struct task_struct *p, s64 task_load_delta)
- + struct task_struct *p, u64 new_task_load)
- {
- + s64 task_load_delta = (s64)new_task_load - task_load(p);
- +
- rq->cumulative_runnable_avg += task_load_delta;
- if ((s64)rq->cumulative_runnable_avg < 0)
- panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
- task_load_delta, task_load(p));
- +
- + fixup_cum_window_demand(rq, task_load_delta);
- }
- u64 walt_ktime_clock(void)
- @@ -169,16 +172,33 @@
- static int __init set_walt_ravg_window(char *str)
- {
- + unsigned int adj_window;
- + bool no_walt = walt_disabled;
- +
- get_option(&str, &walt_ravg_window);
- - walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
- - walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
- + /* Adjust for CONFIG_HZ */
- + adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC;
- +
- + /* Warn if we're a bit too far away from the expected window size */
- + WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC,
- + "tick-adjusted window size %u, original was %u\n", adj_window,
- + walt_ravg_window);
- +
- + walt_ravg_window = adj_window;
- +
- + walt_disabled = walt_disabled ||
- + (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
- + walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
- +
- + WARN(!no_walt && walt_disabled,
- + "invalid window size, disabling WALT\n");
- +
- return 0;
- }
- early_param("walt_ravg_window", set_walt_ravg_window);
- -extern u64 arch_counter_get_cntpct(void);
- static void
- update_window_start(struct rq *rq, u64 wallclock)
- {
- @@ -188,10 +208,8 @@
- delta = wallclock - rq->window_start;
- /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
- if (delta < 0) {
- - if (arch_counter_get_cntpct() == 0)
- - delta = 0;
- - else
- - BUG_ON(1);
- + delta = 0;
- + WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
- }
- if (delta < walt_ravg_window)
- @@ -199,26 +217,20 @@
- nr_windows = div64_u64(delta, walt_ravg_window);
- rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
- +
- + rq->cum_window_demand = rq->cumulative_runnable_avg;
- }
- +/*
- + * Translate absolute delta time accounted on a CPU
- + * to a scale where 1024 is the capacity of the most
- + * capable CPU running at FMAX
- + */
- static u64 scale_exec_time(u64 delta, struct rq *rq)
- {
- - unsigned int cur_freq = rq->cur_freq;
- - int sf;
- -
- - if (unlikely(cur_freq > max_possible_freq))
- - cur_freq = rq->max_possible_freq;
- + unsigned long capcurr = capacity_curr_of(cpu_of(rq));
- - /* round up div64 */
- - delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
- - max_possible_freq);
- -
- - sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency);
- -
- - delta *= sf;
- - delta >>= 10;
- -
- - return delta;
- + return (delta * capcurr) >> SCHED_CAPACITY_SHIFT;
- }
- static int cpu_is_waiting_on_io(struct rq *rq)
- @@ -595,10 +607,20 @@
- * A throttled deadline sched class task gets dequeued without
- * changing p->on_rq. Since the dequeue decrements hmp stats
- * avoid decrementing it here again.
- + *
- + * When window is rolled over, the cumulative window demand
- + * is reset to the cumulative runnable average (contribution from
- + * the tasks on the runqueue). If the current task is dequeued
- + * already, it's demand is not included in the cumulative runnable
- + * average. So add the task demand separately to cumulative window
- + * demand.
- */
- - if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
- - !p->dl.dl_throttled))
- - fixup_cumulative_runnable_avg(rq, p, demand);
- + if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
- + if (task_on_rq_queued(p))
- + fixup_cumulative_runnable_avg(rq, p, demand);
- + else if (rq->curr == p)
- + fixup_cum_window_demand(rq, demand);
- + }
- p->ravg.demand = demand;
- @@ -741,33 +763,6 @@
- p->ravg.mark_start = wallclock;
- }
- -unsigned long __weak arch_get_cpu_efficiency(int cpu)
- -{
- - return SCHED_LOAD_SCALE;
- -}
- -
- -void walt_init_cpu_efficiency(void)
- -{
- - int i, efficiency;
- - unsigned int max = 0, min = UINT_MAX;
- -
- - for_each_possible_cpu(i) {
- - efficiency = arch_get_cpu_efficiency(i);
- - cpu_rq(i)->efficiency = efficiency;
- -
- - if (efficiency > max)
- - max = efficiency;
- - if (efficiency < min)
- - min = efficiency;
- - }
- -
- - if (max)
- - max_possible_efficiency = max;
- -
- - if (min)
- - min_possible_efficiency = min;
- -}
- -
- static void reset_task_stats(struct task_struct *p)
- {
- u32 sum = 0;
- @@ -799,11 +794,11 @@
- int cpu = cpu_of(rq);
- struct rq *sync_rq = cpu_rq(sync_cpu);
- - if (rq->window_start)
- + if (likely(rq->window_start))
- return;
- if (cpu == sync_cpu) {
- - rq->window_start = walt_ktime_clock();
- + rq->window_start = 1;
- } else {
- raw_spin_unlock(&rq->lock);
- double_rq_lock(rq, sync_rq);
- @@ -846,6 +841,17 @@
- walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
- + /*
- + * When a task is migrating during the wakeup, adjust
- + * the task's contribution towards cumulative window
- + * demand.
- + */
- + if (p->state == TASK_WAKING &&
- + p->last_sleep_ts >= src_rq->window_start) {
- + fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand);
- + fixup_cum_window_demand(dest_rq, p->ravg.demand);
- + }
- +
- if (p->ravg.curr_window) {
- src_rq->curr_runnable_sum -= p->ravg.curr_window;
- dest_rq->curr_runnable_sum += p->ravg.curr_window;
- @@ -872,283 +878,6 @@
- double_rq_unlock(src_rq, dest_rq);
- }
- -/* Keep track of max/min capacity possible across CPUs "currently" */
- -static void __update_min_max_capacity(void)
- -{
- - int i;
- - int max = 0, min = INT_MAX;
- -
- - for_each_online_cpu(i) {
- - if (cpu_rq(i)->capacity > max)
- - max = cpu_rq(i)->capacity;
- - if (cpu_rq(i)->capacity < min)
- - min = cpu_rq(i)->capacity;
- - }
- -
- - max_capacity = max;
- - min_capacity = min;
- -}
- -
- -static void update_min_max_capacity(void)
- -{
- - unsigned long flags;
- - int i;
- -
- - local_irq_save(flags);
- - for_each_possible_cpu(i)
- - raw_spin_lock(&cpu_rq(i)->lock);
- -
- - __update_min_max_capacity();
- -
- - for_each_possible_cpu(i)
- - raw_spin_unlock(&cpu_rq(i)->lock);
- - local_irq_restore(flags);
- -}
- -
- -/*
- - * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
- - * least efficient cpu gets capacity of 1024
- - */
- -static unsigned long capacity_scale_cpu_efficiency(int cpu)
- -{
- - return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency;
- -}
- -
- -/*
- - * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
- - * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
- - */
- -static unsigned long capacity_scale_cpu_freq(int cpu)
- -{
- - return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq;
- -}
- -
- -/*
- - * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
- - * that "most" efficient cpu gets a load_scale_factor of 1
- - */
- -static unsigned long load_scale_cpu_efficiency(int cpu)
- -{
- - return DIV_ROUND_UP(1024 * max_possible_efficiency,
- - cpu_rq(cpu)->efficiency);
- -}
- -
- -/*
- - * Return load_scale_factor of a cpu in reference to cpu with best max_freq
- - * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
- - * of 1.
- - */
- -static unsigned long load_scale_cpu_freq(int cpu)
- -{
- - return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq);
- -}
- -
- -static int compute_capacity(int cpu)
- -{
- - int capacity = 1024;
- -
- - capacity *= capacity_scale_cpu_efficiency(cpu);
- - capacity >>= 10;
- -
- - capacity *= capacity_scale_cpu_freq(cpu);
- - capacity >>= 10;
- -
- - return capacity;
- -}
- -
- -static int compute_load_scale_factor(int cpu)
- -{
- - int load_scale = 1024;
- -
- - /*
- - * load_scale_factor accounts for the fact that task load
- - * is in reference to "best" performing cpu. Task's load will need to be
- - * scaled (up) by a factor to determine suitability to be placed on a
- - * (little) cpu.
- - */
- - load_scale *= load_scale_cpu_efficiency(cpu);
- - load_scale >>= 10;
- -
- - load_scale *= load_scale_cpu_freq(cpu);
- - load_scale >>= 10;
- -
- - return load_scale;
- -}
- -
- -static int cpufreq_notifier_policy(struct notifier_block *nb,
- - unsigned long val, void *data)
- -{
- - struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
- - int i, update_max = 0;
- - u64 highest_mpc = 0, highest_mplsf = 0;
- - const struct cpumask *cpus = policy->related_cpus;
- - unsigned int orig_min_max_freq = min_max_freq;
- - unsigned int orig_max_possible_freq = max_possible_freq;
- - /* Initialized to policy->max in case policy->related_cpus is empty! */
- - unsigned int orig_max_freq = policy->max;
- -
- - if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
- - val != CPUFREQ_CREATE_POLICY)
- - return 0;
- -
- - if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
- - update_min_max_capacity();
- - return 0;
- - }
- -
- - for_each_cpu(i, policy->related_cpus) {
- - cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
- - policy->related_cpus);
- - orig_max_freq = cpu_rq(i)->max_freq;
- - cpu_rq(i)->min_freq = policy->min;
- - cpu_rq(i)->max_freq = policy->max;
- - cpu_rq(i)->cur_freq = policy->cur;
- - cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq;
- - }
- -
- - max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
- - if (min_max_freq == 1)
- - min_max_freq = UINT_MAX;
- - min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
- - BUG_ON(!min_max_freq);
- - BUG_ON(!policy->max);
- -
- - /* Changes to policy other than max_freq don't require any updates */
- - if (orig_max_freq == policy->max)
- - return 0;
- -
- - /*
- - * A changed min_max_freq or max_possible_freq (possible during bootup)
- - * needs to trigger re-computation of load_scale_factor and capacity for
- - * all possible cpus (even those offline). It also needs to trigger
- - * re-computation of nr_big_task count on all online cpus.
- - *
- - * A changed rq->max_freq otoh needs to trigger re-computation of
- - * load_scale_factor and capacity for just the cluster of cpus involved.
- - * Since small task definition depends on max_load_scale_factor, a
- - * changed load_scale_factor of one cluster could influence
- - * classification of tasks in another cluster. Hence a changed
- - * rq->max_freq will need to trigger re-computation of nr_big_task
- - * count on all online cpus.
- - *
- - * While it should be sufficient for nr_big_tasks to be
- - * re-computed for only online cpus, we have inadequate context
- - * information here (in policy notifier) with regard to hotplug-safety
- - * context in which notification is issued. As a result, we can't use
- - * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is
- - * fixed up to issue notification always in hotplug-safe context,
- - * re-compute nr_big_task for all possible cpus.
- - */
- -
- - if (orig_min_max_freq != min_max_freq ||
- - orig_max_possible_freq != max_possible_freq) {
- - cpus = cpu_possible_mask;
- - update_max = 1;
- - }
- -
- - /*
- - * Changed load_scale_factor can trigger reclassification of tasks as
- - * big or small. Make this change "atomic" so that tasks are accounted
- - * properly due to changed load_scale_factor
- - */
- - for_each_cpu(i, cpus) {
- - struct rq *rq = cpu_rq(i);
- -
- - rq->capacity = compute_capacity(i);
- - rq->load_scale_factor = compute_load_scale_factor(i);
- -
- - if (update_max) {
- - u64 mpc, mplsf;
- -
- - mpc = div_u64(((u64) rq->capacity) *
- - rq->max_possible_freq, rq->max_freq);
- - rq->max_possible_capacity = (int) mpc;
- -
- - mplsf = div_u64(((u64) rq->load_scale_factor) *
- - rq->max_possible_freq, rq->max_freq);
- -
- - if (mpc > highest_mpc) {
- - highest_mpc = mpc;
- - cpumask_clear(&mpc_mask);
- - cpumask_set_cpu(i, &mpc_mask);
- - } else if (mpc == highest_mpc) {
- - cpumask_set_cpu(i, &mpc_mask);
- - }
- -
- - if (mplsf > highest_mplsf)
- - highest_mplsf = mplsf;
- - }
- - }
- -
- - if (update_max) {
- - max_possible_capacity = highest_mpc;
- - max_load_scale_factor = highest_mplsf;
- - }
- -
- - __update_min_max_capacity();
- -
- - return 0;
- -}
- -
- -static int cpufreq_notifier_trans(struct notifier_block *nb,
- - unsigned long val, void *data)
- -{
- - struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
- - unsigned int cpu = freq->cpu, new_freq = freq->new;
- - unsigned long flags;
- - int i;
- -
- - if (val != CPUFREQ_POSTCHANGE)
- - return 0;
- -
- - BUG_ON(!new_freq);
- -
- - if (cpu_rq(cpu)->cur_freq == new_freq)
- - return 0;
- -
- - for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) {
- - struct rq *rq = cpu_rq(i);
- -
- - raw_spin_lock_irqsave(&rq->lock, flags);
- - walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
- - walt_ktime_clock(), 0);
- - rq->cur_freq = new_freq;
- - raw_spin_unlock_irqrestore(&rq->lock, flags);
- - }
- -
- - return 0;
- -}
- -
- -static struct notifier_block notifier_policy_block = {
- - .notifier_call = cpufreq_notifier_policy
- -};
- -
- -static struct notifier_block notifier_trans_block = {
- - .notifier_call = cpufreq_notifier_trans
- -};
- -
- -static int register_sched_callback(void)
- -{
- - int ret;
- -
- - ret = cpufreq_register_notifier(¬ifier_policy_block,
- - CPUFREQ_POLICY_NOTIFIER);
- -
- - if (!ret)
- - ret = cpufreq_register_notifier(¬ifier_trans_block,
- - CPUFREQ_TRANSITION_NOTIFIER);
- -
- - return 0;
- -}
- -
- -/*
- - * cpufreq callbacks can be registered at core_initcall or later time.
- - * Any registration done prior to that is "forgotten" by cpufreq. See
- - * initialization of variable init_cpufreq_transition_notifier_list_called
- - * for further information.
- - */
- -core_initcall(register_sched_callback);
- -
- void walt_init_new_task_load(struct task_struct *p)
- {
- int i;
- diff -Nur /home/ninez/android/marlin/kernel/sched/walt.h /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/walt.h
- --- /home/ninez/android/marlin/kernel/sched/walt.h 2018-08-10 01:54:08.566728454 -0400
- +++ /home/ninez/android/Marlin_exns-eas_81_369/kernel/sched/walt.h 2018-08-11 23:57:17.131940887 -0400
- @@ -55,8 +55,10 @@
- static inline void walt_init_cpu_efficiency(void) { }
- static inline u64 walt_ktime_clock(void) { return 0; }
- +#define walt_cpu_high_irqload(cpu) false
- +
- #endif /* CONFIG_SCHED_WALT */
- -extern unsigned int walt_disabled;
- +extern bool walt_disabled;
- #endif
Add Comment
Please, Sign In to add comment