Advertisement
Guest User

Untitled

a guest
Feb 22nd, 2019
5,025
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 262.69 KB | None | 0 0
  1. diff -ruNb a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
  2. --- a/arch/powerpc/platforms/cell/spufs/sched.c 2012-10-12 21:48:25.000000000 +0100
  3. +++ b/arch/powerpc/platforms/cell/spufs/sched.c 2012-10-21 16:28:24.284671447 +0100
  4. @@ -63,11 +63,6 @@
  5.  static struct timer_list spuloadavg_timer;
  6.  
  7.  /*
  8. - * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
  9. - */
  10. -#define NORMAL_PRIO        120
  11. -
  12. -/*
  13.   * Frequency of the spu scheduler tick.  By default we do one SPU scheduler
  14.   * tick for every 10 CPU scheduler ticks.
  15.   */
  16. diff -ruNb a/arch/x86/Kconfig b/arch/x86/Kconfig
  17. --- a/arch/x86/Kconfig  2012-10-12 21:48:25.000000000 +0100
  18. +++ b/arch/x86/Kconfig  2012-10-21 16:28:24.316665306 +0100
  19. @@ -795,15 +795,7 @@
  20.       increased overhead in some places. If unsure say N here.
  21.  
  22.  config IRQ_TIME_ACCOUNTING
  23. -   bool "Fine granularity task level IRQ time accounting"
  24. -   default n
  25. -   ---help---
  26. -     Select this option to enable fine granularity task irq time
  27. -     accounting. This is done by reading a timestamp on each
  28. -     transitions between softirq and hardirq state, so there can be a
  29. -     small performance impact.
  30. -
  31. -     If in doubt, say N here.
  32. +   def_bool y
  33.  
  34.  source "kernel/Kconfig.preempt"
  35.  
  36. @@ -1101,7 +1093,7 @@
  37.  
  38.  choice
  39.     depends on EXPERIMENTAL
  40. -   prompt "Memory split" if EXPERT
  41. +   prompt "Memory split"
  42.     default VMSPLIT_3G
  43.     depends on X86_32
  44.     ---help---
  45. @@ -1121,17 +1113,17 @@
  46.       option alone!
  47.  
  48.     config VMSPLIT_3G
  49. -       bool "3G/1G user/kernel split"
  50. +       bool "Default 896MB lowmem (3G/1G user/kernel split)"
  51.     config VMSPLIT_3G_OPT
  52.         depends on !X86_PAE
  53. -       bool "3G/1G user/kernel split (for full 1G low memory)"
  54. +       bool "1GB lowmem (3G/1G user/kernel split)"
  55.     config VMSPLIT_2G
  56. -       bool "2G/2G user/kernel split"
  57. +       bool "2GB lowmem (2G/2G user/kernel split)"
  58.     config VMSPLIT_2G_OPT
  59.         depends on !X86_PAE
  60. -       bool "2G/2G user/kernel split (for full 2G low memory)"
  61. +       bool "2GB lowmem (2G/2G user/kernel split)"
  62.     config VMSPLIT_1G
  63. -       bool "1G/3G user/kernel split"
  64. +       bool "3GB lowmem (1G/3G user/kernel split)"
  65.  endchoice
  66.  
  67.  config PAGE_OFFSET
  68. diff -ruNb a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
  69. --- a/arch/x86/kernel/cpu/proc.c    2012-10-12 21:48:25.000000000 +0100
  70. +++ b/arch/x86/kernel/cpu/proc.c    2012-10-21 16:28:24.321664346 +0100
  71. @@ -109,7 +109,7 @@
  72.  
  73.     seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
  74.            c->loops_per_jiffy/(500000/HZ),
  75. -          (c->loops_per_jiffy/(5000/HZ)) % 100);
  76. +          (c->loops_per_jiffy * 10 /(50000/HZ)) % 100);
  77.  
  78.  #ifdef CONFIG_X86_64
  79.     if (c->x86_tlbsize > 0)
  80. diff -ruNb a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
  81. --- a/arch/x86/kernel/smpboot.c 2012-10-12 21:48:25.000000000 +0100
  82. +++ b/arch/x86/kernel/smpboot.c 2012-10-21 16:28:24.322664154 +0100
  83. @@ -440,7 +440,7 @@
  84.         "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
  85.         num_online_cpus(),
  86.         bogosum/(500000/HZ),
  87. -       (bogosum/(5000/HZ))%100);
  88. +       (bogosum * 10/(50000/HZ))%100);
  89.  
  90.     pr_debug("Before bogocount - setting activated=1.\n");
  91.  }
  92. diff -ruNb a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt
  93. --- a/Documentation/scheduler/sched-BFS.txt 1970-01-01 01:00:00.000000000 +0100
  94. +++ b/Documentation/scheduler/sched-BFS.txt 2012-10-21 16:28:24.285671255 +0100
  95. @@ -0,0 +1,347 @@
  96. +BFS - The Brain Fuck Scheduler by Con Kolivas.
  97. +
  98. +Goals.
  99. +
  100. +The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to
  101. +completely do away with the complex designs of the past for the cpu process
  102. +scheduler and instead implement one that is very simple in basic design.
  103. +The main focus of BFS is to achieve excellent desktop interactivity and
  104. +responsiveness without heuristics and tuning knobs that are difficult to
  105. +understand, impossible to model and predict the effect of, and when tuned to
  106. +one workload cause massive detriment to another.
  107. +
  108. +
  109. +Design summary.
  110. +
  111. +BFS is best described as a single runqueue, O(n) lookup, earliest effective
  112. +virtual deadline first design, loosely based on EEVDF (earliest eligible virtual
  113. +deadline first) and my previous Staircase Deadline scheduler. Each component
  114. +shall be described in order to understand the significance of, and reasoning for
  115. +it. The codebase when the first stable version was released was approximately
  116. +9000 lines less code than the existing mainline linux kernel scheduler (in
  117. +2.6.31). This does not even take into account the removal of documentation and
  118. +the cgroups code that is not used.
  119. +
  120. +Design reasoning.
  121. +
  122. +The single runqueue refers to the queued but not running processes for the
  123. +entire system, regardless of the number of CPUs. The reason for going back to
  124. +a single runqueue design is that once multiple runqueues are introduced,
  125. +per-CPU or otherwise, there will be complex interactions as each runqueue will
  126. +be responsible for the scheduling latency and fairness of the tasks only on its
  127. +own runqueue, and to achieve fairness and low latency across multiple CPUs, any
  128. +advantage in throughput of having CPU local tasks causes other disadvantages.
  129. +This is due to requiring a very complex balancing system to at best achieve some
  130. +semblance of fairness across CPUs and can only maintain relatively low latency
  131. +for tasks bound to the same CPUs, not across them. To increase said fairness
  132. +and latency across CPUs, the advantage of local runqueue locking, which makes
  133. +for better scalability, is lost due to having to grab multiple locks.
  134. +
  135. +A significant feature of BFS is that all accounting is done purely based on CPU
  136. +used and nowhere is sleep time used in any way to determine entitlement or
  137. +interactivity. Interactivity "estimators" that use some kind of sleep/run
  138. +algorithm are doomed to fail to detect all interactive tasks, and to falsely tag
  139. +tasks that aren't interactive as being so. The reason for this is that it is
  140. +close to impossible to determine that when a task is sleeping, whether it is
  141. +doing it voluntarily, as in a userspace application waiting for input in the
  142. +form of a mouse click or otherwise, or involuntarily, because it is waiting for
  143. +another thread, process, I/O, kernel activity or whatever. Thus, such an
  144. +estimator will introduce corner cases, and more heuristics will be required to
  145. +cope with those corner cases, introducing more corner cases and failed
  146. +interactivity detection and so on. Interactivity in BFS is built into the design
  147. +by virtue of the fact that tasks that are waking up have not used up their quota
  148. +of CPU time, and have earlier effective deadlines, thereby making it very likely
  149. +they will preempt any CPU bound task of equivalent nice level. See below for
  150. +more information on the virtual deadline mechanism. Even if they do not preempt
  151. +a running task, because the rr interval is guaranteed to have a bound upper
  152. +limit on how long a task will wait for, it will be scheduled within a timeframe
  153. +that will not cause visible interface jitter.
  154. +
  155. +
  156. +Design details.
  157. +
  158. +Task insertion.
  159. +
  160. +BFS inserts tasks into each relevant queue as an O(1) insertion into a double
  161. +linked list. On insertion, *every* running queue is checked to see if the newly
  162. +queued task can run on any idle queue, or preempt the lowest running task on the
  163. +system. This is how the cross-CPU scheduling of BFS achieves significantly lower
  164. +latency per extra CPU the system has. In this case the lookup is, in the worst
  165. +case scenario, O(n) where n is the number of CPUs on the system.
  166. +
  167. +Data protection.
  168. +
  169. +BFS has one single lock protecting the process local data of every task in the
  170. +global queue. Thus every insertion, removal and modification of task data in the
  171. +global runqueue needs to grab the global lock. However, once a task is taken by
  172. +a CPU, the CPU has its own local data copy of the running process' accounting
  173. +information which only that CPU accesses and modifies (such as during a
  174. +timer tick) thus allowing the accounting data to be updated lockless. Once a
  175. +CPU has taken a task to run, it removes it from the global queue. Thus the
  176. +global queue only ever has, at most,
  177. +
  178. +   (number of tasks requesting cpu time) - (number of logical CPUs) + 1
  179. +
  180. +tasks in the global queue. This value is relevant for the time taken to look up
  181. +tasks during scheduling. This will increase if many tasks with CPU affinity set
  182. +in their policy to limit which CPUs they're allowed to run on if they outnumber
  183. +the number of CPUs. The +1 is because when rescheduling a task, the CPU's
  184. +currently running task is put back on the queue. Lookup will be described after
  185. +the virtual deadline mechanism is explained.
  186. +
  187. +Virtual deadline.
  188. +
  189. +The key to achieving low latency, scheduling fairness, and "nice level"
  190. +distribution in BFS is entirely in the virtual deadline mechanism. The one
  191. +tunable in BFS is the rr_interval, or "round robin interval". This is the
  192. +maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy)
  193. +tasks of the same nice level will be running for, or looking at it the other
  194. +way around, the longest duration two tasks of the same nice level will be
  195. +delayed for. When a task requests cpu time, it is given a quota (time_slice)
  196. +equal to the rr_interval and a virtual deadline. The virtual deadline is
  197. +offset from the current time in jiffies by this equation:
  198. +
  199. +   jiffies + (prio_ratio * rr_interval)
  200. +
  201. +The prio_ratio is determined as a ratio compared to the baseline of nice -20
  202. +and increases by 10% per nice level. The deadline is a virtual one only in that
  203. +no guarantee is placed that a task will actually be scheduled by this time, but
  204. +it is used to compare which task should go next. There are three components to
  205. +how a task is next chosen. First is time_slice expiration. If a task runs out
  206. +of its time_slice, it is descheduled, the time_slice is refilled, and the
  207. +deadline reset to that formula above. Second is sleep, where a task no longer
  208. +is requesting CPU for whatever reason. The time_slice and deadline are _not_
  209. +adjusted in this case and are just carried over for when the task is next
  210. +scheduled. Third is preemption, and that is when a newly waking task is deemed
  211. +higher priority than a currently running task on any cpu by virtue of the fact
  212. +that it has an earlier virtual deadline than the currently running task. The
  213. +earlier deadline is the key to which task is next chosen for the first and
  214. +second cases. Once a task is descheduled, it is put back on the queue, and an
  215. +O(n) lookup of all queued-but-not-running tasks is done to determine which has
  216. +the earliest deadline and that task is chosen to receive CPU next.
  217. +
  218. +The CPU proportion of different nice tasks works out to be approximately the
  219. +
  220. +   (prio_ratio difference)^2
  221. +
  222. +The reason it is squared is that a task's deadline does not change while it is
  223. +running unless it runs out of time_slice. Thus, even if the time actually
  224. +passes the deadline of another task that is queued, it will not get CPU time
  225. +unless the current running task deschedules, and the time "base" (jiffies) is
  226. +constantly moving.
  227. +
  228. +Task lookup.
  229. +
  230. +BFS has 103 priority queues. 100 of these are dedicated to the static priority
  231. +of realtime tasks, and the remaining 3 are, in order of best to worst priority,
  232. +SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority
  233. +scheduling). When a task of these priorities is queued, a bitmap of running
  234. +priorities is set showing which of these priorities has tasks waiting for CPU
  235. +time. When a CPU is made to reschedule, the lookup for the next task to get
  236. +CPU time is performed in the following way:
  237. +
  238. +First the bitmap is checked to see what static priority tasks are queued. If
  239. +any realtime priorities are found, the corresponding queue is checked and the
  240. +first task listed there is taken (provided CPU affinity is suitable) and lookup
  241. +is complete. If the priority corresponds to a SCHED_ISO task, they are also
  242. +taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds
  243. +to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this
  244. +stage, every task in the runlist that corresponds to that priority is checked
  245. +to see which has the earliest set deadline, and (provided it has suitable CPU
  246. +affinity) it is taken off the runqueue and given the CPU. If a task has an
  247. +expired deadline, it is taken and the rest of the lookup aborted (as they are
  248. +chosen in FIFO order).
  249. +
  250. +Thus, the lookup is O(n) in the worst case only, where n is as described
  251. +earlier, as tasks may be chosen before the whole task list is looked over.
  252. +
  253. +
  254. +Scalability.
  255. +
  256. +The major limitations of BFS will be that of scalability, as the separate
  257. +runqueue designs will have less lock contention as the number of CPUs rises.
  258. +However they do not scale linearly even with separate runqueues as multiple
  259. +runqueues will need to be locked concurrently on such designs to be able to
  260. +achieve fair CPU balancing, to try and achieve some sort of nice-level fairness
  261. +across CPUs, and to achieve low enough latency for tasks on a busy CPU when
  262. +other CPUs would be more suited. BFS has the advantage that it requires no
  263. +balancing algorithm whatsoever, as balancing occurs by proxy simply because
  264. +all CPUs draw off the global runqueue, in priority and deadline order. Despite
  265. +the fact that scalability is _not_ the prime concern of BFS, it both shows very
  266. +good scalability to smaller numbers of CPUs and is likely a more scalable design
  267. +at these numbers of CPUs.
  268. +
  269. +It also has some very low overhead scalability features built into the design
  270. +when it has been deemed their overhead is so marginal that they're worth adding.
  271. +The first is the local copy of the running process' data to the CPU it's running
  272. +on to allow that data to be updated lockless where possible. Then there is
  273. +deference paid to the last CPU a task was running on, by trying that CPU first
  274. +when looking for an idle CPU to use the next time it's scheduled. Finally there
  275. +is the notion of "sticky" tasks that are flagged when they are involuntarily
  276. +descheduled, meaning they still want further CPU time. This sticky flag is
  277. +used to bias heavily against those tasks being scheduled on a different CPU
  278. +unless that CPU would be otherwise idle. When a cpu frequency governor is used
  279. +that scales with CPU load, such as ondemand, sticky tasks are not scheduled
  280. +on a different CPU at all, preferring instead to go idle. This means the CPU
  281. +they were bound to is more likely to increase its speed while the other CPU
  282. +will go idle, thus speeding up total task execution time and likely decreasing
  283. +power usage. This is the only scenario where BFS will allow a CPU to go idle
  284. +in preference to scheduling a task on the earliest available spare CPU.
  285. +
  286. +The real cost of migrating a task from one CPU to another is entirely dependant
  287. +on the cache footprint of the task, how cache intensive the task is, how long
  288. +it's been running on that CPU to take up the bulk of its cache, how big the CPU
  289. +cache is, how fast and how layered the CPU cache is, how fast a context switch
  290. +is... and so on. In other words, it's close to random in the real world where we
  291. +do more than just one sole workload. The only thing we can be sure of is that
  292. +it's not free. So BFS uses the principle that an idle CPU is a wasted CPU and
  293. +utilising idle CPUs is more important than cache locality, and cache locality
  294. +only plays a part after that.
  295. +
  296. +When choosing an idle CPU for a waking task, the cache locality is determined
  297. +according to where the task last ran and then idle CPUs are ranked from best
  298. +to worst to choose the most suitable idle CPU based on cache locality, NUMA
  299. +node locality and hyperthread sibling business. They are chosen in the
  300. +following preference (if idle):
  301. +
  302. +* Same core, idle or busy cache, idle threads
  303. +* Other core, same cache, idle or busy cache, idle threads.
  304. +* Same node, other CPU, idle cache, idle threads.
  305. +* Same node, other CPU, busy cache, idle threads.
  306. +* Same core, busy threads.
  307. +* Other core, same cache, busy threads.
  308. +* Same node, other CPU, busy threads.
  309. +* Other node, other CPU, idle cache, idle threads.
  310. +* Other node, other CPU, busy cache, idle threads.
  311. +* Other node, other CPU, busy threads.
  312. +
  313. +This shows the SMT or "hyperthread" awareness in the design as well which will
  314. +choose a real idle core first before a logical SMT sibling which already has
  315. +tasks on the physical CPU.
  316. +
  317. +Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark.
  318. +However this benchmarking was performed on an earlier design that was far less
  319. +scalable than the current one so it's hard to know how scalable it is in terms
  320. +of both CPUs (due to the global runqueue) and heavily loaded machines (due to
  321. +O(n) lookup) at this stage. Note that in terms of scalability, the number of
  322. +_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x)
  323. +quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark
  324. +results are very promising indeed, without needing to tweak any knobs, features
  325. +or options. Benchmark contributions are most welcome.
  326. +
  327. +
  328. +Features
  329. +
  330. +As the initial prime target audience for BFS was the average desktop user, it
  331. +was designed to not need tweaking, tuning or have features set to obtain benefit
  332. +from it. Thus the number of knobs and features has been kept to an absolute
  333. +minimum and should not require extra user input for the vast majority of cases.
  334. +There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval
  335. +and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition
  336. +to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is
  337. +support for CGROUPS. The average user should neither need to know what these
  338. +are, nor should they need to be using them to have good desktop behaviour.
  339. +
  340. +rr_interval
  341. +
  342. +There is only one "scheduler" tunable, the round robin interval. This can be
  343. +accessed in
  344. +
  345. +   /proc/sys/kernel/rr_interval
  346. +
  347. +The value is in milliseconds, and the default value is set to 6ms. Valid values
  348. +are from 1 to 1000. Decreasing the value will decrease latencies at the cost of
  349. +decreasing throughput, while increasing it will improve throughput, but at the
  350. +cost of worsening latencies. The accuracy of the rr interval is limited by HZ
  351. +resolution of the kernel configuration. Thus, the worst case latencies are
  352. +usually slightly higher than this actual value. BFS uses "dithering" to try and
  353. +minimise the effect the Hz limitation has. The default value of 6 is not an
  354. +arbitrary one. It is based on the fact that humans can detect jitter at
  355. +approximately 7ms, so aiming for much lower latencies is pointless under most
  356. +circumstances. It is worth noting this fact when comparing the latency
  357. +performance of BFS to other schedulers. Worst case latencies being higher than
  358. +7ms are far worse than average latencies not being in the microsecond range.
  359. +Experimentation has shown that rr intervals being increased up to 300 can
  360. +improve throughput but beyond that, scheduling noise from elsewhere prevents
  361. +further demonstrable throughput.
  362. +
  363. +Isochronous scheduling.
  364. +
  365. +Isochronous scheduling is a unique scheduling policy designed to provide
  366. +near-real-time performance to unprivileged (ie non-root) users without the
  367. +ability to starve the machine indefinitely. Isochronous tasks (which means
  368. +"same time") are set using, for example, the schedtool application like so:
  369. +
  370. +   schedtool -I -e amarok
  371. +
  372. +This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works
  373. +is that it has a priority level between true realtime tasks and SCHED_NORMAL
  374. +which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie,
  375. +if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval
  376. +rate). However if ISO tasks run for more than a tunable finite amount of time,
  377. +they are then demoted back to SCHED_NORMAL scheduling. This finite amount of
  378. +time is the percentage of _total CPU_ available across the machine, configurable
  379. +as a percentage in the following "resource handling" tunable (as opposed to a
  380. +scheduler tunable):
  381. +
  382. +   /proc/sys/kernel/iso_cpu
  383. +
  384. +and is set to 70% by default. It is calculated over a rolling 5 second average
  385. +Because it is the total CPU available, it means that on a multi CPU machine, it
  386. +is possible to have an ISO task running as realtime scheduling indefinitely on
  387. +just one CPU, as the other CPUs will be available. Setting this to 100 is the
  388. +equivalent of giving all users SCHED_RR access and setting it to 0 removes the
  389. +ability to run any pseudo-realtime tasks.
  390. +
  391. +A feature of BFS is that it detects when an application tries to obtain a
  392. +realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the
  393. +appropriate privileges to use those policies. When it detects this, it will
  394. +give the task SCHED_ISO policy instead. Thus it is transparent to the user.
  395. +Because some applications constantly set their policy as well as their nice
  396. +level, there is potential for them to undo the override specified by the user
  397. +on the command line of setting the policy to SCHED_ISO. To counter this, once
  398. +a task has been set to SCHED_ISO policy, it needs superuser privileges to set
  399. +it back to SCHED_NORMAL. This will ensure the task remains ISO and all child
  400. +processes and threads will also inherit the ISO policy.
  401. +
  402. +Idleprio scheduling.
  403. +
  404. +Idleprio scheduling is a scheduling policy designed to give out CPU to a task
  405. +_only_ when the CPU would be otherwise idle. The idea behind this is to allow
  406. +ultra low priority tasks to be run in the background that have virtually no
  407. +effect on the foreground tasks. This is ideally suited to distributed computing
  408. +clients (like setiathome, folding, mprime etc) but can also be used to start
  409. +a video encode or so on without any slowdown of other tasks. To avoid this
  410. +policy from grabbing shared resources and holding them indefinitely, if it
  411. +detects a state where the task is waiting on I/O, the machine is about to
  412. +suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As
  413. +per the Isochronous task management, once a task has been scheduled as IDLEPRIO,
  414. +it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can
  415. +be set to start as SCHED_IDLEPRIO with the schedtool command like so:
  416. +
  417. +   schedtool -D -e ./mprime
  418. +
  419. +Subtick accounting.
  420. +
  421. +It is surprisingly difficult to get accurate CPU accounting, and in many cases,
  422. +the accounting is done by simply determining what is happening at the precise
  423. +moment a timer tick fires off. This becomes increasingly inaccurate as the
  424. +timer tick frequency (HZ) is lowered. It is possible to create an application
  425. +which uses almost 100% CPU, yet by being descheduled at the right time, records
  426. +zero CPU usage. While the main problem with this is that there are possible
  427. +security implications, it is also difficult to determine how much CPU a task
  428. +really does use. BFS tries to use the sub-tick accounting from the TSC clock,
  429. +where possible, to determine real CPU usage. This is not entirely reliable, but
  430. +is far more likely to produce accurate CPU usage data than the existing designs
  431. +and will not show tasks as consuming no CPU usage when they actually are. Thus,
  432. +the amount of CPU reported as being used by BFS will more accurately represent
  433. +how much CPU the task itself is using (as is shown for example by the 'time'
  434. +application), so the reported values may be quite different to other schedulers.
  435. +Values reported as the 'load' are more prone to problems with this design, but
  436. +per process values are closer to real usage. When comparing throughput of BFS
  437. +to other designs, it is important to compare the actual completed work in terms
  438. +of total wall clock time taken and total work done, rather than the reported
  439. +"cpu usage".
  440. +
  441. +
  442. +Con Kolivas <kernel@kolivas.org> Tue, 5 Apr 2011
  443. diff -ruNb a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
  444. --- a/Documentation/sysctl/kernel.txt   2012-10-12 21:48:25.000000000 +0100
  445. +++ b/Documentation/sysctl/kernel.txt   2012-10-21 16:28:24.286671063 +0100
  446. @@ -33,6 +33,7 @@
  447.  - domainname
  448.  - hostname
  449.  - hotplug
  450. +- iso_cpu
  451.  - kptr_restrict
  452.  - kstack_depth_to_print       [ X86 only ]
  453.  - l2cr                        [ PPC only ]
  454. @@ -59,6 +60,7 @@
  455.  - randomize_va_space
  456.  - real-root-dev               ==> Documentation/initrd.txt
  457.  - reboot-cmd                  [ SPARC only ]
  458. +- rr_interval
  459.  - rtsig-max
  460.  - rtsig-nr
  461.  - sem
  462. @@ -301,6 +303,16 @@
  463.  
  464.  ==============================================================
  465.  
  466. +iso_cpu: (BFS CPU scheduler only).
  467. +
  468. +This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
  469. +run effectively at realtime priority, averaged over a rolling five
  470. +seconds over the -whole- system, meaning all cpus.
  471. +
  472. +Set to 70 (percent) by default.
  473. +
  474. +==============================================================
  475. +
  476.  l2cr: (PPC only)
  477.  
  478.  This flag controls the L2 cache of G3 processor boards. If
  479. @@ -517,6 +529,20 @@
  480.  
  481.  ==============================================================
  482.  
  483. +rr_interval: (BFS CPU scheduler only)
  484. +
  485. +This is the smallest duration that any cpu process scheduling unit
  486. +will run for. Increasing this value can increase throughput of cpu
  487. +bound tasks substantially but at the expense of increased latencies
  488. +overall. Conversely decreasing it will decrease average and maximum
  489. +latencies but at the expense of throughput. This value is in
  490. +milliseconds and the default value chosen depends on the number of
  491. +cpus available at scheduler initialisation with a minimum of 6.
  492. +
  493. +Valid values are from 1-1000.
  494. +
  495. +==============================================================
  496. +
  497.  rtsig-max & rtsig-nr:
  498.  
  499.  The file rtsig-max can be used to tune the maximum number
  500. diff -ruNb a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
  501. --- a/drivers/cpufreq/cpufreq.c 2012-10-12 21:48:25.000000000 +0100
  502. +++ b/drivers/cpufreq/cpufreq.c 2012-10-21 16:28:24.286671063 +0100
  503. @@ -28,6 +28,7 @@
  504.  #include <linux/cpu.h>
  505.  #include <linux/completion.h>
  506.  #include <linux/mutex.h>
  507. +#include <linux/sched.h>
  508.  #include <linux/syscore_ops.h>
  509.  
  510.  #include <trace/events/power.h>
  511. @@ -1457,6 +1458,12 @@
  512.         target_freq, relation);
  513.     if (cpu_online(policy->cpu) && cpufreq_driver->target)
  514.         retval = cpufreq_driver->target(policy, target_freq, relation);
  515. +   if (likely(retval != -EINVAL)) {
  516. +       if (target_freq == policy->max)
  517. +           cpu_nonscaling(policy->cpu);
  518. +       else
  519. +           cpu_scaling(policy->cpu);
  520. +   }
  521.  
  522.     return retval;
  523.  }
  524. diff -ruNb a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
  525. --- a/drivers/cpufreq/cpufreq_conservative.c    2012-10-12 21:48:25.000000000 +0100
  526. +++ b/drivers/cpufreq/cpufreq_conservative.c    2012-10-21 16:28:24.287670871 +0100
  527. @@ -29,8 +29,8 @@
  528.   * It helps to keep variable names smaller, simpler
  529.   */
  530.  
  531. -#define DEF_FREQUENCY_UP_THRESHOLD     (80)
  532. -#define DEF_FREQUENCY_DOWN_THRESHOLD       (20)
  533. +#define DEF_FREQUENCY_UP_THRESHOLD     (63)
  534. +#define DEF_FREQUENCY_DOWN_THRESHOLD       (26)
  535.  
  536.  /*
  537.   * The polling frequency of this governor depends on the capability of
  538. diff -ruNb a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
  539. --- a/drivers/cpufreq/cpufreq_ondemand.c    2012-10-12 21:48:25.000000000 +0100
  540. +++ b/drivers/cpufreq/cpufreq_ondemand.c    2012-10-21 16:28:24.287670871 +0100
  541. @@ -28,8 +28,8 @@
  542.   * It helps to keep variable names smaller, simpler
  543.   */
  544.  
  545. -#define DEF_FREQUENCY_DOWN_DIFFERENTIAL        (10)
  546. -#define DEF_FREQUENCY_UP_THRESHOLD     (80)
  547. +#define DEF_FREQUENCY_DOWN_DIFFERENTIAL        (26)
  548. +#define DEF_FREQUENCY_UP_THRESHOLD     (63)
  549.  #define DEF_SAMPLING_DOWN_FACTOR       (1)
  550.  #define MAX_SAMPLING_DOWN_FACTOR       (100000)
  551.  #define MICRO_FREQUENCY_DOWN_DIFFERENTIAL  (3)
  552. @@ -472,10 +472,10 @@
  553.  
  554.     /*
  555.      * Every sampling_rate, we check, if current idle time is less
  556. -    * than 20% (default), then we try to increase frequency
  557. +    * than 37% (default), then we try to increase frequency
  558.      * Every sampling_rate, we look for a the lowest
  559.      * frequency which can sustain the load while keeping idle time over
  560. -    * 30%. If such a frequency exist, we try to decrease to this frequency.
  561. +    * 63%. If such a frequency exist, we try to decrease to this frequency.
  562.      *
  563.      * Any frequency increase takes it to the maximum frequency.
  564.      * Frequency reduction happens at minimum steps of
  565. diff -ruNb a/fs/proc/base.c b/fs/proc/base.c
  566. --- a/fs/proc/base.c    2012-10-12 21:48:25.000000000 +0100
  567. +++ b/fs/proc/base.c    2012-10-21 16:28:24.288670679 +0100
  568. @@ -338,7 +338,7 @@
  569.  static int proc_pid_schedstat(struct task_struct *task, char *buffer)
  570.  {
  571.     return sprintf(buffer, "%llu %llu %lu\n",
  572. -           (unsigned long long)task->se.sum_exec_runtime,
  573. +           (unsigned long long)tsk_seruntime(task),
  574.             (unsigned long long)task->sched_info.run_delay,
  575.             task->sched_info.pcount);
  576.  }
  577. diff -ruNb a/include/linux/init_task.h b/include/linux/init_task.h
  578. --- a/include/linux/init_task.h 2012-10-12 21:48:25.000000000 +0100
  579. +++ b/include/linux/init_task.h 2012-10-21 16:28:24.288670679 +0100
  580. @@ -141,12 +141,70 @@
  581.  # define INIT_PERF_EVENTS(tsk)
  582.  #endif
  583.  
  584. -#define INIT_TASK_COMM "swapper"
  585. -
  586.  /*
  587.   *  INIT_TASK is used to set up the first task table, touch at
  588.   * your own risk!. Base=0, limit=0x1fffff (=2MB)
  589.   */
  590. +#ifdef CONFIG_SCHED_BFS
  591. +#define INIT_TASK_COMM "BFS"
  592. +#define INIT_TASK(tsk) \
  593. +{                                  \
  594. +   .state      = 0,                        \
  595. +   .stack      = &init_thread_info,                \
  596. +   .usage      = ATOMIC_INIT(2),               \
  597. +   .flags      = PF_KTHREAD,                   \
  598. +   .prio       = NORMAL_PRIO,                  \
  599. +   .static_prio    = MAX_PRIO-20,                  \
  600. +   .normal_prio    = NORMAL_PRIO,                  \
  601. +   .deadline   = 0,                        \
  602. +   .policy     = SCHED_NORMAL,                 \
  603. +   .cpus_allowed   = CPU_MASK_ALL,                 \
  604. +   .mm     = NULL,                     \
  605. +   .active_mm  = &init_mm,                 \
  606. +   .run_list   = LIST_HEAD_INIT(tsk.run_list),         \
  607. +   .time_slice = HZ,                   \
  608. +   .tasks      = LIST_HEAD_INIT(tsk.tasks),            \
  609. +   INIT_PUSHABLE_TASKS(tsk)                    \
  610. +   .ptraced    = LIST_HEAD_INIT(tsk.ptraced),          \
  611. +   .ptrace_entry   = LIST_HEAD_INIT(tsk.ptrace_entry),     \
  612. +   .real_parent    = &tsk,                     \
  613. +   .parent     = &tsk,                     \
  614. +   .children   = LIST_HEAD_INIT(tsk.children),         \
  615. +   .sibling    = LIST_HEAD_INIT(tsk.sibling),          \
  616. +   .group_leader   = &tsk,                     \
  617. +   RCU_INIT_POINTER(.real_cred, &init_cred),           \
  618. +   RCU_INIT_POINTER(.cred, &init_cred),                \
  619. +   .comm       = INIT_TASK_COMM,               \
  620. +   .thread     = INIT_THREAD,                  \
  621. +   .fs     = &init_fs,                 \
  622. +   .files      = &init_files,                  \
  623. +   .signal     = &init_signals,                \
  624. +   .sighand    = &init_sighand,                \
  625. +   .nsproxy    = &init_nsproxy,                \
  626. +   .pending    = {                     \
  627. +       .list = LIST_HEAD_INIT(tsk.pending.list),       \
  628. +       .signal = {{0}}},                   \
  629. +   .blocked    = {{0}},                    \
  630. +   .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),     \
  631. +   .journal_info   = NULL,                     \
  632. +   .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers),      \
  633. +   .pi_lock    = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
  634. +   .timer_slack_ns = 50000, /* 50 usec default slack */        \
  635. +   .pids = {                           \
  636. +       [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),        \
  637. +       [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),       \
  638. +       [PIDTYPE_SID]  = INIT_PID_LINK(PIDTYPE_SID),        \
  639. +   },                              \
  640. +   INIT_IDS                            \
  641. +   INIT_PERF_EVENTS(tsk)                       \
  642. +   INIT_TRACE_IRQFLAGS                     \
  643. +   INIT_LOCKDEP                            \
  644. +   INIT_FTRACE_GRAPH                       \
  645. +   INIT_TRACE_RECURSION                        \
  646. +   INIT_TASK_RCU_PREEMPT(tsk)                  \
  647. +}
  648. +#else /* CONFIG_SCHED_BFS */
  649. +#define INIT_TASK_COMM "swapper"
  650.  #define INIT_TASK(tsk) \
  651.  {                                  \
  652.     .state      = 0,                        \
  653. @@ -211,7 +269,7 @@
  654.     INIT_TASK_RCU_PREEMPT(tsk)                  \
  655.     INIT_CPUSET_SEQ                         \
  656.  }
  657. -
  658. +#endif /* CONFIG_SCHED_BFS */
  659.  
  660.  #define INIT_CPU_TIMERS(cpu_timers)                    \
  661.  {                                  \
  662. diff -ruNb a/include/linux/ioprio.h b/include/linux/ioprio.h
  663. --- a/include/linux/ioprio.h    2012-10-12 21:48:25.000000000 +0100
  664. +++ b/include/linux/ioprio.h    2012-10-21 16:28:24.288670679 +0100
  665. @@ -52,6 +52,8 @@
  666.   */
  667.  static inline int task_nice_ioprio(struct task_struct *task)
  668.  {
  669. +   if (iso_task(task))
  670. +       return 0;
  671.     return (task_nice(task) + 20) / 5;
  672.  }
  673.  
  674. diff -ruNb a/include/linux/jiffies.h b/include/linux/jiffies.h
  675. --- a/include/linux/jiffies.h   2012-10-12 21:48:25.000000000 +0100
  676. +++ b/include/linux/jiffies.h   2012-10-21 16:28:24.289670487 +0100
  677. @@ -164,7 +164,7 @@
  678.   * Have the 32 bit jiffies value wrap 5 minutes after boot
  679.   * so jiffies wrap bugs show up earlier.
  680.   */
  681. -#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))
  682. +#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ))
  683.  
  684.  /*
  685.   * Change timeval to jiffies, trying to avoid the
  686. diff -ruNb a/include/linux/nfsd/stats.h b/include/linux/nfsd/stats.h
  687. --- a/include/linux/nfsd/stats.h    2012-10-12 21:48:25.000000000 +0100
  688. +++ b/include/linux/nfsd/stats.h    2012-10-21 16:28:24.322664154 +0100
  689. @@ -11,8 +11,8 @@
  690.  
  691.  #include <linux/nfs4.h>
  692.  
  693. -/* thread usage wraps very million seconds (approx one fortnight) */
  694. -#define    NFSD_USAGE_WRAP (HZ*1000000)
  695. +/* thread usage wraps every one hundred thousand seconds (approx one day) */
  696. +#define    NFSD_USAGE_WRAP (HZ*100000)
  697.  
  698.  #ifdef __KERNEL__
  699.  
  700. diff -ruNb a/include/linux/sched.h b/include/linux/sched.h
  701. --- a/include/linux/sched.h 2012-10-12 21:48:25.000000000 +0100
  702. +++ b/include/linux/sched.h 2012-10-21 16:28:24.312666074 +0100
  703. @@ -37,8 +37,15 @@
  704.  #define SCHED_FIFO     1
  705.  #define SCHED_RR       2
  706.  #define SCHED_BATCH        3
  707. -/* SCHED_ISO: reserved but not implemented yet */
  708. +/* SCHED_ISO: Implemented on BFS only */
  709.  #define SCHED_IDLE     5
  710. +#define SCHED_IDLEPRIO     SCHED_IDLE
  711. +#ifdef CONFIG_SCHED_BFS
  712. +#define SCHED_ISO      4
  713. +#define SCHED_MAX      (SCHED_IDLEPRIO)
  714. +#define SCHED_RANGE(policy)    ((policy) <= SCHED_MAX)
  715. +#endif
  716. +
  717.  /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
  718.  #define SCHED_RESET_ON_FORK     0x40000000
  719.  
  720. @@ -270,8 +277,6 @@
  721.  extern void init_idle(struct task_struct *idle, int cpu);
  722.  extern void init_idle_bootup_task(struct task_struct *idle);
  723.  
  724. -extern int runqueue_is_locked(int cpu);
  725. -
  726.  #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
  727.  extern void select_nohz_load_balancer(int stop_tick);
  728.  extern void set_cpu_sd_state_idle(void);
  729. @@ -1235,15 +1240,30 @@
  730.  
  731.  #ifdef CONFIG_SMP
  732.     struct llist_node wake_entry;
  733. -   int on_cpu;
  734.  #endif
  735. -   int on_rq;
  736. -
  737. +#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_BFS)
  738. +   bool on_cpu;
  739. +#endif
  740. +#ifndef CONFIG_SCHED_BFS
  741. +   bool on_rq;
  742. +#endif
  743.     int prio, static_prio, normal_prio;
  744.     unsigned int rt_priority;
  745. +#ifdef CONFIG_SCHED_BFS
  746. +   int time_slice;
  747. +   u64 deadline;
  748. +   struct list_head run_list;
  749. +   u64 last_ran;
  750. +   u64 sched_time; /* sched_clock time spent running */
  751. +#ifdef CONFIG_SMP
  752. +   bool sticky; /* Soft affined flag */
  753. +#endif
  754. +   unsigned long rt_timeout;
  755. +#else /* CONFIG_SCHED_BFS */
  756.     const struct sched_class *sched_class;
  757.     struct sched_entity se;
  758.     struct sched_rt_entity rt;
  759. +#endif
  760.  #ifdef CONFIG_CGROUP_SCHED
  761.     struct task_group *sched_task_group;
  762.  #endif
  763. @@ -1355,6 +1375,9 @@
  764.     int __user *clear_child_tid;        /* CLONE_CHILD_CLEARTID */
  765.  
  766.     cputime_t utime, stime, utimescaled, stimescaled;
  767. +#ifdef CONFIG_SCHED_BFS
  768. +   unsigned long utime_pc, stime_pc;
  769. +#endif
  770.     cputime_t gtime;
  771.  #ifndef CONFIG_VIRT_CPU_ACCOUNTING
  772.     cputime_t prev_utime, prev_stime;
  773. @@ -1588,6 +1611,64 @@
  774.  #endif
  775.  };
  776.  
  777. +#ifdef CONFIG_SCHED_BFS
  778. +bool grunqueue_is_locked(void);
  779. +void grq_unlock_wait(void);
  780. +void cpu_scaling(int cpu);
  781. +void cpu_nonscaling(int cpu);
  782. +bool above_background_load(void);
  783. +#define tsk_seruntime(t)       ((t)->sched_time)
  784. +#define tsk_rttimeout(t)       ((t)->rt_timeout)
  785. +
  786. +static inline void tsk_cpus_current(struct task_struct *p)
  787. +{
  788. +}
  789. +
  790. +static inline int runqueue_is_locked(int cpu)
  791. +{
  792. +   return grunqueue_is_locked();
  793. +}
  794. +
  795. +void print_scheduler_version(void);
  796. +
  797. +static inline bool iso_task(struct task_struct *p)
  798. +{
  799. +   return (p->policy == SCHED_ISO);
  800. +}
  801. +#else /* CFS */
  802. +extern int runqueue_is_locked(int cpu);
  803. +static inline void cpu_scaling(int cpu)
  804. +{
  805. +}
  806. +
  807. +static inline void cpu_nonscaling(int cpu)
  808. +{
  809. +}
  810. +#define tsk_seruntime(t)   ((t)->se.sum_exec_runtime)
  811. +#define tsk_rttimeout(t)   ((t)->rt.timeout)
  812. +
  813. +static inline void tsk_cpus_current(struct task_struct *p)
  814. +{
  815. +   p->nr_cpus_allowed = current->nr_cpus_allowed;
  816. +}
  817. +
  818. +static inline void print_scheduler_version(void)
  819. +{
  820. +   printk(KERN_INFO"CFS CPU scheduler.\n");
  821. +}
  822. +
  823. +static inline bool iso_task(struct task_struct *p)
  824. +{
  825. +   return false;
  826. +}
  827. +
  828. +/* Anyone feel like implementing this? */
  829. +static inline bool above_background_load(void)
  830. +{
  831. +   return false;
  832. +}
  833. +#endif /* CONFIG_SCHED_BFS */
  834. +
  835.  /* Future-safe accessor for struct task_struct's cpus_allowed. */
  836.  #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
  837.  
  838. @@ -1605,10 +1686,20 @@
  839.   */
  840.  
  841.  #define MAX_USER_RT_PRIO   100
  842. -#define MAX_RT_PRIO        MAX_USER_RT_PRIO
  843. +#define MAX_RT_PRIO        (MAX_USER_RT_PRIO + 1)
  844. +#define DEFAULT_PRIO       (MAX_RT_PRIO + 20)
  845.  
  846. +#ifdef CONFIG_SCHED_BFS
  847. +#define PRIO_RANGE     (40)
  848. +#define MAX_PRIO       (MAX_RT_PRIO + PRIO_RANGE)
  849. +#define ISO_PRIO       (MAX_RT_PRIO)
  850. +#define NORMAL_PRIO        (MAX_RT_PRIO + 1)
  851. +#define IDLE_PRIO      (MAX_RT_PRIO + 2)
  852. +#define PRIO_LIMIT     ((IDLE_PRIO) + 1)
  853. +#else /* CONFIG_SCHED_BFS */
  854.  #define MAX_PRIO       (MAX_RT_PRIO + 40)
  855. -#define DEFAULT_PRIO       (MAX_RT_PRIO + 20)
  856. +#define NORMAL_PRIO        DEFAULT_PRIO
  857. +#endif /* CONFIG_SCHED_BFS */
  858.  
  859.  static inline int rt_prio(int prio)
  860.  {
  861. @@ -1979,7 +2070,7 @@
  862.  task_sched_runtime(struct task_struct *task);
  863.  
  864.  /* sched_exec is called by processes performing an exec */
  865. -#ifdef CONFIG_SMP
  866. +#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BFS)
  867.  extern void sched_exec(void);
  868.  #else
  869.  #define sched_exec()   {}
  870. @@ -2695,7 +2786,7 @@
  871.     return 0;
  872.  }
  873.  
  874. -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
  875. +static inline void set_task_cpu(struct task_struct *p, int cpu)
  876.  {
  877.  }
  878.  
  879. diff -ruNb a/include/linux/swap.h b/include/linux/swap.h
  880. --- a/include/linux/swap.h  2012-10-12 21:48:25.000000000 +0100
  881. +++ b/include/linux/swap.h  2012-10-21 16:28:24.304667608 +0100
  882. @@ -208,7 +208,7 @@
  883.     int next;   /* swapfile to be used next */
  884.  };
  885.  
  886. -/* Swap 50% full? Release swapcache more aggressively.. */
  887. +/* Swap 50% full? */
  888.  #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
  889.  
  890.  /* linux/mm/page_alloc.c */
  891. diff -ruNb a/include/linux/urwlock.h b/include/linux/urwlock.h
  892. --- a/include/linux/urwlock.h   1970-01-01 01:00:00.000000000 +0100
  893. +++ b/include/linux/urwlock.h   2012-10-21 16:28:24.328663004 +0100
  894. @@ -0,0 +1,245 @@
  895. +/*
  896. + * include/linux/urwlock.h - Upgradeable read/write locks.
  897. + *
  898. + * Copyright (C) 2012 Con Kolivas <kernel@kolivas.org>
  899. + *
  900. + * These are upgradeable variants of read/write locks.
  901. + *
  902. + * When a lock is chosen, one of read, upgradeable or write lock needs to be
  903. + * chosen. Much like read/write locks, a read lock cannot be upgraded to a
  904. + * write lock. However the upgradeable version can be either upgraded to a
  905. + * write lock, or downgraded to a read lock. Unlike read/write locks, these
  906. + * locks favour writers over readers. They are significantly more overhead
  907. + * than either spinlocks or read/write locks as they include one of each,
  908. + * however they are suited to situations where there are clear distinctions
  909. + * between read and write patterns, and where the state may be indeterminate
  910. + * for a period, allowing other readers to continue reading till they need to
  911. + * declare themselves as read or write.
  912. + */
  913. +
  914. +#ifndef __LINUX_URWLOCK_H
  915. +#define __LINUX_URWLOCK_H
  916. +
  917. +#include <linux/spinlock.h>
  918. +
  919. +struct urwlock {
  920. +   raw_spinlock_t lock;
  921. +   rwlock_t rwlock;
  922. +};
  923. +
  924. +typedef struct urwlock urwlock_t;
  925. +
  926. +static inline void urwlock_init(urwlock_t *urw)
  927. +{
  928. +   raw_spin_lock_init(&urw->lock);
  929. +   rwlock_init(&urw->rwlock);
  930. +}
  931. +
  932. +/* Low level write and read lock/unlock of the rw lock. */
  933. +static inline void __urw_write_lock(rwlock_t *rw)
  934. +{
  935. +   rwlock_acquire(&rw.dep_map, 0, 0, _RET_IP_);
  936. +   LOCK_CONTENDED(rw, do_raw_write_trylock, do_raw_write_lock);
  937. +}
  938. +
  939. +static inline void __urw_write_unlock(rwlock_t *rw)
  940. +{
  941. +   rwlock_release(&rw.dep_map, 1, _RET_IP_);
  942. +   do_raw_write_unlock(rw);
  943. +}
  944. +
  945. +static inline void __urw_read_lock(rwlock_t *rw)
  946. +{
  947. +   rwlock_acquire_read(&rw.dep_map, 0, 0, _RET_IP_);
  948. +   LOCK_CONTENDED(rw, do_raw_read_trylock, do_raw_read_lock);
  949. +}
  950. +
  951. +static inline void __urw_read_unlock(rwlock_t *rw)
  952. +{
  953. +   rwlock_release(&rw.dep_map, 1, _RET_IP_);
  954. +   do_raw_read_unlock(rw);
  955. +}
  956. +
  957. +/* Write variant of urw lock. Grabs both spinlock and rwlock. */
  958. +static inline void urw_wlock(urwlock_t *urw)
  959. +   __acquires(urw->lock)
  960. +   __acquires(urw->rwlock)
  961. +{
  962. +   raw_spin_lock(&urw->lock);
  963. +   __urw_write_lock(&urw->rwlock);
  964. +}
  965. +
  966. +/* Write variant of urw unlock. Releases both spinlock and rwlock. */
  967. +static inline void urw_wunlock(urwlock_t *urw)
  968. +   __releases(urw->rwlock)
  969. +   __releases(urw->lock)
  970. +{
  971. +   __urw_write_unlock(&urw->rwlock);
  972. +   raw_spin_unlock(&urw->lock);
  973. +}
  974. +
  975. +/*
  976. + * Read variant of urw lock. Grabs spinlock and rwlock and then releases
  977. + * spinlock.
  978. + */
  979. +static inline void urw_rlock(urwlock_t *urw)
  980. +   __acquires(urw->lock)
  981. +   __acquires(urw->rwlock)
  982. +   __releases(urw->lock)
  983. +{
  984. +   raw_spin_lock(&urw->lock);
  985. +   __urw_read_lock(&urw->rwlock);
  986. +   spin_release(&urw->lock.dep_map, 1, _RET_IP_);
  987. +   do_raw_spin_unlock(&urw->lock);
  988. +}
  989. +
  990. +/* Read variant of urw lock. Releases only the rwlock. */
  991. +static inline void urw_runlock(urwlock_t *urw)
  992. +   __releases(urw->rwlock)
  993. +{
  994. +   __urw_read_unlock(&urw->rwlock);
  995. +}
  996. +
  997. +/* Upgradeable variant of urw lock. Grabs only the spinlock. */
  998. +static inline void urw_ulock(urwlock_t *urw)
  999. +   __acquires(urw->lock)
  1000. +{
  1001. +   raw_spin_lock(&urw->lock);
  1002. +}
  1003. +
  1004. +/* Upgrade the upgradeable variant of urwlock. Grabs the write lock. */
  1005. +static inline void urw_upgrade(urwlock_t *urw)
  1006. +{
  1007. +   __urw_write_lock(&urw->rwlock);
  1008. +}
  1009. +
  1010. +/*
  1011. + * Downgrade the upgradeable variant of urwlock to a read lock. Grabs the
  1012. + * read rwlock and releases the spinlock.
  1013. + */
  1014. +static inline void urw_udowngrade(urwlock_t *urw)
  1015. +   __acquires(urw->rwlock)
  1016. +   __releases(urw->lock)
  1017. +{
  1018. +   __urw_read_lock(&urw->rwlock);
  1019. +   spin_release(&urw->lock.dep_map, 1, _RET_IP_);
  1020. +   do_raw_spin_unlock(&urw->lock);
  1021. +}
  1022. +
  1023. +/*
  1024. + * Downgrade the write variant of urwlock to a read lock. Drops the write
  1025. + * rwlock, grabs the read rwlock and releases the spinlock.
  1026. + */
  1027. +static inline void urw_wdowngrade(urwlock_t *urw)
  1028. +   __releases(urw->rwlock)
  1029. +   __acquires(urw->rwlock)
  1030. +   __releases(urw->lock)
  1031. +{
  1032. +   __urw_write_unlock(&urw->rwlock);
  1033. +   __urw_read_lock(&urw->rwlock);
  1034. +   spin_release(&urw->lock.dep_map, 1, _RET_IP_);
  1035. +   do_raw_spin_unlock(&urw->lock);
  1036. +}
  1037. +
  1038. +/*
  1039. + * Unlock the upgradeable variant of urwlock where it has not been up or
  1040. + * downgraded.
  1041. + */
  1042. +static inline void urw_uunlock(urwlock_t *urw)
  1043. +   __releases(urw->lock)
  1044. +{
  1045. +   raw_spin_unlock(&urw->lock);
  1046. +}
  1047. +
  1048. +/* IRQ variants of urw locks */
  1049. +static inline void urw_wlock_irq(urwlock_t *urw)
  1050. +   __acquires(urw->lock)
  1051. +   __acquires(urw->rwlock)
  1052. +{
  1053. +   raw_spin_lock_irq(&urw->lock);
  1054. +   __urw_write_lock(&urw->rwlock);
  1055. +}
  1056. +
  1057. +static inline void urw_wunlock_irq(urwlock_t *urw)
  1058. +   __releases(urw->rwlock)
  1059. +   __releases(urw->lock)
  1060. +{
  1061. +   __urw_write_unlock(&urw->rwlock);
  1062. +   raw_spin_unlock_irq(&urw->lock);
  1063. +}
  1064. +
  1065. +static inline void urw_rlock_irq(urwlock_t *urw)
  1066. +   __acquires(urw->lock)
  1067. +   __acquires(urw->rwlock)
  1068. +   __releases(urw->lock)
  1069. +{
  1070. +   raw_spin_lock_irq(&urw->lock);
  1071. +   __urw_read_lock(&urw->rwlock);
  1072. +   spin_release(&urw->lock.dep_map, 1, _RET_IP_);
  1073. +   do_raw_spin_unlock(&urw->lock);
  1074. +}
  1075. +
  1076. +static inline void urw_runlock_irq(urwlock_t *urw)
  1077. +   __releases(urw->rwlock)
  1078. +{
  1079. +   read_unlock_irq(&urw->rwlock);
  1080. +}
  1081. +
  1082. +static inline void urw_ulock_irq(urwlock_t *urw)
  1083. +   __acquires(urw->lock)
  1084. +{
  1085. +   raw_spin_lock_irq(&urw->lock);
  1086. +}
  1087. +
  1088. +static inline void urw_uunlock_irq(urwlock_t *urw)
  1089. +   __releases(urw->lock)
  1090. +{
  1091. +   raw_spin_unlock_irq(&urw->lock);
  1092. +}
  1093. +
  1094. +static inline void urw_wlock_irqsave(urwlock_t *urw, unsigned long *flags)
  1095. +   __acquires(urw->lock)
  1096. +   __acquires(urw->rwlock)
  1097. +{
  1098. +   raw_spin_lock_irqsave(&urw->lock, *flags);
  1099. +   __urw_write_lock(&urw->rwlock);
  1100. +}
  1101. +
  1102. +static inline void urw_wunlock_irqrestore(urwlock_t *urw, unsigned long *flags)
  1103. +   __releases(urw->rwlock)
  1104. +   __releases(urw->lock)
  1105. +{
  1106. +   __urw_write_unlock(&urw->rwlock);
  1107. +   raw_spin_unlock_irqrestore(&urw->lock, *flags);
  1108. +}
  1109. +
  1110. +static inline void urw_ulock_irqsave(urwlock_t *urw, unsigned long *flags)
  1111. +   __acquires(urw->lock)
  1112. +{
  1113. +   raw_spin_lock_irqsave(&urw->lock, *flags);
  1114. +}
  1115. +
  1116. +static inline void urw_uunlock_irqrestore(urwlock_t *urw, unsigned long *flags)
  1117. +   __releases(urw->lock)
  1118. +{
  1119. +   raw_spin_unlock_irqrestore(&urw->lock, *flags);
  1120. +}
  1121. +
  1122. +static inline void urw_rlock_irqsave(urwlock_t *urw, unsigned long *flags)
  1123. +   __acquires(urw->lock)
  1124. +   __acquires(urw->rwlock)
  1125. +   __releases(urw->lock)
  1126. +{
  1127. +   raw_spin_lock_irqsave(&urw->lock, *flags);
  1128. +   __urw_read_lock(&urw->rwlock);
  1129. +   spin_release(&urw->lock.dep_map, 1, _RET_IP_);
  1130. +   do_raw_spin_unlock(&urw->lock);
  1131. +}
  1132. +
  1133. +static inline void urw_runlock_irqrestore(urwlock_t *urw, unsigned long *flags)
  1134. +   __releases(urw->rwlock)
  1135. +{
  1136. +   read_unlock_irqrestore(&urw->rwlock, *flags);
  1137. +}
  1138. +
  1139. +#endif /* __LINUX_URWLOCK_H */
  1140. diff -ruNb a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
  1141. --- a/include/net/inet_timewait_sock.h  2012-10-12 21:48:25.000000000 +0100
  1142. +++ b/include/net/inet_timewait_sock.h  2012-10-21 16:28:24.322664154 +0100
  1143. @@ -38,8 +38,8 @@
  1144.   * If time > 4sec, it is "slow" path, no recycling is required,
  1145.   * so that we select tick to get range about 4 seconds.
  1146.   */
  1147. -#if HZ <= 16 || HZ > 4096
  1148. -# error Unsupported: HZ <= 16 or HZ > 4096
  1149. +#if HZ <= 16 || HZ > 16384
  1150. +# error Unsupported: HZ <= 16 or HZ > 16384
  1151.  #elif HZ <= 32
  1152.  # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
  1153.  #elif HZ <= 64
  1154. @@ -54,8 +54,12 @@
  1155.  # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
  1156.  #elif HZ <= 2048
  1157.  # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
  1158. -#else
  1159. +#elif HZ <= 4096
  1160.  # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
  1161. +#elif HZ <= 8192
  1162. +# define INET_TWDR_RECYCLE_TICK (13 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
  1163. +#else
  1164. +# define INET_TWDR_RECYCLE_TICK (14 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
  1165.  #endif
  1166.  
  1167.  /* TIME_WAIT reaping mechanism. */
  1168. diff -ruNb a/init/calibrate.c b/init/calibrate.c
  1169. --- a/init/calibrate.c  2012-10-12 21:48:25.000000000 +0100
  1170. +++ b/init/calibrate.c  2012-10-21 16:28:24.322664154 +0100
  1171. @@ -294,7 +294,7 @@
  1172.     if (!printed)
  1173.         pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n",
  1174.             lpj/(500000/HZ),
  1175. -           (lpj/(5000/HZ)) % 100, lpj);
  1176. +           (lpj * 10 /(50000 / HZ)) % 100, lpj);
  1177.  
  1178.     loops_per_jiffy = lpj;
  1179.     printed = true;
  1180. diff -ruNb a/init/Kconfig b/init/Kconfig
  1181. --- a/init/Kconfig  2012-10-12 21:48:25.000000000 +0100
  1182. +++ b/init/Kconfig  2012-10-21 16:28:24.290670295 +0100
  1183. @@ -32,6 +32,19 @@
  1184.  
  1185.  menu "General setup"
  1186.  
  1187. +config SCHED_BFS
  1188. +   bool "BFS cpu scheduler"
  1189. +   ---help---
  1190. +     The Brain Fuck CPU Scheduler for excellent interactivity and
  1191. +     responsiveness on the desktop and solid scalability on normal
  1192. +          hardware. Not recommended for 4096 CPUs.
  1193. +
  1194. +     Currently incompatible with the Group CPU scheduler, and RCU TORTURE
  1195. +          TEST so these options are disabled.
  1196. +
  1197. +          Say Y here.
  1198. +   default y
  1199. +
  1200.  config EXPERIMENTAL
  1201.     bool "Prompt for development and/or incomplete code/drivers"
  1202.     ---help---
  1203. @@ -676,6 +689,7 @@
  1204.  
  1205.  config CGROUP_CPUACCT
  1206.     bool "Simple CPU accounting cgroup subsystem"
  1207. +   depends on !SCHED_BFS
  1208.     help
  1209.       Provides a simple Resource Controller for monitoring the
  1210.       total CPU consumed by the tasks in a cgroup.
  1211. @@ -763,6 +777,7 @@
  1212.  
  1213.  menuconfig CGROUP_SCHED
  1214.     bool "Group CPU scheduler"
  1215. +   depends on !SCHED_BFS
  1216.     default n
  1217.     help
  1218.       This feature lets CPU scheduler recognize task groups and control CPU
  1219. @@ -1027,6 +1042,7 @@
  1220.  
  1221.  config SCHED_AUTOGROUP
  1222.     bool "Automatic process group scheduling"
  1223. +   depends on !SCHED_BFS
  1224.     select EVENTFD
  1225.     select CGROUPS
  1226.     select CGROUP_SCHED
  1227. @@ -1411,38 +1427,8 @@
  1228.  
  1229.       On non-ancient distros (post-2000 ones) N is usually a safe choice.
  1230.  
  1231. -choice
  1232. -   prompt "Choose SLAB allocator"
  1233. -   default SLUB
  1234. -   help
  1235. -      This option allows to select a slab allocator.
  1236. -
  1237. -config SLAB
  1238. -   bool "SLAB"
  1239. -   help
  1240. -     The regular slab allocator that is established and known to work
  1241. -     well in all environments. It organizes cache hot objects in
  1242. -     per cpu and per node queues.
  1243. -
  1244.  config SLUB
  1245. -   bool "SLUB (Unqueued Allocator)"
  1246. -   help
  1247. -      SLUB is a slab allocator that minimizes cache line usage
  1248. -      instead of managing queues of cached objects (SLAB approach).
  1249. -      Per cpu caching is realized using slabs of objects instead
  1250. -      of queues of objects. SLUB can use memory efficiently
  1251. -      and has enhanced diagnostics. SLUB is the default choice for
  1252. -      a slab allocator.
  1253. -
  1254. -config SLOB
  1255. -   depends on EXPERT
  1256. -   bool "SLOB (Simple Allocator)"
  1257. -   help
  1258. -      SLOB replaces the stock allocator with a drastically simpler
  1259. -      allocator. SLOB is generally more space efficient but
  1260. -      does not perform as well on large systems.
  1261. -
  1262. -endchoice
  1263. +   def_bool y
  1264.  
  1265.  config MMAP_ALLOW_UNINITIALIZED
  1266.     bool "Allow mmapped anonymous memory to be uninitialized"
  1267. diff -ruNb a/init/main.c b/init/main.c
  1268. --- a/init/main.c   2012-10-12 21:48:25.000000000 +0100
  1269. +++ b/init/main.c   2012-10-21 16:28:24.290670295 +0100
  1270. @@ -804,6 +804,7 @@
  1271.     system_state = SYSTEM_RUNNING;
  1272.     numa_default_policy();
  1273.  
  1274. +   print_scheduler_version();
  1275.  
  1276.     current->signal->flags |= SIGNAL_UNKILLABLE;
  1277.  
  1278. diff -ruNb a/kernel/delayacct.c b/kernel/delayacct.c
  1279. --- a/kernel/delayacct.c    2012-10-12 21:48:25.000000000 +0100
  1280. +++ b/kernel/delayacct.c    2012-10-21 16:28:24.291670103 +0100
  1281. @@ -130,7 +130,7 @@
  1282.      */
  1283.     t1 = tsk->sched_info.pcount;
  1284.     t2 = tsk->sched_info.run_delay;
  1285. -   t3 = tsk->se.sum_exec_runtime;
  1286. +   t3 = tsk_seruntime(tsk);
  1287.  
  1288.     d->cpu_count += t1;
  1289.  
  1290. diff -ruNb a/kernel/exit.c b/kernel/exit.c
  1291. --- a/kernel/exit.c 2012-10-12 21:48:25.000000000 +0100
  1292. +++ b/kernel/exit.c 2012-10-21 16:28:24.291670103 +0100
  1293. @@ -145,7 +145,7 @@
  1294.         sig->inblock += task_io_get_inblock(tsk);
  1295.         sig->oublock += task_io_get_oublock(tsk);
  1296.         task_io_accounting_add(&sig->ioac, &tsk->ioac);
  1297. -       sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
  1298. +       sig->sum_sched_runtime += tsk_seruntime(tsk);
  1299.     }
  1300.  
  1301.     sig->nr_threads--;
  1302. diff -ruNb a/kernel/Kconfig.hz b/kernel/Kconfig.hz
  1303. --- a/kernel/Kconfig.hz 2012-10-12 21:48:25.000000000 +0100
  1304. +++ b/kernel/Kconfig.hz 2012-10-21 16:28:24.322664154 +0100
  1305. @@ -4,7 +4,7 @@
  1306.  
  1307.  choice
  1308.     prompt "Timer frequency"
  1309. -   default HZ_250
  1310. +   default HZ_1000
  1311.     help
  1312.      Allows the configuration of the timer frequency. It is customary
  1313.      to have the timer interrupt run at 1000 Hz but 100 Hz may be more
  1314. @@ -23,13 +23,14 @@
  1315.       with lots of processors that may show reduced performance if
  1316.       too many timer interrupts are occurring.
  1317.  
  1318. -   config HZ_250
  1319. +   config HZ_250_NODEFAULT
  1320.         bool "250 HZ"
  1321.     help
  1322. -    250 Hz is a good compromise choice allowing server performance
  1323. -    while also showing good interactive responsiveness even
  1324. -    on SMP and NUMA systems. If you are going to be using NTSC video
  1325. -    or multimedia, selected 300Hz instead.
  1326. +    250 HZ is a lousy compromise choice allowing server interactivity
  1327. +    while also showing desktop throughput and no extra power saving on
  1328. +    laptops. No good for anything.
  1329. +
  1330. +    Recommend 100 or 1000 instead.
  1331.  
  1332.     config HZ_300
  1333.         bool "300 HZ"
  1334. @@ -43,16 +44,82 @@
  1335.         bool "1000 HZ"
  1336.     help
  1337.      1000 Hz is the preferred choice for desktop systems and other
  1338. -    systems requiring fast interactive responses to events.
  1339. +    systems requiring fast interactive responses to events. Laptops
  1340. +    can also benefit from this choice without sacrificing battery life
  1341. +    if dynticks is also enabled.
  1342. +
  1343. +   config HZ_1500
  1344. +       bool "1500 HZ"
  1345. +   help
  1346. +    1500 Hz is an insane value to use to run broken software that is Hz
  1347. +    limited.
  1348. +
  1349. +    Being over 1000, driver breakage is likely.
  1350. +
  1351. +   config HZ_2000
  1352. +       bool "2000 HZ"
  1353. +   help
  1354. +    2000 Hz is an insane value to use to run broken software that is Hz
  1355. +    limited.
  1356. +
  1357. +    Being over 1000, driver breakage is likely.
  1358. +
  1359. +   config HZ_3000
  1360. +       bool "3000 HZ"
  1361. +   help
  1362. +    3000 Hz is an insane value to use to run broken software that is Hz
  1363. +    limited.
  1364. +
  1365. +    Being over 1000, driver breakage is likely.
  1366. +
  1367. +   config HZ_4000
  1368. +       bool "4000 HZ"
  1369. +   help
  1370. +    4000 Hz is an insane value to use to run broken software that is Hz
  1371. +    limited.
  1372. +
  1373. +    Being over 1000, driver breakage is likely.
  1374. +
  1375. +   config HZ_5000
  1376. +       bool "5000 HZ"
  1377. +   help
  1378. +    5000 Hz is an obscene value to use to run broken software that is Hz
  1379. +    limited.
  1380. +
  1381. +    Being over 1000, driver breakage is likely.
  1382. +
  1383. +   config HZ_7500
  1384. +       bool "7500 HZ"
  1385. +   help
  1386. +    7500 Hz is an obscene value to use to run broken software that is Hz
  1387. +    limited.
  1388. +
  1389. +    Being over 1000, driver breakage is likely.
  1390. +
  1391. +   config HZ_10000
  1392. +       bool "10000 HZ"
  1393. +   help
  1394. +    10000 Hz is an obscene value to use to run broken software that is Hz
  1395. +    limited.
  1396. +
  1397. +    Being over 1000, driver breakage is likely.
  1398. +
  1399.  
  1400.  endchoice
  1401.  
  1402.  config HZ
  1403.     int
  1404.     default 100 if HZ_100
  1405. -   default 250 if HZ_250
  1406. +   default 250 if HZ_250_NODEFAULT
  1407.     default 300 if HZ_300
  1408.     default 1000 if HZ_1000
  1409. +   default 1500 if HZ_1500
  1410. +   default 2000 if HZ_2000
  1411. +   default 3000 if HZ_3000
  1412. +   default 4000 if HZ_4000
  1413. +   default 5000 if HZ_5000
  1414. +   default 7500 if HZ_7500
  1415. +   default 10000 if HZ_10000
  1416.  
  1417.  config SCHED_HRTICK
  1418.     def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
  1419. diff -ruNb a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
  1420. --- a/kernel/Kconfig.preempt    2012-10-12 21:48:25.000000000 +0100
  1421. +++ b/kernel/Kconfig.preempt    2012-10-21 16:28:24.324663772 +0100
  1422. @@ -1,7 +1,7 @@
  1423.  
  1424.  choice
  1425.     prompt "Preemption Model"
  1426. -   default PREEMPT_NONE
  1427. +   default PREEMPT
  1428.  
  1429.  config PREEMPT_NONE
  1430.     bool "No Forced Preemption (Server)"
  1431. @@ -17,7 +17,7 @@
  1432.       latencies.
  1433.  
  1434.  config PREEMPT_VOLUNTARY
  1435. -   bool "Voluntary Kernel Preemption (Desktop)"
  1436. +   bool "Voluntary Kernel Preemption (Nothing)"
  1437.     help
  1438.       This option reduces the latency of the kernel by adding more
  1439.       "explicit preemption points" to the kernel code. These new
  1440. @@ -31,7 +31,8 @@
  1441.       applications to run more 'smoothly' even when the system is
  1442.       under load.
  1443.  
  1444. -     Select this if you are building a kernel for a desktop system.
  1445. +     Select this for no system in particular (choose Preemptible
  1446. +     instead on a desktop if you know what's good for you).
  1447.  
  1448.  config PREEMPT
  1449.     bool "Preemptible Kernel (Low-Latency Desktop)"
  1450. diff -ruNb a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
  1451. --- a/kernel/posix-cpu-timers.c 2012-10-12 21:48:25.000000000 +0100
  1452. +++ b/kernel/posix-cpu-timers.c 2012-10-21 16:28:24.292669911 +0100
  1453. @@ -495,7 +495,7 @@
  1454.  void posix_cpu_timers_exit(struct task_struct *tsk)
  1455.  {
  1456.     cleanup_timers(tsk->cpu_timers,
  1457. -              tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
  1458. +              tsk->utime, tsk->stime, tsk_seruntime(tsk));
  1459.  
  1460.  }
  1461.  void posix_cpu_timers_exit_group(struct task_struct *tsk)
  1462. @@ -504,7 +504,7 @@
  1463.  
  1464.     cleanup_timers(tsk->signal->cpu_timers,
  1465.                tsk->utime + sig->utime, tsk->stime + sig->stime,
  1466. -              tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
  1467. +              tsk_seruntime(tsk) + sig->sum_sched_runtime);
  1468.  }
  1469.  
  1470.  static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
  1471. @@ -934,7 +934,7 @@
  1472.         struct cpu_timer_list *t = list_first_entry(timers,
  1473.                               struct cpu_timer_list,
  1474.                               entry);
  1475. -       if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
  1476. +       if (!--maxfire || tsk_seruntime(tsk) < t->expires.sched) {
  1477.             tsk->cputime_expires.sched_exp = t->expires.sched;
  1478.             break;
  1479.         }
  1480. @@ -951,7 +951,7 @@
  1481.             ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
  1482.  
  1483.         if (hard != RLIM_INFINITY &&
  1484. -           tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
  1485. +           tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
  1486.             /*
  1487.              * At the hard limit, we just die.
  1488.              * No need to calculate anything else now.
  1489. @@ -959,7 +959,7 @@
  1490.             __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
  1491.             return;
  1492.         }
  1493. -       if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
  1494. +       if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
  1495.             /*
  1496.              * At the soft limit, send a SIGXCPU every second.
  1497.              */
  1498. @@ -1252,7 +1252,7 @@
  1499.         struct task_cputime task_sample = {
  1500.             .utime = tsk->utime,
  1501.             .stime = tsk->stime,
  1502. -           .sum_exec_runtime = tsk->se.sum_exec_runtime
  1503. +           .sum_exec_runtime = tsk_seruntime(tsk)
  1504.         };
  1505.  
  1506.         if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
  1507. diff -ruNb a/kernel/sched/bfs.c b/kernel/sched/bfs.c
  1508. --- a/kernel/sched/bfs.c    1970-01-01 01:00:00.000000000 +0100
  1509. +++ b/kernel/sched/bfs.c    2012-10-21 16:28:24.333662044 +0100
  1510. @@ -0,0 +1,7570 @@
  1511. +/*
  1512. + *  kernel/sched/bfs.c, was kernel/sched.c
  1513. + *
  1514. + *  Kernel scheduler and related syscalls
  1515. + *
  1516. + *  Copyright (C) 1991-2002  Linus Torvalds
  1517. + *
  1518. + *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  1519. + *     make semaphores SMP safe
  1520. + *  1998-11-19 Implemented schedule_timeout() and related stuff
  1521. + *     by Andrea Arcangeli
  1522. + *  2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
  1523. + *     hybrid priority-list and round-robin design with
  1524. + *     an array-switch method of distributing timeslices
  1525. + *     and per-CPU runqueues.  Cleanups and useful suggestions
  1526. + *     by Davide Libenzi, preemptible kernel bits by Robert Love.
  1527. + *  2003-09-03 Interactivity tuning by Con Kolivas.
  1528. + *  2004-04-02 Scheduler domains code by Nick Piggin
  1529. + *  2007-04-15  Work begun on replacing all interactivity tuning with a
  1530. + *              fair scheduling design by Con Kolivas.
  1531. + *  2007-05-05  Load balancing (smp-nice) and other improvements
  1532. + *              by Peter Williams
  1533. + *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  1534. + *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  1535. + *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
  1536. + *              Thomas Gleixner, Mike Kravetz
  1537. + *  now        Brainfuck deadline scheduling policy by Con Kolivas deletes
  1538. + *              a whole lot of those previous things.
  1539. + */
  1540. +
  1541. +#include <linux/mm.h>
  1542. +#include <linux/module.h>
  1543. +#include <linux/nmi.h>
  1544. +#include <linux/init.h>
  1545. +#include <asm/uaccess.h>
  1546. +#include <linux/highmem.h>
  1547. +#include <asm/mmu_context.h>
  1548. +#include <linux/interrupt.h>
  1549. +#include <linux/capability.h>
  1550. +#include <linux/completion.h>
  1551. +#include <linux/kernel_stat.h>
  1552. +#include <linux/debug_locks.h>
  1553. +#include <linux/perf_event.h>
  1554. +#include <linux/security.h>
  1555. +#include <linux/notifier.h>
  1556. +#include <linux/profile.h>
  1557. +#include <linux/freezer.h>
  1558. +#include <linux/vmalloc.h>
  1559. +#include <linux/blkdev.h>
  1560. +#include <linux/delay.h>
  1561. +#include <linux/smp.h>
  1562. +#include <linux/threads.h>
  1563. +#include <linux/timer.h>
  1564. +#include <linux/rcupdate.h>
  1565. +#include <linux/cpu.h>
  1566. +#include <linux/cpuset.h>
  1567. +#include <linux/cpumask.h>
  1568. +#include <linux/percpu.h>
  1569. +#include <linux/proc_fs.h>
  1570. +#include <linux/seq_file.h>
  1571. +#include <linux/syscalls.h>
  1572. +#include <linux/times.h>
  1573. +#include <linux/tsacct_kern.h>
  1574. +#include <linux/kprobes.h>
  1575. +#include <linux/delayacct.h>
  1576. +#include <linux/log2.h>
  1577. +#include <linux/bootmem.h>
  1578. +#include <linux/ftrace.h>
  1579. +#include <linux/slab.h>
  1580. +#include <linux/init_task.h>
  1581. +#include <linux/binfmts.h>
  1582. +#include <linux/urwlock.h>
  1583. +
  1584. +#include <asm/switch_to.h>
  1585. +#include <asm/tlb.h>
  1586. +#include <asm/unistd.h>
  1587. +#include <asm/mutex.h>
  1588. +#ifdef CONFIG_PARAVIRT
  1589. +#include <asm/paravirt.h>
  1590. +#endif
  1591. +
  1592. +#include "cpupri.h"
  1593. +#include "../workqueue_sched.h"
  1594. +#include "../smpboot.h"
  1595. +
  1596. +#define CREATE_TRACE_POINTS
  1597. +#include <trace/events/sched.h>
  1598. +
  1599. +#define rt_prio(prio)      unlikely((prio) < MAX_RT_PRIO)
  1600. +#define rt_task(p)     rt_prio((p)->prio)
  1601. +#define rt_queue(rq)       rt_prio((rq)->rq_prio)
  1602. +#define batch_task(p)      (unlikely((p)->policy == SCHED_BATCH))
  1603. +#define is_rt_policy(policy)   ((policy) == SCHED_FIFO || \
  1604. +                   (policy) == SCHED_RR)
  1605. +#define has_rt_policy(p)   unlikely(is_rt_policy((p)->policy))
  1606. +#define idleprio_task(p)   unlikely((p)->policy == SCHED_IDLEPRIO)
  1607. +#define iso_task(p)        unlikely((p)->policy == SCHED_ISO)
  1608. +#define iso_queue(rq)      unlikely((rq)->rq_policy == SCHED_ISO)
  1609. +#define rq_running_iso(rq) ((rq)->rq_prio == ISO_PRIO)
  1610. +
  1611. +#define ISO_PERIOD     ((5 * HZ * grq.noc) + 1)
  1612. +
  1613. +/*
  1614. + * Convert user-nice values [ -20 ... 0 ... 19 ]
  1615. + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  1616. + * and back.
  1617. + */
  1618. +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
  1619. +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
  1620. +#define TASK_NICE(p)       PRIO_TO_NICE((p)->static_prio)
  1621. +
  1622. +/*
  1623. + * 'User priority' is the nice value converted to something we
  1624. + * can work with better when scaling various scheduler parameters,
  1625. + * it's a [ 0 ... 39 ] range.
  1626. + */
  1627. +#define USER_PRIO(p)       ((p) - MAX_RT_PRIO)
  1628. +#define TASK_USER_PRIO(p)  USER_PRIO((p)->static_prio)
  1629. +#define MAX_USER_PRIO      (USER_PRIO(MAX_PRIO))
  1630. +#define SCHED_PRIO(p)      ((p) + MAX_RT_PRIO)
  1631. +#define STOP_PRIO      (MAX_RT_PRIO - 1)
  1632. +
  1633. +/*
  1634. + * Some helpers for converting to/from various scales. Use shifts to get
  1635. + * approximate multiples of ten for less overhead.
  1636. + */
  1637. +#define JIFFIES_TO_NS(TIME)    ((TIME) * (1000000000 / HZ))
  1638. +#define JIFFY_NS       (1000000000 / HZ)
  1639. +#define HALF_JIFFY_NS      (1000000000 / HZ / 2)
  1640. +#define HALF_JIFFY_US      (1000000 / HZ / 2)
  1641. +#define MS_TO_NS(TIME)     ((TIME) << 20)
  1642. +#define MS_TO_US(TIME)     ((TIME) << 10)
  1643. +#define NS_TO_MS(TIME)     ((TIME) >> 20)
  1644. +#define NS_TO_US(TIME)     ((TIME) >> 10)
  1645. +
  1646. +#define RESCHED_US (100) /* Reschedule if less than this many μs left */
  1647. +
  1648. +void print_scheduler_version(void)
  1649. +{
  1650. +   printk(KERN_INFO "BFS CPU scheduler v0.424 by Con Kolivas.\n");
  1651. +}
  1652. +
  1653. +/*
  1654. + * This is the time all tasks within the same priority round robin.
  1655. + * Value is in ms and set to a minimum of 6ms. Scales with number of cpus.
  1656. + * Tunable via /proc interface.
  1657. + */
  1658. +int rr_interval __read_mostly = 6;
  1659. +
  1660. +/*
  1661. + * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
  1662. + * are allowed to run five seconds as real time tasks. This is the total over
  1663. + * all online cpus.
  1664. + */
  1665. +int sched_iso_cpu __read_mostly = 70;
  1666. +
  1667. +/*
  1668. + * The relative length of deadline for each priority(nice) level.
  1669. + */
  1670. +static int prio_ratios[PRIO_RANGE] __read_mostly;
  1671. +
  1672. +/*
  1673. + * The quota handed out to tasks of all priority levels when refilling their
  1674. + * time_slice.
  1675. + */
  1676. +static inline int timeslice(void)
  1677. +{
  1678. +   return MS_TO_US(rr_interval);
  1679. +}
  1680. +
  1681. +/*
  1682. + * The global runqueue data that all CPUs work off. Data is protected either
  1683. + * by the global grq lock, or the discrete lock that precedes the data in this
  1684. + * struct.
  1685. + */
  1686. +struct global_rq {
  1687. +   urwlock_t urw;
  1688. +   unsigned long nr_running;
  1689. +   unsigned long nr_uninterruptible;
  1690. +   unsigned long long nr_switches;
  1691. +   struct list_head queue[PRIO_LIMIT];
  1692. +   DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1);
  1693. +#ifdef CONFIG_SMP
  1694. +   unsigned long qnr; /* queued not running */
  1695. +   cpumask_t cpu_idle_map;
  1696. +   bool idle_cpus;
  1697. +#endif
  1698. +   int noc; /* num_online_cpus stored and updated when it changes */
  1699. +   u64 niffies; /* Nanosecond jiffies */
  1700. +   unsigned long last_jiffy; /* Last jiffy we updated niffies */
  1701. +
  1702. +   raw_spinlock_t iso_lock;
  1703. +   int iso_ticks;
  1704. +   bool iso_refractory;
  1705. +};
  1706. +
  1707. +#ifdef CONFIG_SMP
  1708. +
  1709. +/*
  1710. + * We add the notion of a root-domain which will be used to define per-domain
  1711. + * variables. Each exclusive cpuset essentially defines an island domain by
  1712. + * fully partitioning the member cpus from any other cpuset. Whenever a new
  1713. + * exclusive cpuset is created, we also create and attach a new root-domain
  1714. + * object.
  1715. + *
  1716. + */
  1717. +struct root_domain {
  1718. +   atomic_t refcount;
  1719. +   atomic_t rto_count;
  1720. +   struct rcu_head rcu;
  1721. +   cpumask_var_t span;
  1722. +   cpumask_var_t online;
  1723. +
  1724. +   /*
  1725. +    * The "RT overload" flag: it gets set if a CPU has more than
  1726. +    * one runnable RT task.
  1727. +    */
  1728. +   cpumask_var_t rto_mask;
  1729. +   struct cpupri cpupri;
  1730. +};
  1731. +
  1732. +/*
  1733. + * By default the system creates a single root-domain with all cpus as
  1734. + * members (mimicking the global state we have today).
  1735. + */
  1736. +static struct root_domain def_root_domain;
  1737. +
  1738. +#endif /* CONFIG_SMP */
  1739. +
  1740. +/* There can be only one */
  1741. +static struct global_rq grq;
  1742. +
  1743. +/*
  1744. + * This is the main, per-CPU runqueue data structure.
  1745. + * This data should only be modified by the local cpu.
  1746. + */
  1747. +struct rq {
  1748. +#ifdef CONFIG_SMP
  1749. +#ifdef CONFIG_NO_HZ
  1750. +   u64 nohz_stamp;
  1751. +   unsigned char in_nohz_recently;
  1752. +#endif
  1753. +#endif
  1754. +
  1755. +   struct task_struct *curr, *idle, *stop;
  1756. +   struct mm_struct *prev_mm;
  1757. +
  1758. +   /* Stored data about rq->curr to work outside grq lock */
  1759. +   u64 rq_deadline;
  1760. +   unsigned int rq_policy;
  1761. +   int rq_time_slice;
  1762. +   u64 rq_last_ran;
  1763. +   int rq_prio;
  1764. +   bool rq_running; /* There is a task running */
  1765. +
  1766. +   /* Accurate timekeeping data */
  1767. +   u64 timekeep_clock;
  1768. +   unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc,
  1769. +       iowait_pc, idle_pc;
  1770. +   long account_pc;
  1771. +   atomic_t nr_iowait;
  1772. +
  1773. +#ifdef CONFIG_SMP
  1774. +   int cpu;        /* cpu of this runqueue */
  1775. +   bool online;
  1776. +   bool scaling; /* This CPU is managed by a scaling CPU freq governor */
  1777. +   struct task_struct *sticky_task;
  1778. +
  1779. +   struct root_domain *rd;
  1780. +   struct sched_domain *sd;
  1781. +   int *cpu_locality; /* CPU relative cache distance */
  1782. +#ifdef CONFIG_SCHED_SMT
  1783. +   bool (*siblings_idle)(int cpu);
  1784. +   /* See if all smt siblings are idle */
  1785. +   cpumask_t smt_siblings;
  1786. +#endif
  1787. +#ifdef CONFIG_SCHED_MC
  1788. +   bool (*cache_idle)(int cpu);
  1789. +   /* See if all cache siblings are idle */
  1790. +   cpumask_t cache_siblings;
  1791. +#endif
  1792. +   u64 last_niffy; /* Last time this RQ updated grq.niffies */
  1793. +#endif
  1794. +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
  1795. +   u64 prev_irq_time;
  1796. +#endif
  1797. +#ifdef CONFIG_PARAVIRT
  1798. +   u64 prev_steal_time;
  1799. +#endif
  1800. +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
  1801. +   u64 prev_steal_time_rq;
  1802. +#endif
  1803. +
  1804. +   u64 clock, old_clock, last_tick;
  1805. +   u64 clock_task;
  1806. +   bool dither;
  1807. +
  1808. +#ifdef CONFIG_SCHEDSTATS
  1809. +
  1810. +   /* latency stats */
  1811. +   struct sched_info rq_sched_info;
  1812. +   unsigned long long rq_cpu_time;
  1813. +   /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
  1814. +
  1815. +   /* sys_sched_yield() stats */
  1816. +   unsigned int yld_count;
  1817. +
  1818. +   /* schedule() stats */
  1819. +   unsigned int sched_switch;
  1820. +   unsigned int sched_count;
  1821. +   unsigned int sched_goidle;
  1822. +
  1823. +   /* try_to_wake_up() stats */
  1824. +   unsigned int ttwu_count;
  1825. +   unsigned int ttwu_local;
  1826. +#endif
  1827. +};
  1828. +
  1829. +DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  1830. +static DEFINE_MUTEX(sched_hotcpu_mutex);
  1831. +
  1832. +#ifdef CONFIG_SMP
  1833. +/*
  1834. + * sched_domains_mutex serialises calls to init_sched_domains,
  1835. + * detach_destroy_domains and partition_sched_domains.
  1836. + */
  1837. +static DEFINE_MUTEX(sched_domains_mutex);
  1838. +
  1839. +/*
  1840. + * By default the system creates a single root-domain with all cpus as
  1841. + * members (mimicking the global state we have today).
  1842. + */
  1843. +static struct root_domain def_root_domain;
  1844. +
  1845. +int __weak arch_sd_sibling_asym_packing(void)
  1846. +{
  1847. +       return 0*SD_ASYM_PACKING;
  1848. +}
  1849. +#endif
  1850. +
  1851. +#define rcu_dereference_check_sched_domain(p) \
  1852. +   rcu_dereference_check((p), \
  1853. +                 lockdep_is_held(&sched_domains_mutex))
  1854. +
  1855. +/*
  1856. + * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  1857. + * See detach_destroy_domains: synchronize_sched for details.
  1858. + *
  1859. + * The domain tree of any CPU may only be accessed from within
  1860. + * preempt-disabled sections.
  1861. + */
  1862. +#define for_each_domain(cpu, __sd) \
  1863. +   for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
  1864. +
  1865. +static inline void update_rq_clock(struct rq *rq);
  1866. +
  1867. +/*
  1868. + * Sanity check should sched_clock return bogus values. We make sure it does
  1869. + * not appear to go backwards, and use jiffies to determine the maximum and
  1870. + * minimum it could possibly have increased, and round down to the nearest
  1871. + * jiffy when it falls outside this.
  1872. + */
  1873. +static inline void niffy_diff(s64 *niff_diff, int jiff_diff)
  1874. +{
  1875. +   unsigned long min_diff, max_diff;
  1876. +
  1877. +   if (jiff_diff > 1)
  1878. +       min_diff = JIFFIES_TO_NS(jiff_diff - 1);
  1879. +   else
  1880. +       min_diff = 1;
  1881. +   /*  Round up to the nearest tick for maximum */
  1882. +   max_diff = JIFFIES_TO_NS(jiff_diff + 1);
  1883. +
  1884. +   if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff))
  1885. +       *niff_diff = min_diff;
  1886. +}
  1887. +
  1888. +#ifdef CONFIG_SMP
  1889. +#define cpu_rq(cpu)        (&per_cpu(runqueues, (cpu)))
  1890. +#define this_rq()      (&__get_cpu_var(runqueues))
  1891. +#define task_rq(p)     cpu_rq(task_cpu(p))
  1892. +#define cpu_curr(cpu)      (cpu_rq(cpu)->curr)
  1893. +static inline int cpu_of(struct rq *rq)
  1894. +{
  1895. +   return rq->cpu;
  1896. +}
  1897. +
  1898. +/*
  1899. + * Niffies are a globally increasing nanosecond counter. Whenever a runqueue
  1900. + * clock is updated with the grq lock held, it is an opportunity to update the
  1901. + * niffies value. Any CPU can update it by adding how much its clock has
  1902. + * increased since it last updated niffies, minus any added niffies by other
  1903. + * CPUs.
  1904. + */
  1905. +static inline void update_clocks(struct rq *rq)
  1906. +{
  1907. +   s64 ndiff;
  1908. +   long jdiff;
  1909. +
  1910. +   update_rq_clock(rq);
  1911. +   ndiff = rq->clock - rq->old_clock;
  1912. +   /* old_clock is only updated when we are updating niffies */
  1913. +   rq->old_clock = rq->clock;
  1914. +   ndiff -= grq.niffies - rq->last_niffy;
  1915. +   jdiff = jiffies - grq.last_jiffy;
  1916. +   niffy_diff(&ndiff, jdiff);
  1917. +   grq.last_jiffy += jdiff;
  1918. +   grq.niffies += ndiff;
  1919. +   rq->last_niffy = grq.niffies;
  1920. +}
  1921. +#else /* CONFIG_SMP */
  1922. +static struct rq *uprq;
  1923. +#define cpu_rq(cpu)    (uprq)
  1924. +#define this_rq()  (uprq)
  1925. +#define task_rq(p) (uprq)
  1926. +#define cpu_curr(cpu)  ((uprq)->curr)
  1927. +static inline int cpu_of(struct rq *rq)
  1928. +{
  1929. +   return 0;
  1930. +}
  1931. +
  1932. +static inline void update_clocks(struct rq *rq)
  1933. +{
  1934. +   s64 ndiff;
  1935. +   long jdiff;
  1936. +
  1937. +   update_rq_clock(rq);
  1938. +   ndiff = rq->clock - rq->old_clock;
  1939. +   rq->old_clock = rq->clock;
  1940. +   jdiff = jiffies - grq.last_jiffy;
  1941. +   niffy_diff(&ndiff, jdiff);
  1942. +   grq.last_jiffy += jdiff;
  1943. +   grq.niffies += ndiff;
  1944. +}
  1945. +#endif
  1946. +#define raw_rq()   (&__raw_get_cpu_var(runqueues))
  1947. +
  1948. +#include "stats.h"
  1949. +
  1950. +#ifndef prepare_arch_switch
  1951. +# define prepare_arch_switch(next) do { } while (0)
  1952. +#endif
  1953. +#ifndef finish_arch_switch
  1954. +# define finish_arch_switch(prev)  do { } while (0)
  1955. +#endif
  1956. +#ifndef finish_arch_post_lock_switch
  1957. +# define finish_arch_post_lock_switch()    do { } while (0)
  1958. +#endif
  1959. +
  1960. +/*
  1961. + * All common locking functions performed on grq lock. rq->clock is local to
  1962. + * the CPU accessing it so it can be modified just with interrupts disabled
  1963. + * when we're not updating niffies. Some variables are redundant to the
  1964. + * behaviour and purely there as a prompt to know why the lock was taken.
  1965. + * Looking up task_rq must be done under rlock to be safe.
  1966. + */
  1967. +static void update_rq_clock_task(struct rq *rq, s64 delta);
  1968. +
  1969. +static inline void update_rq_clock(struct rq *rq)
  1970. +{
  1971. +   s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
  1972. +
  1973. +   rq->clock += delta;
  1974. +   update_rq_clock_task(rq, delta);
  1975. +}
  1976. +
  1977. +static inline bool task_running(struct task_struct *p)
  1978. +{
  1979. +   return p->on_cpu;
  1980. +}
  1981. +
  1982. +static inline void grq_wlock(void)
  1983. +{
  1984. +   urw_wlock(&grq.urw);
  1985. +}
  1986. +
  1987. +static inline void grq_wunlock(void)
  1988. +{
  1989. +   urw_wunlock(&grq.urw);
  1990. +}
  1991. +
  1992. +static inline void grq_rlock(void)
  1993. +{
  1994. +   urw_rlock(&grq.urw);
  1995. +}
  1996. +
  1997. +static inline void grq_runlock(void)
  1998. +{
  1999. +   urw_runlock(&grq.urw);
  2000. +}
  2001. +
  2002. +static inline void grq_ulock(void)
  2003. +{
  2004. +   urw_ulock(&grq.urw);
  2005. +}
  2006. +
  2007. +static inline void grq_uunlock(void)
  2008. +{
  2009. +   urw_uunlock(&grq.urw);
  2010. +}
  2011. +
  2012. +static inline void grq_upgrade(void)
  2013. +{
  2014. +   urw_upgrade(&grq.urw);
  2015. +}
  2016. +
  2017. +static inline void grq_udowngrade(void)
  2018. +{
  2019. +   urw_udowngrade(&grq.urw);
  2020. +}
  2021. +
  2022. +static inline void grq_wdowngrade(void)
  2023. +{
  2024. +   urw_wdowngrade(&grq.urw);
  2025. +}
  2026. +
  2027. +static inline void grq_wlock_irq(void)
  2028. +{
  2029. +   urw_wlock_irq(&grq.urw);
  2030. +}
  2031. +
  2032. +static inline void grq_ulock_irq(void)
  2033. +{
  2034. +   urw_ulock_irq(&grq.urw);
  2035. +}
  2036. +
  2037. +static inline void time_wlock_grq(struct rq *rq)
  2038. +{
  2039. +   grq_wlock();
  2040. +   update_clocks(rq);
  2041. +}
  2042. +
  2043. +static inline void grq_wunlock_irq(void)
  2044. +{
  2045. +   urw_wunlock_irq(&grq.urw);
  2046. +}
  2047. +
  2048. +static inline void grq_runlock_irq(void)
  2049. +{
  2050. +   urw_runlock_irq(&grq.urw);
  2051. +}
  2052. +
  2053. +static inline void grq_wlock_irqsave(unsigned long *flags)
  2054. +{
  2055. +   urw_wlock_irqsave(&grq.urw, flags);
  2056. +}
  2057. +
  2058. +static inline void grq_ulock_irqsave(unsigned long *flags)
  2059. +{
  2060. +   urw_ulock_irqsave(&grq.urw, flags);
  2061. +}
  2062. +
  2063. +static inline void grq_rlock_irqsave(unsigned long *flags)
  2064. +{
  2065. +   urw_rlock_irqsave(&grq.urw, flags);
  2066. +}
  2067. +
  2068. +static inline void grq_wunlock_irqrestore(unsigned long *flags)
  2069. +{
  2070. +   urw_wunlock_irqrestore(&grq.urw, flags);
  2071. +}
  2072. +
  2073. +static inline void grq_uunlock_irqrestore(unsigned long *flags)
  2074. +{
  2075. +   urw_uunlock_irqrestore(&grq.urw, flags);
  2076. +}
  2077. +
  2078. +static inline void grq_runlock_irqrestore(unsigned long *flags)
  2079. +{
  2080. +   urw_runlock_irqrestore(&grq.urw, flags);
  2081. +}
  2082. +
  2083. +static inline struct rq
  2084. +*task_grq_wlock(struct task_struct *p, unsigned long *flags)
  2085. +{
  2086. +   grq_wlock_irqsave(flags);
  2087. +   return task_rq(p);
  2088. +}
  2089. +
  2090. +static inline struct rq
  2091. +*task_grq_ulock(struct task_struct *p, unsigned long *flags)
  2092. +{
  2093. +   grq_ulock_irqsave(flags);
  2094. +   return task_rq(p);
  2095. +}
  2096. +
  2097. +static inline struct rq
  2098. +*task_grq_rlock(struct task_struct *p, unsigned long *flags)
  2099. +{
  2100. +   grq_rlock_irqsave(flags);
  2101. +   return task_rq(p);
  2102. +}
  2103. +
  2104. +static inline struct rq
  2105. +*time_task_grq_wlock(struct task_struct *p, unsigned long *flags)
  2106. +{
  2107. +   struct rq *rq = task_grq_wlock(p, flags);
  2108. +   update_clocks(rq);
  2109. +   return rq;
  2110. +}
  2111. +
  2112. +static inline struct rq *task_grq_wlock_irq(struct task_struct *p)
  2113. +{
  2114. +   grq_wlock_irq();
  2115. +   return task_rq(p);
  2116. +}
  2117. +
  2118. +static inline struct rq *task_grq_ulock_irq(struct task_struct *p)
  2119. +{
  2120. +   grq_ulock_irq();
  2121. +   return task_rq(p);
  2122. +}
  2123. +
  2124. +static inline void time_task_grq_wlock_irq(struct task_struct *p)
  2125. +{
  2126. +   struct rq *rq = task_grq_wlock_irq(p);
  2127. +   update_clocks(rq);
  2128. +}
  2129. +
  2130. +static inline void task_grq_wunlock_irq(void)
  2131. +{
  2132. +   grq_wunlock_irq();
  2133. +}
  2134. +
  2135. +static inline void task_grq_runlock_irq(void)
  2136. +{
  2137. +   grq_runlock_irq();
  2138. +}
  2139. +
  2140. +static inline void task_grq_wunlock(unsigned long *flags)
  2141. +{
  2142. +   grq_wunlock_irqrestore(flags);
  2143. +}
  2144. +
  2145. +static inline void task_grq_uunlock(unsigned long *flags)
  2146. +{
  2147. +   grq_uunlock_irqrestore(flags);
  2148. +}
  2149. +
  2150. +static inline void task_grq_runlock(unsigned long *flags)
  2151. +{
  2152. +   grq_runlock_irqrestore(flags);
  2153. +}
  2154. +
  2155. +/**
  2156. + * grunqueue_is_locked
  2157. + *
  2158. + * Returns true if the global runqueue is locked.
  2159. + * This interface allows printk to be called with the runqueue lock
  2160. + * held and know whether or not it is OK to wake up the klogd.
  2161. + */
  2162. +bool grunqueue_is_locked(void)
  2163. +{
  2164. +   return raw_spin_is_locked(&grq.urw.lock);
  2165. +}
  2166. +
  2167. +void grq_unlock_wait(void)
  2168. +{
  2169. +   smp_mb(); /* spin-unlock-wait is not a full memory barrier */
  2170. +   raw_spin_unlock_wait(&grq.urw.lock);
  2171. +}
  2172. +
  2173. +static inline void time_grq_wlock(struct rq *rq, unsigned long *flags)
  2174. +{
  2175. +   local_irq_save(*flags);
  2176. +   time_wlock_grq(rq);
  2177. +}
  2178. +
  2179. +static inline struct rq *__task_grq_wlock(struct task_struct *p)
  2180. +{
  2181. +   grq_wlock();
  2182. +   return task_rq(p);
  2183. +}
  2184. +
  2185. +static inline struct rq *__task_grq_ulock(struct task_struct *p)
  2186. +{
  2187. +   grq_ulock();
  2188. +   return task_rq(p);
  2189. +}
  2190. +
  2191. +static inline void __task_grq_wunlock(void)
  2192. +{
  2193. +   grq_wunlock();
  2194. +}
  2195. +
  2196. +static inline void __task_grq_uunlock(void)
  2197. +{
  2198. +   grq_uunlock();
  2199. +}
  2200. +
  2201. +/*
  2202. + * Look for any tasks *anywhere* that are running nice 0 or better. We do
  2203. + * this lockless for overhead reasons since the occasional wrong result
  2204. + * is harmless.
  2205. + */
  2206. +bool above_background_load(void)
  2207. +{
  2208. +   int cpu;
  2209. +
  2210. +   for_each_online_cpu(cpu) {
  2211. +       struct task_struct *cpu_curr = cpu_rq(cpu)->curr;
  2212. +
  2213. +       if (unlikely(!cpu_curr))
  2214. +           continue;
  2215. +       if (PRIO_TO_NICE(cpu_curr->static_prio) < 1) {
  2216. +           return true;
  2217. +       }
  2218. +   }
  2219. +   return false;
  2220. +}
  2221. +
  2222. +#ifndef __ARCH_WANT_UNLOCKED_CTXSW
  2223. +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
  2224. +{
  2225. +}
  2226. +
  2227. +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
  2228. +{
  2229. +#ifdef CONFIG_DEBUG_SPINLOCK
  2230. +   /* this is a valid case when another task releases the spinlock */
  2231. +   grq.urw.lock.owner = current;
  2232. +   grq.urw.rwlock.owner = current;
  2233. +#endif
  2234. +   /*
  2235. +    * If we are tracking spinlock dependencies then we have to
  2236. +    * fix up the runqueue lock - which gets 'carried over' from
  2237. +    * prev into current:
  2238. +    */
  2239. +   spin_acquire(&grq.urw.lock.dep_map, 0, 0, _THIS_IP_);
  2240. +   rwlock_acquire(&grq.urw.rwlock.dep_map, 0, 0, _THIS_IP_);
  2241. +
  2242. +   grq_wunlock_irq();
  2243. +}
  2244. +
  2245. +#else /* __ARCH_WANT_UNLOCKED_CTXSW */
  2246. +
  2247. +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
  2248. +{
  2249. +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
  2250. +   grq_wunlock_irq();
  2251. +#else
  2252. +   grq_wunlock();
  2253. +#endif
  2254. +}
  2255. +
  2256. +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
  2257. +{
  2258. +   smp_wmb();
  2259. +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
  2260. +   local_irq_enable();
  2261. +#endif
  2262. +}
  2263. +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
  2264. +
  2265. +static inline bool deadline_before(u64 deadline, u64 time)
  2266. +{
  2267. +   return (deadline < time);
  2268. +}
  2269. +
  2270. +static inline bool deadline_after(u64 deadline, u64 time)
  2271. +{
  2272. +   return (deadline > time);
  2273. +}
  2274. +
  2275. +/*
  2276. + * A task that is queued but not running will be on the grq run list.
  2277. + * A task that is not running or queued will not be on the grq run list.
  2278. + * A task that is currently running will have ->on_cpu set but not on the
  2279. + * grq run list.
  2280. + */
  2281. +static inline bool task_queued(struct task_struct *p)
  2282. +{
  2283. +   return (!list_empty(&p->run_list));
  2284. +}
  2285. +
  2286. +/*
  2287. + * Removing from the global runqueue. Enter with grq locked.
  2288. + */
  2289. +static void dequeue_task(struct task_struct *p)
  2290. +{
  2291. +   list_del_init(&p->run_list);
  2292. +   if (list_empty(grq.queue + p->prio))
  2293. +       __clear_bit(p->prio, grq.prio_bitmap);
  2294. +}
  2295. +
  2296. +/*
  2297. + * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
  2298. + * an idle task, we ensure none of the following conditions are met.
  2299. + */
  2300. +static bool idleprio_suitable(struct task_struct *p)
  2301. +{
  2302. +   return (!freezing(p) && !signal_pending(p) &&
  2303. +       !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)));
  2304. +}
  2305. +
  2306. +/*
  2307. + * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check
  2308. + * that the iso_refractory flag is not set.
  2309. + */
  2310. +static bool isoprio_suitable(void)
  2311. +{
  2312. +   return !grq.iso_refractory;
  2313. +}
  2314. +
  2315. +/*
  2316. + * Adding to the global runqueue. Enter with grq locked.
  2317. + */
  2318. +static void enqueue_task(struct task_struct *p)
  2319. +{
  2320. +   if (!rt_task(p)) {
  2321. +       /* Check it hasn't gotten rt from PI */
  2322. +       if ((idleprio_task(p) && idleprio_suitable(p)) ||
  2323. +          (iso_task(p) && isoprio_suitable()))
  2324. +           p->prio = p->normal_prio;
  2325. +       else
  2326. +           p->prio = NORMAL_PRIO;
  2327. +   }
  2328. +   __set_bit(p->prio, grq.prio_bitmap);
  2329. +   list_add_tail(&p->run_list, grq.queue + p->prio);
  2330. +   sched_info_queued(p);
  2331. +}
  2332. +
  2333. +/* Only idle task does this as a real time task*/
  2334. +static inline void enqueue_task_head(struct task_struct *p)
  2335. +{
  2336. +   __set_bit(p->prio, grq.prio_bitmap);
  2337. +   list_add(&p->run_list, grq.queue + p->prio);
  2338. +   sched_info_queued(p);
  2339. +}
  2340. +
  2341. +static inline void requeue_task(struct task_struct *p)
  2342. +{
  2343. +   sched_info_queued(p);
  2344. +}
  2345. +
  2346. +/*
  2347. + * Returns the relative length of deadline all compared to the shortest
  2348. + * deadline which is that of nice -20.
  2349. + */
  2350. +static inline int task_prio_ratio(struct task_struct *p)
  2351. +{
  2352. +   return prio_ratios[TASK_USER_PRIO(p)];
  2353. +}
  2354. +
  2355. +/*
  2356. + * task_timeslice - all tasks of all priorities get the exact same timeslice
  2357. + * length. CPU distribution is handled by giving different deadlines to
  2358. + * tasks of different priorities. Use 128 as the base value for fast shifts.
  2359. + */
  2360. +static inline int task_timeslice(struct task_struct *p)
  2361. +{
  2362. +   return (rr_interval * task_prio_ratio(p) / 128);
  2363. +}
  2364. +
  2365. +#ifdef CONFIG_SMP
  2366. +/*
  2367. + * qnr is the "queued but not running" count which is the total number of
  2368. + * tasks on the global runqueue list waiting for cpu time but not actually
  2369. + * currently running on a cpu.
  2370. + */
  2371. +static inline void inc_qnr(void)
  2372. +{
  2373. +   grq.qnr++;
  2374. +}
  2375. +
  2376. +static inline void dec_qnr(void)
  2377. +{
  2378. +   grq.qnr--;
  2379. +}
  2380. +
  2381. +static inline int queued_notrunning(void)
  2382. +{
  2383. +   return grq.qnr;
  2384. +}
  2385. +
  2386. +/*
  2387. + * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
  2388. + * allow easy lookup of whether any suitable idle CPUs are available.
  2389. + * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
  2390. + * idle_cpus variable than to do a full bitmask check when we are busy.
  2391. + */
  2392. +static inline void set_cpuidle_map(int cpu)
  2393. +{
  2394. +   if (likely(cpu_online(cpu))) {
  2395. +       cpu_set(cpu, grq.cpu_idle_map);
  2396. +       grq.idle_cpus = true;
  2397. +   }
  2398. +}
  2399. +
  2400. +static inline void clear_cpuidle_map(int cpu)
  2401. +{
  2402. +   cpu_clear(cpu, grq.cpu_idle_map);
  2403. +   if (cpus_empty(grq.cpu_idle_map))
  2404. +       grq.idle_cpus = false;
  2405. +}
  2406. +
  2407. +static bool suitable_idle_cpus(struct task_struct *p)
  2408. +{
  2409. +   if (!grq.idle_cpus)
  2410. +       return false;
  2411. +   return (cpus_intersects(p->cpus_allowed, grq.cpu_idle_map));
  2412. +}
  2413. +
  2414. +#define CPUIDLE_DIFF_THREAD    (1)
  2415. +#define CPUIDLE_DIFF_CORE  (2)
  2416. +#define CPUIDLE_CACHE_BUSY (4)
  2417. +#define CPUIDLE_DIFF_CPU   (8)
  2418. +#define CPUIDLE_THREAD_BUSY    (16)
  2419. +#define CPUIDLE_DIFF_NODE  (32)
  2420. +
  2421. +static void resched_task(struct task_struct *p);
  2422. +
  2423. +/*
  2424. + * The best idle CPU is chosen according to the CPUIDLE ranking above where the
  2425. + * lowest value would give the most suitable CPU to schedule p onto next. The
  2426. + * order works out to be the following:
  2427. + *
  2428. + * Same core, idle or busy cache, idle or busy threads
  2429. + * Other core, same cache, idle or busy cache, idle threads.
  2430. + * Same node, other CPU, idle cache, idle threads.
  2431. + * Same node, other CPU, busy cache, idle threads.
  2432. + * Other core, same cache, busy threads.
  2433. + * Same node, other CPU, busy threads.
  2434. + * Other node, other CPU, idle cache, idle threads.
  2435. + * Other node, other CPU, busy cache, idle threads.
  2436. + * Other node, other CPU, busy threads.
  2437. + */
  2438. +static void
  2439. +resched_best_mask(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
  2440. +{
  2441. +   unsigned int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY |
  2442. +       CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE |
  2443. +       CPUIDLE_DIFF_THREAD;
  2444. +   int cpu_tmp;
  2445. +
  2446. +   if (cpu_isset(best_cpu, *tmpmask))
  2447. +       goto out;
  2448. +
  2449. +   for_each_cpu_mask(cpu_tmp, *tmpmask) {
  2450. +       unsigned int ranking;
  2451. +       struct rq *tmp_rq;
  2452. +
  2453. +       ranking = 0;
  2454. +       tmp_rq = cpu_rq(cpu_tmp);
  2455. +
  2456. +#ifdef CONFIG_NUMA
  2457. +       if (rq->cpu_locality[cpu_tmp] > 3)
  2458. +           ranking |= CPUIDLE_DIFF_NODE;
  2459. +       else
  2460. +#endif
  2461. +       if (rq->cpu_locality[cpu_tmp] > 2)
  2462. +           ranking |= CPUIDLE_DIFF_CPU;
  2463. +#ifdef CONFIG_SCHED_MC
  2464. +       if (rq->cpu_locality[cpu_tmp] == 2)
  2465. +           ranking |= CPUIDLE_DIFF_CORE;
  2466. +       if (!(tmp_rq->cache_idle(cpu_tmp)))
  2467. +           ranking |= CPUIDLE_CACHE_BUSY;
  2468. +#endif
  2469. +#ifdef CONFIG_SCHED_SMT
  2470. +       if (rq->cpu_locality[cpu_tmp] == 1)
  2471. +           ranking |= CPUIDLE_DIFF_THREAD;
  2472. +       if (!(tmp_rq->siblings_idle(cpu_tmp)))
  2473. +           ranking |= CPUIDLE_THREAD_BUSY;
  2474. +#endif
  2475. +       if (ranking < best_ranking) {
  2476. +           best_cpu = cpu_tmp;
  2477. +           best_ranking = ranking;
  2478. +       }
  2479. +   }
  2480. +out:
  2481. +   resched_task(cpu_rq(best_cpu)->curr);
  2482. +}
  2483. +
  2484. +bool cpus_share_cache(int this_cpu, int that_cpu)
  2485. +{
  2486. +   struct rq *this_rq = cpu_rq(this_cpu);
  2487. +
  2488. +   return (this_rq->cpu_locality[that_cpu] < 2);
  2489. +}
  2490. +
  2491. +static void resched_best_idle(struct task_struct *p)
  2492. +{
  2493. +   cpumask_t tmpmask;
  2494. +
  2495. +   cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
  2496. +   resched_best_mask(task_cpu(p), task_rq(p), &tmpmask);
  2497. +}
  2498. +
  2499. +static inline void resched_suitable_idle(struct task_struct *p)
  2500. +{
  2501. +   if (suitable_idle_cpus(p))
  2502. +       resched_best_idle(p);
  2503. +}
  2504. +/*
  2505. + * Flags to tell us whether this CPU is running a CPU frequency governor that
  2506. + * has slowed its speed or not. No locking required as the very rare wrongly
  2507. + * read value would be harmless.
  2508. + */
  2509. +void cpu_scaling(int cpu)
  2510. +{
  2511. +   cpu_rq(cpu)->scaling = true;
  2512. +}
  2513. +
  2514. +void cpu_nonscaling(int cpu)
  2515. +{
  2516. +   cpu_rq(cpu)->scaling = false;
  2517. +}
  2518. +
  2519. +static inline bool scaling_rq(struct rq *rq)
  2520. +{
  2521. +   return rq->scaling;
  2522. +}
  2523. +
  2524. +static inline int locality_diff(struct task_struct *p, struct rq *rq)
  2525. +{
  2526. +   return rq->cpu_locality[task_cpu(p)];
  2527. +}
  2528. +#else /* CONFIG_SMP */
  2529. +static inline void inc_qnr(void)
  2530. +{
  2531. +}
  2532. +
  2533. +static inline void dec_qnr(void)
  2534. +{
  2535. +}
  2536. +
  2537. +static inline int queued_notrunning(void)
  2538. +{
  2539. +   return grq.nr_running;
  2540. +}
  2541. +
  2542. +static inline void set_cpuidle_map(int cpu)
  2543. +{
  2544. +}
  2545. +
  2546. +static inline void clear_cpuidle_map(int cpu)
  2547. +{
  2548. +}
  2549. +
  2550. +static inline bool suitable_idle_cpus(struct task_struct *p)
  2551. +{
  2552. +   return uprq->curr == uprq->idle;
  2553. +}
  2554. +
  2555. +static inline void resched_suitable_idle(struct task_struct *p)
  2556. +{
  2557. +}
  2558. +
  2559. +void cpu_scaling(int __unused)
  2560. +{
  2561. +}
  2562. +
  2563. +void cpu_nonscaling(int __unused)
  2564. +{
  2565. +}
  2566. +
  2567. +/*
  2568. + * Although CPUs can scale in UP, there is nowhere else for tasks to go so this
  2569. + * always returns 0.
  2570. + */
  2571. +static inline bool scaling_rq(struct rq *rq)
  2572. +{
  2573. +   return false;
  2574. +}
  2575. +
  2576. +static inline int locality_diff(struct task_struct *p, struct rq *rq)
  2577. +{
  2578. +   return 0;
  2579. +}
  2580. +#endif /* CONFIG_SMP */
  2581. +EXPORT_SYMBOL_GPL(cpu_scaling);
  2582. +EXPORT_SYMBOL_GPL(cpu_nonscaling);
  2583. +
  2584. +/*
  2585. + * activate_idle_task - move idle task to the _front_ of runqueue.
  2586. + */
  2587. +static inline void activate_idle_task(struct task_struct *p)
  2588. +{
  2589. +   enqueue_task_head(p);
  2590. +   grq.nr_running++;
  2591. +   inc_qnr();
  2592. +}
  2593. +
  2594. +static inline int normal_prio(struct task_struct *p)
  2595. +{
  2596. +   if (has_rt_policy(p))
  2597. +       return MAX_RT_PRIO - 1 - p->rt_priority;
  2598. +   if (idleprio_task(p))
  2599. +       return IDLE_PRIO;
  2600. +   if (iso_task(p))
  2601. +       return ISO_PRIO;
  2602. +   return NORMAL_PRIO;
  2603. +}
  2604. +
  2605. +/*
  2606. + * Calculate the current priority, i.e. the priority
  2607. + * taken into account by the scheduler. This value might
  2608. + * be boosted by RT tasks as it will be RT if the task got
  2609. + * RT-boosted. If not then it returns p->normal_prio.
  2610. + */
  2611. +static int effective_prio(struct task_struct *p)
  2612. +{
  2613. +   p->normal_prio = normal_prio(p);
  2614. +   /*
  2615. +    * If we are RT tasks or we were boosted to RT priority,
  2616. +    * keep the priority unchanged. Otherwise, update priority
  2617. +    * to the normal priority:
  2618. +    */
  2619. +   if (!rt_prio(p->prio))
  2620. +       return p->normal_prio;
  2621. +   return p->prio;
  2622. +}
  2623. +
  2624. +/*
  2625. + * activate_task - move a task to the runqueue. Enter with grq locked.
  2626. + */
  2627. +static void activate_task(struct task_struct *p, struct rq *rq)
  2628. +{
  2629. +   update_clocks(rq);
  2630. +
  2631. +   /*
  2632. +    * Sleep time is in units of nanosecs, so shift by 20 to get a
  2633. +    * milliseconds-range estimation of the amount of time that the task
  2634. +    * spent sleeping:
  2635. +    */
  2636. +   if (unlikely(prof_on == SLEEP_PROFILING)) {
  2637. +       if (p->state == TASK_UNINTERRUPTIBLE)
  2638. +           profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
  2639. +                    (rq->clock - p->last_ran) >> 20);
  2640. +   }
  2641. +
  2642. +   p->prio = effective_prio(p);
  2643. +   if (task_contributes_to_load(p))
  2644. +       grq.nr_uninterruptible--;
  2645. +   enqueue_task(p);
  2646. +   grq.nr_running++;
  2647. +   inc_qnr();
  2648. +}
  2649. +
  2650. +static inline void clear_sticky(struct task_struct *p);
  2651. +
  2652. +/*
  2653. + * deactivate_task - If it's running, it's not on the grq and we can just
  2654. + * decrement the nr_running. Enter with grq locked.
  2655. + */
  2656. +static inline void deactivate_task(struct task_struct *p)
  2657. +{
  2658. +   if (task_contributes_to_load(p))
  2659. +       grq.nr_uninterruptible++;
  2660. +   grq.nr_running--;
  2661. +   clear_sticky(p);
  2662. +}
  2663. +
  2664. +#ifdef CONFIG_SMP
  2665. +void set_task_cpu(struct task_struct *p, unsigned int cpu)
  2666. +{
  2667. +#ifdef CONFIG_LOCKDEP
  2668. +   /*
  2669. +    * The caller should hold grq lock.
  2670. +    */
  2671. +   WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.urw.lock));
  2672. +#endif
  2673. +   trace_sched_migrate_task(p, cpu);
  2674. +   if (task_cpu(p) != cpu)
  2675. +       perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
  2676. +
  2677. +   /*
  2678. +    * After ->cpu is set up to a new value, task_grq_wlock(p, ...) can be
  2679. +    * successfully executed on another CPU. We must ensure that updates of
  2680. +    * per-task data have been completed by this moment.
  2681. +    */
  2682. +   smp_wmb();
  2683. +   task_thread_info(p)->cpu = cpu;
  2684. +}
  2685. +
  2686. +static inline void clear_sticky(struct task_struct *p)
  2687. +{
  2688. +   p->sticky = false;
  2689. +}
  2690. +
  2691. +static inline bool task_sticky(struct task_struct *p)
  2692. +{
  2693. +   return p->sticky;
  2694. +}
  2695. +
  2696. +/* Reschedule the best idle CPU that is not this one. */
  2697. +static void
  2698. +resched_closest_idle(struct rq *rq, int cpu, struct task_struct *p)
  2699. +{
  2700. +   cpumask_t tmpmask;
  2701. +
  2702. +   cpus_and(tmpmask, p->cpus_allowed, grq.cpu_idle_map);
  2703. +   cpu_clear(cpu, tmpmask);
  2704. +   if (cpus_empty(tmpmask))
  2705. +       return;
  2706. +   resched_best_mask(cpu, rq, &tmpmask);
  2707. +}
  2708. +
  2709. +/*
  2710. + * We set the sticky flag on a task that is descheduled involuntarily meaning
  2711. + * it is awaiting further CPU time. If the last sticky task is still sticky
  2712. + * but unlucky enough to not be the next task scheduled, we unstick it and try
  2713. + * to find it an idle CPU. Realtime tasks do not stick to minimise their
  2714. + * latency at all times.
  2715. + */
  2716. +static inline void
  2717. +swap_sticky(struct rq *rq, int cpu, struct task_struct *p)
  2718. +{
  2719. +   if (rq->sticky_task) {
  2720. +       if (rq->sticky_task == p) {
  2721. +           p->sticky = true;
  2722. +           return;
  2723. +       }
  2724. +       if (task_sticky(rq->sticky_task)) {
  2725. +           clear_sticky(rq->sticky_task);
  2726. +           resched_closest_idle(rq, cpu, rq->sticky_task);
  2727. +       }
  2728. +   }
  2729. +   if (!rt_task(p)) {
  2730. +       p->sticky = true;
  2731. +       rq->sticky_task = p;
  2732. +   } else {
  2733. +       resched_closest_idle(rq, cpu, p);
  2734. +       rq->sticky_task = NULL;
  2735. +   }
  2736. +}
  2737. +
  2738. +static inline void unstick_task(struct rq *rq, struct task_struct *p)
  2739. +{
  2740. +   rq->sticky_task = NULL;
  2741. +   clear_sticky(p);
  2742. +}
  2743. +#else
  2744. +static inline void clear_sticky(struct task_struct *p)
  2745. +{
  2746. +}
  2747. +
  2748. +static inline bool task_sticky(struct task_struct *p)
  2749. +{
  2750. +   return false;
  2751. +}
  2752. +
  2753. +static inline void
  2754. +swap_sticky(struct rq *rq, int cpu, struct task_struct *p)
  2755. +{
  2756. +}
  2757. +
  2758. +static inline void unstick_task(struct rq *rq, struct task_struct *p)
  2759. +{
  2760. +}
  2761. +#endif
  2762. +
  2763. +/*
  2764. + * Move a task off the global queue and take it to a cpu for it will
  2765. + * become the running task.
  2766. + */
  2767. +static inline void take_task(int cpu, struct task_struct *p)
  2768. +{
  2769. +   set_task_cpu(p, cpu);
  2770. +   dequeue_task(p);
  2771. +   clear_sticky(p);
  2772. +   dec_qnr();
  2773. +}
  2774. +
  2775. +/*
  2776. + * Returns a descheduling task to the grq runqueue unless it is being
  2777. + * deactivated.
  2778. + */
  2779. +static inline void return_task(struct task_struct *p, bool deactivate)
  2780. +{
  2781. +   if (deactivate)
  2782. +       deactivate_task(p);
  2783. +   else {
  2784. +       inc_qnr();
  2785. +       enqueue_task(p);
  2786. +   }
  2787. +}
  2788. +
  2789. +/*
  2790. + * resched_task - mark a task 'to be rescheduled now'.
  2791. + *
  2792. + * On UP this means the setting of the need_resched flag, on SMP it
  2793. + * might also involve a cross-CPU call to trigger the scheduler on
  2794. + * the target CPU.
  2795. + */
  2796. +#ifdef CONFIG_SMP
  2797. +
  2798. +#ifndef tsk_is_polling
  2799. +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
  2800. +#endif
  2801. +
  2802. +static void resched_task(struct task_struct *p)
  2803. +{
  2804. +   int cpu;
  2805. +
  2806. +   assert_raw_spin_locked(&grq.urw.lock);
  2807. +
  2808. +   if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
  2809. +       return;
  2810. +
  2811. +   set_tsk_thread_flag(p, TIF_NEED_RESCHED);
  2812. +
  2813. +   cpu = task_cpu(p);
  2814. +   if (cpu == smp_processor_id())
  2815. +       return;
  2816. +
  2817. +   /* NEED_RESCHED must be visible before we test polling */
  2818. +   smp_mb();
  2819. +   if (!tsk_is_polling(p))
  2820. +       smp_send_reschedule(cpu);
  2821. +}
  2822. +
  2823. +#else
  2824. +static inline void resched_task(struct task_struct *p)
  2825. +{
  2826. +   assert_raw_spin_locked(&grq.urw.lock);
  2827. +   set_tsk_need_resched(p);
  2828. +}
  2829. +#endif
  2830. +
  2831. +/**
  2832. + * task_curr - is this task currently executing on a CPU?
  2833. + * @p: the task in question.
  2834. + */
  2835. +inline int task_curr(const struct task_struct *p)
  2836. +{
  2837. +   return cpu_curr(task_cpu(p)) == p;
  2838. +}
  2839. +
  2840. +#ifdef CONFIG_SMP
  2841. +struct migration_req {
  2842. +   struct task_struct *task;
  2843. +   int dest_cpu;
  2844. +};
  2845. +
  2846. +/*
  2847. + * wait_task_inactive - wait for a thread to unschedule.
  2848. + *
  2849. + * If @match_state is nonzero, it's the @p->state value just checked and
  2850. + * not expected to change.  If it changes, i.e. @p might have woken up,
  2851. + * then return zero.  When we succeed in waiting for @p to be off its CPU,
  2852. + * we return a positive number (its total switch count).  If a second call
  2853. + * a short while later returns the same number, the caller can be sure that
  2854. + * @p has remained unscheduled the whole time.
  2855. + *
  2856. + * The caller must ensure that the task *will* unschedule sometime soon,
  2857. + * else this function might spin for a *long* time. This function can't
  2858. + * be called with interrupts off, or it may introduce deadlock with
  2859. + * smp_call_function() if an IPI is sent by the same process we are
  2860. + * waiting to become inactive.
  2861. + */
  2862. +unsigned long wait_task_inactive(struct task_struct *p, long match_state)
  2863. +{
  2864. +   unsigned long flags;
  2865. +   bool running, on_rq;
  2866. +   unsigned long ncsw;
  2867. +   struct rq *rq;
  2868. +
  2869. +   for (;;) {
  2870. +       /*
  2871. +        * We do the initial early heuristics without holding
  2872. +        * any task-queue locks at all. We'll only try to get
  2873. +        * the runqueue lock when things look like they will
  2874. +        * work out! In the unlikely event rq is dereferenced
  2875. +        * since we're lockless, grab it again.
  2876. +        */
  2877. +#ifdef CONFIG_SMP
  2878. +retry_rq:
  2879. +       rq = task_rq(p);
  2880. +       if (unlikely(!rq))
  2881. +           goto retry_rq;
  2882. +#else /* CONFIG_SMP */
  2883. +       rq = task_rq(p);
  2884. +#endif
  2885. +       /*
  2886. +        * If the task is actively running on another CPU
  2887. +        * still, just relax and busy-wait without holding
  2888. +        * any locks.
  2889. +        *
  2890. +        * NOTE! Since we don't hold any locks, it's not
  2891. +        * even sure that "rq" stays as the right runqueue!
  2892. +        * But we don't care, since this will return false
  2893. +        * if the runqueue has changed and p is actually now
  2894. +        * running somewhere else!
  2895. +        */
  2896. +       while (task_running(p) && p == rq->curr) {
  2897. +           if (match_state && unlikely(p->state != match_state))
  2898. +               return 0;
  2899. +           cpu_relax();
  2900. +       }
  2901. +
  2902. +       /*
  2903. +        * Ok, time to look more closely! We need the grq
  2904. +        * lock now, to be *sure*. If we're wrong, we'll
  2905. +        * just go back and repeat.
  2906. +        */
  2907. +       rq = task_grq_rlock(p, &flags);
  2908. +       trace_sched_wait_task(p);
  2909. +       running = task_running(p);
  2910. +       on_rq = task_queued(p);
  2911. +       ncsw = 0;
  2912. +       if (!match_state || p->state == match_state)
  2913. +           ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
  2914. +       task_grq_runlock(&flags);
  2915. +
  2916. +       /*
  2917. +        * If it changed from the expected state, bail out now.
  2918. +        */
  2919. +       if (unlikely(!ncsw))
  2920. +           break;
  2921. +
  2922. +       /*
  2923. +        * Was it really running after all now that we
  2924. +        * checked with the proper locks actually held?
  2925. +        *
  2926. +        * Oops. Go back and try again..
  2927. +        */
  2928. +       if (unlikely(running)) {
  2929. +           cpu_relax();
  2930. +           continue;
  2931. +       }
  2932. +
  2933. +       /*
  2934. +        * It's not enough that it's not actively running,
  2935. +        * it must be off the runqueue _entirely_, and not
  2936. +        * preempted!
  2937. +        *
  2938. +        * So if it was still runnable (but just not actively
  2939. +        * running right now), it's preempted, and we should
  2940. +        * yield - it could be a while.
  2941. +        */
  2942. +       if (unlikely(on_rq)) {
  2943. +           ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ);
  2944. +
  2945. +           set_current_state(TASK_UNINTERRUPTIBLE);
  2946. +           schedule_hrtimeout(&to, HRTIMER_MODE_REL);
  2947. +           continue;
  2948. +       }
  2949. +
  2950. +       /*
  2951. +        * Ahh, all good. It wasn't running, and it wasn't
  2952. +        * runnable, which means that it will never become
  2953. +        * running in the future either. We're all done!
  2954. +        */
  2955. +       break;
  2956. +   }
  2957. +
  2958. +   return ncsw;
  2959. +}
  2960. +
  2961. +/***
  2962. + * kick_process - kick a running thread to enter/exit the kernel
  2963. + * @p: the to-be-kicked thread
  2964. + *
  2965. + * Cause a process which is running on another CPU to enter
  2966. + * kernel-mode, without any delay. (to get signals handled.)
  2967. + *
  2968. + * NOTE: this function doesn't have to take the runqueue lock,
  2969. + * because all it wants to ensure is that the remote task enters
  2970. + * the kernel. If the IPI races and the task has been migrated
  2971. + * to another CPU then no harm is done and the purpose has been
  2972. + * achieved as well.
  2973. + */
  2974. +void kick_process(struct task_struct *p)
  2975. +{
  2976. +   int cpu;
  2977. +
  2978. +   preempt_disable();
  2979. +   cpu = task_cpu(p);
  2980. +   if ((cpu != smp_processor_id()) && task_curr(p))
  2981. +       smp_send_reschedule(cpu);
  2982. +   preempt_enable();
  2983. +}
  2984. +EXPORT_SYMBOL_GPL(kick_process);
  2985. +#endif
  2986. +
  2987. +#define rq_idle(rq)    ((rq)->rq_prio == PRIO_LIMIT)
  2988. +
  2989. +/*
  2990. + * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
  2991. + * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or
  2992. + * between themselves, they cooperatively multitask. An idle rq scores as
  2993. + * prio PRIO_LIMIT so it is always preempted.
  2994. + */
  2995. +static inline bool
  2996. +can_preempt(struct task_struct *p, int prio, u64 deadline)
  2997. +{
  2998. +   /* Better static priority RT task or better policy preemption */
  2999. +   if (p->prio < prio)
  3000. +       return true;
  3001. +   if (p->prio > prio)
  3002. +       return false;
  3003. +   /* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */
  3004. +   if (!deadline_before(p->deadline, deadline))
  3005. +       return false;
  3006. +   return true;
  3007. +}
  3008. +
  3009. +#ifdef CONFIG_SMP
  3010. +#define cpu_online_map     (*(cpumask_t *)cpu_online_mask)
  3011. +#ifdef CONFIG_HOTPLUG_CPU
  3012. +/*
  3013. + * Check to see if there is a task that is affined only to offline CPUs but
  3014. + * still wants runtime. This happens to kernel threads during suspend/halt and
  3015. + * disabling of CPUs.
  3016. + */
  3017. +static inline bool online_cpus(struct task_struct *p)
  3018. +{
  3019. +   return (likely(cpus_intersects(cpu_online_map, p->cpus_allowed)));
  3020. +}
  3021. +#else /* CONFIG_HOTPLUG_CPU */
  3022. +/* All available CPUs are always online without hotplug. */
  3023. +static inline bool online_cpus(struct task_struct *p)
  3024. +{
  3025. +   return true;
  3026. +}
  3027. +#endif
  3028. +
  3029. +/*
  3030. + * Check to see if p can run on cpu, and if not, whether there are any online
  3031. + * CPUs it can run on instead.
  3032. + */
  3033. +static inline bool needs_other_cpu(struct task_struct *p, int cpu)
  3034. +{
  3035. +   if (unlikely(!cpu_isset(cpu, p->cpus_allowed)))
  3036. +       return true;
  3037. +   return false;
  3038. +}
  3039. +
  3040. +/*
  3041. + * When all else is equal, still prefer this_rq.
  3042. + */
  3043. +static void try_preempt(struct task_struct *p, struct rq *this_rq)
  3044. +{
  3045. +   struct rq *highest_prio_rq = NULL;
  3046. +   int cpu, highest_prio;
  3047. +   u64 latest_deadline;
  3048. +   cpumask_t tmp;
  3049. +
  3050. +   /*
  3051. +    * We clear the sticky flag here because for a task to have called
  3052. +    * try_preempt with the sticky flag enabled means some complicated
  3053. +    * re-scheduling has occurred and we should ignore the sticky flag.
  3054. +    */
  3055. +   clear_sticky(p);
  3056. +
  3057. +   if (suitable_idle_cpus(p)) {
  3058. +       resched_best_idle(p);
  3059. +       return;
  3060. +   }
  3061. +
  3062. +   /* IDLEPRIO tasks never preempt anything but idle */
  3063. +   if (p->policy == SCHED_IDLEPRIO)
  3064. +       return;
  3065. +
  3066. +   if (likely(online_cpus(p)))
  3067. +       cpus_and(tmp, cpu_online_map, p->cpus_allowed);
  3068. +   else
  3069. +       return;
  3070. +
  3071. +   highest_prio = latest_deadline = 0;
  3072. +
  3073. +   for_each_cpu_mask(cpu, tmp) {
  3074. +       struct rq *rq;
  3075. +       int rq_prio;
  3076. +
  3077. +       rq = cpu_rq(cpu);
  3078. +       rq_prio = rq->rq_prio;
  3079. +       if (rq_prio < highest_prio)
  3080. +           continue;
  3081. +
  3082. +       if (rq_prio > highest_prio ||
  3083. +           deadline_after(rq->rq_deadline, latest_deadline)) {
  3084. +           latest_deadline = rq->rq_deadline;
  3085. +           highest_prio = rq_prio;
  3086. +           highest_prio_rq = rq;
  3087. +       }
  3088. +   }
  3089. +
  3090. +   if (likely(highest_prio_rq)) {
  3091. +       if (can_preempt(p, highest_prio, highest_prio_rq->rq_deadline))
  3092. +           resched_task(highest_prio_rq->curr);
  3093. +   }
  3094. +}
  3095. +#else /* CONFIG_SMP */
  3096. +static inline bool needs_other_cpu(struct task_struct *p, int cpu)
  3097. +{
  3098. +   return false;
  3099. +}
  3100. +
  3101. +static void try_preempt(struct task_struct *p, struct rq *this_rq)
  3102. +{
  3103. +   if (p->policy == SCHED_IDLEPRIO)
  3104. +       return;
  3105. +   if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline))
  3106. +       resched_task(uprq->curr);
  3107. +}
  3108. +#endif /* CONFIG_SMP */
  3109. +
  3110. +static void
  3111. +ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
  3112. +{
  3113. +#ifdef CONFIG_SCHEDSTATS
  3114. +   struct rq *rq = this_rq();
  3115. +
  3116. +#ifdef CONFIG_SMP
  3117. +   int this_cpu = smp_processor_id();
  3118. +
  3119. +   if (cpu == this_cpu)
  3120. +       schedstat_inc(rq, ttwu_local);
  3121. +   else {
  3122. +       struct sched_domain *sd;
  3123. +
  3124. +       rcu_read_lock();
  3125. +       for_each_domain(this_cpu, sd) {
  3126. +           if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
  3127. +               schedstat_inc(sd, ttwu_wake_remote);
  3128. +               break;
  3129. +           }
  3130. +       }
  3131. +       rcu_read_unlock();
  3132. +   }
  3133. +
  3134. +#endif /* CONFIG_SMP */
  3135. +
  3136. +   schedstat_inc(rq, ttwu_count);
  3137. +#endif /* CONFIG_SCHEDSTATS */
  3138. +}
  3139. +
  3140. +static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
  3141. +                bool is_sync)
  3142. +{
  3143. +   activate_task(p, rq);
  3144. +
  3145. +   /*
  3146. +    * Sync wakeups (i.e. those types of wakeups where the waker
  3147. +    * has indicated that it will leave the CPU in short order)
  3148. +    * don't trigger a preemption if there are no idle cpus,
  3149. +    * instead waiting for current to deschedule.
  3150. +    */
  3151. +   if (!is_sync || suitable_idle_cpus(p))
  3152. +       try_preempt(p, rq);
  3153. +}
  3154. +
  3155. +static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
  3156. +                   bool success)
  3157. +{
  3158. +   trace_sched_wakeup(p, success);
  3159. +   p->state = TASK_RUNNING;
  3160. +
  3161. +   /*
  3162. +    * if a worker is waking up, notify workqueue. Note that on BFS, we
  3163. +    * don't really know what cpu it will be, so we fake it for
  3164. +    * wq_worker_waking_up :/
  3165. +    */
  3166. +   if ((p->flags & PF_WQ_WORKER) && success)
  3167. +       wq_worker_waking_up(p, cpu_of(rq));
  3168. +}
  3169. +
  3170. +#ifdef CONFIG_SMP
  3171. +void scheduler_ipi(void)
  3172. +{
  3173. +}
  3174. +#endif /* CONFIG_SMP */
  3175. +
  3176. +/***
  3177. + * try_to_wake_up - wake up a thread
  3178. + * @p: the thread to be awakened
  3179. + * @state: the mask of task states that can be woken
  3180. + * @wake_flags: wake modifier flags (WF_*)
  3181. + *
  3182. + * Put it on the run-queue if it's not already there. The "current"
  3183. + * thread is always on the run-queue (except when the actual
  3184. + * re-schedule is in progress), and as such you're allowed to do
  3185. + * the simpler "current->state = TASK_RUNNING" to mark yourself
  3186. + * runnable without the overhead of this.
  3187. + *
  3188. + * Returns %true if @p was woken up, %false if it was already running
  3189. + * or @state didn't match @p's state.
  3190. + */
  3191. +static bool try_to_wake_up(struct task_struct *p, unsigned int state,
  3192. +             int wake_flags)
  3193. +{
  3194. +   bool success = false;
  3195. +   unsigned long flags;
  3196. +   struct rq *rq;
  3197. +   int cpu;
  3198. +
  3199. +   get_cpu();
  3200. +
  3201. +   /* This barrier is undocumented, probably for p->state? くそ */
  3202. +   smp_wmb();
  3203. +
  3204. +   /*
  3205. +    * No need to do time_lock_grq as we only need to update the rq clock
  3206. +    * if we activate the task
  3207. +    */
  3208. +   rq = task_grq_ulock(p, &flags);
  3209. +   cpu = task_cpu(p);
  3210. +
  3211. +   /* state is a volatile long, どうして、分からない */
  3212. +   if (!((unsigned int)p->state & state))
  3213. +       goto out_unlock;
  3214. +
  3215. +   if (task_queued(p) || task_running(p))
  3216. +       goto out_running;
  3217. +
  3218. +   grq_upgrade();
  3219. +   ttwu_activate(p, rq, wake_flags & WF_SYNC);
  3220. +   success = true;
  3221. +
  3222. +out_running:
  3223. +   ttwu_post_activation(p, rq, success);
  3224. +out_unlock:
  3225. +   if (success)
  3226. +       task_grq_wunlock(&flags);
  3227. +   else
  3228. +       task_grq_uunlock(&flags);
  3229. +
  3230. +   ttwu_stat(p, cpu, wake_flags);
  3231. +
  3232. +   put_cpu();
  3233. +
  3234. +   return success;
  3235. +}
  3236. +
  3237. +/**
  3238. + * try_to_wake_up_local - try to wake up a local task with grq lock held
  3239. + * @p: the thread to be awakened
  3240. + *
  3241. + * Put @p on the run-queue if it's not already there. The caller must
  3242. + * ensure that grq is locked and, @p is not the current task.
  3243. + * grq stays locked over invocation.
  3244. + */
  3245. +static void try_to_wake_up_local(struct task_struct *p)
  3246. +{
  3247. +   struct rq *rq = task_rq(p);
  3248. +   bool success = false;
  3249. +
  3250. +   lockdep_assert_held(&grq.urw.lock);
  3251. +
  3252. +   if (!(p->state & TASK_NORMAL))
  3253. +       return;
  3254. +
  3255. +   if (!task_queued(p)) {
  3256. +       if (likely(!task_running(p))) {
  3257. +           schedstat_inc(rq, ttwu_count);
  3258. +           schedstat_inc(rq, ttwu_local);
  3259. +       }
  3260. +       ttwu_activate(p, rq, false);
  3261. +       ttwu_stat(p, smp_processor_id(), 0);
  3262. +       success = true;
  3263. +   }
  3264. +   ttwu_post_activation(p, rq, success);
  3265. +}
  3266. +
  3267. +/**
  3268. + * wake_up_process - Wake up a specific process
  3269. + * @p: The process to be woken up.
  3270. + *
  3271. + * Attempt to wake up the nominated process and move it to the set of runnable
  3272. + * processes.  Returns 1 if the process was woken up, 0 if it was already
  3273. + * running.
  3274. + *
  3275. + * It may be assumed that this function implies a write memory barrier before
  3276. + * changing the task state if and only if any tasks are woken up.
  3277. + */
  3278. +int wake_up_process(struct task_struct *p)
  3279. +{
  3280. +   return try_to_wake_up(p, TASK_ALL, 0);
  3281. +}
  3282. +EXPORT_SYMBOL(wake_up_process);
  3283. +
  3284. +int wake_up_state(struct task_struct *p, unsigned int state)
  3285. +{
  3286. +   return try_to_wake_up(p, state, 0);
  3287. +}
  3288. +
  3289. +static void time_slice_expired(struct task_struct *p);
  3290. +
  3291. +/*
  3292. + * Perform scheduler related setup for a newly forked process p.
  3293. + * p is forked by current.
  3294. + */
  3295. +void sched_fork(struct task_struct *p)
  3296. +{
  3297. +   struct task_struct *curr;
  3298. +   int cpu = get_cpu();
  3299. +   struct rq *rq;
  3300. +
  3301. +#ifdef CONFIG_PREEMPT_NOTIFIERS
  3302. +   INIT_HLIST_HEAD(&p->preempt_notifiers);
  3303. +#endif
  3304. +   /*
  3305. +    * We mark the process as running here. This guarantees that
  3306. +    * nobody will actually run it, and a signal or other external
  3307. +    * event cannot wake it up and insert it on the runqueue either.
  3308. +    */
  3309. +   p->state = TASK_RUNNING;
  3310. +   set_task_cpu(p, cpu);
  3311. +
  3312. +   /* Should be reset in fork.c but done here for ease of bfs patching */
  3313. +   p->utime =
  3314. +   p->stime =
  3315. +   p->utimescaled =
  3316. +   p->stimescaled =
  3317. +   p->sched_time =
  3318. +   p->stime_pc =
  3319. +   p->utime_pc = 0;
  3320. +
  3321. +   /*
  3322. +    * Revert to default priority/policy on fork if requested.
  3323. +    */
  3324. +   if (unlikely(p->sched_reset_on_fork)) {
  3325. +       if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
  3326. +           p->policy = SCHED_NORMAL;
  3327. +           p->normal_prio = normal_prio(p);
  3328. +       }
  3329. +
  3330. +       if (PRIO_TO_NICE(p->static_prio) < 0) {
  3331. +           p->static_prio = NICE_TO_PRIO(0);
  3332. +           p->normal_prio = p->static_prio;
  3333. +       }
  3334. +
  3335. +       /*
  3336. +        * We don't need the reset flag anymore after the fork. It has
  3337. +        * fulfilled its duty:
  3338. +        */
  3339. +       p->sched_reset_on_fork = 0;
  3340. +   }
  3341. +
  3342. +   curr = current;
  3343. +   /*
  3344. +    * Make sure we do not leak PI boosting priority to the child.
  3345. +    */
  3346. +   p->prio = curr->normal_prio;
  3347. +
  3348. +   INIT_LIST_HEAD(&p->run_list);
  3349. +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
  3350. +   if (unlikely(sched_info_on()))
  3351. +       memset(&p->sched_info, 0, sizeof(p->sched_info));
  3352. +#endif
  3353. +
  3354. +   p->on_cpu = false;
  3355. +   clear_sticky(p);
  3356. +
  3357. +#ifdef CONFIG_PREEMPT_COUNT
  3358. +   /* Want to start with kernel preemption disabled. */
  3359. +   task_thread_info(p)->preempt_count = 1;
  3360. +#endif
  3361. +   if (unlikely(p->policy == SCHED_FIFO))
  3362. +       goto out;
  3363. +   /*
  3364. +    * Share the timeslice between parent and child, thus the
  3365. +    * total amount of pending timeslices in the system doesn't change,
  3366. +    * resulting in more scheduling fairness. If it's negative, it won't
  3367. +    * matter since that's the same as being 0. current's time_slice is
  3368. +    * actually in rq_time_slice when it's running, as is its last_ran
  3369. +    * value. rq->rq_deadline is only modified within schedule() so it
  3370. +    * is always equal to current->deadline.
  3371. +    */
  3372. +   rq = task_grq_ulock_irq(curr);
  3373. +   if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
  3374. +       grq_udowngrade();
  3375. +       rq->rq_time_slice /= 2;
  3376. +       p->time_slice = rq->rq_time_slice;
  3377. +       p->last_ran = rq->rq_last_ran;
  3378. +       task_grq_runlock_irq();
  3379. +   } else {
  3380. +       /*
  3381. +        * Forking task has run out of timeslice. Reschedule it and
  3382. +        * start its child with a new time slice and deadline. The
  3383. +        * child will end up running first because its deadline will
  3384. +        * be slightly earlier.
  3385. +        */
  3386. +       rq->rq_time_slice = 0;
  3387. +       grq_upgrade();
  3388. +       set_tsk_need_resched(curr);
  3389. +       time_slice_expired(p);
  3390. +       p->last_ran = rq->rq_last_ran;
  3391. +       task_grq_wunlock_irq();
  3392. +   }
  3393. +out:
  3394. +   put_cpu();
  3395. +}
  3396. +
  3397. +/*
  3398. + * wake_up_new_task - wake up a newly created task for the first time.
  3399. + *
  3400. + * This function will do some initial scheduler statistics housekeeping
  3401. + * that must be done for every newly created context, then puts the task
  3402. + * on the runqueue and wakes it.
  3403. + */
  3404. +void wake_up_new_task(struct task_struct *p)
  3405. +{
  3406. +   struct task_struct *parent;
  3407. +   unsigned long flags;
  3408. +   struct rq *rq;
  3409. +
  3410. +   p->state = TASK_RUNNING;
  3411. +   parent = p->parent;
  3412. +   /* Unnecessary but small chance that the parent changed CPU */
  3413. +   set_task_cpu(p, task_cpu(parent));
  3414. +   rq = task_grq_wlock(p, &flags);
  3415. +   activate_task(p, rq);
  3416. +   trace_sched_wakeup_new(p, 1);
  3417. +   if (rq->curr == parent && !suitable_idle_cpus(p)) {
  3418. +       /*
  3419. +        * The VM isn't cloned, so we're in a good position to
  3420. +        * do child-runs-first in anticipation of an exec. This
  3421. +        * usually avoids a lot of COW overhead.
  3422. +        */
  3423. +       resched_task(parent);
  3424. +   } else
  3425. +       try_preempt(p, rq);
  3426. +   task_grq_wunlock(&flags);
  3427. +}
  3428. +
  3429. +#ifdef CONFIG_PREEMPT_NOTIFIERS
  3430. +
  3431. +/**
  3432. + * preempt_notifier_register - tell me when current is being preempted & rescheduled
  3433. + * @notifier: notifier struct to register
  3434. + */
  3435. +void preempt_notifier_register(struct preempt_notifier *notifier)
  3436. +{
  3437. +   hlist_add_head(&notifier->link, &current->preempt_notifiers);
  3438. +}
  3439. +EXPORT_SYMBOL_GPL(preempt_notifier_register);
  3440. +
  3441. +/**
  3442. + * preempt_notifier_unregister - no longer interested in preemption notifications
  3443. + * @notifier: notifier struct to unregister
  3444. + *
  3445. + * This is safe to call from within a preemption notifier.
  3446. + */
  3447. +void preempt_notifier_unregister(struct preempt_notifier *notifier)
  3448. +{
  3449. +   hlist_del(&notifier->link);
  3450. +}
  3451. +EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
  3452. +
  3453. +static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
  3454. +{
  3455. +   struct preempt_notifier *notifier;
  3456. +   struct hlist_node *node;
  3457. +
  3458. +   hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
  3459. +       notifier->ops->sched_in(notifier, raw_smp_processor_id());
  3460. +}
  3461. +
  3462. +static void
  3463. +fire_sched_out_preempt_notifiers(struct task_struct *curr,
  3464. +                struct task_struct *next)
  3465. +{
  3466. +   struct preempt_notifier *notifier;
  3467. +   struct hlist_node *node;
  3468. +
  3469. +   hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
  3470. +       notifier->ops->sched_out(notifier, next);
  3471. +}
  3472. +
  3473. +#else /* !CONFIG_PREEMPT_NOTIFIERS */
  3474. +
  3475. +static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
  3476. +{
  3477. +}
  3478. +
  3479. +static void
  3480. +fire_sched_out_preempt_notifiers(struct task_struct *curr,
  3481. +                struct task_struct *next)
  3482. +{
  3483. +}
  3484. +
  3485. +#endif /* CONFIG_PREEMPT_NOTIFIERS */
  3486. +
  3487. +/**
  3488. + * prepare_task_switch - prepare to switch tasks
  3489. + * @rq: the runqueue preparing to switch
  3490. + * @next: the task we are going to switch to.
  3491. + *
  3492. + * This is called with the rq lock held and interrupts off. It must
  3493. + * be paired with a subsequent finish_task_switch after the context
  3494. + * switch.
  3495. + *
  3496. + * prepare_task_switch sets up locking and calls architecture specific
  3497. + * hooks.
  3498. + */
  3499. +static inline void
  3500. +prepare_task_switch(struct rq *rq, struct task_struct *prev,
  3501. +           struct task_struct *next)
  3502. +{
  3503. +   sched_info_switch(prev, next);
  3504. +   perf_event_task_sched_out(prev, next);
  3505. +   fire_sched_out_preempt_notifiers(prev, next);
  3506. +   prepare_lock_switch(rq, next);
  3507. +   prepare_arch_switch(next);
  3508. +   trace_sched_switch(prev, next);
  3509. +}
  3510. +
  3511. +/**
  3512. + * finish_task_switch - clean up after a task-switch
  3513. + * @rq: runqueue associated with task-switch
  3514. + * @prev: the thread we just switched away from.
  3515. + *
  3516. + * finish_task_switch must be called after the context switch, paired
  3517. + * with a prepare_task_switch call before the context switch.
  3518. + * finish_task_switch will reconcile locking set up by prepare_task_switch,
  3519. + * and do any other architecture-specific cleanup actions.
  3520. + *
  3521. + * Note that we may have delayed dropping an mm in context_switch(). If
  3522. + * so, we finish that here outside of the runqueue lock.  (Doing it
  3523. + * with the lock held can cause deadlocks; see schedule() for
  3524. + * details.)
  3525. + */
  3526. +static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
  3527. +{
  3528. +   struct mm_struct *mm = rq->prev_mm;
  3529. +   long prev_state;
  3530. +
  3531. +   rq->prev_mm = NULL;
  3532. +
  3533. +   /*
  3534. +    * A task struct has one reference for the use as "current".
  3535. +    * If a task dies, then it sets TASK_DEAD in tsk->state and calls
  3536. +    * schedule one last time. The schedule call will never return, and
  3537. +    * the scheduled task must drop that reference.
  3538. +    * The test for TASK_DEAD must occur while the runqueue locks are
  3539. +    * still held, otherwise prev could be scheduled on another cpu, die
  3540. +    * there before we look at prev->state, and then the reference would
  3541. +    * be dropped twice.
  3542. +    *      Manfred Spraul <manfred@colorfullife.com>
  3543. +    */
  3544. +   prev_state = prev->state;
  3545. +   finish_arch_switch(prev);
  3546. +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
  3547. +   local_irq_disable();
  3548. +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
  3549. +   perf_event_task_sched_in(prev, current);
  3550. +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
  3551. +   local_irq_enable();
  3552. +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
  3553. +   finish_lock_switch(rq, prev);
  3554. +   finish_arch_post_lock_switch();
  3555. +
  3556. +   fire_sched_in_preempt_notifiers(current);
  3557. +   if (mm)
  3558. +       mmdrop(mm);
  3559. +   if (unlikely(prev_state == TASK_DEAD)) {
  3560. +       /*
  3561. +        * Remove function-return probe instances associated with this
  3562. +        * task and put them back on the free list.
  3563. +        */
  3564. +       kprobe_flush_task(prev);
  3565. +       put_task_struct(prev);
  3566. +   }
  3567. +}
  3568. +
  3569. +/**
  3570. + * schedule_tail - first thing a freshly forked thread must call.
  3571. + * @prev: the thread we just switched away from.
  3572. + */
  3573. +asmlinkage void schedule_tail(struct task_struct *prev)
  3574. +{
  3575. +   struct rq *rq = this_rq();
  3576. +
  3577. +   finish_task_switch(rq, prev);
  3578. +#ifdef __ARCH_WANT_UNLOCKED_CTXSW
  3579. +   /* In this case, finish_task_switch does not reenable preemption */
  3580. +   preempt_enable();
  3581. +#endif
  3582. +   if (current->set_child_tid)
  3583. +       put_user(current->pid, current->set_child_tid);
  3584. +}
  3585. +
  3586. +/*
  3587. + * context_switch - switch to the new MM and the new
  3588. + * thread's register state.
  3589. + */
  3590. +static inline void
  3591. +context_switch(struct rq *rq, struct task_struct *prev,
  3592. +          struct task_struct *next)
  3593. +{
  3594. +   struct mm_struct *mm, *oldmm;
  3595. +
  3596. +   prepare_task_switch(rq, prev, next);
  3597. +
  3598. +   mm = next->mm;
  3599. +   oldmm = prev->active_mm;
  3600. +   /*
  3601. +    * For paravirt, this is coupled with an exit in switch_to to
  3602. +    * combine the page table reload and the switch backend into
  3603. +    * one hypercall.
  3604. +    */
  3605. +   arch_start_context_switch(prev);
  3606. +
  3607. +   if (!mm) {
  3608. +       next->active_mm = oldmm;
  3609. +       atomic_inc(&oldmm->mm_count);
  3610. +       enter_lazy_tlb(oldmm, next);
  3611. +   } else
  3612. +       switch_mm(oldmm, mm, next);
  3613. +
  3614. +   if (!prev->mm) {
  3615. +       prev->active_mm = NULL;
  3616. +       rq->prev_mm = oldmm;
  3617. +   }
  3618. +   /*
  3619. +    * Since the runqueue lock will be released by the next
  3620. +    * task (which is an invalid locking op but in the case
  3621. +    * of the scheduler it's an obvious special-case), so we
  3622. +    * do an early lockdep release here:
  3623. +    */
  3624. +#ifndef __ARCH_WANT_UNLOCKED_CTXSW
  3625. +   spin_release(&grq.urw.lock.dep_map, 1, _THIS_IP_);
  3626. +#endif
  3627. +
  3628. +   /* Here we just switch the register state and the stack. */
  3629. +   switch_to(prev, next, prev);
  3630. +
  3631. +   barrier();
  3632. +   /*
  3633. +    * this_rq must be evaluated again because prev may have moved
  3634. +    * CPUs since it called schedule(), thus the 'rq' on its stack
  3635. +    * frame will be invalid.
  3636. +    */
  3637. +   finish_task_switch(this_rq(), prev);
  3638. +}
  3639. +
  3640. +/*
  3641. + * nr_running, nr_uninterruptible and nr_context_switches:
  3642. + *
  3643. + * externally visible scheduler statistics: current number of runnable
  3644. + * threads, current number of uninterruptible-sleeping threads, total
  3645. + * number of context switches performed since bootup. All are measured
  3646. + * without grabbing the grq lock but the occasional inaccurate result
  3647. + * doesn't matter so long as it's positive.
  3648. + */
  3649. +unsigned long nr_running(void)
  3650. +{
  3651. +   long nr = grq.nr_running;
  3652. +
  3653. +   if (unlikely(nr < 0))
  3654. +       nr = 0;
  3655. +   return (unsigned long)nr;
  3656. +}
  3657. +
  3658. +unsigned long nr_uninterruptible(void)
  3659. +{
  3660. +   long nu = grq.nr_uninterruptible;
  3661. +
  3662. +   if (unlikely(nu < 0))
  3663. +       nu = 0;
  3664. +   return nu;
  3665. +}
  3666. +
  3667. +unsigned long long nr_context_switches(void)
  3668. +{
  3669. +   long long ns = grq.nr_switches;
  3670. +
  3671. +   /* This is of course impossible */
  3672. +   if (unlikely(ns < 0))
  3673. +       ns = 1;
  3674. +   return (unsigned long long)ns;
  3675. +}
  3676. +
  3677. +unsigned long nr_iowait(void)
  3678. +{
  3679. +   unsigned long i, sum = 0;
  3680. +
  3681. +   for_each_possible_cpu(i)
  3682. +       sum += atomic_read(&cpu_rq(i)->nr_iowait);
  3683. +
  3684. +   return sum;
  3685. +}
  3686. +
  3687. +unsigned long nr_iowait_cpu(int cpu)
  3688. +{
  3689. +   struct rq *this = cpu_rq(cpu);
  3690. +   return atomic_read(&this->nr_iowait);
  3691. +}
  3692. +
  3693. +unsigned long nr_active(void)
  3694. +{
  3695. +   return nr_running() + nr_uninterruptible();
  3696. +}
  3697. +
  3698. +/* Beyond a task running on this CPU, load is equal everywhere on BFS */
  3699. +unsigned long this_cpu_load(void)
  3700. +{
  3701. +   return this_rq()->rq_running +
  3702. +       ((queued_notrunning() + nr_uninterruptible()) / grq.noc);
  3703. +}
  3704. +
  3705. +/* Variables and functions for calc_load */
  3706. +static unsigned long calc_load_update;
  3707. +unsigned long avenrun[3];
  3708. +EXPORT_SYMBOL(avenrun);
  3709. +
  3710. +/**
  3711. + * get_avenrun - get the load average array
  3712. + * @loads: pointer to dest load array
  3713. + * @offset:    offset to add
  3714. + * @shift: shift count to shift the result left
  3715. + *
  3716. + * These values are estimates at best, so no need for locking.
  3717. + */
  3718. +void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
  3719. +{
  3720. +   loads[0] = (avenrun[0] + offset) << shift;
  3721. +   loads[1] = (avenrun[1] + offset) << shift;
  3722. +   loads[2] = (avenrun[2] + offset) << shift;
  3723. +}
  3724. +
  3725. +static unsigned long
  3726. +calc_load(unsigned long load, unsigned long exp, unsigned long active)
  3727. +{
  3728. +   load *= exp;
  3729. +   load += active * (FIXED_1 - exp);
  3730. +   return load >> FSHIFT;
  3731. +}
  3732. +
  3733. +/*
  3734. + * calc_load - update the avenrun load estimates every LOAD_FREQ seconds.
  3735. + */
  3736. +void calc_global_load(unsigned long ticks)
  3737. +{
  3738. +   long active;
  3739. +
  3740. +   if (time_before(jiffies, calc_load_update))
  3741. +       return;
  3742. +   active = nr_active() * FIXED_1;
  3743. +
  3744. +   avenrun[0] = calc_load(avenrun[0], EXP_1, active);
  3745. +   avenrun[1] = calc_load(avenrun[1], EXP_5, active);
  3746. +   avenrun[2] = calc_load(avenrun[2], EXP_15, active);
  3747. +
  3748. +   calc_load_update = jiffies + LOAD_FREQ;
  3749. +}
  3750. +
  3751. +DEFINE_PER_CPU(struct kernel_stat, kstat);
  3752. +DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
  3753. +
  3754. +EXPORT_PER_CPU_SYMBOL(kstat);
  3755. +EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
  3756. +
  3757. +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
  3758. +
  3759. +/*
  3760. + * There are no locks covering percpu hardirq/softirq time.
  3761. + * They are only modified in account_system_vtime, on corresponding CPU
  3762. + * with interrupts disabled. So, writes are safe.
  3763. + * They are read and saved off onto struct rq in update_rq_clock().
  3764. + * This may result in other CPU reading this CPU's irq time and can
  3765. + * race with irq/account_system_vtime on this CPU. We would either get old
  3766. + * or new value with a side effect of accounting a slice of irq time to wrong
  3767. + * task when irq is in progress while we read rq->clock. That is a worthy
  3768. + * compromise in place of having locks on each irq in account_system_time.
  3769. + */
  3770. +static DEFINE_PER_CPU(u64, cpu_hardirq_time);
  3771. +static DEFINE_PER_CPU(u64, cpu_softirq_time);
  3772. +
  3773. +static DEFINE_PER_CPU(u64, irq_start_time);
  3774. +static int sched_clock_irqtime;
  3775. +
  3776. +void enable_sched_clock_irqtime(void)
  3777. +{
  3778. +   sched_clock_irqtime = 1;
  3779. +}
  3780. +
  3781. +void disable_sched_clock_irqtime(void)
  3782. +{
  3783. +   sched_clock_irqtime = 0;
  3784. +}
  3785. +
  3786. +#ifndef CONFIG_64BIT
  3787. +static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
  3788. +
  3789. +static inline void irq_time_write_begin(void)
  3790. +{
  3791. +   __this_cpu_inc(irq_time_seq.sequence);
  3792. +   smp_wmb();
  3793. +}
  3794. +
  3795. +static inline void irq_time_write_end(void)
  3796. +{
  3797. +   smp_wmb();
  3798. +   __this_cpu_inc(irq_time_seq.sequence);
  3799. +}
  3800. +
  3801. +static inline u64 irq_time_read(int cpu)
  3802. +{
  3803. +   u64 irq_time;
  3804. +   unsigned seq;
  3805. +
  3806. +   do {
  3807. +       seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
  3808. +       irq_time = per_cpu(cpu_softirq_time, cpu) +
  3809. +              per_cpu(cpu_hardirq_time, cpu);
  3810. +   } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
  3811. +
  3812. +   return irq_time;
  3813. +}
  3814. +#else /* CONFIG_64BIT */
  3815. +static inline void irq_time_write_begin(void)
  3816. +{
  3817. +}
  3818. +
  3819. +static inline void irq_time_write_end(void)
  3820. +{
  3821. +}
  3822. +
  3823. +static inline u64 irq_time_read(int cpu)
  3824. +{
  3825. +   return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
  3826. +}
  3827. +#endif /* CONFIG_64BIT */
  3828. +
  3829. +/*
  3830. + * Called before incrementing preempt_count on {soft,}irq_enter
  3831. + * and before decrementing preempt_count on {soft,}irq_exit.
  3832. + */
  3833. +void account_system_vtime(struct task_struct *curr)
  3834. +{
  3835. +   unsigned long flags;
  3836. +   s64 delta;
  3837. +   int cpu;
  3838. +
  3839. +   if (!sched_clock_irqtime)
  3840. +       return;
  3841. +
  3842. +   local_irq_save(flags);
  3843. +
  3844. +   cpu = smp_processor_id();
  3845. +   delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
  3846. +   __this_cpu_add(irq_start_time, delta);
  3847. +
  3848. +   irq_time_write_begin();
  3849. +   /*
  3850. +    * We do not account for softirq time from ksoftirqd here.
  3851. +    * We want to continue accounting softirq time to ksoftirqd thread
  3852. +    * in that case, so as not to confuse scheduler with a special task
  3853. +    * that do not consume any time, but still wants to run.
  3854. +    */
  3855. +   if (hardirq_count())
  3856. +       __this_cpu_add(cpu_hardirq_time, delta);
  3857. +   else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
  3858. +       __this_cpu_add(cpu_softirq_time, delta);
  3859. +
  3860. +   irq_time_write_end();
  3861. +   local_irq_restore(flags);
  3862. +}
  3863. +EXPORT_SYMBOL_GPL(account_system_vtime);
  3864. +
  3865. +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
  3866. +
  3867. +#ifdef CONFIG_PARAVIRT
  3868. +static inline u64 steal_ticks(u64 steal)
  3869. +{
  3870. +   if (unlikely(steal > NSEC_PER_SEC))
  3871. +       return div_u64(steal, TICK_NSEC);
  3872. +
  3873. +   return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
  3874. +}
  3875. +#endif
  3876. +
  3877. +static void update_rq_clock_task(struct rq *rq, s64 delta)
  3878. +{
  3879. +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
  3880. +   s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
  3881. +
  3882. +   /*
  3883. +    * Since irq_time is only updated on {soft,}irq_exit, we might run into
  3884. +    * this case when a previous update_rq_clock() happened inside a
  3885. +    * {soft,}irq region.
  3886. +    *
  3887. +    * When this happens, we stop ->clock_task and only update the
  3888. +    * prev_irq_time stamp to account for the part that fit, so that a next
  3889. +    * update will consume the rest. This ensures ->clock_task is
  3890. +    * monotonic.
  3891. +    *
  3892. +    * It does however cause some slight miss-attribution of {soft,}irq
  3893. +    * time, a more accurate solution would be to update the irq_time using
  3894. +    * the current rq->clock timestamp, except that would require using
  3895. +    * atomic ops.
  3896. +    */
  3897. +   if (irq_delta > delta)
  3898. +       irq_delta = delta;
  3899. +
  3900. +   rq->prev_irq_time += irq_delta;
  3901. +   delta -= irq_delta;
  3902. +#endif
  3903. +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
  3904. +   if (static_branch((&paravirt_steal_rq_enabled))) {
  3905. +       u64 st, steal = paravirt_steal_clock(cpu_of(rq));
  3906. +
  3907. +       steal -= rq->prev_steal_time_rq;
  3908. +
  3909. +       if (unlikely(steal > delta))
  3910. +           steal = delta;
  3911. +
  3912. +       st = steal_ticks(steal);
  3913. +       steal = st * TICK_NSEC;
  3914. +
  3915. +       rq->prev_steal_time_rq += steal;
  3916. +
  3917. +       delta -= steal;
  3918. +   }
  3919. +#endif
  3920. +
  3921. +   rq->clock_task += delta;
  3922. +}
  3923. +
  3924. +#ifndef nsecs_to_cputime
  3925. +# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
  3926. +#endif
  3927. +
  3928. +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
  3929. +static void irqtime_account_hi_si(void)
  3930. +{
  3931. +   u64 *cpustat = kcpustat_this_cpu->cpustat;
  3932. +   u64 latest_ns;
  3933. +
  3934. +   latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time));
  3935. +   if (latest_ns > cpustat[CPUTIME_IRQ])
  3936. +       cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy;
  3937. +
  3938. +   latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time));
  3939. +   if (latest_ns > cpustat[CPUTIME_SOFTIRQ])
  3940. +       cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
  3941. +}
  3942. +#else /* CONFIG_IRQ_TIME_ACCOUNTING */
  3943. +
  3944. +#define sched_clock_irqtime    (0)
  3945. +
  3946. +static inline void irqtime_account_hi_si(void)
  3947. +{
  3948. +}
  3949. +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
  3950. +
  3951. +static __always_inline bool steal_account_process_tick(void)
  3952. +{
  3953. +#ifdef CONFIG_PARAVIRT
  3954. +   if (static_key_false(&paravirt_steal_enabled)) {
  3955. +       u64 steal, st = 0;
  3956. +
  3957. +       steal = paravirt_steal_clock(smp_processor_id());
  3958. +       steal -= this_rq()->prev_steal_time;
  3959. +
  3960. +       st = steal_ticks(steal);
  3961. +       this_rq()->prev_steal_time += st * TICK_NSEC;
  3962. +
  3963. +       account_steal_time(st);
  3964. +       return st;
  3965. +   }
  3966. +#endif
  3967. +   return false;
  3968. +}
  3969. +
  3970. +/*
  3971. + * On each tick, see what percentage of that tick was attributed to each
  3972. + * component and add the percentage to the _pc values. Once a _pc value has
  3973. + * accumulated one tick's worth, account for that. This means the total
  3974. + * percentage of load components will always be 128 (pseudo 100) per tick.
  3975. + */
  3976. +static void pc_idle_time(struct rq *rq, unsigned long pc)
  3977. +{
  3978. +   u64 *cpustat = kcpustat_this_cpu->cpustat;
  3979. +
  3980. +   if (atomic_read(&rq->nr_iowait) > 0) {
  3981. +       rq->iowait_pc += pc;
  3982. +       if (rq->iowait_pc >= 128) {
  3983. +           rq->iowait_pc %= 128;
  3984. +           cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy;
  3985. +       }
  3986. +   } else {
  3987. +       rq->idle_pc += pc;
  3988. +       if (rq->idle_pc >= 128) {
  3989. +           rq->idle_pc %= 128;
  3990. +           cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy;
  3991. +       }
  3992. +   }
  3993. +}
  3994. +
  3995. +static void
  3996. +pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset,
  3997. +          unsigned long pc, unsigned long ns)
  3998. +{
  3999. +   u64 *cpustat = kcpustat_this_cpu->cpustat;
  4000. +   cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
  4001. +
  4002. +   p->stime_pc += pc;
  4003. +   if (p->stime_pc >= 128) {
  4004. +       p->stime_pc %= 128;
  4005. +       p->stime += (__force u64)cputime_one_jiffy;
  4006. +       p->stimescaled += one_jiffy_scaled;
  4007. +       account_group_system_time(p, cputime_one_jiffy);
  4008. +       acct_update_integrals(p);
  4009. +   }
  4010. +   p->sched_time += ns;
  4011. +
  4012. +   if (hardirq_count() - hardirq_offset) {
  4013. +       rq->irq_pc += pc;
  4014. +       if (rq->irq_pc >= 128) {
  4015. +           rq->irq_pc %= 128;
  4016. +           cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy;
  4017. +       }
  4018. +   } else if (in_serving_softirq()) {
  4019. +       rq->softirq_pc += pc;
  4020. +       if (rq->softirq_pc >= 128) {
  4021. +           rq->softirq_pc %= 128;
  4022. +           cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
  4023. +       }
  4024. +   } else {
  4025. +       rq->system_pc += pc;
  4026. +       if (rq->system_pc >= 128) {
  4027. +           rq->system_pc %= 128;
  4028. +           cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy;
  4029. +       }
  4030. +   }
  4031. +}
  4032. +
  4033. +static void pc_user_time(struct rq *rq, struct task_struct *p,
  4034. +            unsigned long pc, unsigned long ns)
  4035. +{
  4036. +   u64 *cpustat = kcpustat_this_cpu->cpustat;
  4037. +   cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
  4038. +
  4039. +   p->utime_pc += pc;
  4040. +   if (p->utime_pc >= 128) {
  4041. +       p->utime_pc %= 128;
  4042. +       p->utime += (__force u64)cputime_one_jiffy;
  4043. +       p->utimescaled += one_jiffy_scaled;
  4044. +       account_group_user_time(p, cputime_one_jiffy);
  4045. +       acct_update_integrals(p);
  4046. +   }
  4047. +   p->sched_time += ns;
  4048. +
  4049. +   if (this_cpu_ksoftirqd() == p) {
  4050. +       /*
  4051. +        * ksoftirqd time do not get accounted in cpu_softirq_time.
  4052. +        * So, we have to handle it separately here.
  4053. +        */
  4054. +       rq->softirq_pc += pc;
  4055. +       if (rq->softirq_pc >= 128) {
  4056. +           rq->softirq_pc %= 128;
  4057. +           cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
  4058. +       }
  4059. +   }
  4060. +
  4061. +   if (TASK_NICE(p) > 0 || idleprio_task(p)) {
  4062. +       rq->nice_pc += pc;
  4063. +       if (rq->nice_pc >= 128) {
  4064. +           rq->nice_pc %= 128;
  4065. +           cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy;
  4066. +       }
  4067. +   } else {
  4068. +       rq->user_pc += pc;
  4069. +       if (rq->user_pc >= 128) {
  4070. +           rq->user_pc %= 128;
  4071. +           cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy;
  4072. +       }
  4073. +   }
  4074. +}
  4075. +
  4076. +/*
  4077. + * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast
  4078. + * shifts instead of 100
  4079. + */
  4080. +#define NS_TO_PC(NS)   (NS * 128 / JIFFY_NS)
  4081. +
  4082. +/*
  4083. + * This is called on clock ticks and on context switches.
  4084. + * Bank in p->sched_time the ns elapsed since the last tick or switch.
  4085. + * CPU scheduler quota accounting is also performed here in microseconds.
  4086. + */
  4087. +static void
  4088. +update_cpu_clock(struct rq *rq, struct task_struct *p, bool tick)
  4089. +{
  4090. +   long account_ns = rq->clock - rq->timekeep_clock;
  4091. +   struct task_struct *idle = rq->idle;
  4092. +   unsigned long account_pc;
  4093. +
  4094. +   if (unlikely(account_ns < 0))
  4095. +       account_ns = 0;
  4096. +
  4097. +   account_pc = NS_TO_PC(account_ns);
  4098. +
  4099. +   if (tick) {
  4100. +       int user_tick;
  4101. +
  4102. +       /* Accurate tick timekeeping */
  4103. +       rq->account_pc += account_pc - 128;
  4104. +       if (rq->account_pc < 0) {
  4105. +           /*
  4106. +            * Small errors in micro accounting may not make the
  4107. +            * accounting add up to 128 each tick so we keep track
  4108. +            * of the percentage and round it up when less than 128
  4109. +            */
  4110. +           account_pc += -rq->account_pc;
  4111. +           rq->account_pc = 0;
  4112. +       }
  4113. +       if (steal_account_process_tick())
  4114. +           goto ts_account;
  4115. +
  4116. +       user_tick = user_mode(get_irq_regs());
  4117. +
  4118. +       if (user_tick)
  4119. +           pc_user_time(rq, p, account_pc, account_ns);
  4120. +       else if (p != idle || (irq_count() != HARDIRQ_OFFSET))
  4121. +           pc_system_time(rq, p, HARDIRQ_OFFSET,
  4122. +                      account_pc, account_ns);
  4123. +       else
  4124. +           pc_idle_time(rq, account_pc);
  4125. +
  4126. +       if (sched_clock_irqtime)
  4127. +           irqtime_account_hi_si();
  4128. +   } else {
  4129. +       /* Accurate subtick timekeeping */
  4130. +       rq->account_pc += account_pc;
  4131. +       if (p == idle)
  4132. +           pc_idle_time(rq, account_pc);
  4133. +       else
  4134. +           pc_user_time(rq, p, account_pc, account_ns);
  4135. +   }
  4136. +
  4137. +ts_account:
  4138. +   /* time_slice accounting is done in usecs to avoid overflow on 32bit */
  4139. +   if (rq->rq_policy != SCHED_FIFO && p != idle) {
  4140. +       s64 time_diff = rq->clock - rq->rq_last_ran;
  4141. +
  4142. +       niffy_diff(&time_diff, 1);
  4143. +       rq->rq_time_slice -= NS_TO_US(time_diff);
  4144. +   }
  4145. +   rq->rq_last_ran = rq->timekeep_clock = rq->clock;
  4146. +}
  4147. +
  4148. +/*
  4149. + * Return any ns on the sched_clock that have not yet been accounted in
  4150. + * @p in case that task is currently running.
  4151. + *
  4152. + * Called with task_grq_ulock() held. Returns with a downgraded rlock.
  4153. + */
  4154. +static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
  4155. +{
  4156. +   u64 ns = 0;
  4157. +
  4158. +   if (p == rq->curr) {
  4159. +       grq_upgrade();
  4160. +       update_clocks(rq);
  4161. +       grq_wdowngrade();
  4162. +       ns = rq->clock_task - rq->rq_last_ran;
  4163. +       if (unlikely((s64)ns < 0))
  4164. +           ns = 0;
  4165. +   } else
  4166. +       grq_udowngrade();
  4167. +
  4168. +   return ns;
  4169. +}
  4170. +
  4171. +/* Note the intentional unbalanced locking, lock ulock and unlock rlock. */
  4172. +unsigned long long task_delta_exec(struct task_struct *p)
  4173. +{
  4174. +   unsigned long flags;
  4175. +   struct rq *rq;
  4176. +   u64 ns;
  4177. +
  4178. +   rq = task_grq_ulock(p, &flags);
  4179. +   ns = do_task_delta_exec(p, rq);
  4180. +   task_grq_runlock(&flags);
  4181. +
  4182. +   return ns;
  4183. +}
  4184. +
  4185. +/*
  4186. + * Return accounted runtime for the task.
  4187. + * In case the task is currently running, return the runtime plus current's
  4188. + * pending runtime that have not been accounted yet.
  4189. + *
  4190. + * Note the intentional unbalanced locking, lock ulock and unlock rlock.
  4191. + */
  4192. +unsigned long long task_sched_runtime(struct task_struct *p)
  4193. +{
  4194. +   unsigned long flags;
  4195. +   struct rq *rq;
  4196. +   u64 ns;
  4197. +
  4198. +   rq = task_grq_ulock(p, &flags);
  4199. +   ns = p->sched_time + do_task_delta_exec(p, rq);
  4200. +   task_grq_runlock(&flags);
  4201. +
  4202. +   return ns;
  4203. +}
  4204. +
  4205. +/* Compatibility crap */
  4206. +void account_user_time(struct task_struct *p, cputime_t cputime,
  4207. +              cputime_t cputime_scaled)
  4208. +{
  4209. +}
  4210. +
  4211. +void account_idle_time(cputime_t cputime)
  4212. +{
  4213. +}
  4214. +
  4215. +void update_cpu_load_nohz(void)
  4216. +{
  4217. +}
  4218. +
  4219. +#ifdef CONFIG_NO_HZ
  4220. +void calc_load_enter_idle(void)
  4221. +{
  4222. +}
  4223. +
  4224. +void calc_load_exit_idle(void)
  4225. +{
  4226. +}
  4227. +#endif /* CONFIG_NO_HZ */
  4228. +
  4229. +/*
  4230. + * Account guest cpu time to a process.
  4231. + * @p: the process that the cpu time gets accounted to
  4232. + * @cputime: the cpu time spent in virtual machine since the last update
  4233. + * @cputime_scaled: cputime scaled by cpu frequency
  4234. + */
  4235. +static void account_guest_time(struct task_struct *p, cputime_t cputime,
  4236. +                  cputime_t cputime_scaled)
  4237. +{
  4238. +   u64 *cpustat = kcpustat_this_cpu->cpustat;
  4239. +
  4240. +   /* Add guest time to process. */
  4241. +   p->utime += (__force u64)cputime;
  4242. +   p->utimescaled += (__force u64)cputime_scaled;
  4243. +   account_group_user_time(p, cputime);
  4244. +   p->gtime += (__force u64)cputime;
  4245. +
  4246. +   /* Add guest time to cpustat. */
  4247. +   if (TASK_NICE(p) > 0) {
  4248. +       cpustat[CPUTIME_NICE] += (__force u64)cputime;
  4249. +       cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime;
  4250. +   } else {
  4251. +       cpustat[CPUTIME_USER] += (__force u64)cputime;
  4252. +       cpustat[CPUTIME_GUEST] += (__force u64)cputime;
  4253. +   }
  4254. +}
  4255. +
  4256. +/*
  4257. + * Account system cpu time to a process and desired cpustat field
  4258. + * @p: the process that the cpu time gets accounted to
  4259. + * @cputime: the cpu time spent in kernel space since the last update
  4260. + * @cputime_scaled: cputime scaled by cpu frequency
  4261. + * @target_cputime64: pointer to cpustat field that has to be updated
  4262. + */
  4263. +static inline
  4264. +void __account_system_time(struct task_struct *p, cputime_t cputime,
  4265. +           cputime_t cputime_scaled, cputime64_t *target_cputime64)
  4266. +{
  4267. +   /* Add system time to process. */
  4268. +   p->stime += (__force u64)cputime;
  4269. +   p->stimescaled += (__force u64)cputime_scaled;
  4270. +   account_group_system_time(p, cputime);
  4271. +
  4272. +   /* Add system time to cpustat. */
  4273. +   *target_cputime64 += (__force u64)cputime;
  4274. +
  4275. +   /* Account for system time used */
  4276. +   acct_update_integrals(p);
  4277. +}
  4278. +
  4279. +/*
  4280. + * Account system cpu time to a process.
  4281. + * @p: the process that the cpu time gets accounted to
  4282. + * @hardirq_offset: the offset to subtract from hardirq_count()
  4283. + * @cputime: the cpu time spent in kernel space since the last update
  4284. + * @cputime_scaled: cputime scaled by cpu frequency
  4285. + * This is for guest only now.
  4286. + */
  4287. +void account_system_time(struct task_struct *p, int hardirq_offset,
  4288. +            cputime_t cputime, cputime_t cputime_scaled)
  4289. +{
  4290. +
  4291. +   if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
  4292. +       account_guest_time(p, cputime, cputime_scaled);
  4293. +}
  4294. +
  4295. +/*
  4296. + * Account for involuntary wait time.
  4297. + * @steal: the cpu time spent in involuntary wait
  4298. + */
  4299. +void account_steal_time(cputime_t cputime)
  4300. +{
  4301. +   u64 *cpustat = kcpustat_this_cpu->cpustat;
  4302. +
  4303. +   cpustat[CPUTIME_STEAL] += (__force u64)cputime;
  4304. +}
  4305. +
  4306. +/*
  4307. + * Account for idle time.
  4308. + * @cputime: the cpu time spent in idle wait
  4309. + */
  4310. +static void account_idle_times(cputime_t cputime)
  4311. +{
  4312. +   u64 *cpustat = kcpustat_this_cpu->cpustat;
  4313. +   struct rq *rq = this_rq();
  4314. +
  4315. +   if (atomic_read(&rq->nr_iowait) > 0)
  4316. +       cpustat[CPUTIME_IOWAIT] += (__force u64)cputime;
  4317. +   else
  4318. +       cpustat[CPUTIME_IDLE] += (__force u64)cputime;
  4319. +}
  4320. +
  4321. +#ifndef CONFIG_VIRT_CPU_ACCOUNTING
  4322. +
  4323. +void account_process_tick(struct task_struct *p, int user_tick)
  4324. +{
  4325. +}
  4326. +
  4327. +/*
  4328. + * Account multiple ticks of steal time.
  4329. + * @p: the process from which the cpu time has been stolen
  4330. + * @ticks: number of stolen ticks
  4331. + */
  4332. +void account_steal_ticks(unsigned long ticks)
  4333. +{
  4334. +   account_steal_time(jiffies_to_cputime(ticks));
  4335. +}
  4336. +
  4337. +/*
  4338. + * Account multiple ticks of idle time.
  4339. + * @ticks: number of stolen ticks
  4340. + */
  4341. +void account_idle_ticks(unsigned long ticks)
  4342. +{
  4343. +   account_idle_times(jiffies_to_cputime(ticks));
  4344. +}
  4345. +#endif
  4346. +
  4347. +static inline void grq_iso_lock(void)
  4348. +   __acquires(grq.iso_lock)
  4349. +{
  4350. +   raw_spin_lock(&grq.iso_lock);
  4351. +}
  4352. +
  4353. +static inline void grq_iso_unlock(void)
  4354. +   __releases(grq.iso_lock)
  4355. +{
  4356. +   raw_spin_unlock(&grq.iso_lock);
  4357. +}
  4358. +
  4359. +/*
  4360. + * Functions to test for when SCHED_ISO tasks have used their allocated
  4361. + * quota as real time scheduling and convert them back to SCHED_NORMAL.
  4362. + * Where possible, the data is tested lockless, to avoid grabbing iso_lock
  4363. + * because the occasional inaccurate result won't matter. However the
  4364. + * tick data is only ever modified under lock. iso_refractory is only simply
  4365. + * set to 0 or 1 so it's not worth grabbing the lock yet again for that.
  4366. + */
  4367. +static bool set_iso_refractory(void)
  4368. +{
  4369. +   grq.iso_refractory = true;
  4370. +   return grq.iso_refractory;
  4371. +}
  4372. +
  4373. +static bool clear_iso_refractory(void)
  4374. +{
  4375. +   grq.iso_refractory = false;
  4376. +   return grq.iso_refractory;
  4377. +}
  4378. +
  4379. +/*
  4380. + * Test if SCHED_ISO tasks have run longer than their alloted period as RT
  4381. + * tasks and set the refractory flag if necessary. There is 10% hysteresis
  4382. + * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a
  4383. + * slow division.
  4384. + */
  4385. +static bool test_ret_isorefractory(struct rq *rq)
  4386. +{
  4387. +   if (likely(!grq.iso_refractory)) {
  4388. +       if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu)
  4389. +           return set_iso_refractory();
  4390. +   } else {
  4391. +       if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128))
  4392. +           return clear_iso_refractory();
  4393. +   }
  4394. +   return grq.iso_refractory;
  4395. +}
  4396. +
  4397. +static void iso_tick(void)
  4398. +{
  4399. +   grq_iso_lock();
  4400. +   grq.iso_ticks += 100;
  4401. +   grq_iso_unlock();
  4402. +}
  4403. +
  4404. +/* No SCHED_ISO task was running so decrease rq->iso_ticks */
  4405. +static inline void no_iso_tick(void)
  4406. +{
  4407. +   if (grq.iso_ticks) {
  4408. +       grq_iso_lock();
  4409. +       grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1;
  4410. +       if (unlikely(grq.iso_refractory && grq.iso_ticks <
  4411. +           ISO_PERIOD * (sched_iso_cpu * 115 / 128)))
  4412. +           clear_iso_refractory();
  4413. +       grq_iso_unlock();
  4414. +   }
  4415. +}
  4416. +
  4417. +/* This manages tasks that have run out of timeslice during a scheduler_tick */
  4418. +static void task_running_tick(struct rq *rq)
  4419. +{
  4420. +   struct task_struct *p;
  4421. +
  4422. +   /*
  4423. +    * If a SCHED_ISO task is running we increment the iso_ticks. In
  4424. +    * order to prevent SCHED_ISO tasks from causing starvation in the
  4425. +    * presence of true RT tasks we account those as iso_ticks as well.
  4426. +    */
  4427. +   if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) {
  4428. +       if (grq.iso_ticks <= (ISO_PERIOD * 128) - 128)
  4429. +           iso_tick();
  4430. +   } else
  4431. +       no_iso_tick();
  4432. +
  4433. +   if (iso_queue(rq)) {
  4434. +       if (unlikely(test_ret_isorefractory(rq))) {
  4435. +           if (rq_running_iso(rq)) {
  4436. +               /*
  4437. +                * SCHED_ISO task is running as RT and limit
  4438. +                * has been hit. Force it to reschedule as
  4439. +                * SCHED_NORMAL by zeroing its time_slice
  4440. +                */
  4441. +               rq->rq_time_slice = 0;
  4442. +           }
  4443. +       }
  4444. +   }
  4445. +
  4446. +   /* SCHED_FIFO tasks never run out of timeslice. */
  4447. +   if (rq->rq_policy == SCHED_FIFO)
  4448. +       return;
  4449. +   /*
  4450. +    * Tasks that were scheduled in the first half of a tick are not
  4451. +    * allowed to run into the 2nd half of the next tick if they will
  4452. +    * run out of time slice in the interim. Otherwise, if they have
  4453. +    * less than RESCHED_US μs of time slice left they will be rescheduled.
  4454. +    */
  4455. +   if (rq->dither) {
  4456. +       if (rq->rq_time_slice > HALF_JIFFY_US)
  4457. +           return;
  4458. +       else
  4459. +           rq->rq_time_slice = 0;
  4460. +   } else if (rq->rq_time_slice >= RESCHED_US)
  4461. +           return;
  4462. +
  4463. +   /* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
  4464. +   p = rq->curr;
  4465. +   grq_wlock();
  4466. +   requeue_task(p);
  4467. +   set_tsk_need_resched(p);
  4468. +   grq_wunlock();
  4469. +}
  4470. +
  4471. +void wake_up_idle_cpu(int cpu);
  4472. +
  4473. +/*
  4474. + * This function gets called by the timer code, with HZ frequency.
  4475. + * We call it with interrupts disabled. The data modified is all
  4476. + * local to struct rq so we don't need to grab grq lock.
  4477. + */
  4478. +void scheduler_tick(void)
  4479. +{
  4480. +   int cpu __maybe_unused = smp_processor_id();
  4481. +   struct rq *rq = cpu_rq(cpu);
  4482. +
  4483. +   sched_clock_tick();
  4484. +   /* grq lock not grabbed, so only update rq clock */
  4485. +   update_rq_clock(rq);
  4486. +   update_cpu_clock(rq, rq->curr, true);
  4487. +   if (!rq_idle(rq))
  4488. +       task_running_tick(rq);
  4489. +   else
  4490. +       no_iso_tick();
  4491. +   rq->last_tick = rq->clock;
  4492. +   perf_event_task_tick();
  4493. +}
  4494. +
  4495. +notrace unsigned long get_parent_ip(unsigned long addr)
  4496. +{
  4497. +   if (in_lock_functions(addr)) {
  4498. +       addr = CALLER_ADDR2;
  4499. +       if (in_lock_functions(addr))
  4500. +           addr = CALLER_ADDR3;
  4501. +   }
  4502. +   return addr;
  4503. +}
  4504. +
  4505. +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
  4506. +               defined(CONFIG_PREEMPT_TRACER))
  4507. +void __kprobes add_preempt_count(int val)
  4508. +{
  4509. +#ifdef CONFIG_DEBUG_PREEMPT
  4510. +   /*
  4511. +    * Underflow?
  4512. +    */
  4513. +   if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
  4514. +       return;
  4515. +#endif
  4516. +   preempt_count() += val;
  4517. +#ifdef CONFIG_DEBUG_PREEMPT
  4518. +   /*
  4519. +    * Spinlock count overflowing soon?
  4520. +    */
  4521. +   DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
  4522. +               PREEMPT_MASK - 10);
  4523. +#endif
  4524. +   if (preempt_count() == val)
  4525. +       trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
  4526. +}
  4527. +EXPORT_SYMBOL(add_preempt_count);
  4528. +
  4529. +void __kprobes sub_preempt_count(int val)
  4530. +{
  4531. +#ifdef CONFIG_DEBUG_PREEMPT
  4532. +   /*
  4533. +    * Underflow?
  4534. +    */
  4535. +   if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
  4536. +       return;
  4537. +   /*
  4538. +    * Is the spinlock portion underflowing?
  4539. +    */
  4540. +   if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
  4541. +           !(preempt_count() & PREEMPT_MASK)))
  4542. +       return;
  4543. +#endif
  4544. +
  4545. +   if (preempt_count() == val)
  4546. +       trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
  4547. +   preempt_count() -= val;
  4548. +}
  4549. +EXPORT_SYMBOL(sub_preempt_count);
  4550. +#endif
  4551. +
  4552. +/*
  4553. + * Deadline is "now" in niffies + (offset by priority). Setting the deadline
  4554. + * is the key to everything. It distributes cpu fairly amongst tasks of the
  4555. + * same nice value, it proportions cpu according to nice level, it means the
  4556. + * task that last woke up the longest ago has the earliest deadline, thus
  4557. + * ensuring that interactive tasks get low latency on wake up. The CPU
  4558. + * proportion works out to the square of the virtual deadline difference, so
  4559. + * this equation will give nice 19 3% CPU compared to nice 0.
  4560. + */
  4561. +static inline u64 prio_deadline_diff(int user_prio)
  4562. +{
  4563. +   return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
  4564. +}
  4565. +
  4566. +static inline u64 task_deadline_diff(struct task_struct *p)
  4567. +{
  4568. +   return prio_deadline_diff(TASK_USER_PRIO(p));
  4569. +}
  4570. +
  4571. +static inline u64 static_deadline_diff(int static_prio)
  4572. +{
  4573. +   return prio_deadline_diff(USER_PRIO(static_prio));
  4574. +}
  4575. +
  4576. +static inline int longest_deadline_diff(void)
  4577. +{
  4578. +   return prio_deadline_diff(39);
  4579. +}
  4580. +
  4581. +static inline int ms_longest_deadline_diff(void)
  4582. +{
  4583. +   return NS_TO_MS(longest_deadline_diff());
  4584. +}
  4585. +
  4586. +/*
  4587. + * The time_slice is only refilled when it is empty and that is when we set a
  4588. + * new deadline.
  4589. + */
  4590. +static void time_slice_expired(struct task_struct *p)
  4591. +{
  4592. +   p->time_slice = timeslice();
  4593. +   p->deadline = grq.niffies + task_deadline_diff(p);
  4594. +}
  4595. +
  4596. +/*
  4597. + * Timeslices below RESCHED_US are considered as good as expired as there's no
  4598. + * point rescheduling when there's so little time left. SCHED_BATCH tasks
  4599. + * have been flagged be not latency sensitive and likely to be fully CPU
  4600. + * bound so every time they're rescheduled they have their time_slice
  4601. + * refilled, but get a new later deadline to have little effect on
  4602. + * SCHED_NORMAL tasks.
  4603. +
  4604. + */
  4605. +static inline void check_deadline(struct task_struct *p)
  4606. +{
  4607. +   if (p->time_slice < RESCHED_US || batch_task(p))
  4608. +       time_slice_expired(p);
  4609. +}
  4610. +
  4611. +#define BITOP_WORD(nr)     ((nr) / BITS_PER_LONG)
  4612. +
  4613. +/*
  4614. + * Scheduler queue bitmap specific find next bit.
  4615. + */
  4616. +static inline unsigned long
  4617. +next_sched_bit(const unsigned long *addr, unsigned long offset)
  4618. +{
  4619. +   const unsigned long *p;
  4620. +   unsigned long result;
  4621. +   unsigned long size;
  4622. +   unsigned long tmp;
  4623. +
  4624. +   size = PRIO_LIMIT;
  4625. +   if (offset >= size)
  4626. +       return size;
  4627. +
  4628. +   p = addr + BITOP_WORD(offset);
  4629. +   result = offset & ~(BITS_PER_LONG-1);
  4630. +   size -= result;
  4631. +   offset %= BITS_PER_LONG;
  4632. +   if (offset) {
  4633. +       tmp = *(p++);
  4634. +       tmp &= (~0UL << offset);
  4635. +       if (size < BITS_PER_LONG)
  4636. +           goto found_first;
  4637. +       if (tmp)
  4638. +           goto found_middle;
  4639. +       size -= BITS_PER_LONG;
  4640. +       result += BITS_PER_LONG;
  4641. +   }
  4642. +   while (size & ~(BITS_PER_LONG-1)) {
  4643. +       if ((tmp = *(p++)))
  4644. +           goto found_middle;
  4645. +       result += BITS_PER_LONG;
  4646. +       size -= BITS_PER_LONG;
  4647. +   }
  4648. +   if (!size)
  4649. +       return result;
  4650. +   tmp = *p;
  4651. +
  4652. +found_first:
  4653. +   tmp &= (~0UL >> (BITS_PER_LONG - size));
  4654. +   if (tmp == 0UL)     /* Are any bits set? */
  4655. +       return result + size;   /* Nope. */
  4656. +found_middle:
  4657. +   return result + __ffs(tmp);
  4658. +}
  4659. +
  4660. +/*
  4661. + * O(n) lookup of all tasks in the global runqueue. The real brainfuck
  4662. + * of lock contention and O(n). It's not really O(n) as only the queued,
  4663. + * but not running tasks are scanned, and is O(n) queued in the worst case
  4664. + * scenario only because the right task can be found before scanning all of
  4665. + * them.
  4666. + * Tasks are selected in this order:
  4667. + * Real time tasks are selected purely by their static priority and in the
  4668. + * order they were queued, so the lowest value idx, and the first queued task
  4669. + * of that priority value is chosen.
  4670. + * If no real time tasks are found, the SCHED_ISO priority is checked, and
  4671. + * all SCHED_ISO tasks have the same priority value, so they're selected by
  4672. + * the earliest deadline value.
  4673. + * If no SCHED_ISO tasks are found, SCHED_NORMAL tasks are selected by the
  4674. + * earliest deadline.
  4675. + * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are
  4676. + * selected by the earliest deadline.
  4677. + */
  4678. +static inline struct
  4679. +task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
  4680. +{
  4681. +   struct task_struct *edt = NULL;
  4682. +   unsigned long idx = -1;
  4683. +
  4684. +   do {
  4685. +       struct list_head *queue;
  4686. +       struct task_struct *p;
  4687. +       u64 earliest_deadline;
  4688. +
  4689. +       idx = next_sched_bit(grq.prio_bitmap, ++idx);
  4690. +       if (idx >= PRIO_LIMIT)
  4691. +           return idle;
  4692. +       queue = grq.queue + idx;
  4693. +
  4694. +       if (idx < MAX_RT_PRIO) {
  4695. +           /* We found an rt task */
  4696. +           list_for_each_entry(p, queue, run_list) {
  4697. +               /* Make sure cpu affinity is ok */
  4698. +               if (needs_other_cpu(p, cpu))
  4699. +                   continue;
  4700. +               edt = p;
  4701. +               goto out_take;
  4702. +           }
  4703. +           /*
  4704. +            * None of the RT tasks at this priority can run on
  4705. +            * this cpu
  4706. +            */
  4707. +           continue;
  4708. +       }
  4709. +
  4710. +       /*
  4711. +        * No rt tasks. Find the earliest deadline task. Now we're in
  4712. +        * O(n) territory.
  4713. +        */
  4714. +       earliest_deadline = ~0ULL;
  4715. +       list_for_each_entry(p, queue, run_list) {
  4716. +           u64 dl;
  4717. +
  4718. +           /* Make sure cpu affinity is ok */
  4719. +           if (needs_other_cpu(p, cpu))
  4720. +               continue;
  4721. +
  4722. +           /*
  4723. +            * Soft affinity happens here by not scheduling a task
  4724. +            * with its sticky flag set that ran on a different CPU
  4725. +            * last when the CPU is scaling, or by greatly biasing
  4726. +            * against its deadline when not, based on cpu cache
  4727. +            * locality.
  4728. +            */
  4729. +           if (task_sticky(p) && task_rq(p) != rq) {
  4730. +               if (scaling_rq(rq))
  4731. +                   continue;
  4732. +               dl = p->deadline << locality_diff(p, rq);
  4733. +           } else
  4734. +               dl = p->deadline;
  4735. +
  4736. +           if (deadline_before(dl, earliest_deadline)) {
  4737. +               earliest_deadline = dl;
  4738. +               edt = p;
  4739. +           }
  4740. +       }
  4741. +   } while (!edt);
  4742. +
  4743. +out_take:
  4744. +   take_task(cpu, edt);
  4745. +   return edt;
  4746. +}
  4747. +
  4748. +
  4749. +/*
  4750. + * Print scheduling while atomic bug:
  4751. + */
  4752. +static noinline void __schedule_bug(struct task_struct *prev)
  4753. +{
  4754. +   if (oops_in_progress)
  4755. +       return;
  4756. +
  4757. +   printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
  4758. +       prev->comm, prev->pid, preempt_count());
  4759. +
  4760. +   debug_show_held_locks(prev);
  4761. +   print_modules();
  4762. +   if (irqs_disabled())
  4763. +       print_irqtrace_events(prev);
  4764. +   dump_stack();
  4765. +}
  4766. +
  4767. +/*
  4768. + * Various schedule()-time debugging checks and statistics:
  4769. + */
  4770. +static inline void schedule_debug(struct task_struct *prev)
  4771. +{
  4772. +   /*
  4773. +    * Test if we are atomic. Since do_exit() needs to call into
  4774. +    * schedule() atomically, we ignore that path for now.
  4775. +    * Otherwise, whine if we are scheduling when we should not be.
  4776. +    */
  4777. +   if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
  4778. +       __schedule_bug(prev);
  4779. +   rcu_sleep_check();
  4780. +
  4781. +   profile_hit(SCHED_PROFILING, __builtin_return_address(0));
  4782. +
  4783. +   schedstat_inc(this_rq(), sched_count);
  4784. +}
  4785. +
  4786. +/*
  4787. + * The currently running task's information is all stored in rq local data
  4788. + * which is only modified by the local CPU, thereby allowing the data to be
  4789. + * changed without grabbing the grq lock.
  4790. + */
  4791. +static inline void set_rq_task(struct rq *rq, struct task_struct *p)
  4792. +{
  4793. +   rq->rq_time_slice = p->time_slice;
  4794. +   rq->rq_deadline = p->deadline;
  4795. +   rq->rq_last_ran = p->last_ran = rq->clock;
  4796. +   rq->rq_policy = p->policy;
  4797. +   rq->rq_prio = p->prio;
  4798. +   if (p != rq->idle)
  4799. +       rq->rq_running = true;
  4800. +   else
  4801. +       rq->rq_running = false;
  4802. +}
  4803. +
  4804. +static void reset_rq_task(struct rq *rq, struct task_struct *p)
  4805. +{
  4806. +   rq->rq_policy = p->policy;
  4807. +   rq->rq_prio = p->prio;
  4808. +}
  4809. +
  4810. +/*
  4811. + * schedule() is the main scheduler function.
  4812. + */
  4813. +asmlinkage void __sched schedule(void)
  4814. +{
  4815. +   struct task_struct *prev, *next, *idle;
  4816. +   unsigned long *switch_count;
  4817. +   bool deactivate;
  4818. +   struct rq *rq;
  4819. +   int cpu;
  4820. +
  4821. +need_resched:
  4822. +   preempt_disable();
  4823. +   cpu = smp_processor_id();
  4824. +   rq = cpu_rq(cpu);
  4825. +   rcu_note_context_switch(cpu);
  4826. +   prev = rq->curr;
  4827. +
  4828. +   deactivate = false;
  4829. +   schedule_debug(prev);
  4830. +
  4831. +   grq_wlock_irq();
  4832. +
  4833. +   switch_count = &prev->nivcsw;
  4834. +   if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
  4835. +       if (unlikely(signal_pending_state(prev->state, prev))) {
  4836. +           prev->state = TASK_RUNNING;
  4837. +       } else {
  4838. +           deactivate = true;
  4839. +           /*
  4840. +            * If a worker is going to sleep, notify and
  4841. +            * ask workqueue whether it wants to wake up a
  4842. +            * task to maintain concurrency.  If so, wake
  4843. +            * up the task.
  4844. +            */
  4845. +           if (prev->flags & PF_WQ_WORKER) {
  4846. +               struct task_struct *to_wakeup;
  4847. +
  4848. +               to_wakeup = wq_worker_sleeping(prev, cpu);
  4849. +               if (to_wakeup) {
  4850. +                   /* This shouldn't happen, but does */
  4851. +                   if (unlikely(to_wakeup == prev))
  4852. +                       deactivate = false;
  4853. +                   else
  4854. +                       try_to_wake_up_local(to_wakeup);
  4855. +               }
  4856. +           }
  4857. +       }
  4858. +       switch_count = &prev->nvcsw;
  4859. +   }
  4860. +
  4861. +   /*
  4862. +    * If we are going to sleep and we have plugged IO queued, make
  4863. +    * sure to submit it to avoid deadlocks.
  4864. +    */
  4865. +   if (unlikely(deactivate && blk_needs_flush_plug(prev))) {
  4866. +       grq_wunlock_irq();
  4867. +       preempt_enable_no_resched();
  4868. +       blk_schedule_flush_plug(prev);
  4869. +       goto need_resched;
  4870. +   }
  4871. +
  4872. +   update_clocks(rq);
  4873. +   update_cpu_clock(rq, prev, false);
  4874. +   if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
  4875. +       rq->dither = false;
  4876. +   else
  4877. +       rq->dither = true;
  4878. +
  4879. +   clear_tsk_need_resched(prev);
  4880. +
  4881. +   idle = rq->idle;
  4882. +   if (idle != prev) {
  4883. +       /* Update all the information stored on struct rq */
  4884. +       prev->time_slice = rq->rq_time_slice;
  4885. +       prev->deadline = rq->rq_deadline;
  4886. +       check_deadline(prev);
  4887. +       prev->last_ran = rq->clock;
  4888. +
  4889. +       /* Task changed affinity off this CPU */
  4890. +       if (needs_other_cpu(prev, cpu))
  4891. +           resched_suitable_idle(prev);
  4892. +       else if (!deactivate) {
  4893. +           if (!queued_notrunning()) {
  4894. +               /*
  4895. +               * We now know prev is the only thing that is
  4896. +               * awaiting CPU so we can bypass rechecking for
  4897. +               * the earliest deadline task and just run it
  4898. +               * again.
  4899. +               */
  4900. +               set_rq_task(rq, prev);
  4901. +               grq_wunlock_irq();
  4902. +               goto rerun_prev_unlocked;
  4903. +           } else
  4904. +               swap_sticky(rq, cpu, prev);
  4905. +       }
  4906. +       return_task(prev, deactivate);
  4907. +   }
  4908. +
  4909. +   if (unlikely(!queued_notrunning())) {
  4910. +       /*
  4911. +        * This CPU is now truly idle as opposed to when idle is
  4912. +        * scheduled as a high priority task in its own right.
  4913. +        */
  4914. +       next = idle;
  4915. +       schedstat_inc(rq, sched_goidle);
  4916. +       set_cpuidle_map(cpu);
  4917. +   } else {
  4918. +       next = earliest_deadline_task(rq, cpu, idle);
  4919. +       if (likely(next->prio != PRIO_LIMIT))
  4920. +           clear_cpuidle_map(cpu);
  4921. +       else
  4922. +           set_cpuidle_map(cpu);
  4923. +   }
  4924. +
  4925. +   if (likely(prev != next)) {
  4926. +       /*
  4927. +        * Don't stick tasks when a real time task is going to run as
  4928. +        * they may literally get stuck.
  4929. +        */
  4930. +       if (rt_task(next))
  4931. +           unstick_task(rq, prev);
  4932. +       set_rq_task(rq, next);
  4933. +       grq.nr_switches++;
  4934. +       prev->on_cpu = false;
  4935. +       next->on_cpu = true;
  4936. +       rq->curr = next;
  4937. +       ++*switch_count;
  4938. +
  4939. +       context_switch(rq, prev, next); /* unlocks the grq */
  4940. +       /*
  4941. +        * The context switch have flipped the stack from under us
  4942. +        * and restored the local variables which were saved when
  4943. +        * this task called schedule() in the past. prev == current
  4944. +        * is still correct, but it can be moved to another cpu/rq.
  4945. +        */
  4946. +       cpu = smp_processor_id();
  4947. +       rq = cpu_rq(cpu);
  4948. +       idle = rq->idle;
  4949. +   } else
  4950. +       grq_wunlock_irq();
  4951. +
  4952. +rerun_prev_unlocked:
  4953. +   sched_preempt_enable_no_resched();
  4954. +   if (unlikely(need_resched()))
  4955. +       goto need_resched;
  4956. +}
  4957. +EXPORT_SYMBOL(schedule);
  4958. +
  4959. +/**
  4960. + * schedule_preempt_disabled - called with preemption disabled
  4961. + *
  4962. + * Returns with preemption disabled. Note: preempt_count must be 1
  4963. + */
  4964. +void __sched schedule_preempt_disabled(void)
  4965. +{
  4966. +   sched_preempt_enable_no_resched();
  4967. +   schedule();
  4968. +   preempt_disable();
  4969. +}
  4970. +
  4971. +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
  4972. +
  4973. +static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
  4974. +{
  4975. +   if (lock->owner != owner)
  4976. +       return false;
  4977. +
  4978. +   /*
  4979. +    * Ensure we emit the owner->on_cpu, dereference _after_ checking
  4980. +    * lock->owner still matches owner, if that fails, owner might
  4981. +    * point to free()d memory, if it still matches, the rcu_read_lock()
  4982. +    * ensures the memory stays valid.
  4983. +    */
  4984. +   barrier();
  4985. +
  4986. +   return owner->on_cpu;
  4987. +}
  4988. +
  4989. +/*
  4990. + * Look out! "owner" is an entirely speculative pointer
  4991. + * access and not reliable.
  4992. + */
  4993. +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
  4994. +{
  4995. +   rcu_read_lock();
  4996. +   while (owner_running(lock, owner)) {
  4997. +       if (need_resched())
  4998. +           break;
  4999. +
  5000. +       arch_mutex_cpu_relax();
  5001. +   }
  5002. +   rcu_read_unlock();
  5003. +
  5004. +   /*
  5005. +    * We break out the loop above on need_resched() and when the
  5006. +    * owner changed, which is a sign for heavy contention. Return
  5007. +    * success only when lock->owner is NULL.
  5008. +    */
  5009. +   return lock->owner == NULL;
  5010. +}
  5011. +#endif
  5012. +
  5013. +#ifdef CONFIG_PREEMPT
  5014. +/*
  5015. + * this is the entry point to schedule() from in-kernel preemption
  5016. + * off of preempt_enable. Kernel preemptions off return from interrupt
  5017. + * occur there and call schedule directly.
  5018. + */
  5019. +asmlinkage void __sched notrace preempt_schedule(void)
  5020. +{
  5021. +   struct thread_info *ti = current_thread_info();
  5022. +
  5023. +   /*
  5024. +    * If there is a non-zero preempt_count or interrupts are disabled,
  5025. +    * we do not want to preempt the current task. Just return..
  5026. +    */
  5027. +   if (likely(ti->preempt_count || irqs_disabled()))
  5028. +       return;
  5029. +
  5030. +   do {
  5031. +       add_preempt_count_notrace(PREEMPT_ACTIVE);
  5032. +       schedule();
  5033. +       sub_preempt_count_notrace(PREEMPT_ACTIVE);
  5034. +
  5035. +       /*
  5036. +        * Check again in case we missed a preemption opportunity
  5037. +        * between schedule and now.
  5038. +        */
  5039. +       barrier();
  5040. +   } while (need_resched());
  5041. +}
  5042. +EXPORT_SYMBOL(preempt_schedule);
  5043. +
  5044. +/*
  5045. + * this is the entry point to schedule() from kernel preemption
  5046. + * off of irq context.
  5047. + * Note, that this is called and return with irqs disabled. This will
  5048. + * protect us against recursive calling from irq.
  5049. + */
  5050. +asmlinkage void __sched preempt_schedule_irq(void)
  5051. +{
  5052. +   struct thread_info *ti = current_thread_info();
  5053. +
  5054. +   /* Catch callers which need to be fixed */
  5055. +   BUG_ON(ti->preempt_count || !irqs_disabled());
  5056. +
  5057. +   do {
  5058. +       add_preempt_count(PREEMPT_ACTIVE);
  5059. +       local_irq_enable();
  5060. +       schedule();
  5061. +       local_irq_disable();
  5062. +       sub_preempt_count(PREEMPT_ACTIVE);
  5063. +
  5064. +       /*
  5065. +        * Check again in case we missed a preemption opportunity
  5066. +        * between schedule and now.
  5067. +        */
  5068. +       barrier();
  5069. +   } while (need_resched());
  5070. +}
  5071. +
  5072. +#endif /* CONFIG_PREEMPT */
  5073. +
  5074. +int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
  5075. +             void *key)
  5076. +{
  5077. +   return try_to_wake_up(curr->private, mode, wake_flags);
  5078. +}
  5079. +EXPORT_SYMBOL(default_wake_function);
  5080. +
  5081. +/*
  5082. + * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
  5083. + * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
  5084. + * number) then we wake all the non-exclusive tasks and one exclusive task.
  5085. + *
  5086. + * There are circumstances in which we can try to wake a task which has already
  5087. + * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
  5088. + * zero in this (rare) case, and we handle it by continuing to scan the queue.
  5089. + */
  5090. +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
  5091. +           int nr_exclusive, int wake_flags, void *key)
  5092. +{
  5093. +   struct list_head *tmp, *next;
  5094. +
  5095. +   list_for_each_safe(tmp, next, &q->task_list) {
  5096. +       wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
  5097. +       unsigned int flags = curr->flags;
  5098. +
  5099. +       if (curr->func(curr, mode, wake_flags, key) &&
  5100. +               (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
  5101. +           break;
  5102. +   }
  5103. +}
  5104. +
  5105. +/**
  5106. + * __wake_up - wake up threads blocked on a waitqueue.
  5107. + * @q: the waitqueue
  5108. + * @mode: which threads
  5109. + * @nr_exclusive: how many wake-one or wake-many threads to wake up
  5110. + * @key: is directly passed to the wakeup function
  5111. + *
  5112. + * It may be assumed that this function implies a write memory barrier before
  5113. + * changing the task state if and only if any tasks are woken up.
  5114. + */
  5115. +void __wake_up(wait_queue_head_t *q, unsigned int mode,
  5116. +           int nr_exclusive, void *key)
  5117. +{
  5118. +   unsigned long flags;
  5119. +
  5120. +   spin_lock_irqsave(&q->lock, flags);
  5121. +   __wake_up_common(q, mode, nr_exclusive, 0, key);
  5122. +   spin_unlock_irqrestore(&q->lock, flags);
  5123. +}
  5124. +EXPORT_SYMBOL(__wake_up);
  5125. +
  5126. +/*
  5127. + * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
  5128. + */
  5129. +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
  5130. +{
  5131. +   __wake_up_common(q, mode, nr, 0, NULL);
  5132. +}
  5133. +EXPORT_SYMBOL_GPL(__wake_up_locked);
  5134. +
  5135. +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
  5136. +{
  5137. +   __wake_up_common(q, mode, 1, 0, key);
  5138. +}
  5139. +EXPORT_SYMBOL_GPL(__wake_up_locked_key);
  5140. +
  5141. +/**
  5142. + * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  5143. + * @q: the waitqueue
  5144. + * @mode: which threads
  5145. + * @nr_exclusive: how many wake-one or wake-many threads to wake up
  5146. + * @key: opaque value to be passed to wakeup targets
  5147. + *
  5148. + * The sync wakeup differs that the waker knows that it will schedule
  5149. + * away soon, so while the target thread will be woken up, it will not
  5150. + * be migrated to another CPU - ie. the two threads are 'synchronised'
  5151. + * with each other. This can prevent needless bouncing between CPUs.
  5152. + *
  5153. + * On UP it can prevent extra preemption.
  5154. + *
  5155. + * It may be assumed that this function implies a write memory barrier before
  5156. + * changing the task state if and only if any tasks are woken up.
  5157. + */
  5158. +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
  5159. +           int nr_exclusive, void *key)
  5160. +{
  5161. +   unsigned long flags;
  5162. +   int wake_flags = WF_SYNC;
  5163. +
  5164. +   if (unlikely(!q))
  5165. +       return;
  5166. +
  5167. +   if (unlikely(!nr_exclusive))
  5168. +       wake_flags = 0;
  5169. +
  5170. +   spin_lock_irqsave(&q->lock, flags);
  5171. +   __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
  5172. +   spin_unlock_irqrestore(&q->lock, flags);
  5173. +}
  5174. +EXPORT_SYMBOL_GPL(__wake_up_sync_key);
  5175. +
  5176. +/**
  5177. + * __wake_up_sync - wake up threads blocked on a waitqueue.
  5178. + * @q: the waitqueue
  5179. + * @mode: which threads
  5180. + * @nr_exclusive: how many wake-one or wake-many threads to wake up
  5181. + *
  5182. + * The sync wakeup differs that the waker knows that it will schedule
  5183. + * away soon, so while the target thread will be woken up, it will not
  5184. + * be migrated to another CPU - ie. the two threads are 'synchronised'
  5185. + * with each other. This can prevent needless bouncing between CPUs.
  5186. + *
  5187. + * On UP it can prevent extra preemption.
  5188. + */
  5189. +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
  5190. +{
  5191. +   unsigned long flags;
  5192. +   int sync = 1;
  5193. +
  5194. +   if (unlikely(!q))
  5195. +       return;
  5196. +
  5197. +   if (unlikely(!nr_exclusive))
  5198. +       sync = 0;
  5199. +
  5200. +   spin_lock_irqsave(&q->lock, flags);
  5201. +   __wake_up_common(q, mode, nr_exclusive, sync, NULL);
  5202. +   spin_unlock_irqrestore(&q->lock, flags);
  5203. +}
  5204. +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
  5205. +
  5206. +/**
  5207. + * complete: - signals a single thread waiting on this completion
  5208. + * @x:  holds the state of this particular completion
  5209. + *
  5210. + * This will wake up a single thread waiting on this completion. Threads will be
  5211. + * awakened in the same order in which they were queued.
  5212. + *
  5213. + * See also complete_all(), wait_for_completion() and related routines.
  5214. + *
  5215. + * It may be assumed that this function implies a write memory barrier before
  5216. + * changing the task state if and only if any tasks are woken up.
  5217. + */
  5218. +void complete(struct completion *x)
  5219. +{
  5220. +   unsigned long flags;
  5221. +
  5222. +   spin_lock_irqsave(&x->wait.lock, flags);
  5223. +   x->done++;
  5224. +   __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
  5225. +   spin_unlock_irqrestore(&x->wait.lock, flags);
  5226. +}
  5227. +EXPORT_SYMBOL(complete);
  5228. +
  5229. +/**
  5230. + * complete_all: - signals all threads waiting on this completion
  5231. + * @x:  holds the state of this particular completion
  5232. + *
  5233. + * This will wake up all threads waiting on this particular completion event.
  5234. + *
  5235. + * It may be assumed that this function implies a write memory barrier before
  5236. + * changing the task state if and only if any tasks are woken up.
  5237. + */
  5238. +void complete_all(struct completion *x)
  5239. +{
  5240. +   unsigned long flags;
  5241. +
  5242. +   spin_lock_irqsave(&x->wait.lock, flags);
  5243. +   x->done += UINT_MAX/2;
  5244. +   __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
  5245. +   spin_unlock_irqrestore(&x->wait.lock, flags);
  5246. +}
  5247. +EXPORT_SYMBOL(complete_all);
  5248. +
  5249. +static inline long __sched
  5250. +do_wait_for_common(struct completion *x, long timeout, int state)
  5251. +{
  5252. +   if (!x->done) {
  5253. +       DECLARE_WAITQUEUE(wait, current);
  5254. +
  5255. +       __add_wait_queue_tail_exclusive(&x->wait, &wait);
  5256. +       do {
  5257. +           if (signal_pending_state(state, current)) {
  5258. +               timeout = -ERESTARTSYS;
  5259. +               break;
  5260. +           }
  5261. +           __set_current_state(state);
  5262. +           spin_unlock_irq(&x->wait.lock);
  5263. +           timeout = schedule_timeout(timeout);
  5264. +           spin_lock_irq(&x->wait.lock);
  5265. +       } while (!x->done && timeout);
  5266. +       __remove_wait_queue(&x->wait, &wait);
  5267. +       if (!x->done)
  5268. +           return timeout;
  5269. +   }
  5270. +   x->done--;
  5271. +   return timeout ?: 1;
  5272. +}
  5273. +
  5274. +static long __sched
  5275. +wait_for_common(struct completion *x, long timeout, int state)
  5276. +{
  5277. +   might_sleep();
  5278. +
  5279. +   spin_lock_irq(&x->wait.lock);
  5280. +   timeout = do_wait_for_common(x, timeout, state);
  5281. +   spin_unlock_irq(&x->wait.lock);
  5282. +   return timeout;
  5283. +}
  5284. +
  5285. +/**
  5286. + * wait_for_completion: - waits for completion of a task
  5287. + * @x:  holds the state of this particular completion
  5288. + *
  5289. + * This waits to be signaled for completion of a specific task. It is NOT
  5290. + * interruptible and there is no timeout.
  5291. + *
  5292. + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
  5293. + * and interrupt capability. Also see complete().
  5294. + */
  5295. +void __sched wait_for_completion(struct completion *x)
  5296. +{
  5297. +   wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
  5298. +}
  5299. +EXPORT_SYMBOL(wait_for_completion);
  5300. +
  5301. +/**
  5302. + * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
  5303. + * @x:  holds the state of this particular completion
  5304. + * @timeout:  timeout value in jiffies
  5305. + *
  5306. + * This waits for either a completion of a specific task to be signaled or for a
  5307. + * specified timeout to expire. The timeout is in jiffies. It is not
  5308. + * interruptible.
  5309. + *
  5310. + * The return value is 0 if timed out, and positive (at least 1, or number of
  5311. + * jiffies left till timeout) if completed.
  5312. + */
  5313. +unsigned long __sched
  5314. +wait_for_completion_timeout(struct completion *x, unsigned long timeout)
  5315. +{
  5316. +   return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
  5317. +}
  5318. +EXPORT_SYMBOL(wait_for_completion_timeout);
  5319. +
  5320. +/**
  5321. + * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
  5322. + * @x:  holds the state of this particular completion
  5323. + *
  5324. + * This waits for completion of a specific task to be signaled. It is
  5325. + * interruptible.
  5326. + *
  5327. + * The return value is -ERESTARTSYS if interrupted, 0 if completed.
  5328. + */
  5329. +int __sched wait_for_completion_interruptible(struct completion *x)
  5330. +{
  5331. +   long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
  5332. +   if (t == -ERESTARTSYS)
  5333. +       return t;
  5334. +   return 0;
  5335. +}
  5336. +EXPORT_SYMBOL(wait_for_completion_interruptible);
  5337. +
  5338. +/**
  5339. + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
  5340. + * @x:  holds the state of this particular completion
  5341. + * @timeout:  timeout value in jiffies
  5342. + *
  5343. + * This waits for either a completion of a specific task to be signaled or for a
  5344. + * specified timeout to expire. It is interruptible. The timeout is in jiffies.
  5345. + *
  5346. + * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
  5347. + * positive (at least 1, or number of jiffies left till timeout) if completed.
  5348. + */
  5349. +long __sched
  5350. +wait_for_completion_interruptible_timeout(struct completion *x,
  5351. +                     unsigned long timeout)
  5352. +{
  5353. +   return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
  5354. +}
  5355. +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
  5356. +
  5357. +/**
  5358. + * wait_for_completion_killable: - waits for completion of a task (killable)
  5359. + * @x:  holds the state of this particular completion
  5360. + *
  5361. + * This waits to be signaled for completion of a specific task. It can be
  5362. + * interrupted by a kill signal.
  5363. + *
  5364. + * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
  5365. + * positive (at least 1, or number of jiffies left till timeout) if completed.
  5366. + */
  5367. +int __sched wait_for_completion_killable(struct completion *x)
  5368. +{
  5369. +   long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
  5370. +   if (t == -ERESTARTSYS)
  5371. +       return t;
  5372. +   return 0;
  5373. +}
  5374. +EXPORT_SYMBOL(wait_for_completion_killable);
  5375. +
  5376. +/**
  5377. + * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
  5378. + * @x:  holds the state of this particular completion
  5379. + * @timeout:  timeout value in jiffies
  5380. + *
  5381. + * This waits for either a completion of a specific task to be
  5382. + * signaled or for a specified timeout to expire. It can be
  5383. + * interrupted by a kill signal. The timeout is in jiffies.
  5384. + */
  5385. +long __sched
  5386. +wait_for_completion_killable_timeout(struct completion *x,
  5387. +                    unsigned long timeout)
  5388. +{
  5389. +   return wait_for_common(x, timeout, TASK_KILLABLE);
  5390. +}
  5391. +EXPORT_SYMBOL(wait_for_completion_killable_timeout);
  5392. +
  5393. +/**
  5394. + * try_wait_for_completion - try to decrement a completion without blocking
  5395. + * @x: completion structure
  5396. + *
  5397. + * Returns: 0 if a decrement cannot be done without blocking
  5398. + *      1 if a decrement succeeded.
  5399. + *
  5400. + * If a completion is being used as a counting completion,
  5401. + * attempt to decrement the counter without blocking. This
  5402. + * enables us to avoid waiting if the resource the completion
  5403. + * is protecting is not available.
  5404. + */
  5405. +bool try_wait_for_completion(struct completion *x)
  5406. +{
  5407. +   unsigned long flags;
  5408. +   int ret = 1;
  5409. +
  5410. +   spin_lock_irqsave(&x->wait.lock, flags);
  5411. +   if (!x->done)
  5412. +       ret = 0;
  5413. +   else
  5414. +       x->done--;
  5415. +   spin_unlock_irqrestore(&x->wait.lock, flags);
  5416. +   return ret;
  5417. +}
  5418. +EXPORT_SYMBOL(try_wait_for_completion);
  5419. +
  5420. +/**
  5421. + * completion_done - Test to see if a completion has any waiters
  5422. + * @x: completion structure
  5423. + *
  5424. + * Returns: 0 if there are waiters (wait_for_completion() in progress)
  5425. + *      1 if there are no waiters.
  5426. + *
  5427. + */
  5428. +bool completion_done(struct completion *x)
  5429. +{
  5430. +   unsigned long flags;
  5431. +   int ret = 1;
  5432. +
  5433. +   spin_lock_irqsave(&x->wait.lock, flags);
  5434. +   if (!x->done)
  5435. +       ret = 0;
  5436. +   spin_unlock_irqrestore(&x->wait.lock, flags);
  5437. +   return ret;
  5438. +}
  5439. +EXPORT_SYMBOL(completion_done);
  5440. +
  5441. +static long __sched
  5442. +sleep_on_common(wait_queue_head_t *q, int state, long timeout)
  5443. +{
  5444. +   unsigned long flags;
  5445. +   wait_queue_t wait;
  5446. +
  5447. +   init_waitqueue_entry(&wait, current);
  5448. +
  5449. +   __set_current_state(state);
  5450. +
  5451. +   spin_lock_irqsave(&q->lock, flags);
  5452. +   __add_wait_queue(q, &wait);
  5453. +   spin_unlock(&q->lock);
  5454. +   timeout = schedule_timeout(timeout);
  5455. +   spin_lock_irq(&q->lock);
  5456. +   __remove_wait_queue(q, &wait);
  5457. +   spin_unlock_irqrestore(&q->lock, flags);
  5458. +
  5459. +   return timeout;
  5460. +}
  5461. +
  5462. +void __sched interruptible_sleep_on(wait_queue_head_t *q)
  5463. +{
  5464. +   sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
  5465. +}
  5466. +EXPORT_SYMBOL(interruptible_sleep_on);
  5467. +
  5468. +long __sched
  5469. +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
  5470. +{
  5471. +   return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
  5472. +}
  5473. +EXPORT_SYMBOL(interruptible_sleep_on_timeout);
  5474. +
  5475. +void __sched sleep_on(wait_queue_head_t *q)
  5476. +{
  5477. +   sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
  5478. +}
  5479. +EXPORT_SYMBOL(sleep_on);
  5480. +
  5481. +long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
  5482. +{
  5483. +   return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
  5484. +}
  5485. +EXPORT_SYMBOL(sleep_on_timeout);
  5486. +
  5487. +#ifdef CONFIG_RT_MUTEXES
  5488. +
  5489. +/*
  5490. + * rt_mutex_setprio - set the current priority of a task
  5491. + * @p: task
  5492. + * @prio: prio value (kernel-internal form)
  5493. + *
  5494. + * This function changes the 'effective' priority of a task. It does
  5495. + * not touch ->normal_prio like __setscheduler().
  5496. + *
  5497. + * Used by the rt_mutex code to implement priority inheritance logic.
  5498. + */
  5499. +void rt_mutex_setprio(struct task_struct *p, int prio)
  5500. +{
  5501. +   unsigned long flags;
  5502. +   int queued, oldprio;
  5503. +   struct rq *rq;
  5504. +
  5505. +   BUG_ON(prio < 0 || prio > MAX_PRIO);
  5506. +
  5507. +   rq = task_grq_ulock(p, &flags);
  5508. +
  5509. +   /*
  5510. +    * Idle task boosting is a nono in general. There is one
  5511. +    * exception, when PREEMPT_RT and NOHZ is active:
  5512. +    *
  5513. +    * The idle task calls get_next_timer_interrupt() and holds
  5514. +    * the timer wheel base->lock on the CPU and another CPU wants
  5515. +    * to access the timer (probably to cancel it). We can safely
  5516. +    * ignore the boosting request, as the idle CPU runs this code
  5517. +    * with interrupts disabled and will complete the lock
  5518. +    * protected section without being interrupted. So there is no
  5519. +    * real need to boost.
  5520. +    */
  5521. +   if (unlikely(p == rq->idle)) {
  5522. +       WARN_ON(p != rq->curr);
  5523. +       WARN_ON(p->pi_blocked_on);
  5524. +       task_grq_uunlock(&flags);
  5525. +       return;
  5526. +   }
  5527. +
  5528. +   trace_sched_pi_setprio(p, prio);
  5529. +   oldprio = p->prio;
  5530. +   queued = task_queued(p);
  5531. +   grq_upgrade();
  5532. +   if (queued)
  5533. +       dequeue_task(p);
  5534. +   p->prio = prio;
  5535. +   if (task_running(p) && prio > oldprio)
  5536. +       resched_task(p);
  5537. +   if (queued) {
  5538. +       enqueue_task(p);
  5539. +       try_preempt(p, rq);
  5540. +   }
  5541. +
  5542. +   task_grq_wunlock(&flags);
  5543. +}
  5544. +
  5545. +#endif
  5546. +
  5547. +/*
  5548. + * Adjust the deadline for when the priority is to change, before it's
  5549. + * changed.
  5550. + */
  5551. +static inline void adjust_deadline(struct task_struct *p, int new_prio)
  5552. +{
  5553. +   p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p);
  5554. +}
  5555. +
  5556. +void set_user_nice(struct task_struct *p, long nice)
  5557. +{
  5558. +   int queued, new_static, old_static;
  5559. +   unsigned long flags;
  5560. +   struct rq *rq;
  5561. +
  5562. +   if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
  5563. +       return;
  5564. +   new_static = NICE_TO_PRIO(nice);
  5565. +   /*
  5566. +    * We have to be careful, if called from sys_setpriority(),
  5567. +    * the task might be in the middle of scheduling on another CPU.
  5568. +    */
  5569. +   rq = time_task_grq_wlock(p, &flags);
  5570. +   /*
  5571. +    * The RT priorities are set via sched_setscheduler(), but we still
  5572. +    * allow the 'normal' nice value to be set - but as expected
  5573. +    * it wont have any effect on scheduling until the task is
  5574. +    * not SCHED_NORMAL/SCHED_BATCH:
  5575. +    */
  5576. +   if (has_rt_policy(p)) {
  5577. +       p->static_prio = new_static;
  5578. +       goto out_unlock;
  5579. +   }
  5580. +   queued = task_queued(p);
  5581. +   if (queued)
  5582. +       dequeue_task(p);
  5583. +
  5584. +   adjust_deadline(p, new_static);
  5585. +   old_static = p->static_prio;
  5586. +   p->static_prio = new_static;
  5587. +   p->prio = effective_prio(p);
  5588. +
  5589. +   if (queued) {
  5590. +       enqueue_task(p);
  5591. +       if (new_static < old_static)
  5592. +           try_preempt(p, rq);
  5593. +   } else if (task_running(p)) {
  5594. +       reset_rq_task(rq, p);
  5595. +       if (old_static < new_static)
  5596. +           resched_task(p);
  5597. +   }
  5598. +out_unlock:
  5599. +   task_grq_wunlock(&flags);
  5600. +}
  5601. +EXPORT_SYMBOL(set_user_nice);
  5602. +
  5603. +/*
  5604. + * can_nice - check if a task can reduce its nice value
  5605. + * @p: task
  5606. + * @nice: nice value
  5607. + */
  5608. +int can_nice(const struct task_struct *p, const int nice)
  5609. +{
  5610. +   /* convert nice value [19,-20] to rlimit style value [1,40] */
  5611. +   int nice_rlim = 20 - nice;
  5612. +
  5613. +   return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
  5614. +       capable(CAP_SYS_NICE));
  5615. +}
  5616. +
  5617. +#ifdef __ARCH_WANT_SYS_NICE
  5618. +
  5619. +/*
  5620. + * sys_nice - change the priority of the current process.
  5621. + * @increment: priority increment
  5622. + *
  5623. + * sys_setpriority is a more generic, but much slower function that
  5624. + * does similar things.
  5625. + */
  5626. +SYSCALL_DEFINE1(nice, int, increment)
  5627. +{
  5628. +   long nice, retval;
  5629. +
  5630. +   /*
  5631. +    * Setpriority might change our priority at the same moment.
  5632. +    * We don't have to worry. Conceptually one call occurs first
  5633. +    * and we have a single winner.
  5634. +    */
  5635. +   if (increment < -40)
  5636. +       increment = -40;
  5637. +   if (increment > 40)
  5638. +       increment = 40;
  5639. +
  5640. +   nice = TASK_NICE(current) + increment;
  5641. +   if (nice < -20)
  5642. +       nice = -20;
  5643. +   if (nice > 19)
  5644. +       nice = 19;
  5645. +
  5646. +   if (increment < 0 && !can_nice(current, nice))
  5647. +       return -EPERM;
  5648. +
  5649. +   retval = security_task_setnice(current, nice);
  5650. +   if (retval)
  5651. +       return retval;
  5652. +
  5653. +   set_user_nice(current, nice);
  5654. +   return 0;
  5655. +}
  5656. +
  5657. +#endif
  5658. +
  5659. +/**
  5660. + * task_prio - return the priority value of a given task.
  5661. + * @p: the task in question.
  5662. + *
  5663. + * This is the priority value as seen by users in /proc.
  5664. + * RT tasks are offset by -100. Normal tasks are centered around 1, value goes
  5665. + * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO).
  5666. + */
  5667. +int task_prio(const struct task_struct *p)
  5668. +{
  5669. +   int delta, prio = p->prio - MAX_RT_PRIO;
  5670. +
  5671. +   /* rt tasks and iso tasks */
  5672. +   if (prio <= 0)
  5673. +       goto out;
  5674. +
  5675. +   /* Convert to ms to avoid overflows */
  5676. +   delta = NS_TO_MS(p->deadline - grq.niffies);
  5677. +   delta = delta * 40 / ms_longest_deadline_diff();
  5678. +   if (delta > 0 && delta <= 80)
  5679. +       prio += delta;
  5680. +   if (idleprio_task(p))
  5681. +       prio += 40;
  5682. +out:
  5683. +   return prio;
  5684. +}
  5685. +
  5686. +/**
  5687. + * task_nice - return the nice value of a given task.
  5688. + * @p: the task in question.
  5689. + */
  5690. +int task_nice(const struct task_struct *p)
  5691. +{
  5692. +   return TASK_NICE(p);
  5693. +}
  5694. +EXPORT_SYMBOL_GPL(task_nice);
  5695. +
  5696. +/**
  5697. + * idle_cpu - is a given cpu idle currently?
  5698. + * @cpu: the processor in question.
  5699. + */
  5700. +int idle_cpu(int cpu)
  5701. +{
  5702. +   return cpu_curr(cpu) == cpu_rq(cpu)->idle;
  5703. +}
  5704. +
  5705. +/**
  5706. + * idle_task - return the idle task for a given cpu.
  5707. + * @cpu: the processor in question.
  5708. + */
  5709. +struct task_struct *idle_task(int cpu)
  5710. +{
  5711. +   return cpu_rq(cpu)->idle;
  5712. +}
  5713. +
  5714. +/**
  5715. + * find_process_by_pid - find a process with a matching PID value.
  5716. + * @pid: the pid in question.
  5717. + */
  5718. +static inline struct task_struct *find_process_by_pid(pid_t pid)
  5719. +{
  5720. +   return pid ? find_task_by_vpid(pid) : current;
  5721. +}
  5722. +
  5723. +/* Actually do priority change: must hold grq lock. */
  5724. +static void
  5725. +__setscheduler(struct task_struct *p, struct rq *rq, int policy, int prio)
  5726. +{
  5727. +   int oldrtprio, oldprio;
  5728. +
  5729. +   p->policy = policy;
  5730. +   oldrtprio = p->rt_priority;
  5731. +   p->rt_priority = prio;
  5732. +   p->normal_prio = normal_prio(p);
  5733. +   oldprio = p->prio;
  5734. +   /* we are holding p->pi_lock already */
  5735. +   p->prio = rt_mutex_getprio(p);
  5736. +   if (task_running(p)) {
  5737. +       reset_rq_task(rq, p);
  5738. +       /* Resched only if we might now be preempted */
  5739. +       if (p->prio > oldprio || p->rt_priority > oldrtprio)
  5740. +           resched_task(p);
  5741. +   }
  5742. +}
  5743. +
  5744. +/*
  5745. + * check the target process has a UID that matches the current process's
  5746. + */
  5747. +static bool check_same_owner(struct task_struct *p)
  5748. +{
  5749. +   const struct cred *cred = current_cred(), *pcred;
  5750. +   bool match;
  5751. +
  5752. +   rcu_read_lock();
  5753. +   pcred = __task_cred(p);
  5754. +   match = (uid_eq(cred->euid, pcred->euid) ||
  5755. +        uid_eq(cred->euid, pcred->uid));
  5756. +   rcu_read_unlock();
  5757. +   return match;
  5758. +}
  5759. +
  5760. +static int __sched_setscheduler(struct task_struct *p, int policy,
  5761. +               const struct sched_param *param, bool user)
  5762. +{
  5763. +   struct sched_param zero_param = { .sched_priority = 0 };
  5764. +   int queued, retval, oldpolicy = -1;
  5765. +   unsigned long flags, rlim_rtprio = 0;
  5766. +   int reset_on_fork;
  5767. +   struct rq *rq;
  5768. +
  5769. +   /* may grab non-irq protected spin_locks */
  5770. +   BUG_ON(in_interrupt());
  5771. +
  5772. +   if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
  5773. +       unsigned long lflags;
  5774. +
  5775. +       if (!lock_task_sighand(p, &lflags))
  5776. +           return -ESRCH;
  5777. +       rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
  5778. +       unlock_task_sighand(p, &lflags);
  5779. +       if (rlim_rtprio)
  5780. +           goto recheck;
  5781. +       /*
  5782. +        * If the caller requested an RT policy without having the
  5783. +        * necessary rights, we downgrade the policy to SCHED_ISO.
  5784. +        * We also set the parameter to zero to pass the checks.
  5785. +        */
  5786. +       policy = SCHED_ISO;
  5787. +       param = &zero_param;
  5788. +   }
  5789. +recheck:
  5790. +   /* double check policy once rq lock held */
  5791. +   if (policy < 0) {
  5792. +       reset_on_fork = p->sched_reset_on_fork;
  5793. +       policy = oldpolicy = p->policy;
  5794. +   } else {
  5795. +       reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
  5796. +       policy &= ~SCHED_RESET_ON_FORK;
  5797. +
  5798. +       if (!SCHED_RANGE(policy))
  5799. +           return -EINVAL;
  5800. +   }
  5801. +
  5802. +   /*
  5803. +    * Valid priorities for SCHED_FIFO and SCHED_RR are
  5804. +    * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
  5805. +    * SCHED_BATCH is 0.
  5806. +    */
  5807. +   if (param->sched_priority < 0 ||
  5808. +       (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) ||
  5809. +       (!p->mm && param->sched_priority > MAX_RT_PRIO - 1))
  5810. +       return -EINVAL;
  5811. +   if (is_rt_policy(policy) != (param->sched_priority != 0))
  5812. +       return -EINVAL;
  5813. +
  5814. +   /*
  5815. +    * Allow unprivileged RT tasks to decrease priority:
  5816. +    */
  5817. +   if (user && !capable(CAP_SYS_NICE)) {
  5818. +       if (is_rt_policy(policy)) {
  5819. +           unsigned long rlim_rtprio =
  5820. +                   task_rlimit(p, RLIMIT_RTPRIO);
  5821. +
  5822. +           /* can't set/change the rt policy */
  5823. +           if (policy != p->policy && !rlim_rtprio)
  5824. +               return -EPERM;
  5825. +
  5826. +           /* can't increase priority */
  5827. +           if (param->sched_priority > p->rt_priority &&
  5828. +               param->sched_priority > rlim_rtprio)
  5829. +               return -EPERM;
  5830. +       } else {
  5831. +           switch (p->policy) {
  5832. +               /*
  5833. +                * Can only downgrade policies but not back to
  5834. +                * SCHED_NORMAL
  5835. +                */
  5836. +               case SCHED_ISO:
  5837. +                   if (policy == SCHED_ISO)
  5838. +                       goto out;
  5839. +                   if (policy == SCHED_NORMAL)
  5840. +                       return -EPERM;
  5841. +                   break;
  5842. +               case SCHED_BATCH:
  5843. +                   if (policy == SCHED_BATCH)
  5844. +                       goto out;
  5845. +                   if (policy != SCHED_IDLEPRIO)
  5846. +                       return -EPERM;
  5847. +                   break;
  5848. +               case SCHED_IDLEPRIO:
  5849. +                   if (policy == SCHED_IDLEPRIO)
  5850. +                       goto out;
  5851. +                   return -EPERM;
  5852. +               default:
  5853. +                   break;
  5854. +           }
  5855. +       }
  5856. +
  5857. +       /* can't change other user's priorities */
  5858. +       if (!check_same_owner(p))
  5859. +           return -EPERM;
  5860. +
  5861. +       /* Normal users shall not reset the sched_reset_on_fork flag */
  5862. +       if (p->sched_reset_on_fork && !reset_on_fork)
  5863. +           return -EPERM;
  5864. +   }
  5865. +
  5866. +   if (user) {
  5867. +       retval = security_task_setscheduler(p);
  5868. +       if (retval)
  5869. +           return retval;
  5870. +   }
  5871. +
  5872. +   /*
  5873. +    * make sure no PI-waiters arrive (or leave) while we are
  5874. +    * changing the priority of the task:
  5875. +    */
  5876. +   raw_spin_lock_irqsave(&p->pi_lock, flags);
  5877. +   /*
  5878. +    * To be able to change p->policy safely, the grunqueue lock must be
  5879. +    * held.
  5880. +    */
  5881. +   rq = __task_grq_ulock(p);
  5882. +
  5883. +   /*
  5884. +    * Changing the policy of the stop threads its a very bad idea
  5885. +    */
  5886. +   if (p == rq->stop) {
  5887. +       __task_grq_uunlock();
  5888. +       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  5889. +       return -EINVAL;
  5890. +   }
  5891. +
  5892. +   /*
  5893. +    * If not changing anything there's no need to proceed further:
  5894. +    */
  5895. +   if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
  5896. +           param->sched_priority == p->rt_priority))) {
  5897. +
  5898. +       __task_grq_uunlock();
  5899. +       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  5900. +       return 0;
  5901. +   }
  5902. +
  5903. +   /* recheck policy now with rq lock held */
  5904. +   if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
  5905. +       policy = oldpolicy = -1;
  5906. +       __task_grq_uunlock();
  5907. +       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  5908. +       goto recheck;
  5909. +   }
  5910. +   grq_upgrade();
  5911. +   update_clocks(rq);
  5912. +   p->sched_reset_on_fork = reset_on_fork;
  5913. +
  5914. +   queued = task_queued(p);
  5915. +   if (queued)
  5916. +       dequeue_task(p);
  5917. +   __setscheduler(p, rq, policy, param->sched_priority);
  5918. +   if (queued) {
  5919. +       enqueue_task(p);
  5920. +       try_preempt(p, rq);
  5921. +   }
  5922. +   __task_grq_wunlock();
  5923. +   raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  5924. +
  5925. +   rt_mutex_adjust_pi(p);
  5926. +out:
  5927. +   return 0;
  5928. +}
  5929. +
  5930. +/**
  5931. + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
  5932. + * @p: the task in question.
  5933. + * @policy: new policy.
  5934. + * @param: structure containing the new RT priority.
  5935. + *
  5936. + * NOTE that the task may be already dead.
  5937. + */
  5938. +int sched_setscheduler(struct task_struct *p, int policy,
  5939. +              const struct sched_param *param)
  5940. +{
  5941. +   return __sched_setscheduler(p, policy, param, true);
  5942. +}
  5943. +
  5944. +EXPORT_SYMBOL_GPL(sched_setscheduler);
  5945. +
  5946. +/**
  5947. + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
  5948. + * @p: the task in question.
  5949. + * @policy: new policy.
  5950. + * @param: structure containing the new RT priority.
  5951. + *
  5952. + * Just like sched_setscheduler, only don't bother checking if the
  5953. + * current context has permission.  For example, this is needed in
  5954. + * stop_machine(): we create temporary high priority worker threads,
  5955. + * but our caller might not have that capability.
  5956. + */
  5957. +int sched_setscheduler_nocheck(struct task_struct *p, int policy,
  5958. +                  const struct sched_param *param)
  5959. +{
  5960. +   return __sched_setscheduler(p, policy, param, false);
  5961. +}
  5962. +
  5963. +static int
  5964. +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
  5965. +{
  5966. +   struct sched_param lparam;
  5967. +   struct task_struct *p;
  5968. +   int retval;
  5969. +
  5970. +   if (!param || pid < 0)
  5971. +       return -EINVAL;
  5972. +   if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
  5973. +       return -EFAULT;
  5974. +
  5975. +   rcu_read_lock();
  5976. +   retval = -ESRCH;
  5977. +   p = find_process_by_pid(pid);
  5978. +   if (p != NULL)
  5979. +       retval = sched_setscheduler(p, policy, &lparam);
  5980. +   rcu_read_unlock();
  5981. +
  5982. +   return retval;
  5983. +}
  5984. +
  5985. +/**
  5986. + * sys_sched_setscheduler - set/change the scheduler policy and RT priority
  5987. + * @pid: the pid in question.
  5988. + * @policy: new policy.
  5989. + * @param: structure containing the new RT priority.
  5990. + */
  5991. +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
  5992. +                      struct sched_param __user *param)
  5993. +{
  5994. +   /* negative values for policy are not valid */
  5995. +   if (policy < 0)
  5996. +       return -EINVAL;
  5997. +
  5998. +   return do_sched_setscheduler(pid, policy, param);
  5999. +}
  6000. +
  6001. +/**
  6002. + * sys_sched_setparam - set/change the RT priority of a thread
  6003. + * @pid: the pid in question.
  6004. + * @param: structure containing the new RT priority.
  6005. + */
  6006. +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
  6007. +{
  6008. +   return do_sched_setscheduler(pid, -1, param);
  6009. +}
  6010. +
  6011. +/**
  6012. + * sys_sched_getscheduler - get the policy (scheduling class) of a thread
  6013. + * @pid: the pid in question.
  6014. + */
  6015. +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
  6016. +{
  6017. +   struct task_struct *p;
  6018. +   int retval = -EINVAL;
  6019. +
  6020. +   if (pid < 0)
  6021. +       goto out_nounlock;
  6022. +
  6023. +   retval = -ESRCH;
  6024. +   rcu_read_lock();
  6025. +   p = find_process_by_pid(pid);
  6026. +   if (p) {
  6027. +       retval = security_task_getscheduler(p);
  6028. +       if (!retval)
  6029. +           retval = p->policy;
  6030. +   }
  6031. +   rcu_read_unlock();
  6032. +
  6033. +out_nounlock:
  6034. +   return retval;
  6035. +}
  6036. +
  6037. +/**
  6038. + * sys_sched_getscheduler - get the RT priority of a thread
  6039. + * @pid: the pid in question.
  6040. + * @param: structure containing the RT priority.
  6041. + */
  6042. +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
  6043. +{
  6044. +   struct sched_param lp;
  6045. +   struct task_struct *p;
  6046. +   int retval = -EINVAL;
  6047. +
  6048. +   if (!param || pid < 0)
  6049. +       goto out_nounlock;
  6050. +
  6051. +   rcu_read_lock();
  6052. +   p = find_process_by_pid(pid);
  6053. +   retval = -ESRCH;
  6054. +   if (!p)
  6055. +       goto out_unlock;
  6056. +
  6057. +   retval = security_task_getscheduler(p);
  6058. +   if (retval)
  6059. +       goto out_unlock;
  6060. +
  6061. +   lp.sched_priority = p->rt_priority;
  6062. +   rcu_read_unlock();
  6063. +
  6064. +   /*
  6065. +    * This one might sleep, we cannot do it with a spinlock held ...
  6066. +    */
  6067. +   retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
  6068. +
  6069. +out_nounlock:
  6070. +   return retval;
  6071. +
  6072. +out_unlock:
  6073. +   rcu_read_unlock();
  6074. +   return retval;
  6075. +}
  6076. +
  6077. +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
  6078. +{
  6079. +   cpumask_var_t cpus_allowed, new_mask;
  6080. +   struct task_struct *p;
  6081. +   int retval;
  6082. +
  6083. +   get_online_cpus();
  6084. +   rcu_read_lock();
  6085. +
  6086. +   p = find_process_by_pid(pid);
  6087. +   if (!p) {
  6088. +       rcu_read_unlock();
  6089. +       put_online_cpus();
  6090. +       return -ESRCH;
  6091. +   }
  6092. +
  6093. +   /* Prevent p going away */
  6094. +   get_task_struct(p);
  6095. +   rcu_read_unlock();
  6096. +
  6097. +   if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
  6098. +       retval = -ENOMEM;
  6099. +       goto out_put_task;
  6100. +   }
  6101. +   if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
  6102. +       retval = -ENOMEM;
  6103. +       goto out_free_cpus_allowed;
  6104. +   }
  6105. +   retval = -EPERM;
  6106. +   if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
  6107. +       goto out_unlock;
  6108. +
  6109. +   retval = security_task_setscheduler(p);
  6110. +   if (retval)
  6111. +       goto out_unlock;
  6112. +
  6113. +   cpuset_cpus_allowed(p, cpus_allowed);
  6114. +   cpumask_and(new_mask, in_mask, cpus_allowed);
  6115. +again:
  6116. +   retval = set_cpus_allowed_ptr(p, new_mask);
  6117. +
  6118. +   if (!retval) {
  6119. +       cpuset_cpus_allowed(p, cpus_allowed);
  6120. +       if (!cpumask_subset(new_mask, cpus_allowed)) {
  6121. +           /*
  6122. +            * We must have raced with a concurrent cpuset
  6123. +            * update. Just reset the cpus_allowed to the
  6124. +            * cpuset's cpus_allowed
  6125. +            */
  6126. +           cpumask_copy(new_mask, cpus_allowed);
  6127. +           goto again;
  6128. +       }
  6129. +   }
  6130. +out_unlock:
  6131. +   free_cpumask_var(new_mask);
  6132. +out_free_cpus_allowed:
  6133. +   free_cpumask_var(cpus_allowed);
  6134. +out_put_task:
  6135. +   put_task_struct(p);
  6136. +   put_online_cpus();
  6137. +   return retval;
  6138. +}
  6139. +
  6140. +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
  6141. +                cpumask_t *new_mask)
  6142. +{
  6143. +   if (len < sizeof(cpumask_t)) {
  6144. +       memset(new_mask, 0, sizeof(cpumask_t));
  6145. +   } else if (len > sizeof(cpumask_t)) {
  6146. +       len = sizeof(cpumask_t);
  6147. +   }
  6148. +   return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
  6149. +}
  6150. +
  6151. +
  6152. +/**
  6153. + * sys_sched_setaffinity - set the cpu affinity of a process
  6154. + * @pid: pid of the process
  6155. + * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  6156. + * @user_mask_ptr: user-space pointer to the new cpu mask
  6157. + */
  6158. +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
  6159. +       unsigned long __user *, user_mask_ptr)
  6160. +{
  6161. +   cpumask_var_t new_mask;
  6162. +   int retval;
  6163. +
  6164. +   if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
  6165. +       return -ENOMEM;
  6166. +
  6167. +   retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
  6168. +   if (retval == 0)
  6169. +       retval = sched_setaffinity(pid, new_mask);
  6170. +   free_cpumask_var(new_mask);
  6171. +   return retval;
  6172. +}
  6173. +
  6174. +long sched_getaffinity(pid_t pid, cpumask_t *mask)
  6175. +{
  6176. +   struct task_struct *p;
  6177. +   unsigned long flags;
  6178. +   int retval;
  6179. +
  6180. +   get_online_cpus();
  6181. +   rcu_read_lock();
  6182. +
  6183. +   retval = -ESRCH;
  6184. +   p = find_process_by_pid(pid);
  6185. +   if (!p)
  6186. +       goto out_unlock;
  6187. +
  6188. +   retval = security_task_getscheduler(p);
  6189. +   if (retval)
  6190. +       goto out_unlock;
  6191. +
  6192. +   grq_rlock_irqsave(&flags);
  6193. +   cpumask_and(mask, tsk_cpus_allowed(p), cpu_online_mask);
  6194. +   grq_runlock_irqrestore(&flags);
  6195. +
  6196. +out_unlock:
  6197. +   rcu_read_unlock();
  6198. +   put_online_cpus();
  6199. +
  6200. +   return retval;
  6201. +}
  6202. +
  6203. +/**
  6204. + * sys_sched_getaffinity - get the cpu affinity of a process
  6205. + * @pid: pid of the process
  6206. + * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  6207. + * @user_mask_ptr: user-space pointer to hold the current cpu mask
  6208. + */
  6209. +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
  6210. +       unsigned long __user *, user_mask_ptr)
  6211. +{
  6212. +   int ret;
  6213. +   cpumask_var_t mask;
  6214. +
  6215. +   if ((len * BITS_PER_BYTE) < nr_cpu_ids)
  6216. +       return -EINVAL;
  6217. +   if (len & (sizeof(unsigned long)-1))
  6218. +       return -EINVAL;
  6219. +
  6220. +   if (!alloc_cpumask_var(&mask, GFP_KERNEL))
  6221. +       return -ENOMEM;
  6222. +
  6223. +   ret = sched_getaffinity(pid, mask);
  6224. +   if (ret == 0) {
  6225. +       size_t retlen = min_t(size_t, len, cpumask_size());
  6226. +
  6227. +       if (copy_to_user(user_mask_ptr, mask, retlen))
  6228. +           ret = -EFAULT;
  6229. +       else
  6230. +           ret = retlen;
  6231. +   }
  6232. +   free_cpumask_var(mask);
  6233. +
  6234. +   return ret;
  6235. +}
  6236. +
  6237. +/**
  6238. + * sys_sched_yield - yield the current processor to other threads.
  6239. + *
  6240. + * This function yields the current CPU to other tasks. It does this by
  6241. + * scheduling away the current task. If it still has the earliest deadline
  6242. + * it will be scheduled again as the next task.
  6243. + */
  6244. +SYSCALL_DEFINE0(sched_yield)
  6245. +{
  6246. +   struct task_struct *p;
  6247. +
  6248. +   p = current;
  6249. +   grq_wlock_irq();
  6250. +   schedstat_inc(task_rq(p), yld_count);
  6251. +   requeue_task(p);
  6252. +
  6253. +   /*
  6254. +    * Since we are going to call schedule() anyway, there's
  6255. +    * no need to preempt or enable interrupts:
  6256. +    */
  6257. +   __urw_write_unlock(&grq.urw.rwlock);
  6258. +   __release(grq.urw.lock);
  6259. +   spin_release(&grq.urw.lock.dep_map, 1, _THIS_IP_);
  6260. +   do_raw_spin_unlock(&grq.urw.lock);
  6261. +   sched_preempt_enable_no_resched();
  6262. +
  6263. +   schedule();
  6264. +
  6265. +   return 0;
  6266. +}
  6267. +
  6268. +static inline bool should_resched(void)
  6269. +{
  6270. +   return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
  6271. +}
  6272. +
  6273. +static void __cond_resched(void)
  6274. +{
  6275. +   /* NOT a real fix but will make voluntary preempt work. 馬鹿な事 */
  6276. +   if (unlikely(system_state != SYSTEM_RUNNING))
  6277. +       return;
  6278. +
  6279. +   add_preempt_count(PREEMPT_ACTIVE);
  6280. +   schedule();
  6281. +   sub_preempt_count(PREEMPT_ACTIVE);
  6282. +}
  6283. +
  6284. +int __sched _cond_resched(void)
  6285. +{
  6286. +   if (should_resched()) {
  6287. +       __cond_resched();
  6288. +       return 1;
  6289. +   }
  6290. +   return 0;
  6291. +}
  6292. +EXPORT_SYMBOL(_cond_resched);
  6293. +
  6294. +/*
  6295. + * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
  6296. + * call schedule, and on return reacquire the lock.
  6297. + *
  6298. + * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
  6299. + * operations here to prevent schedule() from being called twice (once via
  6300. + * spin_unlock(), once by hand).
  6301. + */
  6302. +int __cond_resched_lock(spinlock_t *lock)
  6303. +{
  6304. +   int resched = should_resched();
  6305. +   int ret = 0;
  6306. +
  6307. +   lockdep_assert_held(lock);
  6308. +
  6309. +   if (spin_needbreak(lock) || resched) {
  6310. +       spin_unlock(lock);
  6311. +       if (resched)
  6312. +           __cond_resched();
  6313. +       else
  6314. +           cpu_relax();
  6315. +       ret = 1;
  6316. +       spin_lock(lock);
  6317. +   }
  6318. +   return ret;
  6319. +}
  6320. +EXPORT_SYMBOL(__cond_resched_lock);
  6321. +
  6322. +int __sched __cond_resched_softirq(void)
  6323. +{
  6324. +   BUG_ON(!in_softirq());
  6325. +
  6326. +   if (should_resched()) {
  6327. +       local_bh_enable();
  6328. +       __cond_resched();
  6329. +       local_bh_disable();
  6330. +       return 1;
  6331. +   }
  6332. +   return 0;
  6333. +}
  6334. +EXPORT_SYMBOL(__cond_resched_softirq);
  6335. +
  6336. +/**
  6337. + * yield - yield the current processor to other threads.
  6338. + *
  6339. + * Do not ever use this function, there's a 99% chance you're doing it wrong.
  6340. + *
  6341. + * The scheduler is at all times free to pick the calling task as the most
  6342. + * eligible task to run, if removing the yield() call from your code breaks
  6343. + * it, its already broken.
  6344. + *
  6345. + * Typical broken usage is:
  6346. + *
  6347. + * while (!event)
  6348. + *     yield();
  6349. + *
  6350. + * where one assumes that yield() will let 'the other' process run that will
  6351. + * make event true. If the current task is a SCHED_FIFO task that will never
  6352. + * happen. Never use yield() as a progress guarantee!!
  6353. + *
  6354. + * If you want to use yield() to wait for something, use wait_event().
  6355. + * If you want to use yield() to be 'nice' for others, use cond_resched().
  6356. + * If you still want to use yield(), do not!
  6357. + */
  6358. +void __sched yield(void)
  6359. +{
  6360. +   set_current_state(TASK_RUNNING);
  6361. +   sys_sched_yield();
  6362. +}
  6363. +EXPORT_SYMBOL(yield);
  6364. +
  6365. +/**
  6366. + * yield_to - yield the current processor to another thread in
  6367. + * your thread group, or accelerate that thread toward the
  6368. + * processor it's on.
  6369. + * @p: target task
  6370. + * @preempt: whether task preemption is allowed or not
  6371. + *
  6372. + * It's the caller's job to ensure that the target task struct
  6373. + * can't go away on us before we can do any checks.
  6374. + *
  6375. + * Returns true if we indeed boosted the target task.
  6376. + */
  6377. +bool __sched yield_to(struct task_struct *p, bool preempt)
  6378. +{
  6379. +   bool yielded = false;
  6380. +   unsigned long flags;
  6381. +   struct rq *rq;
  6382. +
  6383. +   rq = this_rq();
  6384. +   grq_ulock_irqsave(&flags);
  6385. +   if (task_running(p) || p->state)
  6386. +       goto out_unlock;
  6387. +   yielded = true;
  6388. +   grq_upgrade();
  6389. +   if (p->deadline > rq->rq_deadline)
  6390. +       p->deadline = rq->rq_deadline;
  6391. +   p->time_slice += rq->rq_time_slice;
  6392. +   rq->rq_time_slice = 0;
  6393. +   if (p->time_slice > timeslice())
  6394. +       p->time_slice = timeslice();
  6395. +   set_tsk_need_resched(rq->curr);
  6396. +out_unlock:
  6397. +   if (yielded) {
  6398. +       grq_wunlock_irqrestore(&flags);
  6399. +       schedule();
  6400. +   } else
  6401. +       grq_uunlock_irqrestore(&flags);
  6402. +
  6403. +   return yielded;
  6404. +}
  6405. +EXPORT_SYMBOL_GPL(yield_to);
  6406. +
  6407. +/*
  6408. + * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
  6409. + * that process accounting knows that this is a task in IO wait state.
  6410. + *
  6411. + * But don't do that if it is a deliberate, throttling IO wait (this task
  6412. + * has set its backing_dev_info: the queue against which it should throttle)
  6413. + */
  6414. +void __sched io_schedule(void)
  6415. +{
  6416. +   struct rq *rq = raw_rq();
  6417. +
  6418. +   delayacct_blkio_start();
  6419. +   atomic_inc(&rq->nr_iowait);
  6420. +   blk_flush_plug(current);
  6421. +   current->in_iowait = 1;
  6422. +   schedule();
  6423. +   current->in_iowait = 0;
  6424. +   atomic_dec(&rq->nr_iowait);
  6425. +   delayacct_blkio_end();
  6426. +}
  6427. +EXPORT_SYMBOL(io_schedule);
  6428. +
  6429. +long __sched io_schedule_timeout(long timeout)
  6430. +{
  6431. +   struct rq *rq = raw_rq();
  6432. +   long ret;
  6433. +
  6434. +   delayacct_blkio_start();
  6435. +   atomic_inc(&rq->nr_iowait);
  6436. +   blk_flush_plug(current);
  6437. +   current->in_iowait = 1;
  6438. +   ret = schedule_timeout(timeout);
  6439. +   current->in_iowait = 0;
  6440. +   atomic_dec(&rq->nr_iowait);
  6441. +   delayacct_blkio_end();
  6442. +   return ret;
  6443. +}
  6444. +
  6445. +/**
  6446. + * sys_sched_get_priority_max - return maximum RT priority.
  6447. + * @policy: scheduling class.
  6448. + *
  6449. + * this syscall returns the maximum rt_priority that can be used
  6450. + * by a given scheduling class.
  6451. + */
  6452. +SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
  6453. +{
  6454. +   int ret = -EINVAL;
  6455. +
  6456. +   switch (policy) {
  6457. +   case SCHED_FIFO:
  6458. +   case SCHED_RR:
  6459. +       ret = MAX_USER_RT_PRIO-1;
  6460. +       break;
  6461. +   case SCHED_NORMAL:
  6462. +   case SCHED_BATCH:
  6463. +   case SCHED_ISO:
  6464. +   case SCHED_IDLEPRIO:
  6465. +       ret = 0;
  6466. +       break;
  6467. +   }
  6468. +   return ret;
  6469. +}
  6470. +
  6471. +/**
  6472. + * sys_sched_get_priority_min - return minimum RT priority.
  6473. + * @policy: scheduling class.
  6474. + *
  6475. + * this syscall returns the minimum rt_priority that can be used
  6476. + * by a given scheduling class.
  6477. + */
  6478. +SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
  6479. +{
  6480. +   int ret = -EINVAL;
  6481. +
  6482. +   switch (policy) {
  6483. +   case SCHED_FIFO:
  6484. +   case SCHED_RR:
  6485. +       ret = 1;
  6486. +       break;
  6487. +   case SCHED_NORMAL:
  6488. +   case SCHED_BATCH:
  6489. +   case SCHED_ISO:
  6490. +   case SCHED_IDLEPRIO:
  6491. +       ret = 0;
  6492. +       break;
  6493. +   }
  6494. +   return ret;
  6495. +}
  6496. +
  6497. +/**
  6498. + * sys_sched_rr_get_interval - return the default timeslice of a process.
  6499. + * @pid: pid of the process.
  6500. + * @interval: userspace pointer to the timeslice value.
  6501. + *
  6502. + * this syscall writes the default timeslice value of a given process
  6503. + * into the user-space timespec buffer. A value of '0' means infinity.
  6504. + */
  6505. +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
  6506. +       struct timespec __user *, interval)
  6507. +{
  6508. +   struct task_struct *p;
  6509. +   unsigned int time_slice;
  6510. +   unsigned long flags;
  6511. +   int retval;
  6512. +   struct timespec t;
  6513. +
  6514. +   if (pid < 0)
  6515. +       return -EINVAL;
  6516. +
  6517. +   retval = -ESRCH;
  6518. +   rcu_read_lock();
  6519. +   p = find_process_by_pid(pid);
  6520. +   if (!p)
  6521. +       goto out_unlock;
  6522. +
  6523. +   retval = security_task_getscheduler(p);
  6524. +   if (retval)
  6525. +       goto out_unlock;
  6526. +
  6527. +   grq_rlock_irqsave(&flags);
  6528. +   time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
  6529. +   grq_runlock_irqrestore(&flags);
  6530. +
  6531. +   rcu_read_unlock();
  6532. +   t = ns_to_timespec(time_slice);
  6533. +   retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
  6534. +   return retval;
  6535. +
  6536. +out_unlock:
  6537. +   rcu_read_unlock();
  6538. +   return retval;
  6539. +}
  6540. +
  6541. +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
  6542. +
  6543. +void sched_show_task(struct task_struct *p)
  6544. +{
  6545. +   unsigned long free = 0;
  6546. +   unsigned state;
  6547. +
  6548. +   state = p->state ? __ffs(p->state) + 1 : 0;
  6549. +   printk(KERN_INFO "%-15.15s %c", p->comm,
  6550. +       state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
  6551. +#if BITS_PER_LONG == 32
  6552. +   if (state == TASK_RUNNING)
  6553. +       printk(KERN_CONT " running  ");
  6554. +   else
  6555. +       printk(KERN_CONT " %08lx ", thread_saved_pc(p));
  6556. +#else
  6557. +   if (state == TASK_RUNNING)
  6558. +       printk(KERN_CONT "  running task    ");
  6559. +   else
  6560. +       printk(KERN_CONT " %016lx ", thread_saved_pc(p));
  6561. +#endif
  6562. +#ifdef CONFIG_DEBUG_STACK_USAGE
  6563. +   free = stack_not_used(p);
  6564. +#endif
  6565. +   printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
  6566. +       task_pid_nr(p), task_pid_nr(p->real_parent),
  6567. +       (unsigned long)task_thread_info(p)->flags);
  6568. +
  6569. +   show_stack(p, NULL);
  6570. +}
  6571. +
  6572. +void show_state_filter(unsigned long state_filter)
  6573. +{
  6574. +   struct task_struct *g, *p;
  6575. +
  6576. +#if BITS_PER_LONG == 32
  6577. +   printk(KERN_INFO
  6578. +       "  task                PC stack   pid father\n");
  6579. +#else
  6580. +   printk(KERN_INFO
  6581. +       "  task                        PC stack   pid father\n");
  6582. +#endif
  6583. +   rcu_read_lock();
  6584. +   do_each_thread(g, p) {
  6585. +       /*
  6586. +        * reset the NMI-timeout, listing all files on a slow
  6587. +        * console might take a lot of time:
  6588. +        */
  6589. +       touch_nmi_watchdog();
  6590. +       if (!state_filter || (p->state & state_filter))
  6591. +           sched_show_task(p);
  6592. +   } while_each_thread(g, p);
  6593. +
  6594. +   touch_all_softlockup_watchdogs();
  6595. +
  6596. +   rcu_read_unlock();
  6597. +   /*
  6598. +    * Only show locks if all tasks are dumped:
  6599. +    */
  6600. +   if (!state_filter)
  6601. +       debug_show_all_locks();
  6602. +}
  6603. +
  6604. +#ifdef CONFIG_SMP
  6605. +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  6606. +{
  6607. +   cpumask_copy(tsk_cpus_allowed(p), new_mask);
  6608. +}
  6609. +#endif
  6610. +
  6611. +/**
  6612. + * init_idle - set up an idle thread for a given CPU
  6613. + * @idle: task in question
  6614. + * @cpu: cpu the idle task belongs to
  6615. + *
  6616. + * NOTE: this function does not set the idle thread's NEED_RESCHED
  6617. + * flag, to make booting more robust.
  6618. + */
  6619. +void init_idle(struct task_struct *idle, int cpu)
  6620. +{
  6621. +   struct rq *rq = cpu_rq(cpu);
  6622. +   unsigned long flags;
  6623. +
  6624. +   time_grq_wlock(rq, &flags);
  6625. +   idle->last_ran = rq->clock;
  6626. +   idle->state = TASK_RUNNING;
  6627. +   /* Setting prio to illegal value shouldn't matter when never queued */
  6628. +   idle->prio = PRIO_LIMIT;
  6629. +   set_rq_task(rq, idle);
  6630. +   do_set_cpus_allowed(idle, &cpumask_of_cpu(cpu));
  6631. +   /* Silence PROVE_RCU */
  6632. +   rcu_read_lock();
  6633. +   set_task_cpu(idle, cpu);
  6634. +   rcu_read_unlock();
  6635. +   rq->curr = rq->idle = idle;
  6636. +   idle->on_cpu = 1;
  6637. +   grq_wunlock_irqrestore(&flags);
  6638. +
  6639. +   /* Set the preempt count _outside_ the spinlocks! */
  6640. +   task_thread_info(idle)->preempt_count = 0;
  6641. +
  6642. +   ftrace_graph_init_idle_task(idle, cpu);
  6643. +#if defined(CONFIG_SMP)
  6644. +   sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
  6645. +#endif
  6646. +}
  6647. +
  6648. +#ifdef CONFIG_SMP
  6649. +#ifdef CONFIG_NO_HZ
  6650. +void select_nohz_load_balancer(int stop_tick)
  6651. +{
  6652. +}
  6653. +
  6654. +void set_cpu_sd_state_idle(void) {}
  6655. +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
  6656. +/**
  6657. + * lowest_flag_domain - Return lowest sched_domain containing flag.
  6658. + * @cpu:   The cpu whose lowest level of sched domain is to
  6659. + *     be returned.
  6660. + * @flag:  The flag to check for the lowest sched_domain
  6661. + *     for the given cpu.
  6662. + *
  6663. + * Returns the lowest sched_domain of a cpu which contains the given flag.
  6664. + */
  6665. +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
  6666. +{
  6667. +   struct sched_domain *sd;
  6668. +
  6669. +   for_each_domain(cpu, sd)
  6670. +       if (sd && (sd->flags & flag))
  6671. +           break;
  6672. +
  6673. +   return sd;
  6674. +}
  6675. +
  6676. +/**
  6677. + * for_each_flag_domain - Iterates over sched_domains containing the flag.
  6678. + * @cpu:   The cpu whose domains we're iterating over.
  6679. + * @sd:        variable holding the value of the power_savings_sd
  6680. + *     for cpu.
  6681. + * @flag:  The flag to filter the sched_domains to be iterated.
  6682. + *
  6683. + * Iterates over all the scheduler domains for a given cpu that has the 'flag'
  6684. + * set, starting from the lowest sched_domain to the highest.
  6685. + */
  6686. +#define for_each_flag_domain(cpu, sd, flag) \
  6687. +   for (sd = lowest_flag_domain(cpu, flag); \
  6688. +       (sd && (sd->flags & flag)); sd = sd->parent)
  6689. +
  6690. +#endif /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
  6691. +
  6692. +static inline void resched_cpu(int cpu)
  6693. +{
  6694. +   unsigned long flags;
  6695. +
  6696. +   grq_wlock_irqsave(&flags);
  6697. +   resched_task(cpu_curr(cpu));
  6698. +   grq_wunlock_irqrestore(&flags);
  6699. +}
  6700. +
  6701. +/*
  6702. + * In the semi idle case, use the nearest busy cpu for migrating timers
  6703. + * from an idle cpu.  This is good for power-savings.
  6704. + *
  6705. + * We don't do similar optimization for completely idle system, as
  6706. + * selecting an idle cpu will add more delays to the timers than intended
  6707. + * (as that cpu's timer base may not be uptodate wrt jiffies etc).
  6708. + */
  6709. +int get_nohz_timer_target(void)
  6710. +{
  6711. +   int cpu = smp_processor_id();
  6712. +   int i;
  6713. +   struct sched_domain *sd;
  6714. +
  6715. +   rcu_read_lock();
  6716. +   for_each_domain(cpu, sd) {
  6717. +       for_each_cpu(i, sched_domain_span(sd)) {
  6718. +           if (!idle_cpu(i))
  6719. +               cpu = i;
  6720. +           goto unlock;
  6721. +       }
  6722. +   }
  6723. +unlock:
  6724. +   rcu_read_unlock();
  6725. +   return cpu;
  6726. +}
  6727. +
  6728. +/*
  6729. + * When add_timer_on() enqueues a timer into the timer wheel of an
  6730. + * idle CPU then this timer might expire before the next timer event
  6731. + * which is scheduled to wake up that CPU. In case of a completely
  6732. + * idle system the next event might even be infinite time into the
  6733. + * future. wake_up_idle_cpu() ensures that the CPU is woken up and
  6734. + * leaves the inner idle loop so the newly added timer is taken into
  6735. + * account when the CPU goes back to idle and evaluates the timer
  6736. + * wheel for the next timer event.
  6737. + */
  6738. +void wake_up_idle_cpu(int cpu)
  6739. +{
  6740. +   struct task_struct *idle;
  6741. +   struct rq *rq;
  6742. +
  6743. +   if (cpu == smp_processor_id())
  6744. +       return;
  6745. +
  6746. +   rq = cpu_rq(cpu);
  6747. +   idle = rq->idle;
  6748. +
  6749. +   /*
  6750. +    * This is safe, as this function is called with the timer
  6751. +    * wheel base lock of (cpu) held. When the CPU is on the way
  6752. +    * to idle and has not yet set rq->curr to idle then it will
  6753. +    * be serialised on the timer wheel base lock and take the new
  6754. +    * timer into account automatically.
  6755. +    */
  6756. +   if (unlikely(rq->curr != idle))
  6757. +       return;
  6758. +
  6759. +   /*
  6760. +    * We can set TIF_RESCHED on the idle task of the other CPU
  6761. +    * lockless. The worst case is that the other CPU runs the
  6762. +    * idle task through an additional NOOP schedule()
  6763. +    */
  6764. +   set_tsk_need_resched(idle);
  6765. +
  6766. +   /* NEED_RESCHED must be visible before we test polling */
  6767. +   smp_mb();
  6768. +   if (!tsk_is_polling(idle))
  6769. +       smp_send_reschedule(cpu);
  6770. +}
  6771. +
  6772. +#endif /* CONFIG_NO_HZ */
  6773. +
  6774. +/*
  6775. + * Change a given task's CPU affinity. Migrate the thread to a
  6776. + * proper CPU and schedule it away if the CPU it's executing on
  6777. + * is removed from the allowed bitmask.
  6778. + *
  6779. + * NOTE: the caller must have a valid reference to the task, the
  6780. + * task must not exit() & deallocate itself prematurely. The
  6781. + * call is not atomic; no spinlocks may be held.
  6782. + */
  6783. +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
  6784. +{
  6785. +   bool running_wrong = false;
  6786. +   bool upgrade = false;
  6787. +   bool queued = false;
  6788. +   unsigned long flags;
  6789. +   struct rq *rq;
  6790. +   int ret = 0;
  6791. +
  6792. +   rq = task_grq_ulock(p, &flags);
  6793. +
  6794. +   if (cpumask_equal(tsk_cpus_allowed(p), new_mask))
  6795. +       goto out;
  6796. +
  6797. +   if (!cpumask_intersects(new_mask, cpu_active_mask)) {
  6798. +       ret = -EINVAL;
  6799. +       goto out;
  6800. +   }
  6801. +
  6802. +   if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
  6803. +       ret = -EINVAL;
  6804. +       goto out;
  6805. +   }
  6806. +
  6807. +   queued = task_queued(p);
  6808. +
  6809. +   upgrade = true;
  6810. +   grq_upgrade();
  6811. +   do_set_cpus_allowed(p, new_mask);
  6812. +
  6813. +   /* Can the task run on the task's current CPU? If so, we're done */
  6814. +   if (cpumask_test_cpu(task_cpu(p), new_mask))
  6815. +       goto out;
  6816. +
  6817. +   if (task_running(p)) {
  6818. +       /* Task is running on the wrong cpu now, reschedule it. */
  6819. +       if (rq == this_rq()) {
  6820. +           set_tsk_need_resched(p);
  6821. +           running_wrong = true;
  6822. +       } else
  6823. +           resched_task(p);
  6824. +   } else
  6825. +       set_task_cpu(p, cpumask_any_and(cpu_active_mask, new_mask));
  6826. +
  6827. +out:
  6828. +   if (upgrade) {
  6829. +       if (queued)
  6830. +           try_preempt(p, rq);
  6831. +       task_grq_wunlock(&flags);
  6832. +   } else
  6833. +       task_grq_uunlock(&flags);
  6834. +
  6835. +   if (running_wrong)
  6836. +       _cond_resched();
  6837. +
  6838. +   return ret;
  6839. +}
  6840. +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
  6841. +
  6842. +#ifdef CONFIG_HOTPLUG_CPU
  6843. +/* Run through task list and find tasks affined to just the dead cpu, then
  6844. + * allocate a new affinity */
  6845. +static void break_sole_affinity(int src_cpu, struct task_struct *idle)
  6846. +{
  6847. +   struct task_struct *p, *t;
  6848. +
  6849. +   do_each_thread(t, p) {
  6850. +       if (p != idle && !online_cpus(p)) {
  6851. +           cpumask_copy(tsk_cpus_allowed(p), cpu_possible_mask);
  6852. +           /*
  6853. +            * Don't tell them about moving exiting tasks or
  6854. +            * kernel threads (both mm NULL), since they never
  6855. +            * leave kernel.
  6856. +            */
  6857. +           if (p->mm && printk_ratelimit()) {
  6858. +               printk(KERN_INFO "process %d (%s) no "
  6859. +                      "longer affine to cpu %d\n",
  6860. +                      task_pid_nr(p), p->comm, src_cpu);
  6861. +           }
  6862. +       }
  6863. +       clear_sticky(p);
  6864. +   } while_each_thread(t, p);
  6865. +}
  6866. +
  6867. +/*
  6868. + * Schedules idle task to be the next runnable task on current CPU.
  6869. + * It does so by boosting its priority to highest possible.
  6870. + * Used by CPU offline code.
  6871. + */
  6872. +void sched_idle_next(struct rq *rq, int this_cpu, struct task_struct *idle)
  6873. +{
  6874. +   /* cpu has to be offline */
  6875. +   BUG_ON(cpu_online(this_cpu));
  6876. +
  6877. +   __setscheduler(idle, rq, SCHED_FIFO, STOP_PRIO);
  6878. +
  6879. +   activate_idle_task(idle);
  6880. +   set_tsk_need_resched(rq->curr);
  6881. +}
  6882. +
  6883. +/*
  6884. + * Ensures that the idle task is using init_mm right before its cpu goes
  6885. + * offline.
  6886. + */
  6887. +void idle_task_exit(void)
  6888. +{
  6889. +   struct mm_struct *mm = current->active_mm;
  6890. +
  6891. +   BUG_ON(cpu_online(smp_processor_id()));
  6892. +
  6893. +   if (mm != &init_mm)
  6894. +       switch_mm(mm, &init_mm, current);
  6895. +   mmdrop(mm);
  6896. +}
  6897. +#endif /* CONFIG_HOTPLUG_CPU */
  6898. +void sched_set_stop_task(int cpu, struct task_struct *stop)
  6899. +{
  6900. +   struct sched_param stop_param = { .sched_priority = STOP_PRIO };
  6901. +   struct sched_param start_param = { .sched_priority = MAX_USER_RT_PRIO - 1 };
  6902. +   struct task_struct *old_stop = cpu_rq(cpu)->stop;
  6903. +
  6904. +   if (stop) {
  6905. +       /*
  6906. +        * Make it appear like a SCHED_FIFO task, its something
  6907. +        * userspace knows about and won't get confused about.
  6908. +        *
  6909. +        * Also, it will make PI more or less work without too
  6910. +        * much confusion -- but then, stop work should not
  6911. +        * rely on PI working anyway.
  6912. +        */
  6913. +       sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);
  6914. +   }
  6915. +
  6916. +   cpu_rq(cpu)->stop = stop;
  6917. +
  6918. +   if (old_stop) {
  6919. +       /*
  6920. +        * Reset it back to a normal rt scheduling prio so that
  6921. +        * it can die in pieces.
  6922. +        */
  6923. +       sched_setscheduler_nocheck(old_stop, SCHED_FIFO, &start_param);
  6924. +   }
  6925. +}
  6926. +
  6927. +
  6928. +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
  6929. +
  6930. +static struct ctl_table sd_ctl_dir[] = {
  6931. +   {
  6932. +       .procname   = "sched_domain",
  6933. +       .mode       = 0555,
  6934. +   },
  6935. +   {}
  6936. +};
  6937. +
  6938. +static struct ctl_table sd_ctl_root[] = {
  6939. +   {
  6940. +       .procname   = "kernel",
  6941. +       .mode       = 0555,
  6942. +       .child      = sd_ctl_dir,
  6943. +   },
  6944. +   {}
  6945. +};
  6946. +
  6947. +static struct ctl_table *sd_alloc_ctl_entry(int n)
  6948. +{
  6949. +   struct ctl_table *entry =
  6950. +       kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
  6951. +
  6952. +   return entry;
  6953. +}
  6954. +
  6955. +static void sd_free_ctl_entry(struct ctl_table **tablep)
  6956. +{
  6957. +   struct ctl_table *entry;
  6958. +
  6959. +   /*
  6960. +    * In the intermediate directories, both the child directory and
  6961. +    * procname are dynamically allocated and could fail but the mode
  6962. +    * will always be set. In the lowest directory the names are
  6963. +    * static strings and all have proc handlers.
  6964. +    */
  6965. +   for (entry = *tablep; entry->mode; entry++) {
  6966. +       if (entry->child)
  6967. +           sd_free_ctl_entry(&entry->child);
  6968. +       if (entry->proc_handler == NULL)
  6969. +           kfree(entry->procname);
  6970. +   }
  6971. +
  6972. +   kfree(*tablep);
  6973. +   *tablep = NULL;
  6974. +}
  6975. +
  6976. +static void
  6977. +set_table_entry(struct ctl_table *entry,
  6978. +       const char *procname, void *data, int maxlen,
  6979. +       mode_t mode, proc_handler *proc_handler)
  6980. +{
  6981. +   entry->procname = procname;
  6982. +   entry->data = data;
  6983. +   entry->maxlen = maxlen;
  6984. +   entry->mode = mode;
  6985. +   entry->proc_handler = proc_handler;
  6986. +}
  6987. +
  6988. +static struct ctl_table *
  6989. +sd_alloc_ctl_domain_table(struct sched_domain *sd)
  6990. +{
  6991. +   struct ctl_table *table = sd_alloc_ctl_entry(13);
  6992. +
  6993. +   if (table == NULL)
  6994. +       return NULL;
  6995. +
  6996. +   set_table_entry(&table[0], "min_interval", &sd->min_interval,
  6997. +       sizeof(long), 0644, proc_doulongvec_minmax);
  6998. +   set_table_entry(&table[1], "max_interval", &sd->max_interval,
  6999. +       sizeof(long), 0644, proc_doulongvec_minmax);
  7000. +   set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
  7001. +       sizeof(int), 0644, proc_dointvec_minmax);
  7002. +   set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
  7003. +       sizeof(int), 0644, proc_dointvec_minmax);
  7004. +   set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
  7005. +       sizeof(int), 0644, proc_dointvec_minmax);
  7006. +   set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
  7007. +       sizeof(int), 0644, proc_dointvec_minmax);
  7008. +   set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
  7009. +       sizeof(int), 0644, proc_dointvec_minmax);
  7010. +   set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
  7011. +       sizeof(int), 0644, proc_dointvec_minmax);
  7012. +   set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
  7013. +       sizeof(int), 0644, proc_dointvec_minmax);
  7014. +   set_table_entry(&table[9], "cache_nice_tries",
  7015. +       &sd->cache_nice_tries,
  7016. +       sizeof(int), 0644, proc_dointvec_minmax);
  7017. +   set_table_entry(&table[10], "flags", &sd->flags,
  7018. +       sizeof(int), 0644, proc_dointvec_minmax);
  7019. +   set_table_entry(&table[11], "name", sd->name,
  7020. +       CORENAME_MAX_SIZE, 0444, proc_dostring);
  7021. +   /* &table[12] is terminator */
  7022. +
  7023. +   return table;
  7024. +}
  7025. +
  7026. +static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
  7027. +{
  7028. +   struct ctl_table *entry, *table;
  7029. +   struct sched_domain *sd;
  7030. +   int domain_num = 0, i;
  7031. +   char buf[32];
  7032. +
  7033. +   for_each_domain(cpu, sd)
  7034. +       domain_num++;
  7035. +   entry = table = sd_alloc_ctl_entry(domain_num + 1);
  7036. +   if (table == NULL)
  7037. +       return NULL;
  7038. +
  7039. +   i = 0;
  7040. +   for_each_domain(cpu, sd) {
  7041. +       snprintf(buf, 32, "domain%d", i);
  7042. +       entry->procname = kstrdup(buf, GFP_KERNEL);
  7043. +       entry->mode = 0555;
  7044. +       entry->child = sd_alloc_ctl_domain_table(sd);
  7045. +       entry++;
  7046. +       i++;
  7047. +   }
  7048. +   return table;
  7049. +}
  7050. +
  7051. +static struct ctl_table_header *sd_sysctl_header;
  7052. +static void register_sched_domain_sysctl(void)
  7053. +{
  7054. +   int i, cpu_num = num_possible_cpus();
  7055. +   struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
  7056. +   char buf[32];
  7057. +
  7058. +   WARN_ON(sd_ctl_dir[0].child);
  7059. +   sd_ctl_dir[0].child = entry;
  7060. +
  7061. +   if (entry == NULL)
  7062. +       return;
  7063. +
  7064. +   for_each_possible_cpu(i) {
  7065. +       snprintf(buf, 32, "cpu%d", i);
  7066. +       entry->procname = kstrdup(buf, GFP_KERNEL);
  7067. +       entry->mode = 0555;
  7068. +       entry->child = sd_alloc_ctl_cpu_table(i);
  7069. +       entry++;
  7070. +   }
  7071. +
  7072. +   WARN_ON(sd_sysctl_header);
  7073. +   sd_sysctl_header = register_sysctl_table(sd_ctl_root);
  7074. +}
  7075. +
  7076. +/* may be called multiple times per register */
  7077. +static void unregister_sched_domain_sysctl(void)
  7078. +{
  7079. +   if (sd_sysctl_header)
  7080. +       unregister_sysctl_table(sd_sysctl_header);
  7081. +   sd_sysctl_header = NULL;
  7082. +   if (sd_ctl_dir[0].child)
  7083. +       sd_free_ctl_entry(&sd_ctl_dir[0].child);
  7084. +}
  7085. +#else
  7086. +static void register_sched_domain_sysctl(void)
  7087. +{
  7088. +}
  7089. +static void unregister_sched_domain_sysctl(void)
  7090. +{
  7091. +}
  7092. +#endif
  7093. +
  7094. +static void set_rq_online(struct rq *rq)
  7095. +{
  7096. +   if (!rq->online) {
  7097. +       cpumask_set_cpu(cpu_of(rq), rq->rd->online);
  7098. +       rq->online = true;
  7099. +   }
  7100. +}
  7101. +
  7102. +static void set_rq_offline(struct rq *rq)
  7103. +{
  7104. +   if (rq->online) {
  7105. +       cpumask_clear_cpu(cpu_of(rq), rq->rd->online);
  7106. +       rq->online = false;
  7107. +   }
  7108. +}
  7109. +
  7110. +/*
  7111. + * migration_call - callback that gets triggered when a CPU is added.
  7112. + */
  7113. +static int __cpuinit
  7114. +migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
  7115. +{
  7116. +   int cpu = (long)hcpu;
  7117. +   unsigned long flags;
  7118. +   struct rq *rq = cpu_rq(cpu);
  7119. +#ifdef CONFIG_HOTPLUG_CPU
  7120. +   struct task_struct *idle = rq->idle;
  7121. +#endif
  7122. +
  7123. +   switch (action & ~CPU_TASKS_FROZEN) {
  7124. +
  7125. +   case CPU_UP_PREPARE:
  7126. +       break;
  7127. +
  7128. +   case CPU_ONLINE:
  7129. +       /* Update our root-domain */
  7130. +       grq_wlock_irqsave(&flags);
  7131. +       if (rq->rd) {
  7132. +           BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
  7133. +
  7134. +           set_rq_online(rq);
  7135. +       }
  7136. +       grq.noc = num_online_cpus();
  7137. +       grq_wunlock_irqrestore(&flags);
  7138. +       break;
  7139. +
  7140. +#ifdef CONFIG_HOTPLUG_CPU
  7141. +   case CPU_DEAD:
  7142. +       /* Idle task back to normal (off runqueue, low prio) */
  7143. +       grq_wlock_irq();
  7144. +       return_task(idle, true);
  7145. +       idle->static_prio = MAX_PRIO;
  7146. +       __setscheduler(idle, rq, SCHED_NORMAL, 0);
  7147. +       idle->prio = PRIO_LIMIT;
  7148. +       set_rq_task(rq, idle);
  7149. +       update_clocks(rq);
  7150. +       grq_wunlock_irq();
  7151. +       break;
  7152. +
  7153. +   case CPU_DYING:
  7154. +       /* Update our root-domain */
  7155. +       grq_wlock_irqsave(&flags);
  7156. +       sched_idle_next(rq, cpu, idle);
  7157. +       if (rq->rd) {
  7158. +           BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
  7159. +           set_rq_offline(rq);
  7160. +       }
  7161. +       break_sole_affinity(cpu, idle);
  7162. +       grq.noc = num_online_cpus();
  7163. +       grq_wunlock_irqrestore(&flags);
  7164. +       break;
  7165. +#endif
  7166. +   }
  7167. +   return NOTIFY_OK;
  7168. +}
  7169. +
  7170. +/*
  7171. + * Register at high priority so that task migration (migrate_all_tasks)
  7172. + * happens before everything else.  This has to be lower priority than
  7173. + * the notifier in the perf_counter subsystem, though.
  7174. + */
  7175. +static struct notifier_block __cpuinitdata migration_notifier = {
  7176. +   .notifier_call = migration_call,
  7177. +   .priority = CPU_PRI_MIGRATION,
  7178. +};
  7179. +
  7180. +static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
  7181. +                     unsigned long action, void *hcpu)
  7182. +{
  7183. +   switch (action & ~CPU_TASKS_FROZEN) {
  7184. +   case CPU_STARTING:
  7185. +   case CPU_DOWN_FAILED:
  7186. +       set_cpu_active((long)hcpu, true);
  7187. +       return NOTIFY_OK;
  7188. +   default:
  7189. +       return NOTIFY_DONE;
  7190. +   }
  7191. +}
  7192. +
  7193. +static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
  7194. +                   unsigned long action, void *hcpu)
  7195. +{
  7196. +   switch (action & ~CPU_TASKS_FROZEN) {
  7197. +   case CPU_DOWN_PREPARE:
  7198. +       set_cpu_active((long)hcpu, false);
  7199. +       return NOTIFY_OK;
  7200. +   default:
  7201. +       return NOTIFY_DONE;
  7202. +   }
  7203. +}
  7204. +
  7205. +int __init migration_init(void)
  7206. +{
  7207. +   void *cpu = (void *)(long)smp_processor_id();
  7208. +   int err;
  7209. +
  7210. +   /* Initialise migration for the boot CPU */
  7211. +   err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
  7212. +   BUG_ON(err == NOTIFY_BAD);
  7213. +   migration_call(&migration_notifier, CPU_ONLINE, cpu);
  7214. +   register_cpu_notifier(&migration_notifier);
  7215. +
  7216. +   /* Register cpu active notifiers */
  7217. +   cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
  7218. +   cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
  7219. +
  7220. +   return 0;
  7221. +}
  7222. +early_initcall(migration_init);
  7223. +#endif
  7224. +
  7225. +#ifdef CONFIG_SMP
  7226. +
  7227. +static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
  7228. +
  7229. +#ifdef CONFIG_SCHED_DEBUG
  7230. +
  7231. +static __read_mostly int sched_debug_enabled;
  7232. +
  7233. +static int __init sched_debug_setup(char *str)
  7234. +{
  7235. +   sched_debug_enabled = 1;
  7236. +
  7237. +   return 0;
  7238. +}
  7239. +early_param("sched_debug", sched_debug_setup);
  7240. +
  7241. +static inline bool sched_debug(void)
  7242. +{
  7243. +   return sched_debug_enabled;
  7244. +}
  7245. +
  7246. +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
  7247. +                 struct cpumask *groupmask)
  7248. +{
  7249. +   struct sched_group *group = sd->groups;
  7250. +   char str[256];
  7251. +
  7252. +   cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
  7253. +   cpumask_clear(groupmask);
  7254. +
  7255. +   printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
  7256. +
  7257. +   if (!(sd->flags & SD_LOAD_BALANCE)) {
  7258. +       printk("does not load-balance\n");
  7259. +       if (sd->parent)
  7260. +           printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
  7261. +                   " has parent");
  7262. +       return -1;
  7263. +   }
  7264. +
  7265. +   printk(KERN_CONT "span %s level %s\n", str, sd->name);
  7266. +
  7267. +   if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
  7268. +       printk(KERN_ERR "ERROR: domain->span does not contain "
  7269. +               "CPU%d\n", cpu);
  7270. +   }
  7271. +   if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
  7272. +       printk(KERN_ERR "ERROR: domain->groups does not contain"
  7273. +               " CPU%d\n", cpu);
  7274. +   }
  7275. +
  7276. +   printk(KERN_DEBUG "%*s groups:", level + 1, "");
  7277. +   do {
  7278. +       if (!group) {
  7279. +           printk("\n");
  7280. +           printk(KERN_ERR "ERROR: group is NULL\n");
  7281. +           break;
  7282. +       }
  7283. +
  7284. +       /*
  7285. +        * Even though we initialise ->power to something semi-sane,
  7286. +        * we leave power_orig unset. This allows us to detect if
  7287. +        * domain iteration is still funny without causing /0 traps.
  7288. +        */
  7289. +       if (!group->sgp->power_orig) {
  7290. +           printk(KERN_CONT "\n");
  7291. +           printk(KERN_ERR "ERROR: domain->cpu_power not "
  7292. +                   "set\n");
  7293. +           break;
  7294. +       }
  7295. +
  7296. +       if (!cpumask_weight(sched_group_cpus(group))) {
  7297. +           printk(KERN_CONT "\n");
  7298. +           printk(KERN_ERR "ERROR: empty group\n");
  7299. +           break;
  7300. +       }
  7301. +
  7302. +       if (!(sd->flags & SD_OVERLAP) &&
  7303. +           cpumask_intersects(groupmask, sched_group_cpus(group))) {
  7304. +           printk(KERN_CONT "\n");
  7305. +           printk(KERN_ERR "ERROR: repeated CPUs\n");
  7306. +           break;
  7307. +       }
  7308. +
  7309. +       cpumask_or(groupmask, groupmask, sched_group_cpus(group));
  7310. +
  7311. +       cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
  7312. +
  7313. +       printk(KERN_CONT " %s", str);
  7314. +       if (group->sgp->power != SCHED_POWER_SCALE) {
  7315. +           printk(KERN_CONT " (cpu_power = %d)",
  7316. +               group->sgp->power);
  7317. +       }
  7318. +
  7319. +       group = group->next;
  7320. +   } while (group != sd->groups);
  7321. +   printk(KERN_CONT "\n");
  7322. +
  7323. +   if (!cpumask_equal(sched_domain_span(sd), groupmask))
  7324. +       printk(KERN_ERR "ERROR: groups don't span domain->span\n");
  7325. +
  7326. +   if (sd->parent &&
  7327. +       !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
  7328. +       printk(KERN_ERR "ERROR: parent span is not a superset "
  7329. +           "of domain->span\n");
  7330. +   return 0;
  7331. +}
  7332. +
  7333. +static void sched_domain_debug(struct sched_domain *sd, int cpu)
  7334. +{
  7335. +   int level = 0;
  7336. +
  7337. +   if (!sched_debug_enabled)
  7338. +       return;
  7339. +
  7340. +   if (!sd) {
  7341. +       printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
  7342. +       return;
  7343. +   }
  7344. +
  7345. +   printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
  7346. +
  7347. +   for (;;) {
  7348. +       if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
  7349. +           break;
  7350. +       level++;
  7351. +       sd = sd->parent;
  7352. +       if (!sd)
  7353. +           break;
  7354. +   }
  7355. +}
  7356. +#else /* !CONFIG_SCHED_DEBUG */
  7357. +# define sched_domain_debug(sd, cpu) do { } while (0)
  7358. +static inline bool sched_debug(void)
  7359. +{
  7360. +   return false;
  7361. +}
  7362. +#endif /* CONFIG_SCHED_DEBUG */
  7363. +
  7364. +static int sd_degenerate(struct sched_domain *sd)
  7365. +{
  7366. +   if (cpumask_weight(sched_domain_span(sd)) == 1)
  7367. +       return 1;
  7368. +
  7369. +   /* Following flags need at least 2 groups */
  7370. +   if (sd->flags & (SD_LOAD_BALANCE |
  7371. +            SD_BALANCE_NEWIDLE |
  7372. +            SD_BALANCE_FORK |
  7373. +            SD_BALANCE_EXEC |
  7374. +            SD_SHARE_CPUPOWER |
  7375. +            SD_SHARE_PKG_RESOURCES)) {
  7376. +       if (sd->groups != sd->groups->next)
  7377. +           return 0;
  7378. +   }
  7379. +
  7380. +   /* Following flags don't use groups */
  7381. +   if (sd->flags & (SD_WAKE_AFFINE))
  7382. +       return 0;
  7383. +
  7384. +   return 1;
  7385. +}
  7386. +
  7387. +static int
  7388. +sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
  7389. +{
  7390. +   unsigned long cflags = sd->flags, pflags = parent->flags;
  7391. +
  7392. +   if (sd_degenerate(parent))
  7393. +       return 1;
  7394. +
  7395. +   if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
  7396. +       return 0;
  7397. +
  7398. +   /* Flags needing groups don't count if only 1 group in parent */
  7399. +   if (parent->groups == parent->groups->next) {
  7400. +       pflags &= ~(SD_LOAD_BALANCE |
  7401. +               SD_BALANCE_NEWIDLE |
  7402. +               SD_BALANCE_FORK |
  7403. +               SD_BALANCE_EXEC |
  7404. +               SD_SHARE_CPUPOWER |
  7405. +               SD_SHARE_PKG_RESOURCES);
  7406. +       if (nr_node_ids == 1)
  7407. +           pflags &= ~SD_SERIALIZE;
  7408. +   }
  7409. +   if (~cflags & pflags)
  7410. +       return 0;
  7411. +
  7412. +   return 1;
  7413. +}
  7414. +
  7415. +static void free_rootdomain(struct rcu_head *rcu)
  7416. +{
  7417. +   struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
  7418. +
  7419. +   cpupri_cleanup(&rd->cpupri);
  7420. +   free_cpumask_var(rd->rto_mask);
  7421. +   free_cpumask_var(rd->online);
  7422. +   free_cpumask_var(rd->span);
  7423. +   kfree(rd);
  7424. +}
  7425. +
  7426. +static void rq_attach_root(struct rq *rq, struct root_domain *rd)
  7427. +{
  7428. +   struct root_domain *old_rd = NULL;
  7429. +   unsigned long flags;
  7430. +
  7431. +   grq_wlock_irqsave(&flags);
  7432. +
  7433. +   if (rq->rd) {
  7434. +       old_rd = rq->rd;
  7435. +
  7436. +       if (cpumask_test_cpu(rq->cpu, old_rd->online))
  7437. +           set_rq_offline(rq);
  7438. +
  7439. +       cpumask_clear_cpu(rq->cpu, old_rd->span);
  7440. +
  7441. +       /*
  7442. +        * If we dont want to free the old_rt yet then
  7443. +        * set old_rd to NULL to skip the freeing later
  7444. +        * in this function:
  7445. +        */
  7446. +       if (!atomic_dec_and_test(&old_rd->refcount))
  7447. +           old_rd = NULL;
  7448. +   }
  7449. +
  7450. +   atomic_inc(&rd->refcount);
  7451. +   rq->rd = rd;
  7452. +
  7453. +   cpumask_set_cpu(rq->cpu, rd->span);
  7454. +   if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
  7455. +       set_rq_online(rq);
  7456. +
  7457. +   grq_wunlock_irqrestore(&flags);
  7458. +
  7459. +   if (old_rd)
  7460. +       call_rcu_sched(&old_rd->rcu, free_rootdomain);
  7461. +}
  7462. +
  7463. +static int init_rootdomain(struct root_domain *rd)
  7464. +{
  7465. +   memset(rd, 0, sizeof(*rd));
  7466. +
  7467. +   if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
  7468. +       goto out;
  7469. +   if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
  7470. +       goto free_span;
  7471. +   if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
  7472. +       goto free_online;
  7473. +
  7474. +   if (cpupri_init(&rd->cpupri) != 0)
  7475. +       goto free_rto_mask;
  7476. +   return 0;
  7477. +
  7478. +free_rto_mask:
  7479. +   free_cpumask_var(rd->rto_mask);
  7480. +free_online:
  7481. +   free_cpumask_var(rd->online);
  7482. +free_span:
  7483. +   free_cpumask_var(rd->span);
  7484. +out:
  7485. +   return -ENOMEM;
  7486. +}
  7487. +
  7488. +static void init_defrootdomain(void)
  7489. +{
  7490. +   init_rootdomain(&def_root_domain);
  7491. +
  7492. +   atomic_set(&def_root_domain.refcount, 1);
  7493. +}
  7494. +
  7495. +static struct root_domain *alloc_rootdomain(void)
  7496. +{
  7497. +   struct root_domain *rd;
  7498. +
  7499. +   rd = kmalloc(sizeof(*rd), GFP_KERNEL);
  7500. +   if (!rd)
  7501. +       return NULL;
  7502. +
  7503. +   if (init_rootdomain(rd) != 0) {
  7504. +       kfree(rd);
  7505. +       return NULL;
  7506. +   }
  7507. +
  7508. +   return rd;
  7509. +}
  7510. +
  7511. +static void free_sched_groups(struct sched_group *sg, int free_sgp)
  7512. +{
  7513. +   struct sched_group *tmp, *first;
  7514. +
  7515. +   if (!sg)
  7516. +       return;
  7517. +
  7518. +   first = sg;
  7519. +   do {
  7520. +       tmp = sg->next;
  7521. +
  7522. +       if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
  7523. +           kfree(sg->sgp);
  7524. +
  7525. +       kfree(sg);
  7526. +       sg = tmp;
  7527. +   } while (sg != first);
  7528. +}
  7529. +
  7530. +static void free_sched_domain(struct rcu_head *rcu)
  7531. +{
  7532. +   struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
  7533. +
  7534. +   /*
  7535. +    * If its an overlapping domain it has private groups, iterate and
  7536. +    * nuke them all.
  7537. +    */
  7538. +   if (sd->flags & SD_OVERLAP) {
  7539. +       free_sched_groups(sd->groups, 1);
  7540. +   } else if (atomic_dec_and_test(&sd->groups->ref)) {
  7541. +       kfree(sd->groups->sgp);
  7542. +       kfree(sd->groups);
  7543. +   }
  7544. +   kfree(sd);
  7545. +}
  7546. +
  7547. +static void destroy_sched_domain(struct sched_domain *sd, int cpu)
  7548. +{
  7549. +   call_rcu(&sd->rcu, free_sched_domain);
  7550. +}
  7551. +
  7552. +static void destroy_sched_domains(struct sched_domain *sd, int cpu)
  7553. +{
  7554. +   for (; sd; sd = sd->parent)
  7555. +       destroy_sched_domain(sd, cpu);
  7556. +}
  7557. +
  7558. +/*
  7559. + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  7560. + * hold the hotplug lock.
  7561. + */
  7562. +static void
  7563. +cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
  7564. +{
  7565. +   struct rq *rq = cpu_rq(cpu);
  7566. +   struct sched_domain *tmp;
  7567. +
  7568. +   /* Remove the sched domains which do not contribute to scheduling. */
  7569. +   for (tmp = sd; tmp; ) {
  7570. +       struct sched_domain *parent = tmp->parent;
  7571. +       if (!parent)
  7572. +           break;
  7573. +
  7574. +       if (sd_parent_degenerate(tmp, parent)) {
  7575. +           tmp->parent = parent->parent;
  7576. +           if (parent->parent)
  7577. +               parent->parent->child = tmp;
  7578. +           destroy_sched_domain(parent, cpu);
  7579. +       } else
  7580. +           tmp = tmp->parent;
  7581. +   }
  7582. +
  7583. +   if (sd && sd_degenerate(sd)) {
  7584. +       tmp = sd;
  7585. +       sd = sd->parent;
  7586. +       destroy_sched_domain(tmp, cpu);
  7587. +       if (sd)
  7588. +           sd->child = NULL;
  7589. +   }
  7590. +
  7591. +   sched_domain_debug(sd, cpu);
  7592. +
  7593. +   rq_attach_root(rq, rd);
  7594. +   tmp = rq->sd;
  7595. +   rcu_assign_pointer(rq->sd, sd);
  7596. +   destroy_sched_domains(tmp, cpu);
  7597. +}
  7598. +
  7599. +/* cpus with isolated domains */
  7600. +static cpumask_var_t cpu_isolated_map;
  7601. +
  7602. +/* Setup the mask of cpus configured for isolated domains */
  7603. +static int __init isolated_cpu_setup(char *str)
  7604. +{
  7605. +   alloc_bootmem_cpumask_var(&cpu_isolated_map);
  7606. +   cpulist_parse(str, cpu_isolated_map);
  7607. +   return 1;
  7608. +}
  7609. +
  7610. +__setup("isolcpus=", isolated_cpu_setup);
  7611. +
  7612. +#define SD_NODES_PER_DOMAIN 16
  7613. +
  7614. +static const struct cpumask *cpu_cpu_mask(int cpu)
  7615. +{
  7616. +   return cpumask_of_node(cpu_to_node(cpu));
  7617. +}
  7618. +
  7619. +struct sd_data {
  7620. +   struct sched_domain **__percpu sd;
  7621. +   struct sched_group **__percpu sg;
  7622. +   struct sched_group_power **__percpu sgp;
  7623. +};
  7624. +
  7625. +struct s_data {
  7626. +   struct sched_domain ** __percpu sd;
  7627. +   struct root_domain  *rd;
  7628. +};
  7629. +
  7630. +enum s_alloc {
  7631. +   sa_rootdomain,
  7632. +   sa_sd,
  7633. +   sa_sd_storage,
  7634. +   sa_none,
  7635. +};
  7636. +
  7637. +struct sched_domain_topology_level;
  7638. +
  7639. +typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
  7640. +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
  7641. +
  7642. +#define SDTL_OVERLAP   0x01
  7643. +
  7644. +struct sched_domain_topology_level {
  7645. +   sched_domain_init_f init;
  7646. +   sched_domain_mask_f mask;
  7647. +   int         flags;
  7648. +   int         numa_level;
  7649. +   struct sd_data      data;
  7650. +};
  7651. +
  7652. +/*
  7653. + * Build an iteration mask that can exclude certain CPUs from the upwards
  7654. + * domain traversal.
  7655. + *
  7656. + * Asymmetric node setups can result in situations where the domain tree is of
  7657. + * unequal depth, make sure to skip domains that already cover the entire
  7658. + * range.
  7659. + *
  7660. + * In that case build_sched_domains() will have terminated the iteration early
  7661. + * and our sibling sd spans will be empty. Domains should always include the
  7662. + * cpu they're built on, so check that.
  7663. + *
  7664. + */
  7665. +static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
  7666. +{
  7667. +   const struct cpumask *span = sched_domain_span(sd);
  7668. +   struct sd_data *sdd = sd->private;
  7669. +   struct sched_domain *sibling;
  7670. +   int i;
  7671. +
  7672. +   for_each_cpu(i, span) {
  7673. +       sibling = *per_cpu_ptr(sdd->sd, i);
  7674. +       if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
  7675. +           continue;
  7676. +
  7677. +       cpumask_set_cpu(i, sched_group_mask(sg));
  7678. +   }
  7679. +}
  7680. +
  7681. +/*
  7682. + * Return the canonical balance cpu for this group, this is the first cpu
  7683. + * of this group that's also in the iteration mask.
  7684. + */
  7685. +int group_balance_cpu(struct sched_group *sg)
  7686. +{
  7687. +   return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
  7688. +}
  7689. +
  7690. +static int
  7691. +build_overlap_sched_groups(struct sched_domain *sd, int cpu)
  7692. +{
  7693. +   struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
  7694. +   const struct cpumask *span = sched_domain_span(sd);
  7695. +   struct cpumask *covered = sched_domains_tmpmask;
  7696. +   struct sd_data *sdd = sd->private;
  7697. +   struct sched_domain *child;
  7698. +   int i;
  7699. +
  7700. +   cpumask_clear(covered);
  7701. +
  7702. +   for_each_cpu(i, span) {
  7703. +       struct cpumask *sg_span;
  7704. +
  7705. +       if (cpumask_test_cpu(i, covered))
  7706. +           continue;
  7707. +
  7708. +       child = *per_cpu_ptr(sdd->sd, i);
  7709. +
  7710. +       /* See the comment near build_group_mask(). */
  7711. +       if (!cpumask_test_cpu(i, sched_domain_span(child)))
  7712. +           continue;
  7713. +
  7714. +       sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
  7715. +               GFP_KERNEL, cpu_to_node(i));
  7716. +
  7717. +       if (!sg)
  7718. +           goto fail;
  7719. +
  7720. +       sg_span = sched_group_cpus(sg);
  7721. +       if (child->child) {
  7722. +           child = child->child;
  7723. +           cpumask_copy(sg_span, sched_domain_span(child));
  7724. +       } else
  7725. +           cpumask_set_cpu(i, sg_span);
  7726. +
  7727. +       cpumask_or(covered, covered, sg_span);
  7728. +
  7729. +       sg->sgp = *per_cpu_ptr(sdd->sgp, i);
  7730. +       if (atomic_inc_return(&sg->sgp->ref) == 1)
  7731. +           build_group_mask(sd, sg);
  7732. +
  7733. +       /*
  7734. +        * Initialize sgp->power such that even if we mess up the
  7735. +        * domains and no possible iteration will get us here, we won't
  7736. +        * die on a /0 trap.
  7737. +        */
  7738. +       sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
  7739. +
  7740. +       /*
  7741. +        * Make sure the first group of this domain contains the
  7742. +        * canonical balance cpu. Otherwise the sched_domain iteration
  7743. +        * breaks. See update_sg_lb_stats().
  7744. +        */
  7745. +       if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
  7746. +           group_balance_cpu(sg) == cpu)
  7747. +           groups = sg;
  7748. +
  7749. +       if (!first)
  7750. +           first = sg;
  7751. +       if (last)
  7752. +           last->next = sg;
  7753. +       last = sg;
  7754. +       last->next = first;
  7755. +   }
  7756. +   sd->groups = groups;
  7757. +
  7758. +   return 0;
  7759. +
  7760. +fail:
  7761. +   free_sched_groups(first, 0);
  7762. +
  7763. +   return -ENOMEM;
  7764. +}
  7765. +
  7766. +static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
  7767. +{
  7768. +   struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
  7769. +   struct sched_domain *child = sd->child;
  7770. +
  7771. +   if (child)
  7772. +       cpu = cpumask_first(sched_domain_span(child));
  7773. +
  7774. +   if (sg) {
  7775. +       *sg = *per_cpu_ptr(sdd->sg, cpu);
  7776. +       (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
  7777. +       atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
  7778. +   }
  7779. +
  7780. +   return cpu;
  7781. +}
  7782. +
  7783. +/*
  7784. + * build_sched_groups will build a circular linked list of the groups
  7785. + * covered by the given span, and will set each group's ->cpumask correctly,
  7786. + * and ->cpu_power to 0.
  7787. + *
  7788. + * Assumes the sched_domain tree is fully constructed
  7789. + */
  7790. +static int
  7791. +build_sched_groups(struct sched_domain *sd, int cpu)
  7792. +{
  7793. +   struct sched_group *first = NULL, *last = NULL;
  7794. +   struct sd_data *sdd = sd->private;
  7795. +   const struct cpumask *span = sched_domain_span(sd);
  7796. +   struct cpumask *covered;
  7797. +   int i;
  7798. +
  7799. +   get_group(cpu, sdd, &sd->groups);
  7800. +   atomic_inc(&sd->groups->ref);
  7801. +
  7802. +   if (cpu != cpumask_first(sched_domain_span(sd)))
  7803. +       return 0;
  7804. +
  7805. +   lockdep_assert_held(&sched_domains_mutex);
  7806. +   covered = sched_domains_tmpmask;
  7807. +
  7808. +   cpumask_clear(covered);
  7809. +
  7810. +   for_each_cpu(i, span) {
  7811. +       struct sched_group *sg;
  7812. +       int group = get_group(i, sdd, &sg);
  7813. +       int j;
  7814. +
  7815. +       if (cpumask_test_cpu(i, covered))
  7816. +           continue;
  7817. +
  7818. +       cpumask_clear(sched_group_cpus(sg));
  7819. +       sg->sgp->power = 0;
  7820. +       cpumask_setall(sched_group_mask(sg));
  7821. +
  7822. +       for_each_cpu(j, span) {
  7823. +           if (get_group(j, sdd, NULL) != group)
  7824. +               continue;
  7825. +
  7826. +           cpumask_set_cpu(j, covered);
  7827. +           cpumask_set_cpu(j, sched_group_cpus(sg));
  7828. +       }
  7829. +
  7830. +       if (!first)
  7831. +           first = sg;
  7832. +       if (last)
  7833. +           last->next = sg;
  7834. +       last = sg;
  7835. +   }
  7836. +   last->next = first;
  7837. +
  7838. +   return 0;
  7839. +}
  7840. +
  7841. +/*
  7842. + * Initializers for schedule domains
  7843. + * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
  7844. + */
  7845. +
  7846. +#ifdef CONFIG_SCHED_DEBUG
  7847. +# define SD_INIT_NAME(sd, type)        sd->name = #type
  7848. +#else
  7849. +# define SD_INIT_NAME(sd, type)        do { } while (0)
  7850. +#endif
  7851. +
  7852. +#define SD_INIT_FUNC(type)                     \
  7853. +static noinline struct sched_domain *                  \
  7854. +sd_init_##type(struct sched_domain_topology_level *tl, int cpu)    \
  7855. +{                                  \
  7856. +   struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);   \
  7857. +   *sd = SD_##type##_INIT;                     \
  7858. +   SD_INIT_NAME(sd, type);                     \
  7859. +   sd->private = &tl->data;                    \
  7860. +   return sd;                          \
  7861. +}
  7862. +
  7863. +SD_INIT_FUNC(CPU)
  7864. +#ifdef CONFIG_SCHED_SMT
  7865. + SD_INIT_FUNC(SIBLING)
  7866. +#endif
  7867. +#ifdef CONFIG_SCHED_MC
  7868. + SD_INIT_FUNC(MC)
  7869. +#endif
  7870. +#ifdef CONFIG_SCHED_BOOK
  7871. + SD_INIT_FUNC(BOOK)
  7872. +#endif
  7873. +
  7874. +static int default_relax_domain_level = -1;
  7875. +int sched_domain_level_max;
  7876. +
  7877. +static int __init setup_relax_domain_level(char *str)
  7878. +{
  7879. +   if (kstrtoint(str, 0, &default_relax_domain_level))
  7880. +       pr_warn("Unable to set relax_domain_level\n");
  7881. +
  7882. +   return 1;
  7883. +}
  7884. +__setup("relax_domain_level=", setup_relax_domain_level);
  7885. +
  7886. +static void set_domain_attribute(struct sched_domain *sd,
  7887. +                struct sched_domain_attr *attr)
  7888. +{
  7889. +   int request;
  7890. +
  7891. +   if (!attr || attr->relax_domain_level < 0) {
  7892. +       if (default_relax_domain_level < 0)
  7893. +           return;
  7894. +       else
  7895. +           request = default_relax_domain_level;
  7896. +   } else
  7897. +       request = attr->relax_domain_level;
  7898. +   if (request < sd->level) {
  7899. +       /* turn off idle balance on this domain */
  7900. +       sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
  7901. +   } else {
  7902. +       /* turn on idle balance on this domain */
  7903. +       sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
  7904. +   }
  7905. +}
  7906. +
  7907. +static void __sdt_free(const struct cpumask *cpu_map);
  7908. +static int __sdt_alloc(const struct cpumask *cpu_map);
  7909. +
  7910. +static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
  7911. +                const struct cpumask *cpu_map)
  7912. +{
  7913. +   switch (what) {
  7914. +   case sa_rootdomain:
  7915. +       if (!atomic_read(&d->rd->refcount))
  7916. +           free_rootdomain(&d->rd->rcu); /* fall through */
  7917. +   case sa_sd:
  7918. +       free_percpu(d->sd); /* fall through */
  7919. +   case sa_sd_storage:
  7920. +       __sdt_free(cpu_map); /* fall through */
  7921. +   case sa_none:
  7922. +       break;
  7923. +   }
  7924. +}
  7925. +
  7926. +static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
  7927. +                          const struct cpumask *cpu_map)
  7928. +{
  7929. +   memset(d, 0, sizeof(*d));
  7930. +
  7931. +   if (__sdt_alloc(cpu_map))
  7932. +       return sa_sd_storage;
  7933. +   d->sd = alloc_percpu(struct sched_domain *);
  7934. +   if (!d->sd)
  7935. +       return sa_sd_storage;
  7936. +   d->rd = alloc_rootdomain();
  7937. +   if (!d->rd)
  7938. +       return sa_sd;
  7939. +   return sa_rootdomain;
  7940. +}
  7941. +
  7942. +/*
  7943. + * NULL the sd_data elements we've used to build the sched_domain and
  7944. + * sched_group structure so that the subsequent __free_domain_allocs()
  7945. + * will not free the data we're using.
  7946. + */
  7947. +static void claim_allocations(int cpu, struct sched_domain *sd)
  7948. +{
  7949. +   struct sd_data *sdd = sd->private;
  7950. +
  7951. +   WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
  7952. +   *per_cpu_ptr(sdd->sd, cpu) = NULL;
  7953. +
  7954. +   if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
  7955. +       *per_cpu_ptr(sdd->sg, cpu) = NULL;
  7956. +
  7957. +   if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
  7958. +       *per_cpu_ptr(sdd->sgp, cpu) = NULL;
  7959. +}
  7960. +
  7961. +#ifdef CONFIG_SCHED_SMT
  7962. +static const struct cpumask *cpu_smt_mask(int cpu)
  7963. +{
  7964. +   return topology_thread_cpumask(cpu);
  7965. +}
  7966. +#endif
  7967. +
  7968. +/*
  7969. + * Topology list, bottom-up.
  7970. + */
  7971. +static struct sched_domain_topology_level default_topology[] = {
  7972. +#ifdef CONFIG_SCHED_SMT
  7973. +   { sd_init_SIBLING, cpu_smt_mask, },
  7974. +#endif
  7975. +#ifdef CONFIG_SCHED_MC
  7976. +   { sd_init_MC, cpu_coregroup_mask, },
  7977. +#endif
  7978. +#ifdef CONFIG_SCHED_BOOK
  7979. +   { sd_init_BOOK, cpu_book_mask, },
  7980. +#endif
  7981. +   { sd_init_CPU, cpu_cpu_mask, },
  7982. +   { NULL, },
  7983. +};
  7984. +
  7985. +static struct sched_domain_topology_level *sched_domain_topology = default_topology;
  7986. +
  7987. +#ifdef CONFIG_NUMA
  7988. +
  7989. +static int sched_domains_numa_levels;
  7990. +static int *sched_domains_numa_distance;
  7991. +static struct cpumask ***sched_domains_numa_masks;
  7992. +static int sched_domains_curr_level;
  7993. +
  7994. +static inline int sd_local_flags(int level)
  7995. +{
  7996. +   if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
  7997. +       return 0;
  7998. +
  7999. +   return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
  8000. +}
  8001. +
  8002. +static struct sched_domain *
  8003. +sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
  8004. +{
  8005. +   struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
  8006. +   int level = tl->numa_level;
  8007. +   int sd_weight = cpumask_weight(
  8008. +           sched_domains_numa_masks[level][cpu_to_node(cpu)]);
  8009. +
  8010. +   *sd = (struct sched_domain){
  8011. +       .min_interval       = sd_weight,
  8012. +       .max_interval       = 2*sd_weight,
  8013. +       .busy_factor        = 32,
  8014. +       .imbalance_pct      = 125,
  8015. +       .cache_nice_tries   = 2,
  8016. +       .busy_idx       = 3,
  8017. +       .idle_idx       = 2,
  8018. +       .newidle_idx        = 0,
  8019. +       .wake_idx       = 0,
  8020. +       .forkexec_idx       = 0,
  8021. +
  8022. +       .flags          = 1*SD_LOAD_BALANCE
  8023. +                   | 1*SD_BALANCE_NEWIDLE
  8024. +                   | 0*SD_BALANCE_EXEC
  8025. +                   | 0*SD_BALANCE_FORK
  8026. +                   | 0*SD_BALANCE_WAKE
  8027. +                   | 0*SD_WAKE_AFFINE
  8028. +                   | 0*SD_PREFER_LOCAL
  8029. +                   | 0*SD_SHARE_CPUPOWER
  8030. +                   | 0*SD_SHARE_PKG_RESOURCES
  8031. +                   | 1*SD_SERIALIZE
  8032. +                   | 0*SD_PREFER_SIBLING
  8033. +                   | sd_local_flags(level)
  8034. +                   ,
  8035. +       .last_balance       = jiffies,
  8036. +       .balance_interval   = sd_weight,
  8037. +   };
  8038. +   SD_INIT_NAME(sd, NUMA);
  8039. +   sd->private = &tl->data;
  8040. +
  8041. +   /*
  8042. +    * Ugly hack to pass state to sd_numa_mask()...
  8043. +    */
  8044. +   sched_domains_curr_level = tl->numa_level;
  8045. +
  8046. +   return sd;
  8047. +}
  8048. +
  8049. +static const struct cpumask *sd_numa_mask(int cpu)
  8050. +{
  8051. +   return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
  8052. +}
  8053. +
  8054. +static void sched_numa_warn(const char *str)
  8055. +{
  8056. +   static int done = false;
  8057. +   int i,j;
  8058. +
  8059. +   if (done)
  8060. +       return;
  8061. +
  8062. +   done = true;
  8063. +
  8064. +   printk(KERN_WARNING "ERROR: %s\n\n", str);
  8065. +
  8066. +   for (i = 0; i < nr_node_ids; i++) {
  8067. +       printk(KERN_WARNING "  ");
  8068. +       for (j = 0; j < nr_node_ids; j++)
  8069. +           printk(KERN_CONT "%02d ", node_distance(i,j));
  8070. +       printk(KERN_CONT "\n");
  8071. +   }
  8072. +   printk(KERN_WARNING "\n");
  8073. +}
  8074. +
  8075. +static bool find_numa_distance(int distance)
  8076. +{
  8077. +   int i;
  8078. +
  8079. +   if (distance == node_distance(0, 0))
  8080. +       return true;
  8081. +
  8082. +   for (i = 0; i < sched_domains_numa_levels; i++) {
  8083. +       if (sched_domains_numa_distance[i] == distance)
  8084. +           return true;
  8085. +   }
  8086. +
  8087. +   return false;
  8088. +}
  8089. +
  8090. +static void sched_init_numa(void)
  8091. +{
  8092. +   int next_distance, curr_distance = node_distance(0, 0);
  8093. +   struct sched_domain_topology_level *tl;
  8094. +   int level = 0;
  8095. +   int i, j, k;
  8096. +
  8097. +   sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
  8098. +   if (!sched_domains_numa_distance)
  8099. +       return;
  8100. +
  8101. +   /*
  8102. +    * O(nr_nodes^2) deduplicating selection sort -- in order to find the
  8103. +    * unique distances in the node_distance() table.
  8104. +    *
  8105. +    * Assumes node_distance(0,j) includes all distances in
  8106. +    * node_distance(i,j) in order to avoid cubic time.
  8107. +    */
  8108. +   next_distance = curr_distance;
  8109. +   for (i = 0; i < nr_node_ids; i++) {
  8110. +       for (j = 0; j < nr_node_ids; j++) {
  8111. +           for (k = 0; k < nr_node_ids; k++) {
  8112. +               int distance = node_distance(i, k);
  8113. +
  8114. +               if (distance > curr_distance &&
  8115. +                   (distance < next_distance ||
  8116. +                    next_distance == curr_distance))
  8117. +                   next_distance = distance;
  8118. +
  8119. +               /*
  8120. +                * While not a strong assumption it would be nice to know
  8121. +                * about cases where if node A is connected to B, B is not
  8122. +                * equally connected to A.
  8123. +                */
  8124. +               if (sched_debug() && node_distance(k, i) != distance)
  8125. +                   sched_numa_warn("Node-distance not symmetric");
  8126. +
  8127. +               if (sched_debug() && i && !find_numa_distance(distance))
  8128. +                   sched_numa_warn("Node-0 not representative");
  8129. +           }
  8130. +           if (next_distance != curr_distance) {
  8131. +               sched_domains_numa_distance[level++] = next_distance;
  8132. +               sched_domains_numa_levels = level;
  8133. +               curr_distance = next_distance;
  8134. +           } else break;
  8135. +       }
  8136. +
  8137. +       /*
  8138. +        * In case of sched_debug() we verify the above assumption.
  8139. +        */
  8140. +       if (!sched_debug())
  8141. +           break;
  8142. +   }
  8143. +   /*
  8144. +    * 'level' contains the number of unique distances, excluding the
  8145. +    * identity distance node_distance(i,i).
  8146. +    *
  8147. +    * The sched_domains_nume_distance[] array includes the actual distance
  8148. +    * numbers.
  8149. +    */
  8150. +
  8151. +   sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
  8152. +   if (!sched_domains_numa_masks)
  8153. +       return;
  8154. +
  8155. +   /*
  8156. +    * Now for each level, construct a mask per node which contains all
  8157. +    * cpus of nodes that are that many hops away from us.
  8158. +    */
  8159. +   for (i = 0; i < level; i++) {
  8160. +       sched_domains_numa_masks[i] =
  8161. +           kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
  8162. +       if (!sched_domains_numa_masks[i])
  8163. +           return;
  8164. +
  8165. +       for (j = 0; j < nr_node_ids; j++) {
  8166. +           struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
  8167. +           if (!mask)
  8168. +               return;
  8169. +
  8170. +           sched_domains_numa_masks[i][j] = mask;
  8171. +
  8172. +           for (k = 0; k < nr_node_ids; k++) {
  8173. +               if (node_distance(j, k) > sched_domains_numa_distance[i])
  8174. +                   continue;
  8175. +
  8176. +               cpumask_or(mask, mask, cpumask_of_node(k));
  8177. +           }
  8178. +       }
  8179. +   }
  8180. +
  8181. +   tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
  8182. +           sizeof(struct sched_domain_topology_level), GFP_KERNEL);
  8183. +   if (!tl)
  8184. +       return;
  8185. +
  8186. +   /*
  8187. +    * Copy the default topology bits..
  8188. +    */
  8189. +   for (i = 0; default_topology[i].init; i++)
  8190. +       tl[i] = default_topology[i];
  8191. +
  8192. +   /*
  8193. +    * .. and append 'j' levels of NUMA goodness.
  8194. +    */
  8195. +   for (j = 0; j < level; i++, j++) {
  8196. +       tl[i] = (struct sched_domain_topology_level){
  8197. +           .init = sd_numa_init,
  8198. +           .mask = sd_numa_mask,
  8199. +           .flags = SDTL_OVERLAP,
  8200. +           .numa_level = j,
  8201. +       };
  8202. +   }
  8203. +
  8204. +   sched_domain_topology = tl;
  8205. +}
  8206. +#else
  8207. +static inline void sched_init_numa(void)
  8208. +{
  8209. +}
  8210. +#endif /* CONFIG_NUMA */
  8211. +
  8212. +static int __sdt_alloc(const struct cpumask *cpu_map)
  8213. +{
  8214. +   struct sched_domain_topology_level *tl;
  8215. +   int j;
  8216. +
  8217. +   for (tl = sched_domain_topology; tl->init; tl++) {
  8218. +       struct sd_data *sdd = &tl->data;
  8219. +
  8220. +       sdd->sd = alloc_percpu(struct sched_domain *);
  8221. +       if (!sdd->sd)
  8222. +           return -ENOMEM;
  8223. +
  8224. +       sdd->sg = alloc_percpu(struct sched_group *);
  8225. +       if (!sdd->sg)
  8226. +           return -ENOMEM;
  8227. +
  8228. +       sdd->sgp = alloc_percpu(struct sched_group_power *);
  8229. +       if (!sdd->sgp)
  8230. +           return -ENOMEM;
  8231. +
  8232. +       for_each_cpu(j, cpu_map) {
  8233. +           struct sched_domain *sd;
  8234. +           struct sched_group *sg;
  8235. +           struct sched_group_power *sgp;
  8236. +
  8237. +               sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
  8238. +                   GFP_KERNEL, cpu_to_node(j));
  8239. +           if (!sd)
  8240. +               return -ENOMEM;
  8241. +
  8242. +           *per_cpu_ptr(sdd->sd, j) = sd;
  8243. +
  8244. +           sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
  8245. +                   GFP_KERNEL, cpu_to_node(j));
  8246. +           if (!sg)
  8247. +               return -ENOMEM;
  8248. +
  8249. +           sg->next = sg;
  8250. +
  8251. +           *per_cpu_ptr(sdd->sg, j) = sg;
  8252. +
  8253. +           sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
  8254. +                   GFP_KERNEL, cpu_to_node(j));
  8255. +           if (!sgp)
  8256. +               return -ENOMEM;
  8257. +
  8258. +           *per_cpu_ptr(sdd->sgp, j) = sgp;
  8259. +       }
  8260. +   }
  8261. +
  8262. +   return 0;
  8263. +}
  8264. +
  8265. +static void __sdt_free(const struct cpumask *cpu_map)
  8266. +{
  8267. +   struct sched_domain_topology_level *tl;
  8268. +   int j;
  8269. +
  8270. +   for (tl = sched_domain_topology; tl->init; tl++) {
  8271. +       struct sd_data *sdd = &tl->data;
  8272. +
  8273. +       for_each_cpu(j, cpu_map) {
  8274. +           struct sched_domain *sd;
  8275. +
  8276. +           if (sdd->sd) {
  8277. +               sd = *per_cpu_ptr(sdd->sd, j);
  8278. +               if (sd && (sd->flags & SD_OVERLAP))
  8279. +                   free_sched_groups(sd->groups, 0);
  8280. +               kfree(*per_cpu_ptr(sdd->sd, j));
  8281. +           }
  8282. +
  8283. +           if (sdd->sg)
  8284. +               kfree(*per_cpu_ptr(sdd->sg, j));
  8285. +           if (sdd->sgp)
  8286. +               kfree(*per_cpu_ptr(sdd->sgp, j));
  8287. +       }
  8288. +       free_percpu(sdd->sd);
  8289. +       sdd->sd = NULL;
  8290. +       free_percpu(sdd->sg);
  8291. +       sdd->sg = NULL;
  8292. +       free_percpu(sdd->sgp);
  8293. +       sdd->sgp = NULL;
  8294. +   }
  8295. +}
  8296. +
  8297. +struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
  8298. +       struct s_data *d, const struct cpumask *cpu_map,
  8299. +       struct sched_domain_attr *attr, struct sched_domain *child,
  8300. +       int cpu)
  8301. +{
  8302. +   struct sched_domain *sd = tl->init(tl, cpu);
  8303. +   if (!sd)
  8304. +       return child;
  8305. +
  8306. +   cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
  8307. +   if (child) {
  8308. +       sd->level = child->level + 1;
  8309. +       sched_domain_level_max = max(sched_domain_level_max, sd->level);
  8310. +       child->parent = sd;
  8311. +   }
  8312. +   sd->child = child;
  8313. +   set_domain_attribute(sd, attr);
  8314. +
  8315. +   return sd;
  8316. +}
  8317. +
  8318. +/*
  8319. + * Build sched domains for a given set of cpus and attach the sched domains
  8320. + * to the individual cpus
  8321. + */
  8322. +static int build_sched_domains(const struct cpumask *cpu_map,
  8323. +                  struct sched_domain_attr *attr)
  8324. +{
  8325. +   enum s_alloc alloc_state = sa_none;
  8326. +   struct sched_domain *sd;
  8327. +   struct s_data d;
  8328. +   int i, ret = -ENOMEM;
  8329. +
  8330. +   alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
  8331. +   if (alloc_state != sa_rootdomain)
  8332. +       goto error;
  8333. +
  8334. +   /* Set up domains for cpus specified by the cpu_map. */
  8335. +   for_each_cpu(i, cpu_map) {
  8336. +       struct sched_domain_topology_level *tl;
  8337. +
  8338. +       sd = NULL;
  8339. +       for (tl = sched_domain_topology; tl->init; tl++) {
  8340. +           sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
  8341. +           if (tl->flags & SDTL_OVERLAP)
  8342. +               sd->flags |= SD_OVERLAP;
  8343. +           if (cpumask_equal(cpu_map, sched_domain_span(sd)))
  8344. +               break;
  8345. +       }
  8346. +
  8347. +       while (sd->child)
  8348. +           sd = sd->child;
  8349. +
  8350. +       *per_cpu_ptr(d.sd, i) = sd;
  8351. +   }
  8352. +
  8353. +   /* Build the groups for the domains */
  8354. +   for_each_cpu(i, cpu_map) {
  8355. +       for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
  8356. +           sd->span_weight = cpumask_weight(sched_domain_span(sd));
  8357. +           if (sd->flags & SD_OVERLAP) {
  8358. +               if (build_overlap_sched_groups(sd, i))
  8359. +                   goto error;
  8360. +           } else {
  8361. +               if (build_sched_groups(sd, i))
  8362. +                   goto error;
  8363. +           }
  8364. +       }
  8365. +   }
  8366. +
  8367. +   /* Calculate CPU power for physical packages and nodes */
  8368. +   for (i = nr_cpumask_bits-1; i >= 0; i--) {
  8369. +       if (!cpumask_test_cpu(i, cpu_map))
  8370. +           continue;
  8371. +
  8372. +       for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
  8373. +           claim_allocations(i, sd);
  8374. +       }
  8375. +   }
  8376. +
  8377. +   /* Attach the domains */
  8378. +   rcu_read_lock();
  8379. +   for_each_cpu(i, cpu_map) {
  8380. +       sd = *per_cpu_ptr(d.sd, i);
  8381. +       cpu_attach_domain(sd, d.rd, i);
  8382. +   }
  8383. +   rcu_read_unlock();
  8384. +
  8385. +   ret = 0;
  8386. +error:
  8387. +   __free_domain_allocs(&d, alloc_state, cpu_map);
  8388. +   return ret;
  8389. +}
  8390. +
  8391. +static cpumask_var_t *doms_cur;    /* current sched domains */
  8392. +static int ndoms_cur;      /* number of sched domains in 'doms_cur' */
  8393. +static struct sched_domain_attr *dattr_cur;
  8394. +               /* attribues of custom domains in 'doms_cur' */
  8395. +
  8396. +/*
  8397. + * Special case: If a kmalloc of a doms_cur partition (array of
  8398. + * cpumask) fails, then fallback to a single sched domain,
  8399. + * as determined by the single cpumask fallback_doms.
  8400. + */
  8401. +static cpumask_var_t fallback_doms;
  8402. +
  8403. +/*
  8404. + * arch_update_cpu_topology lets virtualized architectures update the
  8405. + * cpu core maps. It is supposed to return 1 if the topology changed
  8406. + * or 0 if it stayed the same.
  8407. + */
  8408. +int __attribute__((weak)) arch_update_cpu_topology(void)
  8409. +{
  8410. +   return 0;
  8411. +}
  8412. +
  8413. +cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
  8414. +{
  8415. +   int i;
  8416. +   cpumask_var_t *doms;
  8417. +
  8418. +   doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
  8419. +   if (!doms)
  8420. +       return NULL;
  8421. +   for (i = 0; i < ndoms; i++) {
  8422. +       if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
  8423. +           free_sched_domains(doms, i);
  8424. +           return NULL;
  8425. +       }
  8426. +   }
  8427. +   return doms;
  8428. +}
  8429. +
  8430. +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
  8431. +{
  8432. +   unsigned int i;
  8433. +   for (i = 0; i < ndoms; i++)
  8434. +       free_cpumask_var(doms[i]);
  8435. +   kfree(doms);
  8436. +}
  8437. +
  8438. +/*
  8439. + * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  8440. + * For now this just excludes isolated cpus, but could be used to
  8441. + * exclude other special cases in the future.
  8442. + */
  8443. +static int init_sched_domains(const struct cpumask *cpu_map)
  8444. +{
  8445. +   int err;
  8446. +
  8447. +   arch_update_cpu_topology();
  8448. +   ndoms_cur = 1;
  8449. +   doms_cur = alloc_sched_domains(ndoms_cur);
  8450. +   if (!doms_cur)
  8451. +       doms_cur = &fallback_doms;
  8452. +   cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
  8453. +   err = build_sched_domains(doms_cur[0], NULL);
  8454. +   register_sched_domain_sysctl();
  8455. +
  8456. +   return err;
  8457. +}
  8458. +
  8459. +/*
  8460. + * Detach sched domains from a group of cpus specified in cpu_map
  8461. + * These cpus will now be attached to the NULL domain
  8462. + */
  8463. +static void detach_destroy_domains(const struct cpumask *cpu_map)
  8464. +{
  8465. +   int i;
  8466. +
  8467. +   rcu_read_lock();
  8468. +   for_each_cpu(i, cpu_map)
  8469. +       cpu_attach_domain(NULL, &def_root_domain, i);
  8470. +   rcu_read_unlock();
  8471. +}
  8472. +
  8473. +/* handle null as "default" */
  8474. +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  8475. +           struct sched_domain_attr *new, int idx_new)
  8476. +{
  8477. +   struct sched_domain_attr tmp;
  8478. +
  8479. +   /* fast path */
  8480. +   if (!new && !cur)
  8481. +       return 1;
  8482. +
  8483. +   tmp = SD_ATTR_INIT;
  8484. +   return !memcmp(cur ? (cur + idx_cur) : &tmp,
  8485. +           new ? (new + idx_new) : &tmp,
  8486. +           sizeof(struct sched_domain_attr));
  8487. +}
  8488. +
  8489. +/*
  8490. + * Partition sched domains as specified by the 'ndoms_new'
  8491. + * cpumasks in the array doms_new[] of cpumasks. This compares
  8492. + * doms_new[] to the current sched domain partitioning, doms_cur[].
  8493. + * It destroys each deleted domain and builds each new domain.
  8494. + *
  8495. + * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
  8496. + * The masks don't intersect (don't overlap.) We should setup one
  8497. + * sched domain for each mask. CPUs not in any of the cpumasks will
  8498. + * not be load balanced. If the same cpumask appears both in the
  8499. + * current 'doms_cur' domains and in the new 'doms_new', we can leave
  8500. + * it as it is.
  8501. + *
  8502. + * The passed in 'doms_new' should be allocated using
  8503. + * alloc_sched_domains.  This routine takes ownership of it and will
  8504. + * free_sched_domains it when done with it. If the caller failed the
  8505. + * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
  8506. + * and partition_sched_domains() will fallback to the single partition
  8507. + * 'fallback_doms', it also forces the domains to be rebuilt.
  8508. + *
  8509. + * If doms_new == NULL it will be replaced with cpu_online_mask.
  8510. + * ndoms_new == 0 is a special case for destroying existing domains,
  8511. + * and it will not create the default domain.
  8512. + *
  8513. + * Call with hotplug lock held
  8514. + */
  8515. +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
  8516. +                struct sched_domain_attr *dattr_new)
  8517. +{
  8518. +   int i, j, n;
  8519. +   int new_topology;
  8520. +
  8521. +   mutex_lock(&sched_domains_mutex);
  8522. +
  8523. +   /* always unregister in case we don't destroy any domains */
  8524. +   unregister_sched_domain_sysctl();
  8525. +
  8526. +   /* Let architecture update cpu core mappings. */
  8527. +   new_topology = arch_update_cpu_topology();
  8528. +
  8529. +   n = doms_new ? ndoms_new : 0;
  8530. +
  8531. +   /* Destroy deleted domains */
  8532. +   for (i = 0; i < ndoms_cur; i++) {
  8533. +       for (j = 0; j < n && !new_topology; j++) {
  8534. +           if (cpumask_equal(doms_cur[i], doms_new[j])
  8535. +               && dattrs_equal(dattr_cur, i, dattr_new, j))
  8536. +               goto match1;
  8537. +       }
  8538. +       /* no match - a current sched domain not in new doms_new[] */
  8539. +       detach_destroy_domains(doms_cur[i]);
  8540. +match1:
  8541. +       ;
  8542. +   }
  8543. +
  8544. +   if (doms_new == NULL) {
  8545. +       ndoms_cur = 0;
  8546. +       doms_new = &fallback_doms;
  8547. +       cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
  8548. +       WARN_ON_ONCE(dattr_new);
  8549. +   }
  8550. +
  8551. +   /* Build new domains */
  8552. +   for (i = 0; i < ndoms_new; i++) {
  8553. +       for (j = 0; j < ndoms_cur && !new_topology; j++) {
  8554. +           if (cpumask_equal(doms_new[i], doms_cur[j])
  8555. +               && dattrs_equal(dattr_new, i, dattr_cur, j))
  8556. +               goto match2;
  8557. +       }
  8558. +       /* no match - add a new doms_new */
  8559. +       build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
  8560. +match2:
  8561. +       ;
  8562. +   }
  8563. +
  8564. +   /* Remember the new sched domains */
  8565. +   if (doms_cur != &fallback_doms)
  8566. +       free_sched_domains(doms_cur, ndoms_cur);
  8567. +   kfree(dattr_cur);   /* kfree(NULL) is safe */
  8568. +   doms_cur = doms_new;
  8569. +   dattr_cur = dattr_new;
  8570. +   ndoms_cur = ndoms_new;
  8571. +
  8572. +   register_sched_domain_sysctl();
  8573. +
  8574. +   mutex_unlock(&sched_domains_mutex);
  8575. +}
  8576. +
  8577. +/*
  8578. + * Update cpusets according to cpu_active mask.  If cpusets are
  8579. + * disabled, cpuset_update_active_cpus() becomes a simple wrapper
  8580. + * around partition_sched_domains().
  8581. + */
  8582. +static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
  8583. +                void *hcpu)
  8584. +{
  8585. +   switch (action & ~CPU_TASKS_FROZEN) {
  8586. +   case CPU_ONLINE:
  8587. +   case CPU_DOWN_FAILED:
  8588. +       cpuset_update_active_cpus();
  8589. +       return NOTIFY_OK;
  8590. +   default:
  8591. +       return NOTIFY_DONE;
  8592. +   }
  8593. +}
  8594. +
  8595. +static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
  8596. +                  void *hcpu)
  8597. +{
  8598. +   switch (action & ~CPU_TASKS_FROZEN) {
  8599. +   case CPU_DOWN_PREPARE:
  8600. +       cpuset_update_active_cpus();
  8601. +       return NOTIFY_OK;
  8602. +   default:
  8603. +       return NOTIFY_DONE;
  8604. +   }
  8605. +}
  8606. +
  8607. +#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
  8608. +/*
  8609. + * Cheaper version of the below functions in case support for SMT and MC is
  8610. + * compiled in but CPUs have no siblings.
  8611. + */
  8612. +static bool sole_cpu_idle(int cpu)
  8613. +{
  8614. +   return rq_idle(cpu_rq(cpu));
  8615. +}
  8616. +#endif
  8617. +#ifdef CONFIG_SCHED_SMT
  8618. +/* All this CPU's SMT siblings are idle */
  8619. +static bool siblings_cpu_idle(int cpu)
  8620. +{
  8621. +   return cpumask_subset(&(cpu_rq(cpu)->smt_siblings),
  8622. +                 &grq.cpu_idle_map);
  8623. +}
  8624. +#endif
  8625. +#ifdef CONFIG_SCHED_MC
  8626. +/* All this CPU's shared cache siblings are idle */
  8627. +static bool cache_cpu_idle(int cpu)
  8628. +{
  8629. +   return cpumask_subset(&(cpu_rq(cpu)->cache_siblings),
  8630. +                 &grq.cpu_idle_map);
  8631. +}
  8632. +#endif
  8633. +
  8634. +enum sched_domain_level {
  8635. +   SD_LV_NONE = 0,
  8636. +   SD_LV_SIBLING,
  8637. +   SD_LV_MC,
  8638. +   SD_LV_BOOK,
  8639. +   SD_LV_CPU,
  8640. +   SD_LV_NODE,
  8641. +   SD_LV_ALLNODES,
  8642. +   SD_LV_MAX
  8643. +};
  8644. +
  8645. +void __init sched_init_smp(void)
  8646. +{
  8647. +   struct sched_domain *sd;
  8648. +   int cpu;
  8649. +
  8650. +   cpumask_var_t non_isolated_cpus;
  8651. +
  8652. +   alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
  8653. +   alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
  8654. +
  8655. +   sched_init_numa();
  8656. +
  8657. +   get_online_cpus();
  8658. +   mutex_lock(&sched_domains_mutex);
  8659. +   init_sched_domains(cpu_active_mask);
  8660. +   cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
  8661. +   if (cpumask_empty(non_isolated_cpus))
  8662. +       cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
  8663. +   mutex_unlock(&sched_domains_mutex);
  8664. +   put_online_cpus();
  8665. +
  8666. +   hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
  8667. +   hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
  8668. +
  8669. +   /* Move init over to a non-isolated CPU */
  8670. +   if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
  8671. +       BUG();
  8672. +   free_cpumask_var(non_isolated_cpus);
  8673. +
  8674. +   grq_wlock_irq();
  8675. +   /*
  8676. +    * Set up the relative cache distance of each online cpu from each
  8677. +    * other in a simple array for quick lookup. Locality is determined
  8678. +    * by the closest sched_domain that CPUs are separated by. CPUs with
  8679. +    * shared cache in SMT and MC are treated as local. Separate CPUs
  8680. +    * (within the same package or physically) within the same node are
  8681. +    * treated as not local. CPUs not even in the same domain (different
  8682. +    * nodes) are treated as very distant.
  8683. +    */
  8684. +   for_each_online_cpu(cpu) {
  8685. +       struct rq *rq = cpu_rq(cpu);
  8686. +       for_each_domain(cpu, sd) {
  8687. +           int locality, other_cpu;
  8688. +
  8689. +#ifdef CONFIG_SCHED_SMT
  8690. +           if (sd->level == SD_LV_SIBLING) {
  8691. +               for_each_cpu_mask(other_cpu, *sched_domain_span(sd))
  8692. +                   cpumask_set_cpu(other_cpu, &rq->smt_siblings);
  8693. +           }
  8694. +#endif
  8695. +#ifdef CONFIG_SCHED_MC
  8696. +           if (sd->level == SD_LV_MC) {
  8697. +               for_each_cpu_mask(other_cpu, *sched_domain_span(sd))
  8698. +                   cpumask_set_cpu(other_cpu, &rq->cache_siblings);
  8699. +           }
  8700. +#endif
  8701. +           if (sd->level <= SD_LV_SIBLING)
  8702. +               locality = 1;
  8703. +           else if (sd->level <= SD_LV_MC)
  8704. +               locality = 2;
  8705. +           else if (sd->level <= SD_LV_NODE)
  8706. +               locality = 3;
  8707. +           else
  8708. +               continue;
  8709. +
  8710. +           for_each_cpu_mask(other_cpu, *sched_domain_span(sd)) {
  8711. +               if (locality < rq->cpu_locality[other_cpu])
  8712. +                   rq->cpu_locality[other_cpu] = locality;
  8713. +           }
  8714. +       }
  8715. +
  8716. +/*
  8717. +        * Each runqueue has its own function in case it doesn't have
  8718. +        * siblings of its own allowing mixed topologies.
  8719. +        */
  8720. +#ifdef CONFIG_SCHED_SMT
  8721. +       if (cpus_weight(rq->smt_siblings) > 1)
  8722. +           rq->siblings_idle = siblings_cpu_idle;
  8723. +#endif
  8724. +#ifdef CONFIG_SCHED_MC
  8725. +       if (cpus_weight(rq->cache_siblings) > 1)
  8726. +           rq->cache_idle = cache_cpu_idle;
  8727. +#endif
  8728. +   }
  8729. +   grq_wunlock_irq();
  8730. +}
  8731. +#else
  8732. +void __init sched_init_smp(void)
  8733. +{
  8734. +}
  8735. +#endif /* CONFIG_SMP */
  8736. +
  8737. +unsigned int sysctl_timer_migration = 1;
  8738. +
  8739. +int in_sched_functions(unsigned long addr)
  8740. +{
  8741. +   return in_lock_functions(addr) ||
  8742. +       (addr >= (unsigned long)__sched_text_start
  8743. +       && addr < (unsigned long)__sched_text_end);
  8744. +}
  8745. +
  8746. +void __init sched_init(void)
  8747. +{
  8748. +   int i;
  8749. +   struct rq *rq;
  8750. +
  8751. +   prio_ratios[0] = 128;
  8752. +   for (i = 1 ; i < PRIO_RANGE ; i++)
  8753. +       prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
  8754. +
  8755. +   urwlock_init(&grq.urw);
  8756. +   grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
  8757. +   grq.niffies = 0;
  8758. +   grq.last_jiffy = jiffies;
  8759. +   raw_spin_lock_init(&grq.iso_lock);
  8760. +   grq.iso_ticks = 0;
  8761. +   grq.iso_refractory = false;
  8762. +   grq.noc = 1;
  8763. +#ifdef CONFIG_SMP
  8764. +   init_defrootdomain();
  8765. +   grq.qnr = grq.idle_cpus = 0;
  8766. +   cpumask_clear(&grq.cpu_idle_map);
  8767. +#else
  8768. +   uprq = &per_cpu(runqueues, 0);
  8769. +#endif
  8770. +   for_each_possible_cpu(i) {
  8771. +       rq = cpu_rq(i);
  8772. +       rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc =
  8773. +                 rq->iowait_pc = rq->idle_pc = 0;
  8774. +       rq->dither = false;
  8775. +#ifdef CONFIG_SMP
  8776. +       rq->sticky_task = NULL;
  8777. +       rq->last_niffy = 0;
  8778. +       rq->sd = NULL;
  8779. +       rq->rd = NULL;
  8780. +       rq->online = false;
  8781. +       rq->cpu = i;
  8782. +       rq_attach_root(rq, &def_root_domain);
  8783. +#endif
  8784. +       atomic_set(&rq->nr_iowait, 0);
  8785. +   }
  8786. +
  8787. +#ifdef CONFIG_SMP
  8788. +   nr_cpu_ids = i;
  8789. +   /*
  8790. +    * Set the base locality for cpu cache distance calculation to
  8791. +    * "distant" (3). Make sure the distance from a CPU to itself is 0.
  8792. +    */
  8793. +   for_each_possible_cpu(i) {
  8794. +       int j;
  8795. +
  8796. +       rq = cpu_rq(i);
  8797. +#ifdef CONFIG_SCHED_SMT
  8798. +       cpumask_clear(&rq->smt_siblings);
  8799. +       cpumask_set_cpu(i, &rq->smt_siblings);
  8800. +       rq->siblings_idle = sole_cpu_idle;
  8801. +       cpumask_set_cpu(i, &rq->smt_siblings);
  8802. +#endif
  8803. +#ifdef CONFIG_SCHED_MC
  8804. +       cpumask_clear(&rq->cache_siblings);
  8805. +       cpumask_set_cpu(i, &rq->cache_siblings);
  8806. +       rq->cache_idle = sole_cpu_idle;
  8807. +       cpumask_set_cpu(i, &rq->cache_siblings);
  8808. +#endif
  8809. +       rq->cpu_locality = kmalloc(nr_cpu_ids * sizeof(int *), GFP_ATOMIC);
  8810. +       for_each_possible_cpu(j) {
  8811. +           if (i == j)
  8812. +               rq->cpu_locality[j] = 0;
  8813. +           else
  8814. +               rq->cpu_locality[j] = 4;
  8815. +       }
  8816. +   }
  8817. +#endif
  8818. +
  8819. +   for (i = 0; i < PRIO_LIMIT; i++)
  8820. +       INIT_LIST_HEAD(grq.queue + i);
  8821. +   /* delimiter for bitsearch */
  8822. +   __set_bit(PRIO_LIMIT, grq.prio_bitmap);
  8823. +
  8824. +#ifdef CONFIG_PREEMPT_NOTIFIERS
  8825. +   INIT_HLIST_HEAD(&init_task.preempt_notifiers);
  8826. +#endif
  8827. +
  8828. +#ifdef CONFIG_RT_MUTEXES
  8829. +   plist_head_init(&init_task.pi_waiters);
  8830. +#endif
  8831. +
  8832. +   /*
  8833. +    * The boot idle thread does lazy MMU switching as well:
  8834. +    */
  8835. +   atomic_inc(&init_mm.mm_count);
  8836. +   enter_lazy_tlb(&init_mm, current);
  8837. +
  8838. +   /*
  8839. +    * Make us the idle thread. Technically, schedule() should not be
  8840. +    * called from this thread, however somewhere below it might be,
  8841. +    * but because we are the idle thread, we just pick up running again
  8842. +    * when this runqueue becomes "idle".
  8843. +    */
  8844. +   init_idle(current, smp_processor_id());
  8845. +
  8846. +#ifdef CONFIG_SMP
  8847. +   zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
  8848. +   /* May be allocated at isolcpus cmdline parse time */
  8849. +   if (cpu_isolated_map == NULL)
  8850. +       zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
  8851. +   idle_thread_set_boot_cpu();
  8852. +#endif /* SMP */
  8853. +}
  8854. +
  8855. +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
  8856. +static inline int preempt_count_equals(int preempt_offset)
  8857. +{
  8858. +   int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
  8859. +
  8860. +   return (nested == preempt_offset);
  8861. +}
  8862. +
  8863. +void __might_sleep(const char *file, int line, int preempt_offset)
  8864. +{
  8865. +   static unsigned long prev_jiffy;    /* ratelimiting */
  8866. +
  8867. +   rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
  8868. +   if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
  8869. +       system_state != SYSTEM_RUNNING || oops_in_progress)
  8870. +       return;
  8871. +   if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
  8872. +       return;
  8873. +   prev_jiffy = jiffies;
  8874. +
  8875. +   printk(KERN_ERR
  8876. +       "BUG: sleeping function called from invalid context at %s:%d\n",
  8877. +           file, line);
  8878. +   printk(KERN_ERR
  8879. +       "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
  8880. +           in_atomic(), irqs_disabled(),
  8881. +           current->pid, current->comm);
  8882. +
  8883. +   debug_show_held_locks(current);
  8884. +   if (irqs_disabled())
  8885. +       print_irqtrace_events(current);
  8886. +   dump_stack();
  8887. +}
  8888. +EXPORT_SYMBOL(__might_sleep);
  8889. +#endif
  8890. +
  8891. +#ifdef CONFIG_MAGIC_SYSRQ
  8892. +void normalize_rt_tasks(void)
  8893. +{
  8894. +   struct task_struct *g, *p;
  8895. +   unsigned long flags;
  8896. +   struct rq *rq;
  8897. +   int queued;
  8898. +
  8899. +   read_lock_irq(&tasklist_lock);
  8900. +
  8901. +   do_each_thread(g, p) {
  8902. +       if (!rt_task(p) && !iso_task(p))
  8903. +           continue;
  8904. +
  8905. +       raw_spin_lock_irqsave(&p->pi_lock, flags);
  8906. +       rq = __task_grq_wlock(p);
  8907. +
  8908. +       queued = task_queued(p);
  8909. +       if (queued)
  8910. +           dequeue_task(p);
  8911. +       __setscheduler(p, rq, SCHED_NORMAL, 0);
  8912. +       if (queued) {
  8913. +           enqueue_task(p);
  8914. +           try_preempt(p, rq);
  8915. +       }
  8916. +
  8917. +       __task_grq_wunlock();
  8918. +       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  8919. +   } while_each_thread(g, p);
  8920. +
  8921. +   read_unlock_irq(&tasklist_lock);
  8922. +}
  8923. +#endif /* CONFIG_MAGIC_SYSRQ */
  8924. +
  8925. +#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
  8926. +/*
  8927. + * These functions are only useful for the IA64 MCA handling, or kdb.
  8928. + *
  8929. + * They can only be called when the whole system has been
  8930. + * stopped - every CPU needs to be quiescent, and no scheduling
  8931. + * activity can take place. Using them for anything else would
  8932. + * be a serious bug, and as a result, they aren't even visible
  8933. + * under any other configuration.
  8934. + */
  8935. +
  8936. +/**
  8937. + * curr_task - return the current task for a given cpu.
  8938. + * @cpu: the processor in question.
  8939. + *
  8940. + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  8941. + */
  8942. +struct task_struct *curr_task(int cpu)
  8943. +{
  8944. +   return cpu_curr(cpu);
  8945. +}
  8946. +
  8947. +#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
  8948. +
  8949. +#ifdef CONFIG_IA64
  8950. +/**
  8951. + * set_curr_task - set the current task for a given cpu.
  8952. + * @cpu: the processor in question.
  8953. + * @p: the task pointer to set.
  8954. + *
  8955. + * Description: This function must only be used when non-maskable interrupts
  8956. + * are serviced on a separate stack.  It allows the architecture to switch the
  8957. + * notion of the current task on a cpu in a non-blocking manner.  This function
  8958. + * must be called with all CPU's synchronised, and interrupts disabled, the
  8959. + * and caller must save the original value of the current task (see
  8960. + * curr_task() above) and restore that value before reenabling interrupts and
  8961. + * re-starting the system.
  8962. + *
  8963. + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  8964. + */
  8965. +void set_curr_task(int cpu, struct task_struct *p)
  8966. +{
  8967. +   cpu_curr(cpu) = p;
  8968. +}
  8969. +
  8970. +#endif
  8971. +
  8972. +/*
  8973. + * Use precise platform statistics if available:
  8974. + */
  8975. +#ifdef CONFIG_VIRT_CPU_ACCOUNTING
  8976. +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
  8977. +{
  8978. +   *ut = p->utime;
  8979. +   *st = p->stime;
  8980. +}
  8981. +
  8982. +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
  8983. +{
  8984. +   struct task_cputime cputime;
  8985. +
  8986. +   thread_group_cputime(p, &cputime);
  8987. +
  8988. +   *ut = cputime.utime;
  8989. +   *st = cputime.stime;
  8990. +}
  8991. +#else
  8992. +
  8993. +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
  8994. +{
  8995. +   cputime_t rtime, utime = p->utime, total = utime + p->stime;
  8996. +
  8997. +   rtime = nsecs_to_cputime(p->sched_time);
  8998. +
  8999. +   if (total) {
  9000. +       u64 temp;
  9001. +
  9002. +       temp = (u64)(rtime * utime);
  9003. +       do_div(temp, total);
  9004. +       utime = (cputime_t)temp;
  9005. +   } else
  9006. +       utime = rtime;
  9007. +
  9008. +   /*
  9009. +    * Compare with previous values, to keep monotonicity:
  9010. +    */
  9011. +   p->prev_utime = max(p->prev_utime, utime);
  9012. +   p->prev_stime = max(p->prev_stime, (rtime - p->prev_utime));
  9013. +
  9014. +   *ut = p->prev_utime;
  9015. +   *st = p->prev_stime;
  9016. +}
  9017. +
  9018. +/*
  9019. + * Must be called with siglock held.
  9020. + */
  9021. +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
  9022. +{
  9023. +   struct signal_struct *sig = p->signal;
  9024. +   struct task_cputime cputime;
  9025. +   cputime_t rtime, utime, total;
  9026. +
  9027. +   thread_group_cputime(p, &cputime);
  9028. +
  9029. +   total = cputime.utime + cputime.stime;
  9030. +   rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
  9031. +
  9032. +   if (total) {
  9033. +       u64 temp;
  9034. +
  9035. +       temp = (u64)(rtime * cputime.utime);
  9036. +       do_div(temp, total);
  9037. +       utime = (cputime_t)temp;
  9038. +   } else
  9039. +       utime = rtime;
  9040. +
  9041. +   sig->prev_utime = max(sig->prev_utime, utime);
  9042. +   sig->prev_stime = max(sig->prev_stime, (rtime - sig->prev_utime));
  9043. +
  9044. +   *ut = sig->prev_utime;
  9045. +   *st = sig->prev_stime;
  9046. +}
  9047. +#endif
  9048. +
  9049. +inline cputime_t task_gtime(struct task_struct *p)
  9050. +{
  9051. +   return p->gtime;
  9052. +}
  9053. +
  9054. +void __cpuinit init_idle_bootup_task(struct task_struct *idle)
  9055. +{}
  9056. +
  9057. +#ifdef CONFIG_SCHED_DEBUG
  9058. +void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
  9059. +{}
  9060. +
  9061. +void proc_sched_set_task(struct task_struct *p)
  9062. +{}
  9063. +#endif
  9064. +
  9065. +#ifdef CONFIG_SMP
  9066. +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
  9067. +{
  9068. +   return SCHED_LOAD_SCALE;
  9069. +}
  9070. +
  9071. +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
  9072. +{
  9073. +   unsigned long weight = cpumask_weight(sched_domain_span(sd));
  9074. +   unsigned long smt_gain = sd->smt_gain;
  9075. +
  9076. +   smt_gain /= weight;
  9077. +
  9078. +   return smt_gain;
  9079. +}
  9080. +#endif
  9081. diff -ruNb a/kernel/sched/Makefile b/kernel/sched/Makefile
  9082. --- a/kernel/sched/Makefile 2012-10-12 21:48:25.000000000 +0100
  9083. +++ b/kernel/sched/Makefile 2012-10-21 16:28:24.298668760 +0100
  9084. @@ -11,8 +11,12 @@
  9085.  CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
  9086.  endif
  9087.  
  9088. +ifdef CONFIG_SCHED_BFS
  9089. +obj-y += bfs.o clock.o
  9090. +else
  9091.  obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
  9092. -obj-$(CONFIG_SMP) += cpupri.o
  9093.  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
  9094. -obj-$(CONFIG_SCHEDSTATS) += stats.o
  9095.  obj-$(CONFIG_SCHED_DEBUG) += debug.o
  9096. +endif
  9097. +obj-$(CONFIG_SMP) += cpupri.o
  9098. +obj-$(CONFIG_SCHEDSTATS) += stats.o
  9099. diff -ruNb a/kernel/sysctl.c b/kernel/sysctl.c
  9100. --- a/kernel/sysctl.c   2012-10-12 21:48:25.000000000 +0100
  9101. +++ b/kernel/sysctl.c   2012-10-21 16:28:24.299668568 +0100
  9102. @@ -126,7 +126,12 @@
  9103.  static int __maybe_unused two = 2;
  9104.  static int __maybe_unused three = 3;
  9105.  static unsigned long one_ul = 1;
  9106. -static int one_hundred = 100;
  9107. +static int __maybe_unused one_hundred = 100;
  9108. +#ifdef CONFIG_SCHED_BFS
  9109. +extern int rr_interval;
  9110. +extern int sched_iso_cpu;
  9111. +static int __read_mostly one_thousand = 1000;
  9112. +#endif
  9113.  #ifdef CONFIG_PRINTK
  9114.  static int ten_thousand = 10000;
  9115.  #endif
  9116. @@ -241,7 +246,7 @@
  9117.     { }
  9118.  };
  9119.  
  9120. -#ifdef CONFIG_SCHED_DEBUG
  9121. +#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BFS)
  9122.  static int min_sched_granularity_ns = 100000;      /* 100 usecs */
  9123.  static int max_sched_granularity_ns = NSEC_PER_SEC;    /* 1 second */
  9124.  static int min_wakeup_granularity_ns;          /* 0 usecs */
  9125. @@ -256,6 +261,7 @@
  9126.  #endif
  9127.  
  9128.  static struct ctl_table kern_table[] = {
  9129. +#ifndef CONFIG_SCHED_BFS
  9130.     {
  9131.         .procname   = "sched_child_runs_first",
  9132.         .data       = &sysctl_sched_child_runs_first,
  9133. @@ -373,6 +379,7 @@
  9134.         .extra1     = &one,
  9135.     },
  9136.  #endif
  9137. +#endif /* !CONFIG_SCHED_BFS */
  9138.  #ifdef CONFIG_PROVE_LOCKING
  9139.     {
  9140.         .procname   = "prove_locking",
  9141. @@ -840,6 +847,26 @@
  9142.         .proc_handler   = proc_dointvec,
  9143.     },
  9144.  #endif
  9145. +#ifdef CONFIG_SCHED_BFS
  9146. +   {
  9147. +       .procname   = "rr_interval",
  9148. +       .data       = &rr_interval,
  9149. +       .maxlen     = sizeof (int),
  9150. +       .mode       = 0644,
  9151. +       .proc_handler   = &proc_dointvec_minmax,
  9152. +       .extra1     = &one,
  9153. +       .extra2     = &one_thousand,
  9154. +   },
  9155. +   {
  9156. +       .procname   = "iso_cpu",
  9157. +       .data       = &sched_iso_cpu,
  9158. +       .maxlen     = sizeof (int),
  9159. +       .mode       = 0644,
  9160. +       .proc_handler   = &proc_dointvec_minmax,
  9161. +       .extra1     = &zero,
  9162. +       .extra2     = &one_hundred,
  9163. +   },
  9164. +#endif
  9165.  #if defined(CONFIG_S390) && defined(CONFIG_SMP)
  9166.     {
  9167.         .procname   = "spin_retry",
  9168. diff -ruNb a/lib/Kconfig.debug b/lib/Kconfig.debug
  9169. --- a/lib/Kconfig.debug 2012-10-12 21:48:25.000000000 +0100
  9170. +++ b/lib/Kconfig.debug 2012-10-21 16:28:24.299668568 +0100
  9171. @@ -913,7 +913,7 @@
  9172.  
  9173.  config RCU_TORTURE_TEST
  9174.     tristate "torture tests for RCU"
  9175. -   depends on DEBUG_KERNEL
  9176. +   depends on DEBUG_KERNEL && !SCHED_BFS
  9177.     default n
  9178.     help
  9179.       This option provides a kernel module that runs torture tests
  9180. diff -ruNb a/Makefile b/Makefile
  9181. --- a/Makefile  2012-10-12 21:48:25.000000000 +0100
  9182. +++ b/Makefile  2012-10-21 16:28:24.327663196 +0100
  9183. @@ -10,6 +10,10 @@
  9184.  # Comments in this file are targeted only to the developer, do not
  9185.  # expect to learn how to build the kernel reading this file.
  9186.  
  9187. +CKVERSION = -ck1
  9188. +CKNAME = BFS Powered
  9189. +EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION)
  9190. +
  9191.  # Do not:
  9192.  # o  use make's built-in rules and variables
  9193.  #    (this increases performance and avoids hard-to-debug behaviour);
  9194. diff -ruNb a/mm/memory.c b/mm/memory.c
  9195. --- a/mm/memory.c   2012-10-12 21:48:25.000000000 +0100
  9196. +++ b/mm/memory.c   2012-10-21 16:28:24.305667416 +0100
  9197. @@ -3020,7 +3020,7 @@
  9198.     mem_cgroup_commit_charge_swapin(page, ptr);
  9199.  
  9200.     swap_free(entry);
  9201. -   if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
  9202. +   if ((vma->vm_flags & VM_LOCKED) || PageMlocked(page))
  9203.         try_to_free_swap(page);
  9204.     unlock_page(page);
  9205.     if (swapcache) {
  9206. diff -ruNb a/mm/page-writeback.c b/mm/page-writeback.c
  9207. --- a/mm/page-writeback.c   2012-10-12 21:48:25.000000000 +0100
  9208. +++ b/mm/page-writeback.c   2012-10-21 16:28:24.314665690 +0100
  9209. @@ -65,7 +65,7 @@
  9210.  /*
  9211.   * Start background writeback (via writeback threads) at this percentage
  9212.   */
  9213. -int dirty_background_ratio = 10;
  9214. +int dirty_background_ratio = 1;
  9215.  
  9216.  /*
  9217.   * dirty_background_bytes starts at 0 (disabled) so that it is a function of
  9218. @@ -82,7 +82,7 @@
  9219.  /*
  9220.   * The generator of dirty data starts writeback at this percentage
  9221.   */
  9222. -int vm_dirty_ratio = 20;
  9223. +int vm_dirty_ratio = 1;
  9224.  
  9225.  /*
  9226.   * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
  9227. diff -ruNb a/mm/swapfile.c b/mm/swapfile.c
  9228. --- a/mm/swapfile.c 2012-10-12 21:48:25.000000000 +0100
  9229. +++ b/mm/swapfile.c 2012-10-21 16:28:24.306667225 +0100
  9230. @@ -290,7 +290,7 @@
  9231.         scan_base = offset = si->lowest_bit;
  9232.  
  9233.     /* reuse swap entry of cache-only swap if not busy. */
  9234. -   if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
  9235. +   if (si->swap_map[offset] == SWAP_HAS_CACHE) {
  9236.         int swap_was_freed;
  9237.         spin_unlock(&swap_lock);
  9238.         swap_was_freed = __try_to_reclaim_swap(si, offset);
  9239. @@ -379,7 +379,7 @@
  9240.             spin_lock(&swap_lock);
  9241.             goto checks;
  9242.         }
  9243. -       if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
  9244. +       if (si->swap_map[offset] == SWAP_HAS_CACHE) {
  9245.             spin_lock(&swap_lock);
  9246.             goto checks;
  9247.         }
  9248. @@ -394,7 +394,7 @@
  9249.             spin_lock(&swap_lock);
  9250.             goto checks;
  9251.         }
  9252. -       if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
  9253. +       if (si->swap_map[offset] == SWAP_HAS_CACHE) {
  9254.             spin_lock(&swap_lock);
  9255.             goto checks;
  9256.         }
  9257. @@ -709,8 +709,7 @@
  9258.          * Not mapped elsewhere, or swap space full? Free it!
  9259.          * Also recheck PageSwapCache now page is locked (above).
  9260.          */
  9261. -       if (PageSwapCache(page) && !PageWriteback(page) &&
  9262. -               (!page_mapped(page) || vm_swap_full())) {
  9263. +       if (PageSwapCache(page) && !PageWriteback(page)) {
  9264.             delete_from_swap_cache(page);
  9265.             SetPageDirty(page);
  9266.         }
  9267. diff -ruNb a/mm/vmscan.c b/mm/vmscan.c
  9268. --- a/mm/vmscan.c   2012-10-12 21:48:25.000000000 +0100
  9269. +++ b/mm/vmscan.c   2012-10-21 16:28:24.312666074 +0100
  9270. @@ -127,7 +127,7 @@
  9271.  /*
  9272.   * From 0 .. 100.  Higher means more swappy.
  9273.   */
  9274. -int vm_swappiness = 60;
  9275. +int vm_swappiness = 10;
  9276.  long vm_total_pages;   /* The total number of pages which the VM controls */
  9277.  
  9278.  static LIST_HEAD(shrinker_list);
  9279. @@ -928,7 +928,7 @@
  9280.  
  9281.  activate_locked:
  9282.         /* Not a candidate for swapping, so reclaim swap space. */
  9283. -       if (PageSwapCache(page) && vm_swap_full())
  9284. +       if (PageSwapCache(page))
  9285.             try_to_free_swap(page);
  9286.         VM_BUG_ON(PageActive(page));
  9287.         SetPageActive(page);
  9288. @@ -1921,6 +1921,35 @@
  9289.  }
  9290.  
  9291.  /*
  9292. + * Helper functions to adjust nice level of kswapd, based on the priority of
  9293. + * the task (p) that called it. If it is already higher priority we do not
  9294. + * demote its nice level since it is still working on behalf of a higher
  9295. + * priority task. With kernel threads we leave it at nice 0.
  9296. + *
  9297. + * We don't ever run kswapd real time, so if a real time task calls kswapd we
  9298. + * set it to highest SCHED_NORMAL priority.
  9299. + */
  9300. +static inline int effective_sc_prio(struct task_struct *p)
  9301. +{
  9302. +   if (likely(p->mm)) {
  9303. +       if (rt_task(p))
  9304. +           return -20;
  9305. +       if (p->policy == SCHED_IDLEPRIO)
  9306. +           return 19;
  9307. +       return task_nice(p);
  9308. +   }
  9309. +   return 0;
  9310. +}
  9311. +
  9312. +static void set_kswapd_nice(struct task_struct *kswapd, int active)
  9313. +{
  9314. +   long nice = effective_sc_prio(current);
  9315. +
  9316. +   if (task_nice(kswapd) > nice || !active)
  9317. +       set_user_nice(kswapd, nice);
  9318. +}
  9319. +
  9320. +/*
  9321.   * This is the direct reclaim path, for page-allocating processes.  We only
  9322.   * try to reclaim pages from zones which will satisfy the caller's allocation
  9323.   * request.
  9324. @@ -2844,6 +2873,7 @@
  9325.  void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
  9326.  {
  9327.     pg_data_t *pgdat;
  9328. +   int active;
  9329.  
  9330.     if (!populated_zone(zone))
  9331.         return;
  9332. @@ -2855,7 +2885,9 @@
  9333.         pgdat->kswapd_max_order = order;
  9334.         pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
  9335.     }
  9336. -   if (!waitqueue_active(&pgdat->kswapd_wait))
  9337. +   active = waitqueue_active(&pgdat->kswapd_wait);
  9338. +   set_kswapd_nice(pgdat->kswapd, active);
  9339. +   if (!active)
  9340.         return;
  9341.     if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
  9342.         return;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement