Guest User

2.6.37_zcache_V2.patch

a guest
Feb 13th, 2011
594
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 215.49 KB | None | 0 0
  1. diff -Nrupad linux-2.6.37//arch/x86/kvm/vmx.c linux-2.6.37_vanilla//arch/x86/kvm/vmx.c
  2. --- linux-2.6.37//arch/x86/kvm/vmx.c 2011-01-05 01:50:19.000000000 +0100
  3. +++ linux-2.6.37_vanilla//arch/x86/kvm/vmx.c 2011-02-14 01:20:15.814793213 +0100
  4. @@ -563,7 +563,7 @@ static inline void ept_sync_individual_a
  5. }
  6. }
  7.  
  8. -static unsigned long vmcs_readl(unsigned long field)
  9. +static noinline unsigned long vmcs_readl(unsigned long field)
  10. {
  11. unsigned long value;
  12.  
  13. diff -Nrupad linux-2.6.37//Documentation/ABI/testing/sysfs-kernel-mm-cleancache linux-2.6.37_vanilla//Documentation/ABI/testing/sysfs-kernel-mm-cleancache
  14. --- linux-2.6.37//Documentation/ABI/testing/sysfs-kernel-mm-cleancache 1970-01-01 01:00:00.000000000 +0100
  15. +++ linux-2.6.37_vanilla//Documentation/ABI/testing/sysfs-kernel-mm-cleancache 2011-02-14 01:21:43.156792902 +0100
  16. @@ -0,0 +1,11 @@
  17. +What: /sys/kernel/mm/cleancache/
  18. +Date: June 2010
  19. +Contact: Dan Magenheimer <dan.magenheimer@oracle.com>
  20. +Description:
  21. + /sys/kernel/mm/cleancache/ contains a number of files which
  22. + record a count of various cleancache operations
  23. + (sum across all filesystems):
  24. + succ_gets
  25. + failed_gets
  26. + puts
  27. + flushes
  28. diff -Nrupad linux-2.6.37//Documentation/ABI/testing/sysfs-kernel-mm-frontswap linux-2.6.37_vanilla//Documentation/ABI/testing/sysfs-kernel-mm-frontswap
  29. --- linux-2.6.37//Documentation/ABI/testing/sysfs-kernel-mm-frontswap 1970-01-01 01:00:00.000000000 +0100
  30. +++ linux-2.6.37_vanilla//Documentation/ABI/testing/sysfs-kernel-mm-frontswap 2011-02-14 01:21:43.156792902 +0100
  31. @@ -0,0 +1,16 @@
  32. +What: /sys/kernel/mm/frontswap/
  33. +Date: June 2010
  34. +Contact: Dan Magenheimer <dan.magenheimer@oracle.com>
  35. +Description:
  36. + /sys/kernel/mm/frontswap/ contains a number of files which
  37. + record a count of various frontswap operations (sum across
  38. + all swap devices):
  39. + succ_puts
  40. + failed_puts
  41. + gets
  42. + flushes
  43. + In addition, reading the curr_pages file shows how many
  44. + pages are currently contained in frontswap and writing this
  45. + file with an integer performs a "partial swapoff", reducing
  46. + the number of frontswap pages to that integer if memory
  47. + constraints permit.
  48. diff -Nrupad linux-2.6.37//Documentation/vm/cleancache.txt linux-2.6.37_vanilla//Documentation/vm/cleancache.txt
  49. --- linux-2.6.37//Documentation/vm/cleancache.txt 1970-01-01 01:00:00.000000000 +0100
  50. +++ linux-2.6.37_vanilla//Documentation/vm/cleancache.txt 2011-02-14 01:21:43.157792932 +0100
  51. @@ -0,0 +1,267 @@
  52. +MOTIVATION
  53. +
  54. +Cleancache is a new optional feature provided by the VFS layer that
  55. +potentially dramatically increases page cache effectiveness for
  56. +many workloads in many environments at a negligible cost.
  57. +
  58. +Cleancache can be thought of as a page-granularity victim cache for clean
  59. +pages that the kernel's pageframe replacement algorithm (PFRA) would like
  60. +to keep around, but can't since there isn't enough memory. So when the
  61. +PFRA "evicts" a page, it first attempts to put it into a synchronous
  62. +concurrency-safe page-oriented "pseudo-RAM" device (such as Xen's
  63. +Transcendent Memory, aka "tmem", or in-kernel compressed memory, aka "zmem",
  64. +or other RAM-like devices) which is not directly accessible or addressable
  65. +by the kernel and is of unknown and possibly time-varying size. And when a
  66. +cleancache-enabled filesystem wishes to access a page in a file on disk,
  67. +it first checks cleancache to see if it already contains it; if it does,
  68. +the page is copied into the kernel and a disk access is avoided.
  69. +
  70. +FAQs are included below.
  71. +
  72. +IMPLEMENTATION OVERVIEW
  73. +
  74. +A cleancache "backend" that interfaces to this pseudo-RAM links itself
  75. +to the kernel's cleancache "frontend" by calling cleancache_register_ops,
  76. +passing a pointer to a cleancache_ops structure with funcs set appropriately.
  77. +Note that cleancache_register_ops returns the previous settings so that
  78. +chaining can be pefromed if desired. The functions provided must conform to
  79. +certain semantics as follows:
  80. +
  81. +Most important, cleancache is "ephemeral". Pages which are copied into
  82. +cleancache have an indefinite lifetime which is completely unknowable
  83. +by the kernel and so may or may not still be in cleancache at any later time.
  84. +Thus, as its name implies, cleancache is not suitable for dirty pages.
  85. +Cleancache has complete discretion over what pages to preserve and what
  86. +pages to discard and when.
  87. +
  88. +Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a
  89. +pool id which, if positive, must be saved in the filesystem's superblock;
  90. +a negative return value indicates failure. A "put_page" will copy a
  91. +(presumably about-to-be-evicted) page into cleancache and associate it with
  92. +the pool id, a file key, and a page index into the file. (The combination
  93. +of a pool id, a file key, and an index is sometimes called a "handle".)
  94. +A "get_page" will copy the page, if found, from cleancache into kernel memory.
  95. +A "flush_page" will ensure the page no longer is present in cleancache;
  96. +a "flush_inode" will flush all pages associated with the specified file;
  97. +and, when a filesystem is unmounted, a "flush_fs" will flush all pages in
  98. +all files specified by the given pool id and also surrender the pool id.
  99. +
  100. +An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache
  101. +to treat the pool as shared using a 128-bit UUID as a key. On systems
  102. +that may run multiple kernels (such as hard partitioned or virtualized
  103. +systems) that may share a clustered filesystem, and where cleancache
  104. +may be shared among those kernels, calls to init_shared_fs that specify the
  105. +same UUID will receive the same pool id, thus allowing the pages to
  106. +be shared. Note that any security requirements must be imposed outside
  107. +of the kernel (e.g. by "tools" that control cleancache). Or a
  108. +cleancache implementation can simply disable shared_init by always
  109. +returning a negative value.
  110. +
  111. +If a get_page is successful on a non-shared pool, the page is flushed (thus
  112. +making cleancache an "exclusive" cache). On a shared pool, the page
  113. +is NOT flushed on a successful get_page so that it remains accessible to
  114. +other sharers. The kernel is responsible for ensuring coherency between
  115. +cleancache (shared or not), the page cache, and the filesystem, using
  116. +cleancache flush operations as required.
  117. +
  118. +Note that cleancache must enforce put-put-get coherency and get-get
  119. +coherency. For the former, if two puts are made to the same handle but
  120. +with different data, say AAA by the first put and BBB by the second, a
  121. +subsequent get can never return the stale data (AAA). For get-get coherency,
  122. +if a get for a given handle fails, subsequent gets for that handle will
  123. +never succeed unless preceded by a successful put with that handle.
  124. +
  125. +Last, cleancache provides no SMP serialization guarantees; if two
  126. +different Linux threads are simultaneously putting and flushing a page
  127. +with the same handle, the results are indeterminate. Callers must
  128. +lock the page to ensure serial behavior.
  129. +
  130. +CLEANCACHE PERFORMANCE METRICS
  131. +
  132. +Cleancache monitoring is done by sysfs files in the
  133. +/sys/kernel/mm/cleancache directory. The effectiveness of cleancache
  134. +can be measured (across all filesystems) with:
  135. +
  136. +succ_gets - number of gets that were successful
  137. +failed_gets - number of gets that failed
  138. +puts - number of puts attempted (all "succeed")
  139. +flushes - number of flushes attempted
  140. +
  141. +A backend implementatation may provide additional metrics.
  142. +
  143. +FAQ
  144. +
  145. +1) Where's the value? (Andrew Morton)
  146. +
  147. +Cleancache provides a significant performance benefit to many workloads
  148. +in many environments with negligible overhead by improving the
  149. +effectiveness of the pagecache. Clean pagecache pages are
  150. +saved in pseudo-RAM (RAM that is otherwise not directly addressable to
  151. +the kernel); fetching those pages later avoids "refaults" and thus
  152. +disk reads.
  153. +
  154. +Cleancache (and its sister code "frontswap") provide interfaces for
  155. +a new pseudo-RAM memory type that conceptually lies between fast
  156. +kernel-directly-addressable RAM and slower DMA/asynchronous devices.
  157. +Disallowing direct kernel or userland reads/writes to this pseudo-RAM
  158. +is ideal when data is transformed to a different form and size (such
  159. +as with compression) or secretly moved (as might be useful for write-
  160. +balancing for some RAM-like devices). Evicted page-cache pages (and
  161. +swap pages) are a great use for this kind of slower-than-RAM-but-much-
  162. +faster-than-disk pseudo-RAM and the cleancache (and frontswap)
  163. +"page-object-oriented" specification provides a nice way to read and
  164. +write -- and indirectly "name" -- the pages.
  165. +
  166. +In the virtual case, the whole point of virtualization is to statistically
  167. +multiplex physical resources across the varying demands of multiple
  168. +virtual machines. This is really hard to do with RAM and efforts to
  169. +do it well with no kernel change have essentially failed (except in some
  170. +well-publicized special-case workloads). Cleancache -- and frontswap --
  171. +with a fairly small impact on the kernel, provide a huge amount
  172. +of flexibility for more dynamic, flexible RAM multiplexing.
  173. +Specifically, the Xen Transcendent Memory backend allows otherwise
  174. +"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
  175. +virtual machines, but the pages can be compressed and deduplicated to
  176. +optimize RAM utilization. And when guest OS's are induced to surrender
  177. +underutilized RAM (e.g. with "self-ballooning"), page cache pages
  178. +are the first to go, and cleancache allows those pages to be
  179. +saved and reclaimed if overall host system memory conditions allow.
  180. +
  181. +2) Why does cleancache have its sticky fingers so deep inside the
  182. + filesystems and VFS? (Andrew Morton and Christoph Hellwig)
  183. +
  184. +The core hooks for cleancache in VFS are in most cases a single line
  185. +and the minimum set are placed precisely where needed to maintain
  186. +coherency (via cleancache_flush operations) between cleancache,
  187. +the page cache, and disk. All hooks compile into nothingness if
  188. +cleancache is config'ed off and turn into a function-pointer-
  189. +compare-to-NULL if config'ed on but no backend claims the ops
  190. +functions, or to a compare-struct-element-to-negative if a
  191. +backend claims the ops functions but a filesystem doesn't enable
  192. +cleancache.
  193. +
  194. +Some filesystems are built entirely on top of VFS and the hooks
  195. +in VFS are sufficient, so don't require an "init_fs" hook; the
  196. +initial implementation of cleancache didn't provide this hook.
  197. +But for some filesystems (such as btrfs), the VFS hooks are
  198. +incomplete and one or more hooks in fs-specific code are required.
  199. +And for some other filesystems, such as tmpfs, cleancache may
  200. +be counterproductive. So it seemed prudent to require a filesystem
  201. +to "opt in" to use cleancache, which requires adding a hook in
  202. +each filesystem. Not all filesystems are supported by cleancache
  203. +only because they haven't been tested. The existing set should
  204. +be sufficient to validate the concept, the opt-in approach means
  205. +that untested filesystems are not affected, and the hooks in the
  206. +existing filesystems should make it very easy to add more
  207. +filesystems in the future.
  208. +
  209. +The total impact of the hooks to existing fs and mm files is 43
  210. +lines added (not counting comments and blank lines).
  211. +
  212. +3) Why not make cleancache asynchronous and batched so it can
  213. + more easily interface with real devices with DMA instead
  214. + of copying each individual page? (Minchan Kim)
  215. +
  216. +The one-page-at-a-time copy semantics simplifies the implementation
  217. +on both the frontend and backend and also allows the backend to
  218. +do fancy things on-the-fly like page compression and
  219. +page deduplication. And since the data is "gone" (copied into/out
  220. +of the pageframe) before the cleancache get/put call returns,
  221. +a great deal of race conditions and potential coherency issues
  222. +are avoided. While the interface seems odd for a "real device"
  223. +or for real kernel-addressable RAM, it makes perfect sense for
  224. +pseudo-RAM.
  225. +
  226. +4) Why is non-shared cleancache "exclusive"? And where is the
  227. + page "flushed" after a "get"? (Minchan Kim)
  228. +
  229. +The main reason is to free up memory in pseudo-RAM and to avoid
  230. +unnecessary cleancache_flush calls. If you want inclusive,
  231. +the page can be "put" immediately following the "get". If
  232. +put-after-get for inclusive becomes common, the interface could
  233. +be easily extended to add a "get_no_flush" call.
  234. +
  235. +The flush is done by the cleancache backend implementation.
  236. +
  237. +5) What's the performance impact?
  238. +
  239. +Performance analysis has been presented at OLS'09 and LCA'10.
  240. +Briefly, performance gains can be significant on most workloads,
  241. +especially when memory pressure is high (e.g. when RAM is
  242. +overcommitted in a virtual workload); and because the hooks are
  243. +invoked primarily in place of or in addition to a disk read/write,
  244. +overhead is negligible even in worst case workloads. Basically
  245. +cleancache replaces I/O with memory-copy-CPU-overhead; on older
  246. +single-core systems with slow memory-copy speeds, cleancache
  247. +has little value, but in newer multicore machines, especially
  248. +consolidated/virtualized machines, it has great value.
  249. +
  250. +6) How do I add cleancache support for filesystem X? (Boaz Harrash)
  251. +
  252. +Filesystems that are well-behaved and conform to certain
  253. +restrictions can utilize cleancache simply by making a call to
  254. +cleancache_init_fs at mount time. Unusual, misbehaving, or
  255. +poorly layered filesystems must either add additional hooks
  256. +and/or undergo extensive additional testing... or should just
  257. +not enable the optional cleancache.
  258. +
  259. +Some points for a filesystem to consider:
  260. +
  261. +- The FS should be block-device-based (e.g. a ram-based FS such
  262. + as tmpfs should not enable cleancache)
  263. +- To ensure coherency/correctness, the FS must ensure that all
  264. + file removal or truncation operations either go through VFS or
  265. + add hooks to do the equivalent cleancache "flush" operations
  266. +- To ensure coherency/correctness, either inode numbers must
  267. + be unique across the lifetime of the on-disk file OR the
  268. + FS must provide an "encode_fh" function.
  269. +- The FS must call the VFS superblock alloc and deactivate routines
  270. + or add hooks to do the equivalent cleancache calls done there.
  271. +- To maximize performance, all pages fetched from the FS should
  272. + go through the do_mpag_readpage routine or the FS should add
  273. + hooks to do the equivalent (cf. btrfs)
  274. +- Currently, the FS blocksize must be the same as PAGESIZE. This
  275. + is not an architectural restriction, but no backends currently
  276. + support anything different.
  277. +- A clustered FS should invoke the "shared_init_fs" cleancache
  278. + hook to get best performance for some backends.
  279. +
  280. +7) Why not use the KVA of the inode as the key? (Christoph Hellwig)
  281. +
  282. +If cleancache would use the inode virtual address instead of
  283. +inode/filehandle, the pool id could be eliminated. But, this
  284. +won't work because cleancache retains pagecache data pages
  285. +persistently even when the inode has been pruned from the
  286. +inode unused list, and only flushes the data page if the file
  287. +gets removed/truncated. So if cleancache used the inode kva,
  288. +there would be potential coherency issues if/when the inode
  289. +kva is reused for a different file. Alternately, if cleancache
  290. +flushed the pages when the inode kva was freed, much of the value
  291. +of cleancache would be lost because the cache of pages in cleanache
  292. +is potentially much larger than the kernel pagecache and is most
  293. +useful if the pages survive inode cache removal.
  294. +
  295. +8) Why is a global variable required?
  296. +
  297. +The cleancache_enabled flag is checked in all of the frequently-used
  298. +cleancache hooks. The alternative is a function call to check a static
  299. +variable. Since cleancache is enabled dynamically at runtime, systems
  300. +that don't enable cleancache would suffer thousands (possibly
  301. +tens-of-thousands) of unnecessary function calls per second. So the
  302. +global variable allows cleancache to be enabled by default at compile
  303. +time, but have insignificant performance impact when cleancache remains
  304. +disabled at runtime.
  305. +
  306. +9) Does cleanache work with KVM?
  307. +
  308. +The memory model of KVM is sufficiently different that a cleancache
  309. +backend may have little value for KVM. This remains to be tested,
  310. +especially in an overcommitted system.
  311. +
  312. +10) Does cleancache work in userspace? It sounds useful for
  313. + memory hungry caches like web browsers. (Jamie Lokier)
  314. +
  315. +No plans yet, though we agree it sounds useful, at least for
  316. +apps that bypass the page cache (e.g. O_DIRECT).
  317. +
  318. +Last updated: Dan Magenheimer, September 2 2010
  319. diff -Nrupad linux-2.6.37//Documentation/vm/frontswap.txt linux-2.6.37_vanilla//Documentation/vm/frontswap.txt
  320. --- linux-2.6.37//Documentation/vm/frontswap.txt 1970-01-01 01:00:00.000000000 +0100
  321. +++ linux-2.6.37_vanilla//Documentation/vm/frontswap.txt 2011-02-14 01:21:43.158792960 +0100
  322. @@ -0,0 +1,209 @@
  323. +Frontswap provides a page-accessible-memory (PAM) interface for swap pages.
  324. +In some environments, dramatic performance savings may be obtained because
  325. +swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
  326. +
  327. +Frontswap is so named because it can be thought of as the opposite of
  328. +a "backing" store for a swap device. The storage is assumed to be
  329. +a synchronous concurrency-safe page-oriented pseudo-RAM device (such as
  330. +Xen's Transcendent Memory, aka "tmem", or in-kernel compressed memory,
  331. +aka "zmem", or other RAM-like devices) which is not directly accessible
  332. +or addressable by the kernel and is of unknown and possibly time-varying
  333. +size. This pseudo-RAM device links itself to frontswap by calling
  334. +frontswap_register_ops to set the frontswap_ops funcs appropriately and
  335. +the functions it provides must conform to certain policies as follows:
  336. +
  337. +An "init" prepares the pseudo-RAM to receive frontswap pages associated
  338. +with the specified swap device number (aka "type"). A "put_page" will
  339. +copy the page to pseudo-RAM and associate it with the type and offset
  340. +associated with the page. A "get_page" will copy the page, if found,
  341. +from pseudo-RAM into kernel memory, but will NOT remove the page from
  342. +pseudo-RAM. A "flush_page" will remove the page from pseudo-RAM and a
  343. +"flush_area" will remove ALL pages associated with the swap type
  344. +(e.g., like swapoff) and notify the pseudo-RAM device to refuse
  345. +further puts with that swap type.
  346. +
  347. +Once a page is successfully put, a matching get on the page will always
  348. +succeed. So when the kernel finds itself in a situation where it needs
  349. +to swap out a page, it first attempts to use frontswap. If the put returns
  350. +non-zero, the data has been successfully saved to pseudo-RAM and
  351. +a disk write and, if the data is later read back, a disk read are avoided.
  352. +If a put returns zero, pseudo-RAM has rejected the data, and the page can
  353. +be written to swap as usual.
  354. +
  355. +Note that if a page is put and the page already exists in pseudo-RAM
  356. +(a "duplicate" put), either the put succeeds and the data is overwritten,
  357. +or the put fails AND the page is flushed. This ensures stale data may
  358. +never be obtained from psuedo-RAM.
  359. +
  360. +Monitoring and control of frontswap is done by sysfs files in the
  361. +/sys/kernel/mm/frontswap directory. The effectiveness of frontswap can
  362. +be measured (across all swap devices) with:
  363. +
  364. +curr_pages - number of pages currently contained in frontswap
  365. +failed_puts - how many put attempts have failed
  366. +gets - how many gets were attempted (all should succeed)
  367. +succ_puts - how many put attempts have succeeded
  368. +flushes - how many flushes were attempted
  369. +
  370. +The number can be reduced by root by writing an integer target to curr_pages,
  371. +which results in a "partial swapoff", thus reducing the number of frontswap
  372. +pages to that target if memory constraints permit.
  373. +
  374. +FAQ
  375. +
  376. +1) Where's the value?
  377. +
  378. +When a workload starts swapping, performance falls through the floor.
  379. +Frontswap significantly increases performance in many such workloads by
  380. +providing a clean, dynamic interface to read and write swap pages to
  381. +pseudo-RAM -- RAM that is otherwise not directly addressable to the kernel.
  382. +This interface is ideal when data is transformed to a different form
  383. +and size (such as with compression) or secretly moved (as might be
  384. +useful for write-balancing for some RAM-like devices). Swap pages (and
  385. +evicted page-cache pages) are a great use for this kind of slower-than-RAM-
  386. +but-much-faster-than-disk pseudo-RAM and the frontswap (and cleancache)
  387. +"page-object-oriented" specification provides a nice way to read
  388. +and write -- and indirectly "name" -- the pages.
  389. +
  390. +In the virtual case, the whole point of virtualization is to statistically
  391. +multiplex physical resources acrosst the varying demands of multiple
  392. +virtual machines. This is really hard to do with RAM and efforts to do
  393. +it well with no kernel changes have essentially failed (except in some
  394. +well-publicized special-case workloads). Frontswap -- and cleancache --
  395. +with a fairly small impact on the kernel, provides a huge amount
  396. +of flexibility for more dynamic, flexible RAM multiplexing.
  397. +Specifically, the Xen Transcendent Memory backend allows otherwise
  398. +"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
  399. +virtual machines, but the pages can be compressed and deduplicated to
  400. +optimize RAM utilization. And when guest OS's are induced to surrender
  401. +underutilized RAM (e.g. with "self-ballooning"), sudden unexpected
  402. +memory pressure may result in swapping; frontswap allows those pages
  403. +to be swapped to and from hypervisor RAM if overall host system memory
  404. +conditions allow.
  405. +
  406. +2) Sure there may be performance advantages in some situations, but
  407. + what's the space/time overhead of frontswap?
  408. +
  409. +If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
  410. +nothingness and the only overhead is a few extra bytes per swapon'ed
  411. +swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend"
  412. +registers, there is one extra global variable compared to zero for
  413. +every swap page read or written. If CONFIG_FRONTSWAP is enabled
  414. +AND a frontswap backend registers AND the backend fails every "put"
  415. +request (i.e. provides no memory despite claiming it might),
  416. +CPU overhead is still negligible -- and since every frontswap fail
  417. +precedes a swap page write-to-disk, the system is highly likely
  418. +to be I/O bound and using a small fraction of a percent of a CPU
  419. +will be irrelevant anyway.
  420. +
  421. +As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend
  422. +registers, one bit is allocated for every swap page for every swap
  423. +device that is swapon'd. This is added to the EIGHT bits (which
  424. +was sixteen until about 2.6.34) that the kernel already allocates
  425. +for every swap page for every swap device that is swapon'd. (Hugh
  426. +Dickins has observed that frontswap could probably steal one of
  427. +the existing eight bits, but let's worry about that minor optimization
  428. +later.) For very large swap disks (which are rare) on a standard
  429. +4K pagesize, this is 1MB per 32GB swap.
  430. +
  431. +3) OK, how about a quick overview of what this frontswap patch does
  432. + in terms that a kernel hacker can grok?
  433. +
  434. +Let's assume that a frontswap "backend" has registered during
  435. +kernel initialization; this registration indicates that this
  436. +frontswap backend has access to some "memory" that is not directly
  437. +accessible by the kernel. Exactly how much memory it provides is
  438. +entirely dynamic and random.
  439. +
  440. +Whenever a swap-device is swapon'd frontswap_init() is called,
  441. +passing the swap device number (aka "type") as a parameter.
  442. +This notifies frontswap to expect attempts to "put" swap pages
  443. +associated with that number.
  444. +
  445. +Whenever the swap subsystem is readying a page to write to a swap
  446. +device (c.f swap_writepage()), frontswap_put_page is called. Frontswap
  447. +consults with the frontswap backend and if the backend says
  448. +it does NOT have room, frontswap_put_page returns 0 and the page is
  449. +swapped as normal. Note that the response from the frontswap
  450. +backend is essentially random; it may choose to never accept a
  451. +page, it could accept every ninth page, or it might accept every
  452. +page. But if the backend does accept a page, the data from the page
  453. +has already been copied and associated with the type and offset,
  454. +and the backend guarantees the persistence of the data. In this case,
  455. +frontswap sets a bit in the "frontswap_map" for the swap device
  456. +corresponding to the page offset on the swap device to which it would
  457. +otherwise have written the data.
  458. +
  459. +When the swap subsystem needs to swap-in a page (swap_readpage()),
  460. +it first calls frontswap_get_page() which checks the frontswap_map to
  461. +see if the page was earlier accepted by the frontswap backend. If
  462. +it was, the page of data is filled from the frontswap backend and
  463. +the swap-in is complete. If not, the normal swap-in code is
  464. +executed to obtain the page of data from the real swap device.
  465. +
  466. +So every time the frontswap backend accepts a page, a swap device read
  467. +and (potentially) a swap device write are replaced by a "frontswap backend
  468. +put" and (possibly) a "frontswap backend get", which are presumably much
  469. +faster.
  470. +
  471. +4) Can't frontswap be configured as a "special" swap device that is
  472. + just higher priority than any real swap device (e.g. like zswap)?
  473. +
  474. +No. Recall that acceptance of any swap page by the frontswap
  475. +backend is entirely unpredictable. This is critical to the definition
  476. +of frontswap because it grants completely dynamic discretion to the
  477. +backend. But since any "put" might fail, there must always be a real
  478. +slot on a real swap device to swap the page. Thus frontswap must be
  479. +implemented as a "shadow" to every swapon'd device with the potential
  480. +capability of holding every page that the swap device might have held
  481. +and the possibility that it might hold no pages at all.
  482. +On the downside, this also means that frontswap cannot contain more
  483. +pages than the total of swapon'd swap devices. For example, if NO
  484. +swap device is configured on some installation, frontswap is useless.
  485. +
  486. +Further, frontswap is entirely synchronous whereas a real swap
  487. +device is, by definition, asynchronous and uses block I/O. The
  488. +block I/O layer is not only unnecessary, but may perform "optimizations"
  489. +that are inappropriate for a RAM-oriented device including delaying
  490. +the write of some pages for a significant amount of time.
  491. +Synchrony is required to ensure the dynamicity of the backend.
  492. +
  493. +In a virtualized environment, the dynamicity allows the hypervisor
  494. +(or host OS) to do "intelligent overcommit". For example, it can
  495. +choose to accept pages only until host-swapping might be imminent,
  496. +then force guests to do their own swapping.
  497. +
  498. +5) Why this weird definition about "duplicate puts"? If a page
  499. + has been previously successfully put, can't it always be
  500. + successfully overwritten?
  501. +
  502. +Nearly always it can, but no, sometimes it cannot. Consider an example
  503. +where data is compressed and the original 4K page has been compressed
  504. +to 1K. Now an attempt is made to overwrite the page with data that
  505. +is non-compressible and so would take the entire 4K. But the backend
  506. +has no more space. In this case, the put must be rejected. Whenever
  507. +frontswap rejects a put that would overwrite, it also must flush
  508. +the old data and ensure that it is no longer accessible. Since the
  509. +swap subsystem then writes the new data to the read swap device,
  510. +this is the correct course of action to ensure coherency.
  511. +
  512. +6) What is frontswap_shrink for?
  513. +
  514. +When the (non-frontswap) swap subsystem swaps out a page to a real
  515. +swap device, that page is only taking up low-value pre-allocated disk
  516. +space. But if frontswap has placed a page in pseudo-RAM, that
  517. +page may be taking up valuable real estate. The frontswap_shrink
  518. +routine allows a process outside of the swap subsystem (such as
  519. +a userland service via the sysfs interface, or a kernel thread)
  520. +to force pages out of the memory managed by frontswap and back into
  521. +kernel-addressable memory.
  522. +
  523. +7) Why does the frontswap patch create the new include file swapfile.h?
  524. +
  525. +The frontswap code depends on some swap-subsystem-internal data
  526. +structures that have, over the years, moved back and forth between
  527. +static and global. This seemed a reasonable compromise: Define
  528. +them as global but declare them in a new include file that isn't
  529. +included by the large number of source files that include swap.h.
  530. +
  531. +Dan Magenheimer, September 21 2010
  532. diff -Nrupad linux-2.6.37//drivers/media/radio/radio-aimslab.c linux-2.6.37_vanilla//drivers/media/radio/radio-aimslab.c
  533. --- linux-2.6.37//drivers/media/radio/radio-aimslab.c 2011-01-05 01:50:19.000000000 +0100
  534. +++ linux-2.6.37_vanilla//drivers/media/radio/radio-aimslab.c 2011-02-14 01:20:15.814793213 +0100
  535. @@ -71,7 +71,7 @@ static struct rtrack rtrack_card;
  536.  
  537. /* local things */
  538.  
  539. -static void sleep_delay(long n)
  540. +static noinline void sleep_delay(long n)
  541. {
  542. /* Sleep nicely for 'n' uS */
  543. int d = n / msecs_to_jiffies(1000);
  544. diff -Nrupad linux-2.6.37//drivers/staging/Kconfig linux-2.6.37_vanilla//drivers/staging/Kconfig
  545. --- linux-2.6.37//drivers/staging/Kconfig 2011-01-05 01:50:19.000000000 +0100
  546. +++ linux-2.6.37_vanilla//drivers/staging/Kconfig 2011-02-14 01:21:43.158792960 +0100
  547. @@ -123,6 +123,8 @@ source "drivers/staging/iio/Kconfig"
  548.  
  549. source "drivers/staging/zram/Kconfig"
  550.  
  551. +source "drivers/staging/zcache/Kconfig"
  552. +
  553. source "drivers/staging/wlags49_h2/Kconfig"
  554.  
  555. source "drivers/staging/wlags49_h25/Kconfig"
  556. diff -Nrupad linux-2.6.37//drivers/staging/Makefile linux-2.6.37_vanilla//drivers/staging/Makefile
  557. --- linux-2.6.37//drivers/staging/Makefile 2011-01-05 01:50:19.000000000 +0100
  558. +++ linux-2.6.37_vanilla//drivers/staging/Makefile 2011-02-14 01:21:43.158792960 +0100
  559. @@ -44,6 +44,7 @@ obj-$(CONFIG_VME_BUS) += vme/
  560. obj-$(CONFIG_MRST_RAR_HANDLER) += memrar/
  561. obj-$(CONFIG_IIO) += iio/
  562. obj-$(CONFIG_ZRAM) += zram/
  563. +obj-$(CONFIG_ZCACHE) += zcache/
  564. obj-$(CONFIG_WLAGS49_H2) += wlags49_h2/
  565. obj-$(CONFIG_WLAGS49_H25) += wlags49_h25/
  566. obj-$(CONFIG_BATMAN_ADV) += batman-adv/
  567. diff -Nrupad linux-2.6.37//drivers/staging/zcache/Kconfig linux-2.6.37_vanilla//drivers/staging/zcache/Kconfig
  568. --- linux-2.6.37//drivers/staging/zcache/Kconfig 1970-01-01 01:00:00.000000000 +0100
  569. +++ linux-2.6.37_vanilla//drivers/staging/zcache/Kconfig 2011-02-14 01:21:43.158792960 +0100
  570. @@ -0,0 +1,13 @@
  571. +config ZCACHE
  572. + tristate "Dynamic compression of swap pages and clean pagecache pages"
  573. + depends on CLEANCACHE || FRONTSWAP
  574. + select XVMALLOC
  575. + select LZO_COMPRESS
  576. + select LZO_DECOMPRESS
  577. + default n
  578. + help
  579. + Zcache doubles RAM efficiency while providing a significant
  580. + performance boosts on many workloads. Zcache uses lzo1x
  581. + compression and an in-kernel implementation of transcendent
  582. + memory to store clean page cache pages and swap in RAM,
  583. + providing a noticeable reduction in disk I/O.
  584. diff -Nrupad linux-2.6.37//drivers/staging/zcache/Makefile linux-2.6.37_vanilla//drivers/staging/zcache/Makefile
  585. --- linux-2.6.37//drivers/staging/zcache/Makefile 1970-01-01 01:00:00.000000000 +0100
  586. +++ linux-2.6.37_vanilla//drivers/staging/zcache/Makefile 2011-02-14 01:21:43.159792985 +0100
  587. @@ -0,0 +1 @@
  588. +obj-$(CONFIG_ZCACHE) += zcache.o tmem.o
  589. diff -Nrupad linux-2.6.37//drivers/staging/zcache/tmem.c linux-2.6.37_vanilla//drivers/staging/zcache/tmem.c
  590. --- linux-2.6.37//drivers/staging/zcache/tmem.c 1970-01-01 01:00:00.000000000 +0100
  591. +++ linux-2.6.37_vanilla//drivers/staging/zcache/tmem.c 2011-02-14 01:21:43.160793007 +0100
  592. @@ -0,0 +1,710 @@
  593. +/*
  594. + * In-kernel transcendent memory (generic implementation)
  595. + *
  596. + * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
  597. + *
  598. + * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
  599. + * "handles" (triples containing a pool id, and object id, and an index), to
  600. + * pages in a page-accessible memory (PAM). Tmem references the PAM pages via
  601. + * an abstract "pampd" (PAM page-descriptor), which can be operated on by a
  602. + * set of functions (pamops). Each pampd contains some representation of
  603. + * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of
  604. + * pages and must be able to insert, find, and delete these pages at a
  605. + * potential frequency of thousands per second concurrently across many CPUs,
  606. + * (and, if used with KVM, across many vcpus across many guests).
  607. + * Tmem is tracked with a hierarchy of data structures, organized by
  608. + * the elements in a handle-tuple: pool_id, object_id, and page index.
  609. + * One or more "clients" (e.g. guests) each provide one or more tmem_pools.
  610. + * Each pool, contains a hash table of rb_trees of tmem_objs. Each
  611. + * tmem_obj contains a radix-tree-like tree of pointers, with intermediate
  612. + * nodes called tmem_objnodes. Each leaf pointer in this tree points to
  613. + * a pampd, which is accessible only through a small set of callbacks
  614. + * registered by the PAM implementation (see tmem_register_pamops). Tmem
  615. + * does all memory allocation via a set of callbacks registered by the tmem
  616. + * host implementation (e.g. see tmem_register_hostops).
  617. + */
  618. +
  619. +#include <linux/list.h>
  620. +#include <linux/spinlock.h>
  621. +#include <linux/atomic.h>
  622. +
  623. +#include "tmem.h"
  624. +
  625. +/* data structure sentinels used for debugging... see tmem.h */
  626. +#define POOL_SENTINEL 0x87658765
  627. +#define OBJ_SENTINEL 0x12345678
  628. +#define OBJNODE_SENTINEL 0xfedcba09
  629. +
  630. +/*
  631. + * A tmem host implementation must use this function to register callbacks
  632. + * for memory allocation.
  633. + */
  634. +static struct tmem_hostops tmem_hostops;
  635. +
  636. +static void tmem_objnode_tree_init(void);
  637. +
  638. +void tmem_register_hostops(struct tmem_hostops *m)
  639. +{
  640. + tmem_objnode_tree_init();
  641. + tmem_hostops = *m;
  642. +}
  643. +
  644. +/*
  645. + * A tmem host implementation must use this function to register
  646. + * callbacks for a page-accessible memory (PAM) implementation
  647. + */
  648. +static struct tmem_pamops tmem_pamops;
  649. +
  650. +void tmem_register_pamops(struct tmem_pamops *m)
  651. +{
  652. + tmem_pamops = *m;
  653. +}
  654. +
  655. +/*
  656. + * Oid's are potentially very sparse and tmem_objs may have an indeterminately
  657. + * short life, being added and deleted at a relatively high frequency.
  658. + * So an rb_tree is an ideal data structure to manage tmem_objs. But because
  659. + * of the potentially huge number of tmem_objs, each pool manages a hashtable
  660. + * of rb_trees to reduce search, insert, delete, and rebalancing time.
  661. + * Each hashbucket also has a lock to manage concurrent access.
  662. + *
  663. + * The following routines manage tmem_objs. When any tmem_obj is accessed,
  664. + * the hashbucket lock must be held.
  665. + */
  666. +
  667. +/* searches for object==oid in pool, returns locked object if found */
  668. +static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
  669. + struct tmem_oid *oidp)
  670. +{
  671. + struct rb_node *rbnode;
  672. + struct tmem_obj *obj;
  673. +
  674. + rbnode = hb->obj_rb_root.rb_node;
  675. + while (rbnode) {
  676. + BUG_ON(RB_EMPTY_NODE(rbnode));
  677. + obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
  678. + switch (tmem_oid_compare(oidp, &obj->oid)) {
  679. + case 0: /* equal */
  680. + goto out;
  681. + case -1:
  682. + rbnode = rbnode->rb_left;
  683. + break;
  684. + case 1:
  685. + rbnode = rbnode->rb_right;
  686. + break;
  687. + }
  688. + }
  689. + obj = NULL;
  690. +out:
  691. + return obj;
  692. +}
  693. +
  694. +static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *);
  695. +
  696. +/* free an object that has no more pampds in it */
  697. +static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
  698. +{
  699. + struct tmem_pool *pool;
  700. +
  701. + BUG_ON(obj == NULL);
  702. + ASSERT_SENTINEL(obj, OBJ);
  703. + BUG_ON(obj->pampd_count > 0);
  704. + pool = obj->pool;
  705. + BUG_ON(pool == NULL);
  706. + if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
  707. + tmem_pampd_destroy_all_in_obj(obj);
  708. + BUG_ON(obj->objnode_tree_root != NULL);
  709. + BUG_ON((long)obj->objnode_count != 0);
  710. + atomic_dec(&pool->obj_count);
  711. + BUG_ON(atomic_read(&pool->obj_count) < 0);
  712. + INVERT_SENTINEL(obj, OBJ);
  713. + obj->pool = NULL;
  714. + tmem_oid_set_invalid(&obj->oid);
  715. + rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
  716. +}
  717. +
  718. +/*
  719. + * initialize, and insert an tmem_object_root (called only if find failed)
  720. + */
  721. +static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
  722. + struct tmem_pool *pool,
  723. + struct tmem_oid *oidp)
  724. +{
  725. + struct rb_root *root = &hb->obj_rb_root;
  726. + struct rb_node **new = &(root->rb_node), *parent = NULL;
  727. + struct tmem_obj *this;
  728. +
  729. + BUG_ON(pool == NULL);
  730. + atomic_inc(&pool->obj_count);
  731. + obj->objnode_tree_height = 0;
  732. + obj->objnode_tree_root = NULL;
  733. + obj->pool = pool;
  734. + obj->oid = *oidp;
  735. + obj->objnode_count = 0;
  736. + obj->pampd_count = 0;
  737. + SET_SENTINEL(obj, OBJ);
  738. + while (*new) {
  739. + BUG_ON(RB_EMPTY_NODE(*new));
  740. + this = rb_entry(*new, struct tmem_obj, rb_tree_node);
  741. + parent = *new;
  742. + switch (tmem_oid_compare(oidp, &this->oid)) {
  743. + case 0:
  744. + BUG(); /* already present; should never happen! */
  745. + break;
  746. + case -1:
  747. + new = &(*new)->rb_left;
  748. + break;
  749. + case 1:
  750. + new = &(*new)->rb_right;
  751. + break;
  752. + }
  753. + }
  754. + rb_link_node(&obj->rb_tree_node, parent, new);
  755. + rb_insert_color(&obj->rb_tree_node, root);
  756. +}
  757. +
  758. +/*
  759. + * Tmem is managed as a set of tmem_pools with certain attributes, such as
  760. + * "ephemeral" vs "persistent". These attributes apply to all tmem_objs
  761. + * and all pampds that belong to a tmem_pool. A tmem_pool is created
  762. + * or deleted relatively rarely (for example, when a filesystem is
  763. + * mounted or unmounted.
  764. + */
  765. +
  766. +/* flush all data from a pool and, optionally, free it */
  767. +static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
  768. +{
  769. + struct rb_node *rbnode;
  770. + struct tmem_obj *obj;
  771. + struct tmem_hashbucket *hb = &pool->hashbucket[0];
  772. + int i;
  773. +
  774. + BUG_ON(pool == NULL);
  775. + for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
  776. + spin_lock(&hb->lock);
  777. + rbnode = rb_first(&hb->obj_rb_root);
  778. + while (rbnode != NULL) {
  779. + obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
  780. + rbnode = rb_next(rbnode);
  781. + tmem_pampd_destroy_all_in_obj(obj);
  782. + tmem_obj_free(obj, hb);
  783. + (*tmem_hostops.obj_free)(obj, pool);
  784. + }
  785. + spin_unlock(&hb->lock);
  786. + }
  787. + if (destroy)
  788. + list_del(&pool->pool_list);
  789. +}
  790. +
  791. +/*
  792. + * A tmem_obj contains a radix-tree-like tree in which the intermediate
  793. + * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation
  794. + * is very specialized and tuned for specific uses and is not particularly
  795. + * suited for use from this code, though some code from the core algorithms has
  796. + * been reused, thus the copyright notices below). Each tmem_objnode contains
  797. + * a set of pointers which point to either a set of intermediate tmem_objnodes
  798. + * or a set of of pampds.
  799. + *
  800. + * Portions Copyright (C) 2001 Momchil Velikov
  801. + * Portions Copyright (C) 2001 Christoph Hellwig
  802. + * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
  803. + */
  804. +
  805. +struct tmem_objnode_tree_path {
  806. + struct tmem_objnode *objnode;
  807. + int offset;
  808. +};
  809. +
  810. +/* objnode height_to_maxindex translation */
  811. +static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
  812. +
  813. +static void tmem_objnode_tree_init(void)
  814. +{
  815. + unsigned int ht, tmp;
  816. +
  817. + for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
  818. + tmp = ht * OBJNODE_TREE_MAP_SHIFT;
  819. + if (tmp >= OBJNODE_TREE_INDEX_BITS)
  820. + tmem_objnode_tree_h2max[ht] = ~0UL;
  821. + else
  822. + tmem_objnode_tree_h2max[ht] =
  823. + (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
  824. + }
  825. +}
  826. +
  827. +static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
  828. +{
  829. + struct tmem_objnode *objnode;
  830. +
  831. + ASSERT_SENTINEL(obj, OBJ);
  832. + BUG_ON(obj->pool == NULL);
  833. + ASSERT_SENTINEL(obj->pool, POOL);
  834. + objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
  835. + if (unlikely(objnode == NULL))
  836. + goto out;
  837. + objnode->obj = obj;
  838. + SET_SENTINEL(objnode, OBJNODE);
  839. + memset(&objnode->slots, 0, sizeof(objnode->slots));
  840. + objnode->slots_in_use = 0;
  841. + obj->objnode_count++;
  842. +out:
  843. + return objnode;
  844. +}
  845. +
  846. +static void tmem_objnode_free(struct tmem_objnode *objnode)
  847. +{
  848. + struct tmem_pool *pool;
  849. + int i;
  850. +
  851. + BUG_ON(objnode == NULL);
  852. + for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
  853. + BUG_ON(objnode->slots[i] != NULL);
  854. + ASSERT_SENTINEL(objnode, OBJNODE);
  855. + INVERT_SENTINEL(objnode, OBJNODE);
  856. + BUG_ON(objnode->obj == NULL);
  857. + ASSERT_SENTINEL(objnode->obj, OBJ);
  858. + pool = objnode->obj->pool;
  859. + BUG_ON(pool == NULL);
  860. + ASSERT_SENTINEL(pool, POOL);
  861. + objnode->obj->objnode_count--;
  862. + objnode->obj = NULL;
  863. + (*tmem_hostops.objnode_free)(objnode, pool);
  864. +}
  865. +
  866. +/*
  867. + * lookup index in object and return associated pampd (or NULL if not found)
  868. + */
  869. +static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
  870. +{
  871. + unsigned int height, shift;
  872. + struct tmem_objnode **slot = NULL;
  873. +
  874. + BUG_ON(obj == NULL);
  875. + ASSERT_SENTINEL(obj, OBJ);
  876. + BUG_ON(obj->pool == NULL);
  877. + ASSERT_SENTINEL(obj->pool, POOL);
  878. +
  879. + height = obj->objnode_tree_height;
  880. + if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
  881. + goto out;
  882. + if (height == 0 && obj->objnode_tree_root) {
  883. + slot = &obj->objnode_tree_root;
  884. + goto out;
  885. + }
  886. + shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
  887. + slot = &obj->objnode_tree_root;
  888. + while (height > 0) {
  889. + if (*slot == NULL)
  890. + goto out;
  891. + slot = (struct tmem_objnode **)
  892. + ((*slot)->slots +
  893. + ((index >> shift) & OBJNODE_TREE_MAP_MASK));
  894. + shift -= OBJNODE_TREE_MAP_SHIFT;
  895. + height--;
  896. + }
  897. +out:
  898. + return slot != NULL ? *slot : NULL;
  899. +}
  900. +
  901. +static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
  902. + void *pampd)
  903. +{
  904. + int ret = 0;
  905. + struct tmem_objnode *objnode = NULL, *newnode, *slot;
  906. + unsigned int height, shift;
  907. + int offset = 0;
  908. +
  909. + /* if necessary, extend the tree to be higher */
  910. + if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
  911. + height = obj->objnode_tree_height + 1;
  912. + if (index > tmem_objnode_tree_h2max[height])
  913. + while (index > tmem_objnode_tree_h2max[height])
  914. + height++;
  915. + if (obj->objnode_tree_root == NULL) {
  916. + obj->objnode_tree_height = height;
  917. + goto insert;
  918. + }
  919. + do {
  920. + newnode = tmem_objnode_alloc(obj);
  921. + if (!newnode) {
  922. + ret = -ENOMEM;
  923. + goto out;
  924. + }
  925. + newnode->slots[0] = obj->objnode_tree_root;
  926. + newnode->slots_in_use = 1;
  927. + obj->objnode_tree_root = newnode;
  928. + obj->objnode_tree_height++;
  929. + } while (height > obj->objnode_tree_height);
  930. + }
  931. +insert:
  932. + slot = obj->objnode_tree_root;
  933. + height = obj->objnode_tree_height;
  934. + shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
  935. + while (height > 0) {
  936. + if (slot == NULL) {
  937. + /* add a child objnode. */
  938. + slot = tmem_objnode_alloc(obj);
  939. + if (!slot) {
  940. + ret = -ENOMEM;
  941. + goto out;
  942. + }
  943. + if (objnode) {
  944. +
  945. + objnode->slots[offset] = slot;
  946. + objnode->slots_in_use++;
  947. + } else
  948. + obj->objnode_tree_root = slot;
  949. + }
  950. + /* go down a level */
  951. + offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
  952. + objnode = slot;
  953. + slot = objnode->slots[offset];
  954. + shift -= OBJNODE_TREE_MAP_SHIFT;
  955. + height--;
  956. + }
  957. + BUG_ON(slot != NULL);
  958. + if (objnode) {
  959. + objnode->slots_in_use++;
  960. + objnode->slots[offset] = pampd;
  961. + } else
  962. + obj->objnode_tree_root = pampd;
  963. + obj->pampd_count++;
  964. +out:
  965. + return ret;
  966. +}
  967. +
  968. +static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
  969. +{
  970. + struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1];
  971. + struct tmem_objnode_tree_path *pathp = path;
  972. + struct tmem_objnode *slot = NULL;
  973. + unsigned int height, shift;
  974. + int offset;
  975. +
  976. + BUG_ON(obj == NULL);
  977. + ASSERT_SENTINEL(obj, OBJ);
  978. + BUG_ON(obj->pool == NULL);
  979. + ASSERT_SENTINEL(obj->pool, POOL);
  980. + height = obj->objnode_tree_height;
  981. + if (index > tmem_objnode_tree_h2max[height])
  982. + goto out;
  983. + slot = obj->objnode_tree_root;
  984. + if (height == 0 && obj->objnode_tree_root) {
  985. + obj->objnode_tree_root = NULL;
  986. + goto out;
  987. + }
  988. + shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
  989. + pathp->objnode = NULL;
  990. + do {
  991. + if (slot == NULL)
  992. + goto out;
  993. + pathp++;
  994. + offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
  995. + pathp->offset = offset;
  996. + pathp->objnode = slot;
  997. + slot = slot->slots[offset];
  998. + shift -= OBJNODE_TREE_MAP_SHIFT;
  999. + height--;
  1000. + } while (height > 0);
  1001. + if (slot == NULL)
  1002. + goto out;
  1003. + while (pathp->objnode) {
  1004. + pathp->objnode->slots[pathp->offset] = NULL;
  1005. + pathp->objnode->slots_in_use--;
  1006. + if (pathp->objnode->slots_in_use) {
  1007. + if (pathp->objnode == obj->objnode_tree_root) {
  1008. + while (obj->objnode_tree_height > 0 &&
  1009. + obj->objnode_tree_root->slots_in_use == 1 &&
  1010. + obj->objnode_tree_root->slots[0]) {
  1011. + struct tmem_objnode *to_free =
  1012. + obj->objnode_tree_root;
  1013. +
  1014. + obj->objnode_tree_root =
  1015. + to_free->slots[0];
  1016. + obj->objnode_tree_height--;
  1017. + to_free->slots[0] = NULL;
  1018. + to_free->slots_in_use = 0;
  1019. + tmem_objnode_free(to_free);
  1020. + }
  1021. + }
  1022. + goto out;
  1023. + }
  1024. + tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
  1025. + pathp--;
  1026. + }
  1027. + obj->objnode_tree_height = 0;
  1028. + obj->objnode_tree_root = NULL;
  1029. +
  1030. +out:
  1031. + if (slot != NULL)
  1032. + obj->pampd_count--;
  1033. + BUG_ON(obj->pampd_count < 0);
  1034. + return slot;
  1035. +}
  1036. +
  1037. +/* recursively walk the objnode_tree destroying pampds and objnodes */
  1038. +static void tmem_objnode_node_destroy(struct tmem_obj *obj,
  1039. + struct tmem_objnode *objnode,
  1040. + unsigned int ht)
  1041. +{
  1042. + int i;
  1043. +
  1044. + if (ht == 0)
  1045. + return;
  1046. + for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
  1047. + if (objnode->slots[i]) {
  1048. + if (ht == 1) {
  1049. + obj->pampd_count--;
  1050. + (*tmem_pamops.free)(objnode->slots[i],
  1051. + obj->pool);
  1052. + objnode->slots[i] = NULL;
  1053. + continue;
  1054. + }
  1055. + tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
  1056. + tmem_objnode_free(objnode->slots[i]);
  1057. + objnode->slots[i] = NULL;
  1058. + }
  1059. + }
  1060. +}
  1061. +
  1062. +static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
  1063. +{
  1064. + if (obj->objnode_tree_root == NULL)
  1065. + return;
  1066. + if (obj->objnode_tree_height == 0) {
  1067. + obj->pampd_count--;
  1068. + (*tmem_pamops.free)(obj->objnode_tree_root, obj->pool);
  1069. + } else {
  1070. + tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
  1071. + obj->objnode_tree_height);
  1072. + tmem_objnode_free(obj->objnode_tree_root);
  1073. + obj->objnode_tree_height = 0;
  1074. + }
  1075. + obj->objnode_tree_root = NULL;
  1076. +}
  1077. +
  1078. +/*
  1079. + * Tmem is operated on by a set of well-defined actions:
  1080. + * "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
  1081. + * (The tmem ABI allows for subpages and exchanges but these operations
  1082. + * are not included in this implementation.)
  1083. + *
  1084. + * These "tmem core" operations are implemented in the following functions.
  1085. + */
  1086. +
  1087. +/*
  1088. + * "Put" a page, e.g. copy a page from the kernel into newly allocated
  1089. + * PAM space (if such space is available). Tmem_put is complicated by
  1090. + * a corner case: What if a page with matching handle already exists in
  1091. + * tmem? To guarantee coherency, one of two actions is necessary: Either
  1092. + * the data for the page must be overwritten, or the page must be
  1093. + * "flushed" so that the data is not accessible to a subsequent "get".
  1094. + * Since these "duplicate puts" are relatively rare, this implementation
  1095. + * always flushes for simplicity.
  1096. + */
  1097. +int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
  1098. + struct page *page)
  1099. +{
  1100. + struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
  1101. + void *pampd = NULL, *pampd_del = NULL;
  1102. + int ret = -ENOMEM;
  1103. + bool ephemeral;
  1104. + struct tmem_hashbucket *hb;
  1105. +
  1106. + ephemeral = is_ephemeral(pool);
  1107. + hb = &pool->hashbucket[tmem_oid_hash(oidp)];
  1108. + spin_lock(&hb->lock);
  1109. + obj = objfound = tmem_obj_find(hb, oidp);
  1110. + if (obj != NULL) {
  1111. + pampd = tmem_pampd_lookup_in_obj(objfound, index);
  1112. + if (pampd != NULL) {
  1113. + /* if found, is a dup put, flush the old one */
  1114. + pampd_del = tmem_pampd_delete_from_obj(obj, index);
  1115. + BUG_ON(pampd_del != pampd);
  1116. + (*tmem_pamops.free)(pampd, pool);
  1117. + if (obj->pampd_count == 0) {
  1118. + objnew = obj;
  1119. + objfound = NULL;
  1120. + }
  1121. + pampd = NULL;
  1122. + }
  1123. + } else {
  1124. + obj = objnew = (*tmem_hostops.obj_alloc)(pool);
  1125. + if (unlikely(obj == NULL)) {
  1126. + ret = -ENOMEM;
  1127. + goto out;
  1128. + }
  1129. + tmem_obj_init(obj, hb, pool, oidp);
  1130. + }
  1131. + BUG_ON(obj == NULL);
  1132. + BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
  1133. + pampd = (*tmem_pamops.create)(obj->pool, &obj->oid, index, page);
  1134. + if (unlikely(pampd == NULL))
  1135. + goto free;
  1136. + ret = tmem_pampd_add_to_obj(obj, index, pampd);
  1137. + if (unlikely(ret == -ENOMEM))
  1138. + /* may have partially built objnode tree ("stump") */
  1139. + goto delete_and_free;
  1140. + goto out;
  1141. +
  1142. +delete_and_free:
  1143. + (void)tmem_pampd_delete_from_obj(obj, index);
  1144. +free:
  1145. + if (pampd)
  1146. + (*tmem_pamops.free)(pampd, pool);
  1147. + if (objnew) {
  1148. + tmem_obj_free(objnew, hb);
  1149. + (*tmem_hostops.obj_free)(objnew, pool);
  1150. + }
  1151. +out:
  1152. + spin_unlock(&hb->lock);
  1153. + return ret;
  1154. +}
  1155. +
  1156. +/*
  1157. + * "Get" a page, e.g. if one can be found, copy the tmem page with the
  1158. + * matching handle from PAM space to the kernel. By tmem definition,
  1159. + * when a "get" is successful on an ephemeral page, the page is "flushed",
  1160. + * and when a "get" is successful on a persistent page, the page is retained
  1161. + * in tmem. Note that to preserve
  1162. + * coherency, "get" can never be skipped if tmem contains the data.
  1163. + * That is, if a get is done with a certain handle and fails, any
  1164. + * subsequent "get" must also fail (unless of course there is a
  1165. + * "put" done with the same handle).
  1166. +
  1167. + */
  1168. +int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp,
  1169. + uint32_t index, struct page *page)
  1170. +{
  1171. + struct tmem_obj *obj;
  1172. + void *pampd;
  1173. + bool ephemeral = is_ephemeral(pool);
  1174. + uint32_t ret = -1;
  1175. + struct tmem_hashbucket *hb;
  1176. +
  1177. + hb = &pool->hashbucket[tmem_oid_hash(oidp)];
  1178. + spin_lock(&hb->lock);
  1179. + obj = tmem_obj_find(hb, oidp);
  1180. + if (obj == NULL)
  1181. + goto out;
  1182. + ephemeral = is_ephemeral(pool);
  1183. + if (ephemeral)
  1184. + pampd = tmem_pampd_delete_from_obj(obj, index);
  1185. + else
  1186. + pampd = tmem_pampd_lookup_in_obj(obj, index);
  1187. + if (pampd == NULL)
  1188. + goto out;
  1189. + ret = (*tmem_pamops.get_data)(page, pampd, pool);
  1190. + if (ret < 0)
  1191. + goto out;
  1192. + if (ephemeral) {
  1193. + (*tmem_pamops.free)(pampd, pool);
  1194. + if (obj->pampd_count == 0) {
  1195. + tmem_obj_free(obj, hb);
  1196. + (*tmem_hostops.obj_free)(obj, pool);
  1197. + obj = NULL;
  1198. + }
  1199. + }
  1200. + ret = 0;
  1201. +out:
  1202. + spin_unlock(&hb->lock);
  1203. + return ret;
  1204. +}
  1205. +
  1206. +/*
  1207. + * If a page in tmem matches the handle, "flush" this page from tmem such
  1208. + * that any subsequent "get" does not succeed (unless, of course, there
  1209. + * was another "put" with the same handle).
  1210. + */
  1211. +int tmem_flush_page(struct tmem_pool *pool,
  1212. + struct tmem_oid *oidp, uint32_t index)
  1213. +{
  1214. + struct tmem_obj *obj;
  1215. + void *pampd;
  1216. + int ret = -1;
  1217. + struct tmem_hashbucket *hb;
  1218. +
  1219. + hb = &pool->hashbucket[tmem_oid_hash(oidp)];
  1220. + spin_lock(&hb->lock);
  1221. + obj = tmem_obj_find(hb, oidp);
  1222. + if (obj == NULL)
  1223. + goto out;
  1224. + pampd = tmem_pampd_delete_from_obj(obj, index);
  1225. + if (pampd == NULL)
  1226. + goto out;
  1227. + (*tmem_pamops.free)(pampd, pool);
  1228. + if (obj->pampd_count == 0) {
  1229. + tmem_obj_free(obj, hb);
  1230. + (*tmem_hostops.obj_free)(obj, pool);
  1231. + }
  1232. + ret = 0;
  1233. +
  1234. +out:
  1235. + spin_unlock(&hb->lock);
  1236. + return ret;
  1237. +}
  1238. +
  1239. +/*
  1240. + * "Flush" all pages in tmem matching this oid.
  1241. + */
  1242. +int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
  1243. +{
  1244. + struct tmem_obj *obj;
  1245. + struct tmem_hashbucket *hb;
  1246. + int ret = -1;
  1247. +
  1248. + hb = &pool->hashbucket[tmem_oid_hash(oidp)];
  1249. + spin_lock(&hb->lock);
  1250. + obj = tmem_obj_find(hb, oidp);
  1251. + if (obj == NULL)
  1252. + goto out;
  1253. + tmem_pampd_destroy_all_in_obj(obj);
  1254. + tmem_obj_free(obj, hb);
  1255. + (*tmem_hostops.obj_free)(obj, pool);
  1256. + ret = 0;
  1257. +
  1258. +out:
  1259. + spin_unlock(&hb->lock);
  1260. + return ret;
  1261. +}
  1262. +
  1263. +/*
  1264. + * "Flush" all pages (and tmem_objs) from this tmem_pool and disable
  1265. + * all subsequent access to this tmem_pool.
  1266. + */
  1267. +int tmem_destroy_pool(struct tmem_pool *pool)
  1268. +{
  1269. + int ret = -1;
  1270. +
  1271. + if (pool == NULL)
  1272. + goto out;
  1273. + tmem_pool_flush(pool, 1);
  1274. + ret = 0;
  1275. +out:
  1276. + return ret;
  1277. +}
  1278. +
  1279. +static LIST_HEAD(tmem_global_pool_list);
  1280. +
  1281. +/*
  1282. + * Create a new tmem_pool with the provided flag and return
  1283. + * a pool id provided by the tmem host implementation.
  1284. + */
  1285. +void tmem_new_pool(struct tmem_pool *pool, uint32_t flags)
  1286. +{
  1287. + int persistent = flags & TMEM_POOL_PERSIST;
  1288. + int shared = flags & TMEM_POOL_SHARED;
  1289. + struct tmem_hashbucket *hb = &pool->hashbucket[0];
  1290. + int i;
  1291. +
  1292. + for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
  1293. + hb->obj_rb_root = RB_ROOT;
  1294. + spin_lock_init(&hb->lock);
  1295. + }
  1296. + INIT_LIST_HEAD(&pool->pool_list);
  1297. + atomic_set(&pool->obj_count, 0);
  1298. + SET_SENTINEL(pool, POOL);
  1299. + list_add_tail(&pool->pool_list, &tmem_global_pool_list);
  1300. + pool->persistent = persistent;
  1301. + pool->shared = shared;
  1302. +}
  1303. diff -Nrupad linux-2.6.37//drivers/staging/zcache/tmem.h linux-2.6.37_vanilla//drivers/staging/zcache/tmem.h
  1304. --- linux-2.6.37//drivers/staging/zcache/tmem.h 1970-01-01 01:00:00.000000000 +0100
  1305. +++ linux-2.6.37_vanilla//drivers/staging/zcache/tmem.h 2011-02-14 01:21:43.160793007 +0100
  1306. @@ -0,0 +1,195 @@
  1307. +/*
  1308. + * tmem.h
  1309. + *
  1310. + * Transcendent memory
  1311. + *
  1312. + * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
  1313. + */
  1314. +
  1315. +#ifndef _TMEM_H_
  1316. +#define _TMEM_H_
  1317. +
  1318. +#include <linux/types.h>
  1319. +#include <linux/highmem.h>
  1320. +#include <linux/hash.h>
  1321. +#include <linux/atomic.h>
  1322. +
  1323. +/*
  1324. + * These are pre-defined by the Xen<->Linux ABI
  1325. + */
  1326. +#define TMEM_PUT_PAGE 4
  1327. +#define TMEM_GET_PAGE 5
  1328. +#define TMEM_FLUSH_PAGE 6
  1329. +#define TMEM_FLUSH_OBJECT 7
  1330. +#define TMEM_POOL_PERSIST 1
  1331. +#define TMEM_POOL_SHARED 2
  1332. +#define TMEM_POOL_PRECOMPRESSED 4
  1333. +#define TMEM_POOL_PAGESIZE_SHIFT 4
  1334. +#define TMEM_POOL_PAGESIZE_MASK 0xf
  1335. +#define TMEM_POOL_RESERVED_BITS 0x00ffff00
  1336. +
  1337. +/*
  1338. + * sentinels have proven very useful for debugging but can be removed
  1339. + * or disabled before final merge.
  1340. + */
  1341. +#define SENTINELS
  1342. +#ifdef SENTINELS
  1343. +#define DECL_SENTINEL uint32_t sentinel;
  1344. +#define SET_SENTINEL(_x, _y) (_x->sentinel = _y##_SENTINEL)
  1345. +#define INVERT_SENTINEL(_x, _y) (_x->sentinel = ~_y##_SENTINEL)
  1346. +#define ASSERT_SENTINEL(_x, _y) WARN_ON(_x->sentinel != _y##_SENTINEL)
  1347. +#define ASSERT_INVERTED_SENTINEL(_x, _y) WARN_ON(_x->sentinel != ~_y##_SENTINEL)
  1348. +#else
  1349. +#define DECL_SENTINEL
  1350. +#define SET_SENTINEL(_x, _y) do { } while (0)
  1351. +#define INVERT_SENTINEL(_x, _y) do { } while (0)
  1352. +#define ASSERT_SENTINEL(_x, _y) do { } while (0)
  1353. +#define ASSERT_INVERTED_SENTINEL(_x, _y) do { } while (0)
  1354. +#endif
  1355. +
  1356. +#define ASSERT_SPINLOCK(_l) WARN_ON(!spin_is_locked(_l))
  1357. +
  1358. +/*
  1359. + * A pool is the highest-level data structure managed by tmem and
  1360. + * usually corresponds to a large independent set of pages such as
  1361. + * a filesystem. Each pool has an id, and certain attributes and counters.
  1362. + * It also contains a set of hash buckets, each of which contains an rbtree
  1363. + * of objects and a lock to manage concurrency within the pool.
  1364. + */
  1365. +
  1366. +#define TMEM_HASH_BUCKET_BITS 8
  1367. +#define TMEM_HASH_BUCKETS (1<<TMEM_HASH_BUCKET_BITS)
  1368. +
  1369. +struct tmem_hashbucket {
  1370. + struct rb_root obj_rb_root;
  1371. + spinlock_t lock;
  1372. +};
  1373. +
  1374. +struct tmem_pool {
  1375. + void *client; /* "up" for some clients, avoids table lookup */
  1376. + struct list_head pool_list;
  1377. + uint32_t pool_id;
  1378. + bool persistent;
  1379. + bool shared;
  1380. + atomic_t obj_count;
  1381. + atomic_t refcount;
  1382. + struct tmem_hashbucket hashbucket[TMEM_HASH_BUCKETS];
  1383. + DECL_SENTINEL
  1384. +};
  1385. +
  1386. +#define is_persistent(_p) (_p->persistent)
  1387. +#define is_ephemeral(_p) (!(_p->persistent))
  1388. +
  1389. +/*
  1390. + * An object id ("oid") is large: 192-bits (to ensure, for example, files
  1391. + * in a modern filesystem can be uniquely identified).
  1392. + */
  1393. +
  1394. +struct tmem_oid {
  1395. + uint64_t oid[3];
  1396. +};
  1397. +
  1398. +static inline void tmem_oid_set_invalid(struct tmem_oid *oidp)
  1399. +{
  1400. + oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
  1401. +}
  1402. +
  1403. +static inline bool tmem_oid_valid(struct tmem_oid *oidp)
  1404. +{
  1405. + return oidp->oid[0] != -1UL || oidp->oid[1] != -1UL ||
  1406. + oidp->oid[2] != -1UL;
  1407. +}
  1408. +
  1409. +static inline int tmem_oid_compare(struct tmem_oid *left,
  1410. + struct tmem_oid *right)
  1411. +{
  1412. + int ret;
  1413. +
  1414. + if (left->oid[2] == right->oid[2]) {
  1415. + if (left->oid[1] == right->oid[1]) {
  1416. + if (left->oid[0] == right->oid[0])
  1417. + ret = 0;
  1418. + else if (left->oid[0] < right->oid[0])
  1419. + ret = -1;
  1420. + else
  1421. + return 1;
  1422. + } else if (left->oid[1] < right->oid[1])
  1423. + ret = -1;
  1424. + else
  1425. + ret = 1;
  1426. + } else if (left->oid[2] < right->oid[2])
  1427. + ret = -1;
  1428. + else
  1429. + ret = 1;
  1430. + return ret;
  1431. +}
  1432. +
  1433. +static inline unsigned tmem_oid_hash(struct tmem_oid *oidp)
  1434. +{
  1435. + return hash_long(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2],
  1436. + TMEM_HASH_BUCKET_BITS);
  1437. +}
  1438. +
  1439. +/*
  1440. + * A tmem_obj contains an identifier (oid), pointers to the parent
  1441. + * pool and the rb_tree to which it belongs, counters, and an ordered
  1442. + * set of pampds, structured in a radix-tree-like tree. The intermediate
  1443. + * nodes of the tree are called tmem_objnodes.
  1444. + */
  1445. +
  1446. +struct tmem_objnode;
  1447. +
  1448. +struct tmem_obj {
  1449. + struct tmem_oid oid;
  1450. + struct tmem_pool *pool;
  1451. + struct rb_node rb_tree_node;
  1452. + struct tmem_objnode *objnode_tree_root;
  1453. + unsigned int objnode_tree_height;
  1454. + unsigned long objnode_count;
  1455. + long pampd_count;
  1456. + DECL_SENTINEL
  1457. +};
  1458. +
  1459. +#define OBJNODE_TREE_MAP_SHIFT 6
  1460. +#define OBJNODE_TREE_MAP_SIZE (1UL << OBJNODE_TREE_MAP_SHIFT)
  1461. +#define OBJNODE_TREE_MAP_MASK (OBJNODE_TREE_MAP_SIZE-1)
  1462. +#define OBJNODE_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
  1463. +#define OBJNODE_TREE_MAX_PATH \
  1464. + (OBJNODE_TREE_INDEX_BITS/OBJNODE_TREE_MAP_SHIFT + 2)
  1465. +
  1466. +struct tmem_objnode {
  1467. + struct tmem_obj *obj;
  1468. + DECL_SENTINEL
  1469. + void *slots[OBJNODE_TREE_MAP_SIZE];
  1470. + unsigned int slots_in_use;
  1471. +};
  1472. +
  1473. +/* pampd abstract datatype methods provided by the PAM implementation */
  1474. +struct tmem_pamops {
  1475. + void *(*create)(struct tmem_pool *, struct tmem_oid *, uint32_t,
  1476. + struct page *);
  1477. + int (*get_data)(struct page *, void *, struct tmem_pool *);
  1478. + void (*free)(void *, struct tmem_pool *);
  1479. +};
  1480. +extern void tmem_register_pamops(struct tmem_pamops *m);
  1481. +
  1482. +/* memory allocation methods provided by the host implementation */
  1483. +struct tmem_hostops {
  1484. + struct tmem_obj *(*obj_alloc)(struct tmem_pool *);
  1485. + void (*obj_free)(struct tmem_obj *, struct tmem_pool *);
  1486. + struct tmem_objnode *(*objnode_alloc)(struct tmem_pool *);
  1487. + void (*objnode_free)(struct tmem_objnode *, struct tmem_pool *);
  1488. +};
  1489. +extern void tmem_register_hostops(struct tmem_hostops *m);
  1490. +
  1491. +/* core tmem accessor functions */
  1492. +extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index,
  1493. + struct page *page);
  1494. +extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index,
  1495. + struct page *page);
  1496. +extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *,
  1497. + uint32_t index);
  1498. +extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *);
  1499. +extern int tmem_destroy_pool(struct tmem_pool *);
  1500. +extern void tmem_new_pool(struct tmem_pool *, uint32_t);
  1501. +#endif /* _TMEM_H */
  1502. diff -Nrupad linux-2.6.37//drivers/staging/zcache/zcache.c linux-2.6.37_vanilla//drivers/staging/zcache/zcache.c
  1503. --- linux-2.6.37//drivers/staging/zcache/zcache.c 1970-01-01 01:00:00.000000000 +0100
  1504. +++ linux-2.6.37_vanilla//drivers/staging/zcache/zcache.c 2011-02-14 01:22:00.636793117 +0100
  1505. @@ -0,0 +1,1658 @@
  1506. +/*
  1507. + * zcache.c
  1508. + *
  1509. + * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.
  1510. + * Copyright (c) 2010,2011, Nitin Gupta
  1511. + *
  1512. + * Zcache provides an in-kernel "host implementation" for transcendent memory
  1513. + * and, thus indirectly, for cleancache and frontswap. Zcache includes two
  1514. + * page-accessible memory [1] interfaces, both utilizing lzo1x compression:
  1515. + * 1) "compression buddies" ("zbud") is used for ephemeral pages
  1516. + * 2) xvmalloc is used for persistent pages.
  1517. + * Xvmalloc (based on the TLSF allocator) has very low fragmentation
  1518. + * so maximizes space efficiency, while zbud allows pairs (and potentially,
  1519. + * in the future, more than a pair of) compressed pages to be closely linked
  1520. + * so that reclaiming can be done via the kernel's physical-page-oriented
  1521. + * "shrinker" interface.
  1522. + *
  1523. + * [1] For a definition of page-accessible memory (aka PAM), see:
  1524. + * http://marc.info/?l=linux-mm&m=127811271605009
  1525. + */
  1526. +
  1527. +#include <linux/cpu.h>
  1528. +#include <linux/highmem.h>
  1529. +#include <linux/list.h>
  1530. +#include <linux/lzo.h>
  1531. +#include <linux/slab.h>
  1532. +#include <linux/spinlock.h>
  1533. +#include <linux/types.h>
  1534. +#include <linux/atomic.h>
  1535. +#include "tmem.h"
  1536. +
  1537. +#include "../zram/xvmalloc.h" /* if built in drivers/staging */
  1538. +
  1539. +#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
  1540. +#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
  1541. +#endif
  1542. +#ifdef CONFIG_CLEANCACHE
  1543. +#include <linux/cleancache.h>
  1544. +#endif
  1545. +#ifdef CONFIG_FRONTSWAP
  1546. +#include <linux/frontswap.h>
  1547. +#endif
  1548. +
  1549. +#if 0
  1550. +/* this is more aggressive but may cause other problems? */
  1551. +#define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
  1552. +#else
  1553. +#define ZCACHE_GFP_MASK \
  1554. + (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
  1555. +#endif
  1556. +
  1557. +/**********
  1558. + * Compression buddies ("zbud") provides for packing two (or, possibly
  1559. + * in the future, more) compressed ephemeral pages into a single "raw"
  1560. + * (physical) page and tracking them with data structures so that
  1561. + * the raw pages can be easily reclaimed.
  1562. + *
  1563. + * A zbud page ("zbpg") is an aligned page containing a list_head,
  1564. + * a lock, and two "zbud headers". The remainder of the physical
  1565. + * page is divided up into aligned 64-byte "chunks" which contain
  1566. + * the compressed data for zero, one, or two zbuds. Each zbpg
  1567. + * resides on: (1) an "unused list" if it has no zbuds; (2) a
  1568. + * "buddied" list if it is fully populated with two zbuds; or
  1569. + * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
  1570. + * the one unbuddied zbud uses. The data inside a zbpg cannot be
  1571. + * read or written unless the zbpg's lock is held.
  1572. + */
  1573. +
  1574. +#define ZBH_SENTINEL 0x43214321
  1575. +#define ZBPG_SENTINEL 0xdeadbeef
  1576. +
  1577. +#define ZBUD_MAX_BUDS 2
  1578. +
  1579. +struct zbud_hdr {
  1580. + uint32_t pool_id;
  1581. + struct tmem_oid oid;
  1582. + uint32_t index;
  1583. + uint16_t size; /* compressed size in bytes, zero means unused */
  1584. + DECL_SENTINEL
  1585. +};
  1586. +
  1587. +struct zbud_page {
  1588. + struct list_head bud_list;
  1589. + spinlock_t lock;
  1590. + struct zbud_hdr buddy[ZBUD_MAX_BUDS];
  1591. + DECL_SENTINEL
  1592. + /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
  1593. +};
  1594. +
  1595. +#define CHUNK_SHIFT 6
  1596. +#define CHUNK_SIZE (1 << CHUNK_SHIFT)
  1597. +#define CHUNK_MASK (~(CHUNK_SIZE-1))
  1598. +#define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \
  1599. + CHUNK_MASK) >> CHUNK_SHIFT)
  1600. +#define MAX_CHUNK (NCHUNKS-1)
  1601. +
  1602. +static struct {
  1603. + struct list_head list;
  1604. + unsigned count;
  1605. +} zbud_unbuddied[NCHUNKS];
  1606. +/* list N contains pages with N chunks USED and NCHUNKS-N unused */
  1607. +/* element 0 is never used but optimizing that isn't worth it */
  1608. +static unsigned long zbud_cumul_chunk_counts[NCHUNKS];
  1609. +
  1610. +struct list_head zbud_buddied_list;
  1611. +static unsigned long zcache_zbud_buddied_count;
  1612. +
  1613. +/* protects the buddied list and all unbuddied lists */
  1614. +static DEFINE_SPINLOCK(zbud_budlists_spinlock);
  1615. +
  1616. +static LIST_HEAD(zbpg_unused_list);
  1617. +static unsigned long zcache_zbpg_unused_list_count;
  1618. +
  1619. +/* protects the unused page list */
  1620. +static DEFINE_SPINLOCK(zbpg_unused_list_spinlock);
  1621. +
  1622. +static atomic_t zcache_zbud_curr_raw_pages;
  1623. +static atomic_t zcache_zbud_curr_zpages;
  1624. +static unsigned long zcache_zbud_curr_zbytes;
  1625. +static unsigned long zcache_zbud_cumul_zpages;
  1626. +static unsigned long zcache_zbud_cumul_zbytes;
  1627. +static unsigned long zcache_compress_poor;
  1628. +
  1629. +/* forward references */
  1630. +static void *zcache_get_free_page(void);
  1631. +static void zcache_free_page(void *p);
  1632. +
  1633. +/*
  1634. + * zbud helper functions
  1635. + */
  1636. +
  1637. +static inline unsigned zbud_max_buddy_size(void)
  1638. +{
  1639. + return MAX_CHUNK << CHUNK_SHIFT;
  1640. +}
  1641. +
  1642. +static inline unsigned zbud_size_to_chunks(unsigned size)
  1643. +{
  1644. + BUG_ON(size == 0 || size > zbud_max_buddy_size());
  1645. + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
  1646. +}
  1647. +
  1648. +static inline int zbud_budnum(struct zbud_hdr *zh)
  1649. +{
  1650. + unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1);
  1651. + struct zbud_page *zbpg = NULL;
  1652. + unsigned budnum = -1U;
  1653. + int i;
  1654. +
  1655. + for (i = 0; i < ZBUD_MAX_BUDS; i++)
  1656. + if (offset == offsetof(typeof(*zbpg), buddy[i])) {
  1657. + budnum = i;
  1658. + break;
  1659. + }
  1660. + BUG_ON(budnum == -1U);
  1661. + return budnum;
  1662. +}
  1663. +
  1664. +static char *zbud_data(struct zbud_hdr *zh, unsigned size)
  1665. +{
  1666. + struct zbud_page *zbpg;
  1667. + char *p;
  1668. + unsigned budnum;
  1669. +
  1670. + ASSERT_SENTINEL(zh, ZBH);
  1671. + budnum = zbud_budnum(zh);
  1672. + BUG_ON(size == 0 || size > zbud_max_buddy_size());
  1673. + zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
  1674. + ASSERT_SPINLOCK(&zbpg->lock);
  1675. + p = (char *)zbpg;
  1676. + if (budnum == 0)
  1677. + p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &
  1678. + CHUNK_MASK);
  1679. + else if (budnum == 1)
  1680. + p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
  1681. + return p;
  1682. +}
  1683. +
  1684. +/*
  1685. + * zbud raw page management
  1686. + */
  1687. +
  1688. +static struct zbud_page *zbud_alloc_raw_page(void)
  1689. +{
  1690. + struct zbud_page *zbpg = NULL;
  1691. + struct zbud_hdr *zh0, *zh1;
  1692. + bool recycled = 0;
  1693. +
  1694. + /* if any pages on the zbpg list, use one */
  1695. + spin_lock(&zbpg_unused_list_spinlock);
  1696. + if (!list_empty(&zbpg_unused_list)) {
  1697. + zbpg = list_first_entry(&zbpg_unused_list,
  1698. + struct zbud_page, bud_list);
  1699. + list_del_init(&zbpg->bud_list);
  1700. + zcache_zbpg_unused_list_count--;
  1701. + recycled = 1;
  1702. + }
  1703. + spin_unlock(&zbpg_unused_list_spinlock);
  1704. + if (zbpg == NULL)
  1705. + /* none on zbpg list, try to get a kernel page */
  1706. + zbpg = zcache_get_free_page();
  1707. + if (likely(zbpg != NULL)) {
  1708. + INIT_LIST_HEAD(&zbpg->bud_list);
  1709. + zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
  1710. + spin_lock_init(&zbpg->lock);
  1711. + if (recycled) {
  1712. + ASSERT_INVERTED_SENTINEL(zbpg, ZBPG);
  1713. + SET_SENTINEL(zbpg, ZBPG);
  1714. + BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
  1715. + BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
  1716. + } else {
  1717. + atomic_inc(&zcache_zbud_curr_raw_pages);
  1718. + INIT_LIST_HEAD(&zbpg->bud_list);
  1719. + SET_SENTINEL(zbpg, ZBPG);
  1720. + zh0->size = 0; zh1->size = 0;
  1721. + tmem_oid_set_invalid(&zh0->oid);
  1722. + tmem_oid_set_invalid(&zh1->oid);
  1723. + }
  1724. + }
  1725. + return zbpg;
  1726. +}
  1727. +
  1728. +static void zbud_free_raw_page(struct zbud_page *zbpg)
  1729. +{
  1730. + struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1];
  1731. +
  1732. + ASSERT_SENTINEL(zbpg, ZBPG);
  1733. + BUG_ON(!list_empty(&zbpg->bud_list));
  1734. + ASSERT_SPINLOCK(&zbpg->lock);
  1735. + BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
  1736. + BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
  1737. + INVERT_SENTINEL(zbpg, ZBPG);
  1738. + spin_unlock(&zbpg->lock);
  1739. + spin_lock(&zbpg_unused_list_spinlock);
  1740. + list_add(&zbpg->bud_list, &zbpg_unused_list);
  1741. + zcache_zbpg_unused_list_count++;
  1742. + spin_unlock(&zbpg_unused_list_spinlock);
  1743. +}
  1744. +
  1745. +/*
  1746. + * core zbud handling routines
  1747. + */
  1748. +
  1749. +static unsigned zbud_free(struct zbud_hdr *zh)
  1750. +{
  1751. + unsigned size;
  1752. +
  1753. + ASSERT_SENTINEL(zh, ZBH);
  1754. + BUG_ON(!tmem_oid_valid(&zh->oid));
  1755. + size = zh->size;
  1756. + BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
  1757. + zh->size = 0;
  1758. + tmem_oid_set_invalid(&zh->oid);
  1759. + INVERT_SENTINEL(zh, ZBH);
  1760. + zcache_zbud_curr_zbytes -= size;
  1761. + atomic_dec(&zcache_zbud_curr_zpages);
  1762. + return size;
  1763. +}
  1764. +
  1765. +static void zbud_free_and_delist(struct zbud_hdr *zh)
  1766. +{
  1767. + unsigned chunks;
  1768. + struct zbud_hdr *zh_other;
  1769. + unsigned budnum = zbud_budnum(zh), size;
  1770. + struct zbud_page *zbpg =
  1771. + container_of(zh, struct zbud_page, buddy[budnum]);
  1772. +
  1773. + spin_lock(&zbpg->lock);
  1774. + if (list_empty(&zbpg->bud_list)) {
  1775. + /* ignore zombie page... see zbud_evict_pages() */
  1776. + spin_unlock(&zbpg->lock);
  1777. + return;
  1778. + }
  1779. + size = zbud_free(zh);
  1780. + ASSERT_SPINLOCK(&zbpg->lock);
  1781. + zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0];
  1782. + if (zh_other->size == 0) { /* was unbuddied: unlist and free */
  1783. + chunks = zbud_size_to_chunks(size) ;
  1784. + spin_lock(&zbud_budlists_spinlock);
  1785. + BUG_ON(list_empty(&zbud_unbuddied[chunks].list));
  1786. + list_del_init(&zbpg->bud_list);
  1787. + zbud_unbuddied[chunks].count--;
  1788. + spin_unlock(&zbud_budlists_spinlock);
  1789. + zbud_free_raw_page(zbpg);
  1790. + } else { /* was buddied: move remaining buddy to unbuddied list */
  1791. + chunks = zbud_size_to_chunks(zh_other->size) ;
  1792. + spin_lock(&zbud_budlists_spinlock);
  1793. + list_del_init(&zbpg->bud_list);
  1794. + zcache_zbud_buddied_count--;
  1795. + list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list);
  1796. + zbud_unbuddied[chunks].count++;
  1797. + spin_unlock(&zbud_budlists_spinlock);
  1798. + spin_unlock(&zbpg->lock);
  1799. + }
  1800. +}
  1801. +
  1802. +static struct zbud_hdr *zbud_create(uint32_t pool_id, struct tmem_oid *oid,
  1803. + uint32_t index, struct page *page,
  1804. + void *cdata, unsigned size)
  1805. +{
  1806. + struct zbud_hdr *zh0, *zh1, *zh = NULL;
  1807. + struct zbud_page *zbpg = NULL, *ztmp;
  1808. + unsigned nchunks;
  1809. + char *to;
  1810. + int i, found_good_buddy = 0;
  1811. +
  1812. + nchunks = zbud_size_to_chunks(size) ;
  1813. + for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
  1814. + spin_lock(&zbud_budlists_spinlock);
  1815. + if (!list_empty(&zbud_unbuddied[i].list)) {
  1816. + list_for_each_entry_safe(zbpg, ztmp,
  1817. + &zbud_unbuddied[i].list, bud_list) {
  1818. + if (spin_trylock(&zbpg->lock)) {
  1819. + found_good_buddy = i;
  1820. + goto found_unbuddied;
  1821. + }
  1822. + }
  1823. + }
  1824. + spin_unlock(&zbud_budlists_spinlock);
  1825. + }
  1826. + /* didn't find a good buddy, try allocating a new page */
  1827. + zbpg = zbud_alloc_raw_page();
  1828. + if (unlikely(zbpg == NULL))
  1829. + goto out;
  1830. + /* ok, have a page, now compress the data before taking locks */
  1831. + spin_lock(&zbpg->lock);
  1832. + spin_lock(&zbud_budlists_spinlock);
  1833. + list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list);
  1834. + zbud_unbuddied[nchunks].count++;
  1835. + zh = &zbpg->buddy[0];
  1836. + goto init_zh;
  1837. +
  1838. +found_unbuddied:
  1839. + ASSERT_SPINLOCK(&zbpg->lock);
  1840. + zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
  1841. + BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0)));
  1842. + if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */
  1843. + ASSERT_SENTINEL(zh0, ZBH);
  1844. + zh = zh1;
  1845. + } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */
  1846. + ASSERT_SENTINEL(zh1, ZBH);
  1847. + zh = zh0;
  1848. + } else
  1849. + BUG();
  1850. + list_del_init(&zbpg->bud_list);
  1851. + zbud_unbuddied[found_good_buddy].count--;
  1852. + list_add_tail(&zbpg->bud_list, &zbud_buddied_list);
  1853. + zcache_zbud_buddied_count++;
  1854. +
  1855. +init_zh:
  1856. + SET_SENTINEL(zh, ZBH);
  1857. + zh->size = size;
  1858. + zh->index = index;
  1859. + zh->oid = *oid;
  1860. + zh->pool_id = pool_id;
  1861. + /* can wait to copy the data until the list locks are dropped */
  1862. + spin_unlock(&zbud_budlists_spinlock);
  1863. +
  1864. + to = zbud_data(zh, size);
  1865. + memcpy(to, cdata, size);
  1866. + spin_unlock(&zbpg->lock);
  1867. + zbud_cumul_chunk_counts[nchunks]++;
  1868. + atomic_inc(&zcache_zbud_curr_zpages);
  1869. + zcache_zbud_cumul_zpages++;
  1870. + zcache_zbud_curr_zbytes += size;
  1871. + zcache_zbud_cumul_zbytes += size;
  1872. +out:
  1873. + return zh;
  1874. +}
  1875. +
  1876. +static int zbud_decompress(struct page *page, struct zbud_hdr *zh)
  1877. +{
  1878. + struct zbud_page *zbpg;
  1879. + unsigned budnum = zbud_budnum(zh);
  1880. + size_t out_len = PAGE_SIZE;
  1881. + char *to_va, *from_va;
  1882. + unsigned size;
  1883. + int ret = 0;
  1884. +
  1885. + zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
  1886. + spin_lock(&zbpg->lock);
  1887. + if (list_empty(&zbpg->bud_list)) {
  1888. + /* ignore zombie page... see zbud_evict_pages() */
  1889. + ret = -EINVAL;
  1890. + goto out;
  1891. + }
  1892. + ASSERT_SENTINEL(zh, ZBH);
  1893. + BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
  1894. + to_va = kmap_atomic(page, KM_USER0);
  1895. + size = zh->size;
  1896. + from_va = zbud_data(zh, size);
  1897. + ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len);
  1898. + BUG_ON(ret != LZO_E_OK);
  1899. + BUG_ON(out_len != PAGE_SIZE);
  1900. + kunmap_atomic(to_va, KM_USER0);
  1901. +out:
  1902. + spin_unlock(&zbpg->lock);
  1903. + return ret;
  1904. +}
  1905. +
  1906. +/*
  1907. + * The following routines handle shrinking of ephemeral pages by evicting
  1908. + * pages "least valuable" first.
  1909. + */
  1910. +
  1911. +static unsigned long zcache_evicted_raw_pages;
  1912. +static unsigned long zcache_evicted_buddied_pages;
  1913. +static unsigned long zcache_evicted_unbuddied_pages;
  1914. +
  1915. +static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid);
  1916. +static void zcache_put_pool(struct tmem_pool *pool);
  1917. +
  1918. +/*
  1919. + * Flush and free all zbuds in a zbpg, then free the pageframe
  1920. + */
  1921. +static void zbud_evict_zbpg(struct zbud_page *zbpg)
  1922. +{
  1923. + struct zbud_hdr *zh;
  1924. + int i, j;
  1925. + uint32_t pool_id[ZBUD_MAX_BUDS], index[ZBUD_MAX_BUDS];
  1926. + struct tmem_oid oid[ZBUD_MAX_BUDS];
  1927. + struct tmem_pool *pool;
  1928. +
  1929. + ASSERT_SPINLOCK(&zbpg->lock);
  1930. + BUG_ON(!list_empty(&zbpg->bud_list));
  1931. + for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) {
  1932. + zh = &zbpg->buddy[i];
  1933. + if (zh->size) {
  1934. + pool_id[j] = zh->pool_id;
  1935. + oid[j] = zh->oid;
  1936. + index[j] = zh->index;
  1937. + j++;
  1938. + zbud_free(zh);
  1939. + }
  1940. + }
  1941. + spin_unlock(&zbpg->lock);
  1942. + for (i = 0; i < j; i++) {
  1943. + pool = zcache_get_pool_by_id(pool_id[i]);
  1944. + if (pool != NULL) {
  1945. + tmem_flush_page(pool, &oid[i], index[i]);
  1946. + zcache_put_pool(pool);
  1947. + }
  1948. + }
  1949. + ASSERT_SENTINEL(zbpg, ZBPG);
  1950. + spin_lock(&zbpg->lock);
  1951. + zbud_free_raw_page(zbpg);
  1952. +}
  1953. +
  1954. +/*
  1955. + * Free nr pages. This code is funky because we want to hold the locks
  1956. + * protecting various lists for as short a time as possible, and in some
  1957. + * circumstances the list may change asynchronously when the list lock is
  1958. + * not held. In some cases we also trylock not only to avoid waiting on a
  1959. + * page in use by another cpu, but also to avoid potential deadlock due to
  1960. + * lock inversion.
  1961. + */
  1962. +static void zbud_evict_pages(int nr)
  1963. +{
  1964. + struct zbud_page *zbpg;
  1965. + int i;
  1966. +
  1967. + /* first try freeing any pages on unused list */
  1968. +retry_unused_list:
  1969. + spin_lock_bh(&zbpg_unused_list_spinlock);
  1970. + if (!list_empty(&zbpg_unused_list)) {
  1971. + /* can't walk list here, since it may change when unlocked */
  1972. + zbpg = list_first_entry(&zbpg_unused_list,
  1973. + struct zbud_page, bud_list);
  1974. + list_del_init(&zbpg->bud_list);
  1975. + zcache_zbpg_unused_list_count--;
  1976. + atomic_dec(&zcache_zbud_curr_raw_pages);
  1977. + spin_unlock_bh(&zbpg_unused_list_spinlock);
  1978. + zcache_free_page(zbpg);
  1979. + zcache_evicted_raw_pages++;
  1980. + if (--nr <= 0)
  1981. + goto out;
  1982. + goto retry_unused_list;
  1983. + }
  1984. + spin_unlock_bh(&zbpg_unused_list_spinlock);
  1985. +
  1986. + /* now try freeing unbuddied pages, starting with least space avail */
  1987. + for (i = 0; i < MAX_CHUNK; i++) {
  1988. +retry_unbud_list_i:
  1989. + spin_lock_bh(&zbud_budlists_spinlock);
  1990. + if (list_empty(&zbud_unbuddied[i].list)) {
  1991. + spin_unlock_bh(&zbud_budlists_spinlock);
  1992. + continue;
  1993. + }
  1994. + list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {
  1995. + if (unlikely(!spin_trylock(&zbpg->lock)))
  1996. + continue;
  1997. + list_del_init(&zbpg->bud_list);
  1998. + zbud_unbuddied[i].count--;
  1999. + spin_unlock(&zbud_budlists_spinlock);
  2000. + zcache_evicted_unbuddied_pages++;
  2001. + /* want budlists unlocked when doing zbpg eviction */
  2002. + zbud_evict_zbpg(zbpg);
  2003. + local_bh_enable();
  2004. + if (--nr <= 0)
  2005. + goto out;
  2006. + goto retry_unbud_list_i;
  2007. + }
  2008. + spin_unlock_bh(&zbud_budlists_spinlock);
  2009. + }
  2010. +
  2011. + /* as a last resort, free buddied pages */
  2012. +retry_bud_list:
  2013. + spin_lock_bh(&zbud_budlists_spinlock);
  2014. + if (list_empty(&zbud_buddied_list)) {
  2015. + spin_unlock_bh(&zbud_budlists_spinlock);
  2016. + goto out;
  2017. + }
  2018. + list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {
  2019. + if (unlikely(!spin_trylock(&zbpg->lock)))
  2020. + continue;
  2021. + list_del_init(&zbpg->bud_list);
  2022. + zcache_zbud_buddied_count--;
  2023. + spin_unlock(&zbud_budlists_spinlock);
  2024. + zcache_evicted_buddied_pages++;
  2025. + /* want budlists unlocked when doing zbpg eviction */
  2026. + zbud_evict_zbpg(zbpg);
  2027. + local_bh_enable();
  2028. + if (--nr <= 0)
  2029. + goto out;
  2030. + goto retry_bud_list;
  2031. + }
  2032. + spin_unlock_bh(&zbud_budlists_spinlock);
  2033. +out:
  2034. + return;
  2035. +}
  2036. +
  2037. +static void zbud_init(void)
  2038. +{
  2039. + int i;
  2040. +
  2041. + INIT_LIST_HEAD(&zbud_buddied_list);
  2042. + zcache_zbud_buddied_count = 0;
  2043. + for (i = 0; i < NCHUNKS; i++) {
  2044. + INIT_LIST_HEAD(&zbud_unbuddied[i].list);
  2045. + zbud_unbuddied[i].count = 0;
  2046. + }
  2047. +}
  2048. +
  2049. +#ifdef CONFIG_SYSFS
  2050. +/*
  2051. + * These sysfs routines show a nice distribution of how many zbpg's are
  2052. + * currently (and have ever been placed) in each unbuddied list. It's fun
  2053. + * to watch but can probably go away before final merge.
  2054. + */
  2055. +static int zbud_show_unbuddied_list_counts(char *buf)
  2056. +{
  2057. + int i;
  2058. + char *p = buf;
  2059. +
  2060. + for (i = 0; i < NCHUNKS - 1; i++)
  2061. + p += sprintf(p, "%u ", zbud_unbuddied[i].count);
  2062. + p += sprintf(p, "%d\n", zbud_unbuddied[i].count);
  2063. + return p - buf;
  2064. +}
  2065. +
  2066. +static int zbud_show_cumul_chunk_counts(char *buf)
  2067. +{
  2068. + unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0;
  2069. + unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0;
  2070. + unsigned long total_chunks_lte_42 = 0;
  2071. + char *p = buf;
  2072. +
  2073. + for (i = 0; i < NCHUNKS; i++) {
  2074. + p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]);
  2075. + chunks += zbud_cumul_chunk_counts[i];
  2076. + total_chunks += zbud_cumul_chunk_counts[i];
  2077. + sum_total_chunks += i * zbud_cumul_chunk_counts[i];
  2078. + if (i == 21)
  2079. + total_chunks_lte_21 = total_chunks;
  2080. + if (i == 32)
  2081. + total_chunks_lte_32 = total_chunks;
  2082. + if (i == 42)
  2083. + total_chunks_lte_42 = total_chunks;
  2084. + }
  2085. + p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
  2086. + total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42,
  2087. + chunks == 0 ? 0 : sum_total_chunks / chunks);
  2088. + return p - buf;
  2089. +}
  2090. +#endif
  2091. +
  2092. +/**********
  2093. + * This "zv" PAM implementation combines the TLSF-based xvMalloc
  2094. + * with lzo1x compression to maximize the amount of data that can
  2095. + * be packed into a physical page.
  2096. + *
  2097. + * Zv represents a PAM page with the index and object (plus a "size" value
  2098. + * necessary for decompression) immediately preceding the compressed data.
  2099. + */
  2100. +
  2101. +#define ZVH_SENTINEL 0x43214321
  2102. +
  2103. +struct zv_hdr {
  2104. + uint32_t pool_id;
  2105. + struct tmem_oid oid;
  2106. + uint32_t index;
  2107. + DECL_SENTINEL
  2108. +};
  2109. +
  2110. +static const int zv_max_page_size = (PAGE_SIZE / 8) * 7;
  2111. +
  2112. +static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id,
  2113. + struct tmem_oid *oid, uint32_t index,
  2114. + void *cdata, unsigned clen)
  2115. +{
  2116. + struct page *page;
  2117. + struct zv_hdr *zv = NULL;
  2118. + uint32_t offset;
  2119. + int ret;
  2120. +
  2121. + BUG_ON(!irqs_disabled());
  2122. + ret = xv_malloc(xvpool, clen + sizeof(struct zv_hdr),
  2123. + &page, &offset, ZCACHE_GFP_MASK);
  2124. + if (unlikely(ret))
  2125. + goto out;
  2126. + zv = kmap_atomic(page, KM_USER0) + offset;
  2127. + zv->index = index;
  2128. + zv->oid = *oid;
  2129. + zv->pool_id = pool_id;
  2130. + SET_SENTINEL(zv, ZVH);
  2131. + memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen);
  2132. + kunmap_atomic(zv, KM_USER0);
  2133. +out:
  2134. + return zv;
  2135. +}
  2136. +
  2137. +static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv)
  2138. +{
  2139. + unsigned long flags;
  2140. + struct page *page;
  2141. + uint32_t offset;
  2142. + uint16_t size;
  2143. +
  2144. + ASSERT_SENTINEL(zv, ZVH);
  2145. + size = xv_get_object_size(zv) - sizeof(*zv);
  2146. + BUG_ON(size == 0 || size > zv_max_page_size);
  2147. + INVERT_SENTINEL(zv, ZVH);
  2148. + page = virt_to_page(zv);
  2149. + offset = (unsigned long)zv & ~PAGE_MASK;
  2150. + local_irq_save(flags);
  2151. + xv_free(xvpool, page, offset);
  2152. + local_irq_restore(flags);
  2153. +}
  2154. +
  2155. +static void zv_decompress(struct page *page, struct zv_hdr *zv)
  2156. +{
  2157. + size_t clen = PAGE_SIZE;
  2158. + char *to_va;
  2159. + unsigned size;
  2160. + int ret;
  2161. +
  2162. + ASSERT_SENTINEL(zv, ZVH);
  2163. + size = xv_get_object_size(zv) - sizeof(*zv);
  2164. + BUG_ON(size == 0 || size > zv_max_page_size);
  2165. + to_va = kmap_atomic(page, KM_USER0);
  2166. + ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv),
  2167. + size, to_va, &clen);
  2168. + kunmap_atomic(to_va, KM_USER0);
  2169. + BUG_ON(ret != LZO_E_OK);
  2170. + BUG_ON(clen != PAGE_SIZE);
  2171. +}
  2172. +
  2173. +/*
  2174. + * zcache core code starts here
  2175. + */
  2176. +
  2177. +/* useful stats not collected by cleancache or frontswap */
  2178. +static unsigned long zcache_flush_total;
  2179. +static unsigned long zcache_flush_found;
  2180. +static unsigned long zcache_flobj_total;
  2181. +static unsigned long zcache_flobj_found;
  2182. +static unsigned long zcache_failed_eph_puts;
  2183. +static unsigned long zcache_failed_pers_puts;
  2184. +
  2185. +#define MAX_POOLS_PER_CLIENT 16
  2186. +
  2187. +static struct {
  2188. + struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT];
  2189. + struct xv_pool *xvpool;
  2190. +} zcache_client;
  2191. +
  2192. +/*
  2193. + * Tmem operations assume the poolid implies the invoking client.
  2194. + * Zcache only has one client (the kernel itself), so translate
  2195. + * the poolid into the tmem_pool allocated for it. A KVM version
  2196. + * of zcache would have one client per guest and each client might
  2197. + * have a poolid==N.
  2198. + */
  2199. +static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid)
  2200. +{
  2201. + struct tmem_pool *pool = NULL;
  2202. +
  2203. + if (poolid >= 0) {
  2204. + pool = zcache_client.tmem_pools[poolid];
  2205. + if (pool != NULL)
  2206. + atomic_inc(&pool->refcount);
  2207. + }
  2208. + return pool;
  2209. +}
  2210. +
  2211. +static void zcache_put_pool(struct tmem_pool *pool)
  2212. +{
  2213. + if (pool != NULL)
  2214. + atomic_dec(&pool->refcount);
  2215. +}
  2216. +
  2217. +/* counters for debugging */
  2218. +static unsigned long zcache_failed_get_free_pages;
  2219. +static unsigned long zcache_failed_alloc;
  2220. +static unsigned long zcache_put_to_flush;
  2221. +static unsigned long zcache_aborted_preload;
  2222. +static unsigned long zcache_aborted_shrink;
  2223. +
  2224. +/*
  2225. + * Ensure that memory allocation requests in zcache don't result
  2226. + * in direct reclaim requests via the shrinker, which would cause
  2227. + * an infinite loop. Maybe a GFP flag would be better?
  2228. + */
  2229. +static DEFINE_SPINLOCK(zcache_direct_reclaim_lock);
  2230. +
  2231. +/*
  2232. + * for now, used named slabs so can easily track usage; later can
  2233. + * either just use kmalloc, or perhaps add a slab-like allocator
  2234. + * to more carefully manage total memory utilization
  2235. + */
  2236. +static struct kmem_cache *zcache_objnode_cache;
  2237. +static struct kmem_cache *zcache_obj_cache;
  2238. +static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0);
  2239. +static unsigned long zcache_curr_obj_count_max;
  2240. +static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0);
  2241. +static unsigned long zcache_curr_objnode_count_max;
  2242. +
  2243. +/*
  2244. + * to avoid memory allocation recursion (e.g. due to direct reclaim), we
  2245. + * preload all necessary data structures so the hostops callbacks never
  2246. + * actually do a malloc
  2247. + */
  2248. +struct zcache_preload {
  2249. + void *page;
  2250. + struct tmem_obj *obj;
  2251. + int nr;
  2252. + struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH];
  2253. +};
  2254. +static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };
  2255. +
  2256. +static int zcache_do_preload(struct tmem_pool *pool)
  2257. +{
  2258. + struct zcache_preload *kp;
  2259. + struct tmem_objnode *objnode;
  2260. + struct tmem_obj *obj;
  2261. + void *page;
  2262. + int ret = -ENOMEM;
  2263. +
  2264. + if (unlikely(zcache_objnode_cache == NULL))
  2265. + goto out;
  2266. + if (unlikely(zcache_obj_cache == NULL))
  2267. + goto out;
  2268. + if (!spin_trylock(&zcache_direct_reclaim_lock)) {
  2269. + zcache_aborted_preload++;
  2270. + goto out;
  2271. + }
  2272. + preempt_disable();
  2273. + kp = &__get_cpu_var(zcache_preloads);
  2274. + while (kp->nr < ARRAY_SIZE(kp->objnodes)) {
  2275. + preempt_enable_no_resched();
  2276. + objnode = kmem_cache_alloc(zcache_objnode_cache,
  2277. + ZCACHE_GFP_MASK);
  2278. + if (unlikely(objnode == NULL)) {
  2279. + zcache_failed_alloc++;
  2280. + goto unlock_out;
  2281. + }
  2282. + preempt_disable();
  2283. + kp = &__get_cpu_var(zcache_preloads);
  2284. + if (kp->nr < ARRAY_SIZE(kp->objnodes))
  2285. + kp->objnodes[kp->nr++] = objnode;
  2286. + else
  2287. + kmem_cache_free(zcache_objnode_cache, objnode);
  2288. + }
  2289. + preempt_enable_no_resched();
  2290. + obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK);
  2291. + if (unlikely(obj == NULL)) {
  2292. + zcache_failed_alloc++;
  2293. + goto unlock_out;
  2294. + }
  2295. + page = (void *)__get_free_page(ZCACHE_GFP_MASK);
  2296. + if (unlikely(page == NULL)) {
  2297. + zcache_failed_get_free_pages++;
  2298. + kmem_cache_free(zcache_obj_cache, obj);
  2299. + goto unlock_out;
  2300. + }
  2301. + preempt_disable();
  2302. + kp = &__get_cpu_var(zcache_preloads);
  2303. + if (kp->obj == NULL)
  2304. + kp->obj = obj;
  2305. + else
  2306. + kmem_cache_free(zcache_obj_cache, obj);
  2307. + if (kp->page == NULL)
  2308. + kp->page = page;
  2309. + else
  2310. + free_page((unsigned long)page);
  2311. + ret = 0;
  2312. +unlock_out:
  2313. + spin_unlock(&zcache_direct_reclaim_lock);
  2314. +out:
  2315. + return ret;
  2316. +}
  2317. +
  2318. +static void *zcache_get_free_page(void)
  2319. +{
  2320. + struct zcache_preload *kp;
  2321. + void *page;
  2322. +
  2323. + kp = &__get_cpu_var(zcache_preloads);
  2324. + page = kp->page;
  2325. + BUG_ON(page == NULL);
  2326. + kp->page = NULL;
  2327. + return page;
  2328. +}
  2329. +
  2330. +static void zcache_free_page(void *p)
  2331. +{
  2332. + free_page((unsigned long)p);
  2333. +}
  2334. +
  2335. +/*
  2336. + * zcache implementation for tmem host ops
  2337. + */
  2338. +
  2339. +static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool)
  2340. +{
  2341. + struct tmem_objnode *objnode = NULL;
  2342. + unsigned long count;
  2343. + struct zcache_preload *kp;
  2344. +
  2345. + kp = &__get_cpu_var(zcache_preloads);
  2346. + if (kp->nr <= 0)
  2347. + goto out;
  2348. + objnode = kp->objnodes[kp->nr - 1];
  2349. + BUG_ON(objnode == NULL);
  2350. + kp->objnodes[kp->nr - 1] = NULL;
  2351. + kp->nr--;
  2352. + count = atomic_inc_return(&zcache_curr_objnode_count);
  2353. + if (count > zcache_curr_objnode_count_max)
  2354. + zcache_curr_objnode_count_max = count;
  2355. +out:
  2356. + return objnode;
  2357. +}
  2358. +
  2359. +static void zcache_objnode_free(struct tmem_objnode *objnode,
  2360. + struct tmem_pool *pool)
  2361. +{
  2362. + atomic_dec(&zcache_curr_objnode_count);
  2363. + BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0);
  2364. + kmem_cache_free(zcache_objnode_cache, objnode);
  2365. +}
  2366. +
  2367. +static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool)
  2368. +{
  2369. + struct tmem_obj *obj = NULL;
  2370. + unsigned long count;
  2371. + struct zcache_preload *kp;
  2372. +
  2373. + kp = &__get_cpu_var(zcache_preloads);
  2374. + obj = kp->obj;
  2375. + BUG_ON(obj == NULL);
  2376. + kp->obj = NULL;
  2377. + count = atomic_inc_return(&zcache_curr_obj_count);
  2378. + if (count > zcache_curr_obj_count_max)
  2379. + zcache_curr_obj_count_max = count;
  2380. + return obj;
  2381. +}
  2382. +
  2383. +static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)
  2384. +{
  2385. + atomic_dec(&zcache_curr_obj_count);
  2386. + BUG_ON(atomic_read(&zcache_curr_obj_count) < 0);
  2387. + kmem_cache_free(zcache_obj_cache, obj);
  2388. +}
  2389. +
  2390. +static struct tmem_hostops zcache_hostops = {
  2391. + .obj_alloc = zcache_obj_alloc,
  2392. + .obj_free = zcache_obj_free,
  2393. + .objnode_alloc = zcache_objnode_alloc,
  2394. + .objnode_free = zcache_objnode_free,
  2395. +};
  2396. +
  2397. +/*
  2398. + * zcache implementations for PAM page descriptor ops
  2399. + */
  2400. +
  2401. +static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0);
  2402. +static unsigned long zcache_curr_eph_pampd_count_max;
  2403. +static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0);
  2404. +static unsigned long zcache_curr_pers_pampd_count_max;
  2405. +
  2406. +/* forward reference */
  2407. +static int zcache_compress(struct page *from, void **out_va, size_t *out_len);
  2408. +
  2409. +static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid,
  2410. + uint32_t index, struct page *page)
  2411. +{
  2412. + void *pampd = NULL, *cdata;
  2413. + size_t clen;
  2414. + int ret;
  2415. + bool ephemeral = is_ephemeral(pool);
  2416. + unsigned long count;
  2417. +
  2418. + if (ephemeral) {
  2419. + ret = zcache_compress(page, &cdata, &clen);
  2420. + if (ret == 0)
  2421. +
  2422. + goto out;
  2423. + if (clen == 0 || clen > zbud_max_buddy_size()) {
  2424. + zcache_compress_poor++;
  2425. + goto out;
  2426. + }
  2427. + pampd = (void *)zbud_create(pool->pool_id, oid, index,
  2428. + page, cdata, clen);
  2429. + if (pampd != NULL) {
  2430. + count = atomic_inc_return(&zcache_curr_eph_pampd_count);
  2431. + if (count > zcache_curr_eph_pampd_count_max)
  2432. + zcache_curr_eph_pampd_count_max = count;
  2433. + }
  2434. + } else {
  2435. + /*
  2436. + * FIXME: This is all the "policy" there is for now.
  2437. + * 3/4 totpages should allow ~37% of RAM to be filled with
  2438. + * compressed frontswap pages
  2439. + */
  2440. + if (atomic_read(&zcache_curr_pers_pampd_count) >
  2441. + 3 * totalram_pages / 4)
  2442. + goto out;
  2443. + ret = zcache_compress(page, &cdata, &clen);
  2444. + if (ret == 0)
  2445. + goto out;
  2446. + if (clen > zv_max_page_size) {
  2447. + zcache_compress_poor++;
  2448. + goto out;
  2449. + }
  2450. + pampd = (void *)zv_create(zcache_client.xvpool, pool->pool_id,
  2451. + oid, index, cdata, clen);
  2452. + if (pampd == NULL)
  2453. + goto out;
  2454. + count = atomic_inc_return(&zcache_curr_pers_pampd_count);
  2455. + if (count > zcache_curr_pers_pampd_count_max)
  2456. + zcache_curr_pers_pampd_count_max = count;
  2457. + }
  2458. +out:
  2459. + return pampd;
  2460. +}
  2461. +
  2462. +/*
  2463. + * fill the pageframe corresponding to the struct page with the data
  2464. + * from the passed pampd
  2465. + */
  2466. +static int zcache_pampd_get_data(struct page *page, void *pampd,
  2467. + struct tmem_pool *pool)
  2468. +{
  2469. + int ret = 0;
  2470. +
  2471. + if (is_ephemeral(pool))
  2472. + ret = zbud_decompress(page, pampd);
  2473. + else
  2474. + zv_decompress(page, pampd);
  2475. + return ret;
  2476. +}
  2477. +
  2478. +/*
  2479. + * free the pampd and remove it from any zcache lists
  2480. + * pampd must no longer be pointed to from any tmem data structures!
  2481. + */
  2482. +static void zcache_pampd_free(void *pampd, struct tmem_pool *pool)
  2483. +{
  2484. + if (is_ephemeral(pool)) {
  2485. + zbud_free_and_delist((struct zbud_hdr *)pampd);
  2486. + atomic_dec(&zcache_curr_eph_pampd_count);
  2487. + BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0);
  2488. + } else {
  2489. + zv_free(zcache_client.xvpool, (struct zv_hdr *)pampd);
  2490. + atomic_dec(&zcache_curr_pers_pampd_count);
  2491. + BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0);
  2492. + }
  2493. +}
  2494. +
  2495. +static struct tmem_pamops zcache_pamops = {
  2496. + .create = zcache_pampd_create,
  2497. + .get_data = zcache_pampd_get_data,
  2498. + .free = zcache_pampd_free,
  2499. +};
  2500. +
  2501. +/*
  2502. + * zcache compression/decompression and related per-cpu stuff
  2503. + */
  2504. +
  2505. +#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
  2506. +#define LZO_DSTMEM_PAGE_ORDER 1
  2507. +static DEFINE_PER_CPU(unsigned char *, zcache_workmem);
  2508. +static DEFINE_PER_CPU(unsigned char *, zcache_dstmem);
  2509. +
  2510. +static int zcache_compress(struct page *from, void **out_va, size_t *out_len)
  2511. +{
  2512. + int ret = 0;
  2513. + unsigned char *dmem = __get_cpu_var(zcache_dstmem);
  2514. + unsigned char *wmem = __get_cpu_var(zcache_workmem);
  2515. + char *from_va;
  2516. +
  2517. + BUG_ON(!irqs_disabled());
  2518. + if (unlikely(dmem == NULL || wmem == NULL))
  2519. + goto out; /* no buffer, so can't compress */
  2520. + from_va = kmap_atomic(from, KM_USER0);
  2521. + mb();
  2522. + ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem);
  2523. + BUG_ON(ret != LZO_E_OK);
  2524. + *out_va = dmem;
  2525. + kunmap_atomic(from_va, KM_USER0);
  2526. + ret = 1;
  2527. +out:
  2528. + return ret;
  2529. +}
  2530. +
  2531. +
  2532. +static int zcache_cpu_notifier(struct notifier_block *nb,
  2533. + unsigned long action, void *pcpu)
  2534. +{
  2535. + int cpu = (long)pcpu;
  2536. + struct zcache_preload *kp;
  2537. +
  2538. + switch (action) {
  2539. + case CPU_UP_PREPARE:
  2540. + per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages(
  2541. + GFP_KERNEL | __GFP_REPEAT,
  2542. + LZO_DSTMEM_PAGE_ORDER),
  2543. + per_cpu(zcache_workmem, cpu) =
  2544. + kzalloc(LZO1X_MEM_COMPRESS,
  2545. + GFP_KERNEL | __GFP_REPEAT);
  2546. + break;
  2547. + case CPU_DEAD:
  2548. + case CPU_UP_CANCELED:
  2549. + free_pages((unsigned long)per_cpu(zcache_dstmem, cpu),
  2550. + LZO_DSTMEM_PAGE_ORDER);
  2551. + per_cpu(zcache_dstmem, cpu) = NULL;
  2552. + kfree(per_cpu(zcache_workmem, cpu));
  2553. + per_cpu(zcache_workmem, cpu) = NULL;
  2554. + kp = &per_cpu(zcache_preloads, cpu);
  2555. + while (kp->nr) {
  2556. + kmem_cache_free(zcache_objnode_cache,
  2557. + kp->objnodes[kp->nr - 1]);
  2558. + kp->objnodes[kp->nr - 1] = NULL;
  2559. + kp->nr--;
  2560. + }
  2561. + kmem_cache_free(zcache_obj_cache, kp->obj);
  2562. + free_page((unsigned long)kp->page);
  2563. + break;
  2564. + default:
  2565. + break;
  2566. + }
  2567. + return NOTIFY_OK;
  2568. +}
  2569. +
  2570. +static struct notifier_block zcache_cpu_notifier_block = {
  2571. + .notifier_call = zcache_cpu_notifier
  2572. +};
  2573. +
  2574. +#ifdef CONFIG_SYSFS
  2575. +#define ZCACHE_SYSFS_RO(_name) \
  2576. + static ssize_t zcache_##_name##_show(struct kobject *kobj, \
  2577. + struct kobj_attribute *attr, char *buf) \
  2578. + { \
  2579. + return sprintf(buf, "%lu\n", zcache_##_name); \
  2580. + } \
  2581. + static struct kobj_attribute zcache_##_name##_attr = { \
  2582. + .attr = { .name = __stringify(_name), .mode = 0444 }, \
  2583. + .show = zcache_##_name##_show, \
  2584. + }
  2585. +
  2586. +#define ZCACHE_SYSFS_RO_ATOMIC(_name) \
  2587. + static ssize_t zcache_##_name##_show(struct kobject *kobj, \
  2588. + struct kobj_attribute *attr, char *buf) \
  2589. + { \
  2590. + return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
  2591. + } \
  2592. + static struct kobj_attribute zcache_##_name##_attr = { \
  2593. + .attr = { .name = __stringify(_name), .mode = 0444 }, \
  2594. + .show = zcache_##_name##_show, \
  2595. + }
  2596. +
  2597. +#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
  2598. + static ssize_t zcache_##_name##_show(struct kobject *kobj, \
  2599. + struct kobj_attribute *attr, char *buf) \
  2600. + { \
  2601. + return _func(buf); \
  2602. + } \
  2603. + static struct kobj_attribute zcache_##_name##_attr = { \
  2604. + .attr = { .name = __stringify(_name), .mode = 0444 }, \
  2605. + .show = zcache_##_name##_show, \
  2606. + }
  2607. +
  2608. +ZCACHE_SYSFS_RO(curr_obj_count_max);
  2609. +ZCACHE_SYSFS_RO(curr_objnode_count_max);
  2610. +ZCACHE_SYSFS_RO(flush_total);
  2611. +ZCACHE_SYSFS_RO(flush_found);
  2612. +ZCACHE_SYSFS_RO(flobj_total);
  2613. +ZCACHE_SYSFS_RO(flobj_found);
  2614. +ZCACHE_SYSFS_RO(failed_eph_puts);
  2615. +ZCACHE_SYSFS_RO(failed_pers_puts);
  2616. +ZCACHE_SYSFS_RO(zbud_curr_zbytes);
  2617. +ZCACHE_SYSFS_RO(zbud_cumul_zpages);
  2618. +ZCACHE_SYSFS_RO(zbud_cumul_zbytes);
  2619. +ZCACHE_SYSFS_RO(zbud_buddied_count);
  2620. +ZCACHE_SYSFS_RO(zbpg_unused_list_count);
  2621. +ZCACHE_SYSFS_RO(evicted_raw_pages);
  2622. +ZCACHE_SYSFS_RO(evicted_unbuddied_pages);
  2623. +ZCACHE_SYSFS_RO(evicted_buddied_pages);
  2624. +ZCACHE_SYSFS_RO(failed_get_free_pages);
  2625. +ZCACHE_SYSFS_RO(failed_alloc);
  2626. +ZCACHE_SYSFS_RO(put_to_flush);
  2627. +ZCACHE_SYSFS_RO(aborted_preload);
  2628. +ZCACHE_SYSFS_RO(aborted_shrink);
  2629. +ZCACHE_SYSFS_RO(compress_poor);
  2630. +ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);
  2631. +ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);
  2632. +ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);
  2633. +ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count);
  2634. +ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts,
  2635. + zbud_show_unbuddied_list_counts);
  2636. +ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts,
  2637. + zbud_show_cumul_chunk_counts);
  2638. +
  2639. +static struct attribute *zcache_attrs[] = {
  2640. + &zcache_curr_obj_count_attr.attr,
  2641. + &zcache_curr_obj_count_max_attr.attr,
  2642. + &zcache_curr_objnode_count_attr.attr,
  2643. + &zcache_curr_objnode_count_max_attr.attr,
  2644. + &zcache_flush_total_attr.attr,
  2645. + &zcache_flobj_total_attr.attr,
  2646. + &zcache_flush_found_attr.attr,
  2647. + &zcache_flobj_found_attr.attr,
  2648. + &zcache_failed_eph_puts_attr.attr,
  2649. + &zcache_failed_pers_puts_attr.attr,
  2650. + &zcache_compress_poor_attr.attr,
  2651. + &zcache_zbud_curr_raw_pages_attr.attr,
  2652. + &zcache_zbud_curr_zpages_attr.attr,
  2653. + &zcache_zbud_curr_zbytes_attr.attr,
  2654. + &zcache_zbud_cumul_zpages_attr.attr,
  2655. + &zcache_zbud_cumul_zbytes_attr.attr,
  2656. + &zcache_zbud_buddied_count_attr.attr,
  2657. + &zcache_zbpg_unused_list_count_attr.attr,
  2658. + &zcache_evicted_raw_pages_attr.attr,
  2659. + &zcache_evicted_unbuddied_pages_attr.attr,
  2660. + &zcache_evicted_buddied_pages_attr.attr,
  2661. + &zcache_failed_get_free_pages_attr.attr,
  2662. + &zcache_failed_alloc_attr.attr,
  2663. + &zcache_put_to_flush_attr.attr,
  2664. + &zcache_aborted_preload_attr.attr,
  2665. + &zcache_aborted_shrink_attr.attr,
  2666. + &zcache_zbud_unbuddied_list_counts_attr.attr,
  2667. + &zcache_zbud_cumul_chunk_counts_attr.attr,
  2668. + NULL,
  2669. +};
  2670. +
  2671. +static struct attribute_group zcache_attr_group = {
  2672. + .attrs = zcache_attrs,
  2673. + .name = "zcache",
  2674. +};
  2675. +
  2676. +#endif /* CONFIG_SYSFS */
  2677. +/*
  2678. + * When zcache is disabled ("frozen"), pools can be created and destroyed,
  2679. + * but all puts (and thus all other operations that require memory allocation)
  2680. + * must fail. If zcache is unfrozen, accepts puts, then frozen again,
  2681. + * data consistency requires all puts while frozen to be converted into
  2682. + * flushes.
  2683. + */
  2684. +static bool zcache_freeze;
  2685. +
  2686. +/*
  2687. + * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
  2688. + */
  2689. +static int shrink_zcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
  2690. +{
  2691. + int ret = -1;
  2692. +
  2693. + if (nr >= 0) {
  2694. + if (!(gfp_mask & __GFP_FS))
  2695. + /* does this case really need to be skipped? */
  2696. + goto out;
  2697. + if (spin_trylock(&zcache_direct_reclaim_lock)) {
  2698. + zbud_evict_pages(nr);
  2699. + spin_unlock(&zcache_direct_reclaim_lock);
  2700. + } else
  2701. + zcache_aborted_shrink++;
  2702. + }
  2703. + ret = (int)atomic_read(&zcache_zbud_curr_raw_pages);
  2704. +out:
  2705. + return ret;
  2706. +}
  2707. +
  2708. +static struct shrinker zcache_shrinker = {
  2709. + .shrink = shrink_zcache_memory,
  2710. + .seeks = DEFAULT_SEEKS,
  2711. +};
  2712. +
  2713. +/*
  2714. + * zcache shims between cleancache/frontswap ops and tmem
  2715. + */
  2716. +
  2717. +static int zcache_put_page(int pool_id, struct tmem_oid *oidp,
  2718. + uint32_t index, struct page *page)
  2719. +{
  2720. + struct tmem_pool *pool;
  2721. + int ret = -1;
  2722. +
  2723. + BUG_ON(!irqs_disabled());
  2724. + pool = zcache_get_pool_by_id(pool_id);
  2725. + if (unlikely(pool == NULL))
  2726. + goto out;
  2727. + if (!zcache_freeze && zcache_do_preload(pool) == 0) {
  2728. + /* preload does preempt_disable on success */
  2729. + ret = tmem_put(pool, oidp, index, page);
  2730. + if (ret < 0) {
  2731. + if (is_ephemeral(pool))
  2732. + zcache_failed_eph_puts++;
  2733. + else
  2734. + zcache_failed_pers_puts++;
  2735. + }
  2736. + zcache_put_pool(pool);
  2737. + preempt_enable_no_resched();
  2738. + } else {
  2739. + zcache_put_to_flush++;
  2740. + if (atomic_read(&pool->obj_count) > 0)
  2741. + /* the put fails whether the flush succeeds or not */
  2742. + (void)tmem_flush_page(pool, oidp, index);
  2743. + zcache_put_pool(pool);
  2744. + }
  2745. +out:
  2746. + return ret;
  2747. +}
  2748. +
  2749. +static int zcache_get_page(int pool_id, struct tmem_oid *oidp,
  2750. + uint32_t index, struct page *page)
  2751. +{
  2752. + struct tmem_pool *pool;
  2753. + int ret = -1;
  2754. + unsigned long flags;
  2755. +
  2756. + local_irq_save(flags);
  2757. + pool = zcache_get_pool_by_id(pool_id);
  2758. + if (likely(pool != NULL)) {
  2759. + if (atomic_read(&pool->obj_count) > 0)
  2760. + ret = tmem_get(pool, oidp, index, page);
  2761. + zcache_put_pool(pool);
  2762. + }
  2763. + local_irq_restore(flags);
  2764. + return ret;
  2765. +}
  2766. +
  2767. +static int zcache_flush_page(int pool_id, struct tmem_oid *oidp, uint32_t index)
  2768. +{
  2769. + struct tmem_pool *pool;
  2770. + int ret = -1;
  2771. + unsigned long flags;
  2772. +
  2773. + local_irq_save(flags);
  2774. + zcache_flush_total++;
  2775. + pool = zcache_get_pool_by_id(pool_id);
  2776. + if (likely(pool != NULL)) {
  2777. + if (atomic_read(&pool->obj_count) > 0)
  2778. + ret = tmem_flush_page(pool, oidp, index);
  2779. + zcache_put_pool(pool);
  2780. + }
  2781. + if (ret >= 0)
  2782. + zcache_flush_found++;
  2783. + local_irq_restore(flags);
  2784. + return ret;
  2785. +}
  2786. +
  2787. +static int zcache_flush_object(int pool_id, struct tmem_oid *oidp)
  2788. +{
  2789. + struct tmem_pool *pool;
  2790. + int ret = -1;
  2791. + unsigned long flags;
  2792. +
  2793. + local_irq_save(flags);
  2794. + zcache_flobj_total++;
  2795. + pool = zcache_get_pool_by_id(pool_id);
  2796. + if (likely(pool != NULL)) {
  2797. + if (atomic_read(&pool->obj_count) > 0)
  2798. + ret = tmem_flush_object(pool, oidp);
  2799. + zcache_put_pool(pool);
  2800. + }
  2801. + if (ret >= 0)
  2802. + zcache_flobj_found++;
  2803. + local_irq_restore(flags);
  2804. + return ret;
  2805. +}
  2806. +
  2807. +static int zcache_destroy_pool(int pool_id)
  2808. +{
  2809. + struct tmem_pool *pool = NULL;
  2810. + int ret = -1;
  2811. +
  2812. + if (pool_id < 0)
  2813. + goto out;
  2814. + pool = zcache_client.tmem_pools[pool_id];
  2815. + if (pool == NULL)
  2816. + goto out;
  2817. + zcache_client.tmem_pools[pool_id] = NULL;
  2818. + /* wait for pool activity on other cpus to quiesce */
  2819. + while (atomic_read(&pool->refcount) != 0)
  2820. + ;
  2821. + local_bh_disable();
  2822. + ret = tmem_destroy_pool(pool);
  2823. + local_bh_enable();
  2824. + kfree(pool);
  2825. + pr_info("zcache: destroyed pool id=%d\n", pool_id);
  2826. +out:
  2827. + return ret;
  2828. +}
  2829. +
  2830. +static int zcache_new_pool(uint32_t flags)
  2831. +{
  2832. + int poolid = -1;
  2833. + struct tmem_pool *pool;
  2834. +
  2835. + pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL);
  2836. + if (pool == NULL) {
  2837. + pr_info("zcache: pool creation failed: out of memory\n");
  2838. + goto out;
  2839. + }
  2840. +
  2841. + for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++)
  2842. + if (zcache_client.tmem_pools[poolid] == NULL)
  2843. + break;
  2844. + if (poolid >= MAX_POOLS_PER_CLIENT) {
  2845. + pr_info("zcache: pool creation failed: max exceeded\n");
  2846. + kfree(pool);
  2847. + poolid = -1;
  2848. + goto out;
  2849. + }
  2850. + atomic_set(&pool->refcount, 0);
  2851. + pool->client = &zcache_client;
  2852. + pool->pool_id = poolid;
  2853. + tmem_new_pool(pool, flags);
  2854. + zcache_client.tmem_pools[poolid] = pool;
  2855. + pr_info("zcache: created %s tmem pool, id=%d\n",
  2856. + flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
  2857. + poolid);
  2858. +out:
  2859. + return poolid;
  2860. +}
  2861. +
  2862. +/**********
  2863. + * Two kernel functionalities currently can be layered on top of tmem.
  2864. + * These are "cleancache" which is used as a second-chance cache for clean
  2865. + * page cache pages; and "frontswap" which is used for swap pages
  2866. + * to avoid writes to disk. A generic "shim" is provided here for each
  2867. + * to translate in-kernel semantics to zcache semantics.
  2868. + */
  2869. +
  2870. +#ifdef CONFIG_CLEANCACHE
  2871. +static void zcache_cleancache_put_page(int pool_id,
  2872. + struct cleancache_filekey key,
  2873. + pgoff_t index, struct page *page)
  2874. +{
  2875. + u32 ind = (u32) index;
  2876. + struct tmem_oid oid = *(struct tmem_oid *)&key;
  2877. +
  2878. + if (likely(ind == index))
  2879. + (void)zcache_put_page(pool_id, &oid, index, page);
  2880. +}
  2881. +
  2882. +static int zcache_cleancache_get_page(int pool_id,
  2883. + struct cleancache_filekey key,
  2884. + pgoff_t index, struct page *page)
  2885. +{
  2886. + u32 ind = (u32) index;
  2887. + struct tmem_oid oid = *(struct tmem_oid *)&key;
  2888. + int ret = -1;
  2889. +
  2890. + if (likely(ind == index))
  2891. + ret = zcache_get_page(pool_id, &oid, index, page);
  2892. + return ret;
  2893. +}
  2894. +
  2895. +static void zcache_cleancache_flush_page(int pool_id,
  2896. + struct cleancache_filekey key,
  2897. + pgoff_t index)
  2898. +{
  2899. + u32 ind = (u32) index;
  2900. + struct tmem_oid oid = *(struct tmem_oid *)&key;
  2901. +
  2902. + if (likely(ind == index))
  2903. + (void)zcache_flush_page(pool_id, &oid, ind);
  2904. +}
  2905. +
  2906. +static void zcache_cleancache_flush_inode(int pool_id,
  2907. + struct cleancache_filekey key)
  2908. +{
  2909. + struct tmem_oid oid = *(struct tmem_oid *)&key;
  2910. +
  2911. + (void)zcache_flush_object(pool_id, &oid);
  2912. +}
  2913. +
  2914. +static void zcache_cleancache_flush_fs(int pool_id)
  2915. +{
  2916. + if (pool_id >= 0)
  2917. + (void)zcache_destroy_pool(pool_id);
  2918. +}
  2919. +
  2920. +static int zcache_cleancache_init_fs(size_t pagesize)
  2921. +{
  2922. + BUG_ON(sizeof(struct cleancache_filekey) !=
  2923. + sizeof(struct tmem_oid));
  2924. + BUG_ON(pagesize != PAGE_SIZE);
  2925. + return zcache_new_pool(0);
  2926. +}
  2927. +
  2928. +static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
  2929. +{
  2930. + /* shared pools are unsupported and map to private */
  2931. + BUG_ON(sizeof(struct cleancache_filekey) !=
  2932. + sizeof(struct tmem_oid));
  2933. + BUG_ON(pagesize != PAGE_SIZE);
  2934. + return zcache_new_pool(0);
  2935. +}
  2936. +
  2937. +static struct cleancache_ops zcache_cleancache_ops = {
  2938. + .put_page = zcache_cleancache_put_page,
  2939. + .get_page = zcache_cleancache_get_page,
  2940. + .flush_page = zcache_cleancache_flush_page,
  2941. + .flush_inode = zcache_cleancache_flush_inode,
  2942. + .flush_fs = zcache_cleancache_flush_fs,
  2943. + .init_shared_fs = zcache_cleancache_init_shared_fs,
  2944. + .init_fs = zcache_cleancache_init_fs
  2945. +};
  2946. +
  2947. +struct cleancache_ops zcache_cleancache_register_ops(void)
  2948. +{
  2949. + struct cleancache_ops old_ops =
  2950. + cleancache_register_ops(&zcache_cleancache_ops);
  2951. +
  2952. + return old_ops;
  2953. +}
  2954. +#endif
  2955. +
  2956. +#ifdef CONFIG_FRONTSWAP
  2957. +/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
  2958. +static int zcache_frontswap_poolid = -1;
  2959. +
  2960. +/*
  2961. + * Swizzling increases objects per swaptype, increasing tmem concurrency
  2962. + * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS
  2963. + */
  2964. +#define SWIZ_BITS 4
  2965. +#define SWIZ_MASK ((1 << SWIZ_BITS) - 1)
  2966. +#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
  2967. +#define iswiz(_ind) (_ind >> SWIZ_BITS)
  2968. +
  2969. +static inline struct tmem_oid oswiz(unsigned type, u32 ind)
  2970. +{
  2971. + struct tmem_oid oid = { .oid = { 0 } };
  2972. + oid.oid[0] = _oswiz(type, ind);
  2973. + return oid;
  2974. +}
  2975. +
  2976. +static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
  2977. + struct page *page)
  2978. +{
  2979. + u64 ind64 = (u64)offset;
  2980. + u32 ind = (u32)offset;
  2981. + struct tmem_oid oid = oswiz(type, ind);
  2982. + int ret = -1;
  2983. + unsigned long flags;
  2984. +
  2985. + BUG_ON(!PageLocked(page));
  2986. + if (likely(ind64 == ind)) {
  2987. + local_irq_save(flags);
  2988. + ret = zcache_put_page(zcache_frontswap_poolid, &oid,
  2989. + iswiz(ind), page);
  2990. + local_irq_restore(flags);
  2991. + }
  2992. + return ret;
  2993. +}
  2994. +
  2995. +/* returns 0 if the page was successfully gotten from frontswap, -1 if
  2996. + * was not present (should never happen!) */
  2997. +static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
  2998. + struct page *page)
  2999. +{
  3000. + u64 ind64 = (u64)offset;
  3001. + u32 ind = (u32)offset;
  3002. + struct tmem_oid oid = oswiz(type, ind);
  3003. + int ret = -1;
  3004. +
  3005. + BUG_ON(!PageLocked(page));
  3006. + if (likely(ind64 == ind))
  3007. + ret = zcache_get_page(zcache_frontswap_poolid, &oid,
  3008. + iswiz(ind), page);
  3009. + return ret;
  3010. +}
  3011. +
  3012. +/* flush a single page from frontswap */
  3013. +static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)
  3014. +{
  3015. + u64 ind64 = (u64)offset;
  3016. + u32 ind = (u32)offset;
  3017. + struct tmem_oid oid = oswiz(type, ind);
  3018. +
  3019. + if (likely(ind64 == ind))
  3020. + (void)zcache_flush_page(zcache_frontswap_poolid, &oid,
  3021. + iswiz(ind));
  3022. +}
  3023. +
  3024. +/* flush all pages from the passed swaptype */
  3025. +static void zcache_frontswap_flush_area(unsigned type)
  3026. +{
  3027. + struct tmem_oid oid;
  3028. + int ind;
  3029. +
  3030. + for (ind = SWIZ_MASK; ind >= 0; ind--) {
  3031. + oid = oswiz(type, ind);
  3032. + (void)zcache_flush_object(zcache_frontswap_poolid, &oid);
  3033. + }
  3034. +}
  3035. +
  3036. +static void zcache_frontswap_init(unsigned ignored)
  3037. +{
  3038. + /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
  3039. + if (zcache_frontswap_poolid < 0)
  3040. + zcache_frontswap_poolid = zcache_new_pool(TMEM_POOL_PERSIST);
  3041. +}
  3042. +
  3043. +static struct frontswap_ops zcache_frontswap_ops = {
  3044. + .put_page = zcache_frontswap_put_page,
  3045. + .get_page = zcache_frontswap_get_page,
  3046. + .flush_page = zcache_frontswap_flush_page,
  3047. + .flush_area = zcache_frontswap_flush_area,
  3048. + .init = zcache_frontswap_init
  3049. +};
  3050. +
  3051. +struct frontswap_ops zcache_frontswap_register_ops(void)
  3052. +{
  3053. + struct frontswap_ops old_ops =
  3054. + frontswap_register_ops(&zcache_frontswap_ops);
  3055. +
  3056. + return old_ops;
  3057. +}
  3058. +#endif
  3059. +
  3060. +/*
  3061. + * zcache initialization
  3062. + * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
  3063. + * NOTHING HAPPENS!
  3064. + */
  3065. +
  3066. +static int zcache_enabled;
  3067. +
  3068. +static int __init enable_zcache(char *s)
  3069. +{
  3070. + zcache_enabled = 1;
  3071. + return 1;
  3072. +}
  3073. +__setup("zcache", enable_zcache);
  3074. +
  3075. +/* allow independent dynamic disabling of cleancache and frontswap */
  3076. +
  3077. +static int use_cleancache = 1;
  3078. +
  3079. +static int __init no_cleancache(char *s)
  3080. +{
  3081. + use_cleancache = 0;
  3082. + return 1;
  3083. +}
  3084. +
  3085. +__setup("nocleancache", no_cleancache);
  3086. +
  3087. +static int use_frontswap = 1;
  3088. +
  3089. +static int __init no_frontswap(char *s)
  3090. +{
  3091. + use_frontswap = 0;
  3092. + return 1;
  3093. +}
  3094. +
  3095. +__setup("nofrontswap", no_frontswap);
  3096. +
  3097. +static int __init zcache_init(void)
  3098. +{
  3099. + int ret = 0;
  3100. +
  3101. +#ifdef CONFIG_SYSFS
  3102. + ret = sysfs_create_group(mm_kobj, &zcache_attr_group);
  3103. + if (ret) {
  3104. + pr_err("zcache: can't create sysfs\n");
  3105. + goto out;
  3106. + }
  3107. +#endif /* CONFIG_SYSFS */
  3108. +#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
  3109. + if (zcache_enabled) {
  3110. + unsigned int cpu;
  3111. +
  3112. + tmem_register_hostops(&zcache_hostops);
  3113. + tmem_register_pamops(&zcache_pamops);
  3114. + ret = register_cpu_notifier(&zcache_cpu_notifier_block);
  3115. + if (ret) {
  3116. + pr_err("zcache: can't register cpu notifier\n");
  3117. + goto out;
  3118. + }
  3119. + for_each_online_cpu(cpu) {
  3120. + void *pcpu = (void *)(long)cpu;
  3121. + zcache_cpu_notifier(&zcache_cpu_notifier_block,
  3122. + CPU_UP_PREPARE, pcpu);
  3123. + }
  3124. + }
  3125. + zcache_objnode_cache = kmem_cache_create("zcache_objnode",
  3126. + sizeof(struct tmem_objnode), 0, 0, NULL);
  3127. + zcache_obj_cache = kmem_cache_create("zcache_obj",
  3128. + sizeof(struct tmem_obj), 0, 0, NULL);
  3129. +#endif
  3130. +#ifdef CONFIG_CLEANCACHE
  3131. + if (zcache_enabled && use_cleancache) {
  3132. + struct cleancache_ops old_ops;
  3133. +
  3134. + zbud_init();
  3135. + register_shrinker(&zcache_shrinker);
  3136. + old_ops = zcache_cleancache_register_ops();
  3137. + pr_info("zcache: cleancache enabled using kernel "
  3138. + "transcendent memory and compression buddies\n");
  3139. + if (old_ops.init_fs != NULL)
  3140. + pr_warning("zcache: cleancache_ops overridden");
  3141. + }
  3142. +#endif
  3143. +#ifdef CONFIG_FRONTSWAP
  3144. + if (zcache_enabled && use_frontswap) {
  3145. + struct frontswap_ops old_ops;
  3146. +
  3147. + zcache_client.xvpool = xv_create_pool();
  3148. + if (zcache_client.xvpool == NULL) {
  3149. + pr_err("zcache: can't create xvpool\n");
  3150. + goto out;
  3151. + }
  3152. + old_ops = zcache_frontswap_register_ops();
  3153. + pr_info("zcache: frontswap enabled using kernel "
  3154. + "transcendent memory and xvmalloc\n");
  3155. + if (old_ops.init != NULL)
  3156. + pr_warning("ktmem: frontswap_ops overridden");
  3157. + }
  3158. +#endif
  3159. +out:
  3160. + return ret;
  3161. +}
  3162. +
  3163. +module_init(zcache_init)
  3164. diff -Nrupad linux-2.6.37//drivers/staging/zram/Kconfig linux-2.6.37_vanilla//drivers/staging/zram/Kconfig
  3165. --- linux-2.6.37//drivers/staging/zram/Kconfig 2011-01-05 01:50:19.000000000 +0100
  3166. +++ linux-2.6.37_vanilla//drivers/staging/zram/Kconfig 2011-02-14 01:22:46.470793204 +0100
  3167. @@ -15,3 +15,11 @@ config ZRAM
  3168.  
  3169. See zram.txt for more information.
  3170. Project home: http://compcache.googlecode.com/
  3171. +
  3172. +config ZRAM_DEBUG
  3173. + bool "Compressed RAM block device debug support"
  3174. + depends on ZRAM
  3175. + default n
  3176. + help
  3177. + This option adds additional debugging code to the compressed
  3178. + RAM block device driver.
  3179. diff -Nrupad linux-2.6.37//drivers/staging/zram/xvmalloc.c linux-2.6.37_vanilla//drivers/staging/zram/xvmalloc.c
  3180. --- linux-2.6.37//drivers/staging/zram/xvmalloc.c 2011-01-05 01:50:19.000000000 +0100
  3181. +++ linux-2.6.37_vanilla//drivers/staging/zram/xvmalloc.c 2011-02-14 01:24:56.564792988 +0100
  3182. @@ -10,6 +10,10 @@
  3183. * Released under the terms of GNU General Public License Version 2.0
  3184. */
  3185.  
  3186. +#ifdef CONFIG_ZRAM_DEBUG
  3187. +#define DEBUG
  3188. +#endif
  3189. +
  3190. #include <linux/bitops.h>
  3191. #include <linux/errno.h>
  3192. #include <linux/highmem.h>
  3193. @@ -187,7 +191,7 @@ static void insert_block(struct xv_pool
  3194. slindex = get_index_for_insert(block->size);
  3195. flindex = slindex / BITS_PER_LONG;
  3196.  
  3197. - block->link.prev_page = 0;
  3198. + block->link.prev_page = NULL;
  3199. block->link.prev_offset = 0;
  3200. block->link.next_page = pool->freelist[slindex].page;
  3201. block->link.next_offset = pool->freelist[slindex].offset;
  3202. @@ -200,6 +204,8 @@ static void insert_block(struct xv_pool
  3203. nextblock->link.prev_page = page;
  3204. nextblock->link.prev_offset = offset;
  3205. put_ptr_atomic(nextblock, KM_USER1);
  3206. + /* If there was a next page then the free bits are set. */
  3207. + return;
  3208. }
  3209.  
  3210. __set_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
  3211. @@ -207,54 +213,14 @@ static void insert_block(struct xv_pool
  3212. }
  3213.  
  3214. /*
  3215. - * Remove block from head of freelist. Index 'slindex' identifies the freelist.
  3216. - */
  3217. -static void remove_block_head(struct xv_pool *pool,
  3218. - struct block_header *block, u32 slindex)
  3219. -{
  3220. - struct block_header *tmpblock;
  3221. - u32 flindex = slindex / BITS_PER_LONG;
  3222. -
  3223. - pool->freelist[slindex].page = block->link.next_page;
  3224. - pool->freelist[slindex].offset = block->link.next_offset;
  3225. - block->link.prev_page = 0;
  3226. - block->link.prev_offset = 0;
  3227. -
  3228. - if (!pool->freelist[slindex].page) {
  3229. - __clear_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
  3230. - if (!pool->slbitmap[flindex])
  3231. - __clear_bit(flindex, &pool->flbitmap);
  3232. - } else {
  3233. - /*
  3234. - * DEBUG ONLY: We need not reinitialize freelist head previous
  3235. - * pointer to 0 - we never depend on its value. But just for
  3236. - * sanity, lets do it.
  3237. - */
  3238. - tmpblock = get_ptr_atomic(pool->freelist[slindex].page,
  3239. - pool->freelist[slindex].offset, KM_USER1);
  3240. - tmpblock->link.prev_page = 0;
  3241. - tmpblock->link.prev_offset = 0;
  3242. - put_ptr_atomic(tmpblock, KM_USER1);
  3243. - }
  3244. -}
  3245. -
  3246. -/*
  3247. * Remove block from freelist. Index 'slindex' identifies the freelist.
  3248. */
  3249. static void remove_block(struct xv_pool *pool, struct page *page, u32 offset,
  3250. struct block_header *block, u32 slindex)
  3251. {
  3252. - u32 flindex;
  3253. + u32 flindex = slindex / BITS_PER_LONG;
  3254. struct block_header *tmpblock;
  3255.  
  3256. - if (pool->freelist[slindex].page == page
  3257. - && pool->freelist[slindex].offset == offset) {
  3258. - remove_block_head(pool, block, slindex);
  3259. - return;
  3260. - }
  3261. -
  3262. - flindex = slindex / BITS_PER_LONG;
  3263. -
  3264. if (block->link.prev_page) {
  3265. tmpblock = get_ptr_atomic(block->link.prev_page,
  3266. block->link.prev_offset, KM_USER1);
  3267. @@ -270,6 +236,35 @@ static void remove_block(struct xv_pool
  3268. tmpblock->link.prev_offset = block->link.prev_offset;
  3269. put_ptr_atomic(tmpblock, KM_USER1);
  3270. }
  3271. +
  3272. + /* Is this block is at the head of the freelist? */
  3273. + if (pool->freelist[slindex].page == page
  3274. + && pool->freelist[slindex].offset == offset) {
  3275. +
  3276. + pool->freelist[slindex].page = block->link.next_page;
  3277. + pool->freelist[slindex].offset = block->link.next_offset;
  3278. +
  3279. + if (pool->freelist[slindex].page) {
  3280. + struct block_header *tmpblock;
  3281. + tmpblock = get_ptr_atomic(pool->freelist[slindex].page,
  3282. + pool->freelist[slindex].offset,
  3283. + KM_USER1);
  3284. + tmpblock->link.prev_page = NULL;
  3285. + tmpblock->link.prev_offset = 0;
  3286. + put_ptr_atomic(tmpblock, KM_USER1);
  3287. + } else {
  3288. + /* This freelist bucket is empty */
  3289. + __clear_bit(slindex % BITS_PER_LONG,
  3290. + &pool->slbitmap[flindex]);
  3291. + if (!pool->slbitmap[flindex])
  3292. + __clear_bit(flindex, &pool->flbitmap);
  3293. + }
  3294. + }
  3295. +
  3296. + block->link.prev_page = NULL;
  3297. + block->link.prev_offset = 0;
  3298. + block->link.next_page = NULL;
  3299. + block->link.next_offset = 0;
  3300. }
  3301.  
  3302. /*
  3303. @@ -378,7 +373,7 @@ int xv_malloc(struct xv_pool *pool, u32
  3304.  
  3305. block = get_ptr_atomic(*page, *offset, KM_USER0);
  3306.  
  3307. - remove_block_head(pool, block, index);
  3308. + remove_block(pool, *page, *offset, block, index);
  3309.  
  3310. /* Split the block if required */
  3311. tmpoffset = *offset + size + XV_ALIGN;
  3312. diff -Nrupad linux-2.6.37//drivers/staging/zram/xvmalloc_int.h linux-2.6.37_vanilla//drivers/staging/zram/xvmalloc_int.h
  3313. --- linux-2.6.37//drivers/staging/zram/xvmalloc_int.h 2011-01-05 01:50:19.000000000 +0100
  3314. +++ linux-2.6.37_vanilla//drivers/staging/zram/xvmalloc_int.h 2011-02-14 01:22:50.990793071 +0100
  3315. @@ -19,7 +19,11 @@
  3316. /* User configurable params */
  3317.  
  3318. /* Must be power of two */
  3319. +#ifdef CONFIG_64BIT
  3320. +#define XV_ALIGN_SHIFT 3
  3321. +#else
  3322. #define XV_ALIGN_SHIFT 2
  3323. +#endif
  3324. #define XV_ALIGN (1 << XV_ALIGN_SHIFT)
  3325. #define XV_ALIGN_MASK (XV_ALIGN - 1)
  3326.  
  3327. @@ -27,8 +31,16 @@
  3328. #define XV_MIN_ALLOC_SIZE 32
  3329. #define XV_MAX_ALLOC_SIZE (PAGE_SIZE - XV_ALIGN)
  3330.  
  3331. -/* Free lists are separated by FL_DELTA bytes */
  3332. -#define FL_DELTA_SHIFT 3
  3333. +/*
  3334. + * Free lists are separated by FL_DELTA bytes
  3335. + * This value is 3 for 4k pages and 4 for 64k pages, for any
  3336. + * other page size, a conservative (PAGE_SHIFT - 9) is used.
  3337. + */
  3338. +#if PAGE_SHIFT == 16
  3339. +#define FL_DELTA_SHIFT 4
  3340. +#else
  3341. +#define FL_DELTA_SHIFT (PAGE_SHIFT - 9)
  3342. +#endif
  3343. #define FL_DELTA (1 << FL_DELTA_SHIFT)
  3344. #define FL_DELTA_MASK (FL_DELTA - 1)
  3345. #define NUM_FREE_LISTS ((XV_MAX_ALLOC_SIZE - XV_MIN_ALLOC_SIZE) \
  3346. @@ -75,12 +87,9 @@ struct block_header {
  3347. struct xv_pool {
  3348. ulong flbitmap;
  3349. ulong slbitmap[MAX_FLI];
  3350. - spinlock_t lock;
  3351. -
  3352. + u64 total_pages; /* stats */
  3353. struct freelist_entry freelist[NUM_FREE_LISTS];
  3354. -
  3355. - /* stats */
  3356. - u64 total_pages;
  3357. + spinlock_t lock;
  3358. };
  3359.  
  3360. #endif
  3361. diff -Nrupad linux-2.6.37//drivers/staging/zram/zram_drv.c linux-2.6.37_vanilla//drivers/staging/zram/zram_drv.c
  3362. --- linux-2.6.37//drivers/staging/zram/zram_drv.c 2011-01-05 01:50:19.000000000 +0100
  3363. +++ linux-2.6.37_vanilla//drivers/staging/zram/zram_drv.c 2011-02-14 01:24:29.924793006 +0100
  3364. @@ -15,6 +15,10 @@
  3365. #define KMSG_COMPONENT "zram"
  3366. #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  3367.  
  3368. +#ifdef CONFIG_ZRAM_DEBUG
  3369. +#define DEBUG
  3370. +#endif
  3371. +
  3372. #include <linux/module.h>
  3373. #include <linux/kernel.h>
  3374. #include <linux/bio.h>
  3375. @@ -227,6 +231,7 @@ static int zram_read(struct zram *zram,
  3376.  
  3377. if (zram_test_flag(zram, index, ZRAM_ZERO)) {
  3378. handle_zero_page(page);
  3379. + index++;
  3380. continue;
  3381. }
  3382.  
  3383. @@ -234,13 +239,15 @@ static int zram_read(struct zram *zram,
  3384. if (unlikely(!zram->table[index].page)) {
  3385. pr_debug("Read before write: sector=%lu, size=%u",
  3386. (ulong)(bio->bi_sector), bio->bi_size);
  3387. - /* Do nothing */
  3388. + handle_zero_page(page);
  3389. + index++;
  3390. continue;
  3391. }
  3392.  
  3393. /* Page is stored uncompressed since it's incompressible */
  3394. if (unlikely(zram_test_flag(zram, index, ZRAM_UNCOMPRESSED))) {
  3395. handle_uncompressed_page(zram, page, index);
  3396. + index++;
  3397. continue;
  3398. }
  3399.  
  3400. @@ -320,6 +327,7 @@ static int zram_write(struct zram *zram,
  3401. mutex_unlock(&zram->lock);
  3402. zram_stat_inc(&zram->stats.pages_zero);
  3403. zram_set_flag(zram, index, ZRAM_ZERO);
  3404. + index++;
  3405. continue;
  3406. }
  3407.  
  3408. @@ -621,7 +629,8 @@ static int create_device(struct zram *zr
  3409. * and n*PAGE_SIZED sized I/O requests.
  3410. */
  3411. blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
  3412. - blk_queue_logical_block_size(zram->disk->queue, PAGE_SIZE);
  3413. + blk_queue_logical_block_size(zram->disk->queue,
  3414. + ZRAM_LOGICAL_BLOCK_SIZE);
  3415. blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
  3416. blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
  3417.  
  3418. diff -Nrupad linux-2.6.37//drivers/staging/zram/zram_drv.h linux-2.6.37_vanilla//drivers/staging/zram/zram_drv.h
  3419. --- linux-2.6.37//drivers/staging/zram/zram_drv.h 2011-01-05 01:50:19.000000000 +0100
  3420. +++ linux-2.6.37_vanilla//drivers/staging/zram/zram_drv.h 2011-02-14 01:22:38.055793098 +0100
  3421. @@ -61,6 +61,7 @@ static const unsigned max_zpage_size = P
  3422. #define SECTOR_SIZE (1 << SECTOR_SHIFT)
  3423. #define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
  3424. #define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT)
  3425. +#define ZRAM_LOGICAL_BLOCK_SIZE 4096
  3426.  
  3427. /* Flags for zram pages (table[page_no].flags) */
  3428. enum zram_pageflags {
  3429. diff -Nrupad linux-2.6.37//fs/btrfs/extent_io.c linux-2.6.37_vanilla//fs/btrfs/extent_io.c
  3430. --- linux-2.6.37//fs/btrfs/extent_io.c 2011-01-05 01:50:19.000000000 +0100
  3431. +++ linux-2.6.37_vanilla//fs/btrfs/extent_io.c 2011-02-14 01:21:43.164793068 +0100
  3432. @@ -10,6 +10,7 @@
  3433. #include <linux/swap.h>
  3434. #include <linux/writeback.h>
  3435. #include <linux/pagevec.h>
  3436. +#include <linux/cleancache.h>
  3437. #include "extent_io.h"
  3438. #include "extent_map.h"
  3439. #include "compat.h"
  3440. @@ -1981,6 +1982,13 @@ static int __extent_read_full_page(struc
  3441.  
  3442. set_page_extent_mapped(page);
  3443.  
  3444. + if (!PageUptodate(page)) {
  3445. + if (cleancache_get_page(page) == 0) {
  3446. + BUG_ON(blocksize != PAGE_SIZE);
  3447. + goto out;
  3448. + }
  3449. + }
  3450. +
  3451. end = page_end;
  3452. while (1) {
  3453. lock_extent(tree, start, end, GFP_NOFS);
  3454. @@ -2105,6 +2113,7 @@ static int __extent_read_full_page(struc
  3455. cur = cur + iosize;
  3456. page_offset += iosize;
  3457. }
  3458. +out:
  3459. if (!nr) {
  3460. if (!PageError(page))
  3461. SetPageUptodate(page);
  3462. diff -Nrupad linux-2.6.37//fs/btrfs/super.c linux-2.6.37_vanilla//fs/btrfs/super.c
  3463. --- linux-2.6.37//fs/btrfs/super.c 2011-01-05 01:50:19.000000000 +0100
  3464. +++ linux-2.6.37_vanilla//fs/btrfs/super.c 2011-02-14 01:21:43.164793068 +0100
  3465. @@ -39,6 +39,7 @@
  3466. #include <linux/miscdevice.h>
  3467. #include <linux/magic.h>
  3468. #include <linux/slab.h>
  3469. +#include <linux/cleancache.h>
  3470. #include "compat.h"
  3471. #include "ctree.h"
  3472. #include "disk-io.h"
  3473. @@ -494,6 +495,7 @@ static int btrfs_fill_super(struct super
  3474. sb->s_root = root_dentry;
  3475.  
  3476. save_mount_options(sb, data);
  3477. + cleancache_init_fs(sb);
  3478. return 0;
  3479.  
  3480. fail_close:
  3481. diff -Nrupad linux-2.6.37//fs/buffer.c linux-2.6.37_vanilla//fs/buffer.c
  3482. --- linux-2.6.37//fs/buffer.c 2011-01-05 01:50:19.000000000 +0100
  3483. +++ linux-2.6.37_vanilla//fs/buffer.c 2011-02-14 01:21:43.165793086 +0100
  3484. @@ -41,6 +41,7 @@
  3485. #include <linux/bitops.h>
  3486. #include <linux/mpage.h>
  3487. #include <linux/bit_spinlock.h>
  3488. +#include <linux/cleancache.h>
  3489.  
  3490. static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  3491.  
  3492. @@ -277,6 +278,10 @@ void invalidate_bdev(struct block_device
  3493. invalidate_bh_lrus();
  3494. lru_add_drain_all(); /* make sure all lru add caches are flushed */
  3495. invalidate_mapping_pages(mapping, 0, -1);
  3496. + /* 99% of the time, we don't need to flush the cleancache on the bdev.
  3497. + * But, for the strange corners, lets be cautious
  3498. + */
  3499. + cleancache_flush_inode(mapping);
  3500. }
  3501. EXPORT_SYMBOL(invalidate_bdev);
  3502.  
  3503. diff -Nrupad linux-2.6.37//fs/ext3/super.c linux-2.6.37_vanilla//fs/ext3/super.c
  3504. --- linux-2.6.37//fs/ext3/super.c 2011-01-05 01:50:19.000000000 +0100
  3505. +++ linux-2.6.37_vanilla//fs/ext3/super.c 2011-02-14 01:21:43.166793102 +0100
  3506. @@ -36,6 +36,7 @@
  3507. #include <linux/quotaops.h>
  3508. #include <linux/seq_file.h>
  3509. #include <linux/log2.h>
  3510. +#include <linux/cleancache.h>
  3511.  
  3512. #include <asm/uaccess.h>
  3513.  
  3514. @@ -1343,6 +1344,7 @@ static int ext3_setup_super(struct super
  3515. } else {
  3516. ext3_msg(sb, KERN_INFO, "using internal journal");
  3517. }
  3518. + cleancache_init_fs(sb);
  3519. return res;
  3520. }
  3521.  
  3522. diff -Nrupad linux-2.6.37//fs/ext4/super.c linux-2.6.37_vanilla//fs/ext4/super.c
  3523. --- linux-2.6.37//fs/ext4/super.c 2011-01-05 01:50:19.000000000 +0100
  3524. +++ linux-2.6.37_vanilla//fs/ext4/super.c 2011-02-14 01:21:43.168793127 +0100
  3525. @@ -38,6 +38,7 @@
  3526. #include <linux/ctype.h>
  3527. #include <linux/log2.h>
  3528. #include <linux/crc16.h>
  3529. +#include <linux/cleancache.h>
  3530. #include <asm/uaccess.h>
  3531.  
  3532. #include <linux/kthread.h>
  3533. @@ -1902,6 +1903,7 @@ static int ext4_setup_super(struct super
  3534. EXT4_INODES_PER_GROUP(sb),
  3535. sbi->s_mount_opt);
  3536.  
  3537. + cleancache_init_fs(sb);
  3538. return res;
  3539. }
  3540.  
  3541. diff -Nrupad linux-2.6.37//fs/mpage.c linux-2.6.37_vanilla//fs/mpage.c
  3542. --- linux-2.6.37//fs/mpage.c 2011-01-05 01:50:19.000000000 +0100
  3543. +++ linux-2.6.37_vanilla//fs/mpage.c 2011-02-14 01:21:43.168793127 +0100
  3544. @@ -27,6 +27,7 @@
  3545. #include <linux/writeback.h>
  3546. #include <linux/backing-dev.h>
  3547. #include <linux/pagevec.h>
  3548. +#include <linux/cleancache.h>
  3549.  
  3550. /*
  3551. * I/O completion handler for multipage BIOs.
  3552. @@ -286,6 +287,12 @@ do_mpage_readpage(struct bio *bio, struc
  3553. SetPageMappedToDisk(page);
  3554. }
  3555.  
  3556. + if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
  3557. + cleancache_get_page(page) == 0) {
  3558. + SetPageUptodate(page);
  3559. + goto confused;
  3560. + }
  3561. +
  3562. /*
  3563. * This page will go to BIO. Do we need to send this BIO off first?
  3564. */
  3565. diff -Nrupad linux-2.6.37//fs/ocfs2/super.c linux-2.6.37_vanilla//fs/ocfs2/super.c
  3566. --- linux-2.6.37//fs/ocfs2/super.c 2011-01-05 01:50:19.000000000 +0100
  3567. +++ linux-2.6.37_vanilla//fs/ocfs2/super.c 2011-02-14 01:21:43.169793144 +0100
  3568. @@ -41,6 +41,7 @@
  3569. #include <linux/mount.h>
  3570. #include <linux/seq_file.h>
  3571. #include <linux/quotaops.h>
  3572. +#include <linux/cleancache.h>
  3573.  
  3574. #define MLOG_MASK_PREFIX ML_SUPER
  3575. #include <cluster/masklog.h>
  3576. @@ -2366,6 +2367,7 @@ static int ocfs2_initialize_super(struct
  3577. mlog_errno(status);
  3578. goto bail;
  3579. }
  3580. + cleancache_init_shared_fs((char *)&uuid_net_key, sb);
  3581.  
  3582. bail:
  3583. mlog_exit(status);
  3584. diff -Nrupad linux-2.6.37//fs/reiserfs/prints.c linux-2.6.37_vanilla//fs/reiserfs/prints.c
  3585. --- linux-2.6.37//fs/reiserfs/prints.c 2011-01-05 01:50:19.000000000 +0100
  3586. +++ linux-2.6.37_vanilla//fs/reiserfs/prints.c 2011-02-14 01:20:50.468793185 +0100
  3587. @@ -586,13 +586,13 @@ void print_block(struct buffer_head *bh,
  3588. va_list args;
  3589. int mode, first, last;
  3590.  
  3591. - va_start(args, bh);
  3592. -
  3593. if (!bh) {
  3594. printk("print_block: buffer is NULL\n");
  3595. return;
  3596. }
  3597.  
  3598. + va_start(args, bh);
  3599. +
  3600. mode = va_arg(args, int);
  3601. first = va_arg(args, int);
  3602. last = va_arg(args, int);
  3603. diff -Nrupad linux-2.6.37//fs/reiserfs/super.c linux-2.6.37_vanilla//fs/reiserfs/super.c
  3604. --- linux-2.6.37//fs/reiserfs/super.c 2011-01-05 01:50:19.000000000 +0100
  3605. +++ linux-2.6.37_vanilla//fs/reiserfs/super.c 2011-02-14 01:21:07.821793171 +0100
  3606. @@ -237,7 +237,7 @@ static int finish_unfinished(struct supe
  3607. pathrelse(&path);
  3608.  
  3609. inode = reiserfs_iget(s, &obj_key);
  3610. - if (!inode) {
  3611. + if (IS_ERR_OR_NULL(inode)) {
  3612. /* the unlink almost completed, it just did not manage to remove
  3613. "save" link and release objectid */
  3614. reiserfs_warning(s, "vs-2180", "iget failed for %K",
  3615. diff -Nrupad linux-2.6.37//fs/super.c linux-2.6.37_vanilla//fs/super.c
  3616. --- linux-2.6.37//fs/super.c 2011-01-05 01:50:19.000000000 +0100
  3617. +++ linux-2.6.37_vanilla//fs/super.c 2011-02-14 01:21:43.169793144 +0100
  3618. @@ -30,6 +30,7 @@
  3619. #include <linux/idr.h>
  3620. #include <linux/mutex.h>
  3621. #include <linux/backing-dev.h>
  3622. +#include <linux/cleancache.h>
  3623. #include "internal.h"
  3624.  
  3625.  
  3626. @@ -110,6 +111,7 @@ static struct super_block *alloc_super(s
  3627. s->s_maxbytes = MAX_NON_LFS;
  3628. s->s_op = &default_op;
  3629. s->s_time_gran = 1000000000;
  3630. + s->cleancache_poolid = -1;
  3631. }
  3632. out:
  3633. return s;
  3634. @@ -176,6 +178,7 @@ void deactivate_locked_super(struct supe
  3635. struct file_system_type *fs = s->s_type;
  3636. if (atomic_dec_and_test(&s->s_active)) {
  3637. fs->kill_sb(s);
  3638. + cleancache_flush_fs(s);
  3639. put_filesystem(fs);
  3640. put_super(s);
  3641. } else {
  3642. diff -Nrupad linux-2.6.37//include/linux/cleancache.h linux-2.6.37_vanilla//include/linux/cleancache.h
  3643. --- linux-2.6.37//include/linux/cleancache.h 1970-01-01 01:00:00.000000000 +0100
  3644. +++ linux-2.6.37_vanilla//include/linux/cleancache.h 2011-02-14 01:21:43.169793144 +0100
  3645. @@ -0,0 +1,118 @@
  3646. +#ifndef _LINUX_CLEANCACHE_H
  3647. +#define _LINUX_CLEANCACHE_H
  3648. +
  3649. +#include <linux/fs.h>
  3650. +#include <linux/exportfs.h>
  3651. +#include <linux/mm.h>
  3652. +
  3653. +#define CLEANCACHE_KEY_MAX 6
  3654. +
  3655. +/*
  3656. + * cleancache requires every file with a page in cleancache to have a
  3657. + * unique key unless/until the file is removed/truncated. For some
  3658. + * filesystems, the inode number is unique, but for "modern" filesystems
  3659. + * an exportable filehandle is required (see exportfs.h)
  3660. + */
  3661. +struct cleancache_filekey {
  3662. + union {
  3663. + ino_t ino;
  3664. + __u32 fh[CLEANCACHE_KEY_MAX];
  3665. + u32 key[CLEANCACHE_KEY_MAX];
  3666. + } u;
  3667. +};
  3668. +
  3669. +struct cleancache_ops {
  3670. + int (*init_fs)(size_t);
  3671. + int (*init_shared_fs)(char *uuid, size_t);
  3672. + int (*get_page)(int, struct cleancache_filekey,
  3673. + pgoff_t, struct page *);
  3674. + void (*put_page)(int, struct cleancache_filekey,
  3675. + pgoff_t, struct page *);
  3676. + void (*flush_page)(int, struct cleancache_filekey, pgoff_t);
  3677. + void (*flush_inode)(int, struct cleancache_filekey);
  3678. + void (*flush_fs)(int);
  3679. +};
  3680. +
  3681. +extern struct cleancache_ops
  3682. + cleancache_register_ops(struct cleancache_ops *ops);
  3683. +extern void __cleancache_init_fs(struct super_block *);
  3684. +extern void __cleancache_init_shared_fs(char *, struct super_block *);
  3685. +extern int __cleancache_get_page(struct page *);
  3686. +extern void __cleancache_put_page(struct page *);
  3687. +extern void __cleancache_flush_page(struct address_space *, struct page *);
  3688. +extern void __cleancache_flush_inode(struct address_space *);
  3689. +extern void __cleancache_flush_fs(struct super_block *);
  3690. +extern int cleancache_enabled;
  3691. +
  3692. +#ifdef CONFIG_CLEANCACHE
  3693. +#define cleancache_fs_enabled(_page) \
  3694. + (_page->mapping->host->i_sb->cleancache_poolid >= 0)
  3695. +#define cleancache_fs_enabled_mapping(_mapping) \
  3696. + (mapping->host->i_sb->cleancache_poolid >= 0)
  3697. +#else
  3698. +#define cleancache_enabled (0)
  3699. +#define cleancache_fs_enabled(_page) (0)
  3700. +#define cleancache_fs_enabled_mapping(_page) (0)
  3701. +#endif
  3702. +
  3703. +/*
  3704. + * The shim layer provided by these inline functions allows the compiler
  3705. + * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE
  3706. + * is disabled, to a single global variable check if CONFIG_CLEANCACHE
  3707. + * is enabled but no cleancache "backend" has dynamically enabled it,
  3708. + * and, for the most frequent cleancache ops, to a single global variable
  3709. + * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled
  3710. + * and a cleancache backend has dynamically enabled cleancache, but the
  3711. + * filesystem referenced by that cleancache op has not enabled cleancache.
  3712. + * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially
  3713. + * no measurable performance impact.
  3714. + */
  3715. +
  3716. +static inline void cleancache_init_fs(struct super_block *sb)
  3717. +{
  3718. + if (cleancache_enabled)
  3719. + __cleancache_init_fs(sb);
  3720. +}
  3721. +
  3722. +static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb)
  3723. +{
  3724. + if (cleancache_enabled)
  3725. + __cleancache_init_shared_fs(uuid, sb);
  3726. +}
  3727. +
  3728. +static inline int cleancache_get_page(struct page *page)
  3729. +{
  3730. + int ret = -1;
  3731. +
  3732. + if (cleancache_enabled && cleancache_fs_enabled(page))
  3733. + ret = __cleancache_get_page(page);
  3734. + return ret;
  3735. +}
  3736. +
  3737. +static inline void cleancache_put_page(struct page *page)
  3738. +{
  3739. + if (cleancache_enabled && cleancache_fs_enabled(page))
  3740. + __cleancache_put_page(page);
  3741. +}
  3742. +
  3743. +static inline void cleancache_flush_page(struct address_space *mapping,
  3744. + struct page *page)
  3745. +{
  3746. + /* careful... page->mapping is NULL sometimes when this is called */
  3747. + if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
  3748. + __cleancache_flush_page(mapping, page);
  3749. +}
  3750. +
  3751. +static inline void cleancache_flush_inode(struct address_space *mapping)
  3752. +{
  3753. + if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
  3754. + __cleancache_flush_inode(mapping);
  3755. +}
  3756. +
  3757. +static inline void cleancache_flush_fs(struct super_block *sb)
  3758. +{
  3759. + if (cleancache_enabled)
  3760. + __cleancache_flush_fs(sb);
  3761. +}
  3762. +
  3763. +#endif /* _LINUX_CLEANCACHE_H */
  3764. diff -Nrupad linux-2.6.37//include/linux/frontswap.h linux-2.6.37_vanilla//include/linux/frontswap.h
  3765. --- linux-2.6.37//include/linux/frontswap.h 1970-01-01 01:00:00.000000000 +0100
  3766. +++ linux-2.6.37_vanilla//include/linux/frontswap.h 2011-02-14 01:21:43.169793144 +0100
  3767. @@ -0,0 +1,86 @@
  3768. +#ifndef _LINUX_FRONTSWAP_H
  3769. +#define _LINUX_FRONTSWAP_H
  3770. +
  3771. +#include <linux/swap.h>
  3772. +#include <linux/mm.h>
  3773. +
  3774. +struct frontswap_ops {
  3775. + void (*init)(unsigned);
  3776. + int (*put_page)(unsigned, pgoff_t, struct page *);
  3777. + int (*get_page)(unsigned, pgoff_t, struct page *);
  3778. + void (*flush_page)(unsigned, pgoff_t);
  3779. + void (*flush_area)(unsigned);
  3780. +};
  3781. +
  3782. +extern int frontswap_enabled;
  3783. +extern struct frontswap_ops
  3784. + frontswap_register_ops(struct frontswap_ops *ops);
  3785. +extern void frontswap_shrink(unsigned long);
  3786. +extern unsigned long frontswap_curr_pages(void);
  3787. +
  3788. +extern void frontswap_init(unsigned type);
  3789. +extern int __frontswap_put_page(struct page *page);
  3790. +extern int __frontswap_get_page(struct page *page);
  3791. +extern void __frontswap_flush_page(unsigned, pgoff_t);
  3792. +extern void __frontswap_flush_area(unsigned);
  3793. +
  3794. +#ifndef CONFIG_FRONTSWAP
  3795. +/* all inline routines become no-ops and all externs are ignored */
  3796. +#define frontswap_enabled (0)
  3797. +#endif
  3798. +
  3799. +static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
  3800. +{
  3801. + int ret = 0;
  3802. +
  3803. + if (frontswap_enabled && sis->frontswap_map)
  3804. + ret = test_bit(offset % BITS_PER_LONG,
  3805. + &sis->frontswap_map[offset/BITS_PER_LONG]);
  3806. + return ret;
  3807. +}
  3808. +
  3809. +static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
  3810. +{
  3811. + if (frontswap_enabled && sis->frontswap_map)
  3812. + set_bit(offset % BITS_PER_LONG,
  3813. + &sis->frontswap_map[offset/BITS_PER_LONG]);
  3814. +}
  3815. +
  3816. +static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
  3817. +{
  3818. + if (frontswap_enabled && sis->frontswap_map)
  3819. + clear_bit(offset % BITS_PER_LONG,
  3820. + &sis->frontswap_map[offset/BITS_PER_LONG]);
  3821. +}
  3822. +
  3823. +static inline int frontswap_put_page(struct page *page)
  3824. +{
  3825. + int ret = -1;
  3826. +
  3827. + if (frontswap_enabled)
  3828. + ret = __frontswap_put_page(page);
  3829. + return ret;
  3830. +}
  3831. +
  3832. +static inline int frontswap_get_page(struct page *page)
  3833. +{
  3834. + int ret = -1;
  3835. +
  3836. + if (frontswap_enabled)
  3837. + ret = __frontswap_get_page(page);
  3838. + return ret;
  3839. +}
  3840. +
  3841. +static inline void frontswap_flush_page(unsigned type, pgoff_t offset)
  3842. +{
  3843. + if (frontswap_enabled)
  3844. + __frontswap_flush_page(type, offset);
  3845. +}
  3846. +
  3847. +static inline void frontswap_flush_area(unsigned type)
  3848. +{
  3849. + if (frontswap_enabled)
  3850. + __frontswap_flush_area(type);
  3851. +}
  3852. +
  3853. +#endif /* _LINUX_FRONTSWAP_H */
  3854. diff -Nrupad linux-2.6.37//include/linux/fs.h linux-2.6.37_vanilla//include/linux/fs.h
  3855. --- linux-2.6.37//include/linux/fs.h 2011-01-05 01:50:19.000000000 +0100
  3856. +++ linux-2.6.37_vanilla//include/linux/fs.h 2011-02-14 01:21:43.170793149 +0100
  3857. @@ -1417,6 +1417,11 @@ struct super_block {
  3858. * generic_show_options()
  3859. */
  3860. char __rcu *s_options;
  3861. +
  3862. + /*
  3863. + * Saved pool identifier for cleancache (-1 means none)
  3864. + */
  3865. + int cleancache_poolid;
  3866. };
  3867.  
  3868. extern struct timespec current_fs_time(struct super_block *sb);
  3869. diff -Nrupad linux-2.6.37//include/linux/swapfile.h linux-2.6.37_vanilla//include/linux/swapfile.h
  3870. --- linux-2.6.37//include/linux/swapfile.h 1970-01-01 01:00:00.000000000 +0100
  3871. +++ linux-2.6.37_vanilla//include/linux/swapfile.h 2011-02-14 01:21:43.170793149 +0100
  3872. @@ -0,0 +1,13 @@
  3873. +#ifndef _LINUX_SWAPFILE_H
  3874. +#define _LINUX_SWAPFILE_H
  3875. +
  3876. +/*
  3877. + * these were static in swapfile.c but frontswap.c needs them and we don't
  3878. + * want to expose them to the dozens of source files that include swap.h
  3879. + */
  3880. +extern spinlock_t swap_lock;
  3881. +extern struct swap_list_t swap_list;
  3882. +extern struct swap_info_struct *swap_info[];
  3883. +extern int try_to_unuse(unsigned int, bool, unsigned long);
  3884. +
  3885. +#endif /* _LINUX_SWAPFILE_H */
  3886. diff -Nrupad linux-2.6.37//include/linux/swap.h linux-2.6.37_vanilla//include/linux/swap.h
  3887. --- linux-2.6.37//include/linux/swap.h 2011-01-05 01:50:19.000000000 +0100
  3888. +++ linux-2.6.37_vanilla//include/linux/swap.h 2011-02-14 01:21:43.171793147 +0100
  3889. @@ -185,6 +185,8 @@ struct swap_info_struct {
  3890. struct block_device *bdev; /* swap device or bdev of swap file */
  3891. struct file *swap_file; /* seldom referenced */
  3892. unsigned int old_block_size; /* seldom referenced */
  3893. + unsigned long *frontswap_map; /* frontswap in-use, one bit per page */
  3894. + unsigned int frontswap_pages; /* frontswap pages in-use counter */
  3895. };
  3896.  
  3897. struct swap_list_t {
  3898. diff -Nrupad linux-2.6.37//Makefile linux-2.6.37_vanilla//Makefile
  3899. --- linux-2.6.37//Makefile 2011-01-05 01:50:19.000000000 +0100
  3900. +++ linux-2.6.37_vanilla//Makefile 2011-02-14 01:27:32.292792852 +0100
  3901. @@ -1,7 +1,7 @@
  3902. VERSION = 2
  3903. PATCHLEVEL = 6
  3904. SUBLEVEL = 37
  3905. -EXTRAVERSION =
  3906. +EXTRAVERSION = -zcache
  3907. NAME = Flesh-Eating Bats with Fangs
  3908.  
  3909. # *DOCUMENTATION*
  3910. diff -Nrupad linux-2.6.37//Makefile~ linux-2.6.37_vanilla//Makefile~
  3911. --- linux-2.6.37//Makefile~ 1970-01-01 01:00:00.000000000 +0100
  3912. +++ linux-2.6.37_vanilla//Makefile~ 2011-02-14 01:19:18.000000000 +0100
  3913. @@ -0,0 +1,1533 @@
  3914. +VERSION = 2
  3915. +PATCHLEVEL = 6
  3916. +SUBLEVEL = 37
  3917. +EXTRAVERSION =
  3918. +NAME = Flesh-Eating Bats with Fangs
  3919. +
  3920. +# *DOCUMENTATION*
  3921. +# To see a list of typical targets execute "make help"
  3922. +# More info can be located in ./README
  3923. +# Comments in this file are targeted only to the developer, do not
  3924. +# expect to learn how to build the kernel reading this file.
  3925. +
  3926. +# Do not:
  3927. +# o use make's built-in rules and variables
  3928. +# (this increases performance and avoids hard-to-debug behaviour);
  3929. +# o print "Entering directory ...";
  3930. +MAKEFLAGS += -rR --no-print-directory
  3931. +
  3932. +# Avoid funny character set dependencies
  3933. +unexport LC_ALL
  3934. +LC_COLLATE=C
  3935. +LC_NUMERIC=C
  3936. +export LC_COLLATE LC_NUMERIC
  3937. +
  3938. +# We are using a recursive build, so we need to do a little thinking
  3939. +# to get the ordering right.
  3940. +#
  3941. +# Most importantly: sub-Makefiles should only ever modify files in
  3942. +# their own directory. If in some directory we have a dependency on
  3943. +# a file in another dir (which doesn't happen often, but it's often
  3944. +# unavoidable when linking the built-in.o targets which finally
  3945. +# turn into vmlinux), we will call a sub make in that other dir, and
  3946. +# after that we are sure that everything which is in that other dir
  3947. +# is now up to date.
  3948. +#
  3949. +# The only cases where we need to modify files which have global
  3950. +# effects are thus separated out and done before the recursive
  3951. +# descending is started. They are now explicitly listed as the
  3952. +# prepare rule.
  3953. +
  3954. +# To put more focus on warnings, be less verbose as default
  3955. +# Use 'make V=1' to see the full commands
  3956. +
  3957. +ifeq ("$(origin V)", "command line")
  3958. + KBUILD_VERBOSE = $(V)
  3959. +endif
  3960. +ifndef KBUILD_VERBOSE
  3961. + KBUILD_VERBOSE = 0
  3962. +endif
  3963. +
  3964. +# Call a source code checker (by default, "sparse") as part of the
  3965. +# C compilation.
  3966. +#
  3967. +# Use 'make C=1' to enable checking of only re-compiled files.
  3968. +# Use 'make C=2' to enable checking of *all* source files, regardless
  3969. +# of whether they are re-compiled or not.
  3970. +#
  3971. +# See the file "Documentation/sparse.txt" for more details, including
  3972. +# where to get the "sparse" utility.
  3973. +
  3974. +ifeq ("$(origin C)", "command line")
  3975. + KBUILD_CHECKSRC = $(C)
  3976. +endif
  3977. +ifndef KBUILD_CHECKSRC
  3978. + KBUILD_CHECKSRC = 0
  3979. +endif
  3980. +
  3981. +# Use make M=dir to specify directory of external module to build
  3982. +# Old syntax make ... SUBDIRS=$PWD is still supported
  3983. +# Setting the environment variable KBUILD_EXTMOD take precedence
  3984. +ifdef SUBDIRS
  3985. + KBUILD_EXTMOD ?= $(SUBDIRS)
  3986. +endif
  3987. +
  3988. +ifeq ("$(origin M)", "command line")
  3989. + KBUILD_EXTMOD := $(M)
  3990. +endif
  3991. +
  3992. +# kbuild supports saving output files in a separate directory.
  3993. +# To locate output files in a separate directory two syntaxes are supported.
  3994. +# In both cases the working directory must be the root of the kernel src.
  3995. +# 1) O=
  3996. +# Use "make O=dir/to/store/output/files/"
  3997. +#
  3998. +# 2) Set KBUILD_OUTPUT
  3999. +# Set the environment variable KBUILD_OUTPUT to point to the directory
  4000. +# where the output files shall be placed.
  4001. +# export KBUILD_OUTPUT=dir/to/store/output/files/
  4002. +# make
  4003. +#
  4004. +# The O= assignment takes precedence over the KBUILD_OUTPUT environment
  4005. +# variable.
  4006. +
  4007. +
  4008. +# KBUILD_SRC is set on invocation of make in OBJ directory
  4009. +# KBUILD_SRC is not intended to be used by the regular user (for now)
  4010. +ifeq ($(KBUILD_SRC),)
  4011. +
  4012. +# OK, Make called in directory where kernel src resides
  4013. +# Do we want to locate output files in a separate directory?
  4014. +ifeq ("$(origin O)", "command line")
  4015. + KBUILD_OUTPUT := $(O)
  4016. +endif
  4017. +
  4018. +# That's our default target when none is given on the command line
  4019. +PHONY := _all
  4020. +_all:
  4021. +
  4022. +# Cancel implicit rules on top Makefile
  4023. +$(CURDIR)/Makefile Makefile: ;
  4024. +
  4025. +ifneq ($(KBUILD_OUTPUT),)
  4026. +# Invoke a second make in the output directory, passing relevant variables
  4027. +# check that the output directory actually exists
  4028. +saved-output := $(KBUILD_OUTPUT)
  4029. +KBUILD_OUTPUT := $(shell cd $(KBUILD_OUTPUT) && /bin/pwd)
  4030. +$(if $(KBUILD_OUTPUT),, \
  4031. + $(error output directory "$(saved-output)" does not exist))
  4032. +
  4033. +PHONY += $(MAKECMDGOALS) sub-make
  4034. +
  4035. +$(filter-out _all sub-make $(CURDIR)/Makefile, $(MAKECMDGOALS)) _all: sub-make
  4036. + $(Q)@:
  4037. +
  4038. +sub-make: FORCE
  4039. + $(if $(KBUILD_VERBOSE:1=),@)$(MAKE) -C $(KBUILD_OUTPUT) \
  4040. + KBUILD_SRC=$(CURDIR) \
  4041. + KBUILD_EXTMOD="$(KBUILD_EXTMOD)" -f $(CURDIR)/Makefile \
  4042. + $(filter-out _all sub-make,$(MAKECMDGOALS))
  4043. +
  4044. +# Leave processing to above invocation of make
  4045. +skip-makefile := 1
  4046. +endif # ifneq ($(KBUILD_OUTPUT),)
  4047. +endif # ifeq ($(KBUILD_SRC),)
  4048. +
  4049. +# We process the rest of the Makefile if this is the final invocation of make
  4050. +ifeq ($(skip-makefile),)
  4051. +
  4052. +# If building an external module we do not care about the all: rule
  4053. +# but instead _all depend on modules
  4054. +PHONY += all
  4055. +ifeq ($(KBUILD_EXTMOD),)
  4056. +_all: all
  4057. +else
  4058. +_all: modules
  4059. +endif
  4060. +
  4061. +srctree := $(if $(KBUILD_SRC),$(KBUILD_SRC),$(CURDIR))
  4062. +objtree := $(CURDIR)
  4063. +src := $(srctree)
  4064. +obj := $(objtree)
  4065. +
  4066. +VPATH := $(srctree)$(if $(KBUILD_EXTMOD),:$(KBUILD_EXTMOD))
  4067. +
  4068. +export srctree objtree VPATH
  4069. +
  4070. +
  4071. +# SUBARCH tells the usermode build what the underlying arch is. That is set
  4072. +# first, and if a usermode build is happening, the "ARCH=um" on the command
  4073. +# line overrides the setting of ARCH below. If a native build is happening,
  4074. +# then ARCH is assigned, getting whatever value it gets normally, and
  4075. +# SUBARCH is subsequently ignored.
  4076. +
  4077. +SUBARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
  4078. + -e s/arm.*/arm/ -e s/sa110/arm/ \
  4079. + -e s/s390x/s390/ -e s/parisc64/parisc/ \
  4080. + -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \
  4081. + -e s/sh[234].*/sh/ )
  4082. +
  4083. +# Cross compiling and selecting different set of gcc/bin-utils
  4084. +# ---------------------------------------------------------------------------
  4085. +#
  4086. +# When performing cross compilation for other architectures ARCH shall be set
  4087. +# to the target architecture. (See arch/* for the possibilities).
  4088. +# ARCH can be set during invocation of make:
  4089. +# make ARCH=ia64
  4090. +# Another way is to have ARCH set in the environment.
  4091. +# The default ARCH is the host where make is executed.
  4092. +
  4093. +# CROSS_COMPILE specify the prefix used for all executables used
  4094. +# during compilation. Only gcc and related bin-utils executables
  4095. +# are prefixed with $(CROSS_COMPILE).
  4096. +# CROSS_COMPILE can be set on the command line
  4097. +# make CROSS_COMPILE=ia64-linux-
  4098. +# Alternatively CROSS_COMPILE can be set in the environment.
  4099. +# A third alternative is to store a setting in .config so that plain
  4100. +# "make" in the configured kernel build directory always uses that.
  4101. +# Default value for CROSS_COMPILE is not to prefix executables
  4102. +# Note: Some architectures assign CROSS_COMPILE in their arch/*/Makefile
  4103. +export KBUILD_BUILDHOST := $(SUBARCH)
  4104. +ARCH ?= $(SUBARCH)
  4105. +CROSS_COMPILE ?= $(CONFIG_CROSS_COMPILE:"%"=%)
  4106. +
  4107. +# Architecture as present in compile.h
  4108. +UTS_MACHINE := $(ARCH)
  4109. +SRCARCH := $(ARCH)
  4110. +
  4111. +# Additional ARCH settings for x86
  4112. +ifeq ($(ARCH),i386)
  4113. + SRCARCH := x86
  4114. +endif
  4115. +ifeq ($(ARCH),x86_64)
  4116. + SRCARCH := x86
  4117. +endif
  4118. +
  4119. +# Additional ARCH settings for sparc
  4120. +ifeq ($(ARCH),sparc32)
  4121. + SRCARCH := sparc
  4122. +endif
  4123. +ifeq ($(ARCH),sparc64)
  4124. + SRCARCH := sparc
  4125. +endif
  4126. +
  4127. +# Additional ARCH settings for sh
  4128. +ifeq ($(ARCH),sh64)
  4129. + SRCARCH := sh
  4130. +endif
  4131. +
  4132. +# Where to locate arch specific headers
  4133. +hdr-arch := $(SRCARCH)
  4134. +
  4135. +ifeq ($(ARCH),m68knommu)
  4136. + hdr-arch := m68k
  4137. +endif
  4138. +
  4139. +KCONFIG_CONFIG ?= .config
  4140. +
  4141. +# SHELL used by kbuild
  4142. +CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \
  4143. + else if [ -x /bin/bash ]; then echo /bin/bash; \
  4144. + else echo sh; fi ; fi)
  4145. +
  4146. +HOSTCC = gcc
  4147. +HOSTCXX = g++
  4148. +HOSTCFLAGS = -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer
  4149. +HOSTCXXFLAGS = -O2
  4150. +
  4151. +# Decide whether to build built-in, modular, or both.
  4152. +# Normally, just do built-in.
  4153. +
  4154. +KBUILD_MODULES :=
  4155. +KBUILD_BUILTIN := 1
  4156. +
  4157. +# If we have only "make modules", don't compile built-in objects.
  4158. +# When we're building modules with modversions, we need to consider
  4159. +# the built-in objects during the descend as well, in order to
  4160. +# make sure the checksums are up to date before we record them.
  4161. +
  4162. +ifeq ($(MAKECMDGOALS),modules)
  4163. + KBUILD_BUILTIN := $(if $(CONFIG_MODVERSIONS),1)
  4164. +endif
  4165. +
  4166. +# If we have "make <whatever> modules", compile modules
  4167. +# in addition to whatever we do anyway.
  4168. +# Just "make" or "make all" shall build modules as well
  4169. +
  4170. +ifneq ($(filter all _all modules,$(MAKECMDGOALS)),)
  4171. + KBUILD_MODULES := 1
  4172. +endif
  4173. +
  4174. +ifeq ($(MAKECMDGOALS),)
  4175. + KBUILD_MODULES := 1
  4176. +endif
  4177. +
  4178. +export KBUILD_MODULES KBUILD_BUILTIN
  4179. +export KBUILD_CHECKSRC KBUILD_SRC KBUILD_EXTMOD
  4180. +
  4181. +# Beautify output
  4182. +# ---------------------------------------------------------------------------
  4183. +#
  4184. +# Normally, we echo the whole command before executing it. By making
  4185. +# that echo $($(quiet)$(cmd)), we now have the possibility to set
  4186. +# $(quiet) to choose other forms of output instead, e.g.
  4187. +#
  4188. +# quiet_cmd_cc_o_c = Compiling $(RELDIR)/$@
  4189. +# cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $<
  4190. +#
  4191. +# If $(quiet) is empty, the whole command will be printed.
  4192. +# If it is set to "quiet_", only the short version will be printed.
  4193. +# If it is set to "silent_", nothing will be printed at all, since
  4194. +# the variable $(silent_cmd_cc_o_c) doesn't exist.
  4195. +#
  4196. +# A simple variant is to prefix commands with $(Q) - that's useful
  4197. +# for commands that shall be hidden in non-verbose mode.
  4198. +#
  4199. +# $(Q)ln $@ :<
  4200. +#
  4201. +# If KBUILD_VERBOSE equals 0 then the above command will be hidden.
  4202. +# If KBUILD_VERBOSE equals 1 then the above command is displayed.
  4203. +
  4204. +ifeq ($(KBUILD_VERBOSE),1)
  4205. + quiet =
  4206. + Q =
  4207. +else
  4208. + quiet=quiet_
  4209. + Q = @
  4210. +endif
  4211. +
  4212. +# If the user is running make -s (silent mode), suppress echoing of
  4213. +# commands
  4214. +
  4215. +ifneq ($(findstring s,$(MAKEFLAGS)),)
  4216. + quiet=silent_
  4217. +endif
  4218. +
  4219. +export quiet Q KBUILD_VERBOSE
  4220. +
  4221. +
  4222. +# Look for make include files relative to root of kernel src
  4223. +MAKEFLAGS += --include-dir=$(srctree)
  4224. +
  4225. +# We need some generic definitions (do not try to remake the file).
  4226. +$(srctree)/scripts/Kbuild.include: ;
  4227. +include $(srctree)/scripts/Kbuild.include
  4228. +
  4229. +# Make variables (CC, etc...)
  4230. +
  4231. +AS = $(CROSS_COMPILE)as
  4232. +LD = $(CROSS_COMPILE)ld
  4233. +CC = $(CROSS_COMPILE)gcc
  4234. +CPP = $(CC) -E
  4235. +AR = $(CROSS_COMPILE)ar
  4236. +NM = $(CROSS_COMPILE)nm
  4237. +STRIP = $(CROSS_COMPILE)strip
  4238. +OBJCOPY = $(CROSS_COMPILE)objcopy
  4239. +OBJDUMP = $(CROSS_COMPILE)objdump
  4240. +AWK = awk
  4241. +GENKSYMS = scripts/genksyms/genksyms
  4242. +INSTALLKERNEL := installkernel
  4243. +DEPMOD = /sbin/depmod
  4244. +KALLSYMS = scripts/kallsyms
  4245. +PERL = perl
  4246. +CHECK = sparse
  4247. +
  4248. +CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \
  4249. + -Wbitwise -Wno-return-void $(CF)
  4250. +CFLAGS_MODULE =
  4251. +AFLAGS_MODULE =
  4252. +LDFLAGS_MODULE =
  4253. +CFLAGS_KERNEL =
  4254. +AFLAGS_KERNEL =
  4255. +CFLAGS_GCOV = -fprofile-arcs -ftest-coverage
  4256. +
  4257. +
  4258. +# Use LINUXINCLUDE when you must reference the include/ directory.
  4259. +# Needed to be compatible with the O= option
  4260. +LINUXINCLUDE := -I$(srctree)/arch/$(hdr-arch)/include -Iinclude \
  4261. + $(if $(KBUILD_SRC), -I$(srctree)/include) \
  4262. + -include include/generated/autoconf.h
  4263. +
  4264. +KBUILD_CPPFLAGS := -D__KERNEL__
  4265. +
  4266. +KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
  4267. + -fno-strict-aliasing -fno-common \
  4268. + -Werror-implicit-function-declaration \
  4269. + -Wno-format-security \
  4270. + -fno-delete-null-pointer-checks
  4271. +KBUILD_AFLAGS_KERNEL :=
  4272. +KBUILD_CFLAGS_KERNEL :=
  4273. +KBUILD_AFLAGS := -D__ASSEMBLY__
  4274. +KBUILD_AFLAGS_MODULE := -DMODULE
  4275. +KBUILD_CFLAGS_MODULE := -DMODULE
  4276. +KBUILD_LDFLAGS_MODULE := -T $(srctree)/scripts/module-common.lds
  4277. +
  4278. +# Read KERNELRELEASE from include/config/kernel.release (if it exists)
  4279. +KERNELRELEASE = $(shell cat include/config/kernel.release 2> /dev/null)
  4280. +KERNELVERSION = $(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
  4281. +
  4282. +export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION
  4283. +export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC
  4284. +export CPP AR NM STRIP OBJCOPY OBJDUMP
  4285. +export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE
  4286. +export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
  4287. +
  4288. +export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
  4289. +export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV
  4290. +export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
  4291. +export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
  4292. +export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
  4293. +
  4294. +# When compiling out-of-tree modules, put MODVERDIR in the module
  4295. +# tree rather than in the kernel tree. The kernel tree might
  4296. +# even be read-only.
  4297. +export MODVERDIR := $(if $(KBUILD_EXTMOD),$(firstword $(KBUILD_EXTMOD))/).tmp_versions
  4298. +
  4299. +# Files to ignore in find ... statements
  4300. +
  4301. +RCS_FIND_IGNORE := \( -name SCCS -o -name BitKeeper -o -name .svn -o -name CVS -o -name .pc -o -name .hg -o -name .git \) -prune -o
  4302. +export RCS_TAR_IGNORE := --exclude SCCS --exclude BitKeeper --exclude .svn --exclude CVS --exclude .pc --exclude .hg --exclude .git
  4303. +
  4304. +# ===========================================================================
  4305. +# Rules shared between *config targets and build targets
  4306. +
  4307. +# Basic helpers built in scripts/
  4308. +PHONY += scripts_basic
  4309. +scripts_basic:
  4310. + $(Q)$(MAKE) $(build)=scripts/basic
  4311. + $(Q)rm -f .tmp_quiet_recordmcount
  4312. +
  4313. +# To avoid any implicit rule to kick in, define an empty command.
  4314. +scripts/basic/%: scripts_basic ;
  4315. +
  4316. +PHONY += outputmakefile
  4317. +# outputmakefile generates a Makefile in the output directory, if using a
  4318. +# separate output directory. This allows convenient use of make in the
  4319. +# output directory.
  4320. +outputmakefile:
  4321. +ifneq ($(KBUILD_SRC),)
  4322. + $(Q)ln -fsn $(srctree) source
  4323. + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkmakefile \
  4324. + $(srctree) $(objtree) $(VERSION) $(PATCHLEVEL)
  4325. +endif
  4326. +
  4327. +# To make sure we do not include .config for any of the *config targets
  4328. +# catch them early, and hand them over to scripts/kconfig/Makefile
  4329. +# It is allowed to specify more targets when calling make, including
  4330. +# mixing *config targets and build targets.
  4331. +# For example 'make oldconfig all'.
  4332. +# Detect when mixed targets is specified, and make a second invocation
  4333. +# of make so .config is not included in this case either (for *config).
  4334. +
  4335. +no-dot-config-targets := clean mrproper distclean \
  4336. + cscope TAGS tags help %docs check% coccicheck \
  4337. + include/linux/version.h headers_% \
  4338. + kernelversion %src-pkg
  4339. +
  4340. +config-targets := 0
  4341. +mixed-targets := 0
  4342. +dot-config := 1
  4343. +
  4344. +ifneq ($(filter $(no-dot-config-targets), $(MAKECMDGOALS)),)
  4345. + ifeq ($(filter-out $(no-dot-config-targets), $(MAKECMDGOALS)),)
  4346. + dot-config := 0
  4347. + endif
  4348. +endif
  4349. +
  4350. +ifeq ($(KBUILD_EXTMOD),)
  4351. + ifneq ($(filter config %config,$(MAKECMDGOALS)),)
  4352. + config-targets := 1
  4353. + ifneq ($(filter-out config %config,$(MAKECMDGOALS)),)
  4354. + mixed-targets := 1
  4355. + endif
  4356. + endif
  4357. +endif
  4358. +
  4359. +ifeq ($(mixed-targets),1)
  4360. +# ===========================================================================
  4361. +# We're called with mixed targets (*config and build targets).
  4362. +# Handle them one by one.
  4363. +
  4364. +%:: FORCE
  4365. + $(Q)$(MAKE) -C $(srctree) KBUILD_SRC= $@
  4366. +
  4367. +else
  4368. +ifeq ($(config-targets),1)
  4369. +# ===========================================================================
  4370. +# *config targets only - make sure prerequisites are updated, and descend
  4371. +# in scripts/kconfig to make the *config target
  4372. +
  4373. +# Read arch specific Makefile to set KBUILD_DEFCONFIG as needed.
  4374. +# KBUILD_DEFCONFIG may point out an alternative default configuration
  4375. +# used for 'make defconfig'
  4376. +include $(srctree)/arch/$(SRCARCH)/Makefile
  4377. +export KBUILD_DEFCONFIG KBUILD_KCONFIG
  4378. +
  4379. +config: scripts_basic outputmakefile FORCE
  4380. + $(Q)mkdir -p include/linux include/config
  4381. + $(Q)$(MAKE) $(build)=scripts/kconfig $@
  4382. +
  4383. +%config: scripts_basic outputmakefile FORCE
  4384. + $(Q)mkdir -p include/linux include/config
  4385. + $(Q)$(MAKE) $(build)=scripts/kconfig $@
  4386. +
  4387. +else
  4388. +# ===========================================================================
  4389. +# Build targets only - this includes vmlinux, arch specific targets, clean
  4390. +# targets and others. In general all targets except *config targets.
  4391. +
  4392. +ifeq ($(KBUILD_EXTMOD),)
  4393. +# Additional helpers built in scripts/
  4394. +# Carefully list dependencies so we do not try to build scripts twice
  4395. +# in parallel
  4396. +PHONY += scripts
  4397. +scripts: scripts_basic include/config/auto.conf include/config/tristate.conf
  4398. + $(Q)$(MAKE) $(build)=$(@)
  4399. +
  4400. +# Objects we will link into vmlinux / subdirs we need to visit
  4401. +init-y := init/
  4402. +drivers-y := drivers/ sound/ firmware/
  4403. +net-y := net/
  4404. +libs-y := lib/
  4405. +core-y := usr/
  4406. +endif # KBUILD_EXTMOD
  4407. +
  4408. +ifeq ($(dot-config),1)
  4409. +# Read in config
  4410. +-include include/config/auto.conf
  4411. +
  4412. +ifeq ($(KBUILD_EXTMOD),)
  4413. +# Read in dependencies to all Kconfig* files, make sure to run
  4414. +# oldconfig if changes are detected.
  4415. +-include include/config/auto.conf.cmd
  4416. +
  4417. +# To avoid any implicit rule to kick in, define an empty command
  4418. +$(KCONFIG_CONFIG) include/config/auto.conf.cmd: ;
  4419. +
  4420. +# If .config is newer than include/config/auto.conf, someone tinkered
  4421. +# with it and forgot to run make oldconfig.
  4422. +# if auto.conf.cmd is missing then we are probably in a cleaned tree so
  4423. +# we execute the config step to be sure to catch updated Kconfig files
  4424. +include/config/%.conf: $(KCONFIG_CONFIG) include/config/auto.conf.cmd
  4425. + $(Q)$(MAKE) -f $(srctree)/Makefile silentoldconfig
  4426. +else
  4427. +# external modules needs include/generated/autoconf.h and include/config/auto.conf
  4428. +# but do not care if they are up-to-date. Use auto.conf to trigger the test
  4429. +PHONY += include/config/auto.conf
  4430. +
  4431. +include/config/auto.conf:
  4432. + $(Q)test -e include/generated/autoconf.h -a -e $@ || ( \
  4433. + echo; \
  4434. + echo " ERROR: Kernel configuration is invalid."; \
  4435. + echo " include/generated/autoconf.h or $@ are missing.";\
  4436. + echo " Run 'make oldconfig && make prepare' on kernel src to fix it."; \
  4437. + echo; \
  4438. + /bin/false)
  4439. +
  4440. +endif # KBUILD_EXTMOD
  4441. +
  4442. +else
  4443. +# Dummy target needed, because used as prerequisite
  4444. +include/config/auto.conf: ;
  4445. +endif # $(dot-config)
  4446. +
  4447. +# The all: target is the default when no target is given on the
  4448. +# command line.
  4449. +# This allow a user to issue only 'make' to build a kernel including modules
  4450. +# Defaults to vmlinux, but the arch makefile usually adds further targets
  4451. +all: vmlinux
  4452. +
  4453. +ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
  4454. +KBUILD_CFLAGS += -Os
  4455. +else
  4456. +KBUILD_CFLAGS += -O2
  4457. +endif
  4458. +
  4459. +include $(srctree)/arch/$(SRCARCH)/Makefile
  4460. +
  4461. +ifneq ($(CONFIG_FRAME_WARN),0)
  4462. +KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN})
  4463. +endif
  4464. +
  4465. +# Force gcc to behave correct even for buggy distributions
  4466. +ifndef CONFIG_CC_STACKPROTECTOR
  4467. +KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector)
  4468. +endif
  4469. +
  4470. +ifdef CONFIG_FRAME_POINTER
  4471. +KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls
  4472. +else
  4473. +# Some targets (ARM with Thumb2, for example), can't be built with frame
  4474. +# pointers. For those, we don't have FUNCTION_TRACER automatically
  4475. +# select FRAME_POINTER. However, FUNCTION_TRACER adds -pg, and this is
  4476. +# incompatible with -fomit-frame-pointer with current GCC, so we don't use
  4477. +# -fomit-frame-pointer with FUNCTION_TRACER.
  4478. +ifndef CONFIG_FUNCTION_TRACER
  4479. +KBUILD_CFLAGS += -fomit-frame-pointer
  4480. +endif
  4481. +endif
  4482. +
  4483. +ifdef CONFIG_DEBUG_INFO
  4484. +KBUILD_CFLAGS += -g
  4485. +KBUILD_AFLAGS += -gdwarf-2
  4486. +endif
  4487. +
  4488. +ifdef CONFIG_DEBUG_INFO_REDUCED
  4489. +KBUILD_CFLAGS += $(call cc-option, -femit-struct-debug-baseonly)
  4490. +endif
  4491. +
  4492. +ifdef CONFIG_FUNCTION_TRACER
  4493. +KBUILD_CFLAGS += -pg
  4494. +ifdef CONFIG_DYNAMIC_FTRACE
  4495. + ifdef CONFIG_HAVE_C_RECORDMCOUNT
  4496. + BUILD_C_RECORDMCOUNT := y
  4497. + export BUILD_C_RECORDMCOUNT
  4498. + endif
  4499. +endif
  4500. +endif
  4501. +
  4502. +# We trigger additional mismatches with less inlining
  4503. +ifdef CONFIG_DEBUG_SECTION_MISMATCH
  4504. +KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once)
  4505. +endif
  4506. +
  4507. +# arch Makefile may override CC so keep this after arch Makefile is included
  4508. +NOSTDINC_FLAGS += -nostdinc -isystem $(shell $(CC) -print-file-name=include)
  4509. +CHECKFLAGS += $(NOSTDINC_FLAGS)
  4510. +
  4511. +# warn about C99 declaration after statement
  4512. +KBUILD_CFLAGS += $(call cc-option,-Wdeclaration-after-statement,)
  4513. +
  4514. +# disable pointer signed / unsigned warnings in gcc 4.0
  4515. +KBUILD_CFLAGS += $(call cc-option,-Wno-pointer-sign,)
  4516. +
  4517. +# disable invalid "can't wrap" optimizations for signed / pointers
  4518. +KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
  4519. +
  4520. +# conserve stack if available
  4521. +KBUILD_CFLAGS += $(call cc-option,-fconserve-stack)
  4522. +
  4523. +# check for 'asm goto'
  4524. +ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC)), y)
  4525. + KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
  4526. +endif
  4527. +
  4528. +# Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
  4529. +# But warn user when we do so
  4530. +warn-assign = \
  4531. +$(warning "WARNING: Appending $$K$(1) ($(K$(1))) from $(origin K$(1)) to kernel $$$(1)")
  4532. +
  4533. +ifneq ($(KCPPFLAGS),)
  4534. + $(call warn-assign,CPPFLAGS)
  4535. + KBUILD_CPPFLAGS += $(KCPPFLAGS)
  4536. +endif
  4537. +ifneq ($(KAFLAGS),)
  4538. + $(call warn-assign,AFLAGS)
  4539. + KBUILD_AFLAGS += $(KAFLAGS)
  4540. +endif
  4541. +ifneq ($(KCFLAGS),)
  4542. + $(call warn-assign,CFLAGS)
  4543. + KBUILD_CFLAGS += $(KCFLAGS)
  4544. +endif
  4545. +
  4546. +# Use --build-id when available.
  4547. +LDFLAGS_BUILD_ID = $(patsubst -Wl$(comma)%,%,\
  4548. + $(call cc-ldoption, -Wl$(comma)--build-id,))
  4549. +KBUILD_LDFLAGS_MODULE += $(LDFLAGS_BUILD_ID)
  4550. +LDFLAGS_vmlinux += $(LDFLAGS_BUILD_ID)
  4551. +
  4552. +ifeq ($(CONFIG_STRIP_ASM_SYMS),y)
  4553. +LDFLAGS_vmlinux += $(call ld-option, -X,)
  4554. +endif
  4555. +
  4556. +# Default kernel image to build when no specific target is given.
  4557. +# KBUILD_IMAGE may be overruled on the command line or
  4558. +# set in the environment
  4559. +# Also any assignments in arch/$(ARCH)/Makefile take precedence over
  4560. +# this default value
  4561. +export KBUILD_IMAGE ?= vmlinux
  4562. +
  4563. +#
  4564. +# INSTALL_PATH specifies where to place the updated kernel and system map
  4565. +# images. Default is /boot, but you can set it to other values
  4566. +export INSTALL_PATH ?= /boot
  4567. +
  4568. +#
  4569. +# INSTALL_MOD_PATH specifies a prefix to MODLIB for module directory
  4570. +# relocations required by build roots. This is not defined in the
  4571. +# makefile but the argument can be passed to make if needed.
  4572. +#
  4573. +
  4574. +MODLIB = $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE)
  4575. +export MODLIB
  4576. +
  4577. +#
  4578. +# INSTALL_MOD_STRIP, if defined, will cause modules to be
  4579. +# stripped after they are installed. If INSTALL_MOD_STRIP is '1', then
  4580. +# the default option --strip-debug will be used. Otherwise,
  4581. +# INSTALL_MOD_STRIP will used as the options to the strip command.
  4582. +
  4583. +ifdef INSTALL_MOD_STRIP
  4584. +ifeq ($(INSTALL_MOD_STRIP),1)
  4585. +mod_strip_cmd = $(STRIP) --strip-debug
  4586. +else
  4587. +mod_strip_cmd = $(STRIP) $(INSTALL_MOD_STRIP)
  4588. +endif # INSTALL_MOD_STRIP=1
  4589. +else
  4590. +mod_strip_cmd = true
  4591. +endif # INSTALL_MOD_STRIP
  4592. +export mod_strip_cmd
  4593. +
  4594. +
  4595. +ifeq ($(KBUILD_EXTMOD),)
  4596. +core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/
  4597. +
  4598. +vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
  4599. + $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
  4600. + $(net-y) $(net-m) $(libs-y) $(libs-m)))
  4601. +
  4602. +vmlinux-alldirs := $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \
  4603. + $(init-n) $(init-) \
  4604. + $(core-n) $(core-) $(drivers-n) $(drivers-) \
  4605. + $(net-n) $(net-) $(libs-n) $(libs-))))
  4606. +
  4607. +init-y := $(patsubst %/, %/built-in.o, $(init-y))
  4608. +core-y := $(patsubst %/, %/built-in.o, $(core-y))
  4609. +drivers-y := $(patsubst %/, %/built-in.o, $(drivers-y))
  4610. +net-y := $(patsubst %/, %/built-in.o, $(net-y))
  4611. +libs-y1 := $(patsubst %/, %/lib.a, $(libs-y))
  4612. +libs-y2 := $(patsubst %/, %/built-in.o, $(libs-y))
  4613. +libs-y := $(libs-y1) $(libs-y2)
  4614. +
  4615. +# Build vmlinux
  4616. +# ---------------------------------------------------------------------------
  4617. +# vmlinux is built from the objects selected by $(vmlinux-init) and
  4618. +# $(vmlinux-main). Most are built-in.o files from top-level directories
  4619. +# in the kernel tree, others are specified in arch/$(ARCH)/Makefile.
  4620. +# Ordering when linking is important, and $(vmlinux-init) must be first.
  4621. +#
  4622. +# vmlinux
  4623. +# ^
  4624. +# |
  4625. +# +-< $(vmlinux-init)
  4626. +# | +--< init/version.o + more
  4627. +# |
  4628. +# +--< $(vmlinux-main)
  4629. +# | +--< driver/built-in.o mm/built-in.o + more
  4630. +# |
  4631. +# +-< kallsyms.o (see description in CONFIG_KALLSYMS section)
  4632. +#
  4633. +# vmlinux version (uname -v) cannot be updated during normal
  4634. +# descending-into-subdirs phase since we do not yet know if we need to
  4635. +# update vmlinux.
  4636. +# Therefore this step is delayed until just before final link of vmlinux -
  4637. +# except in the kallsyms case where it is done just before adding the
  4638. +# symbols to the kernel.
  4639. +#
  4640. +# System.map is generated to document addresses of all kernel symbols
  4641. +
  4642. +vmlinux-init := $(head-y) $(init-y)
  4643. +vmlinux-main := $(core-y) $(libs-y) $(drivers-y) $(net-y)
  4644. +vmlinux-all := $(vmlinux-init) $(vmlinux-main)
  4645. +vmlinux-lds := arch/$(SRCARCH)/kernel/vmlinux.lds
  4646. +export KBUILD_VMLINUX_OBJS := $(vmlinux-all)
  4647. +
  4648. +# Rule to link vmlinux - also used during CONFIG_KALLSYMS
  4649. +# May be overridden by arch/$(ARCH)/Makefile
  4650. +quiet_cmd_vmlinux__ ?= LD $@
  4651. + cmd_vmlinux__ ?= $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux) -o $@ \
  4652. + -T $(vmlinux-lds) $(vmlinux-init) \
  4653. + --start-group $(vmlinux-main) --end-group \
  4654. + $(filter-out $(vmlinux-lds) $(vmlinux-init) $(vmlinux-main) vmlinux.o FORCE ,$^)
  4655. +
  4656. +# Generate new vmlinux version
  4657. +quiet_cmd_vmlinux_version = GEN .version
  4658. + cmd_vmlinux_version = set -e; \
  4659. + if [ ! -r .version ]; then \
  4660. + rm -f .version; \
  4661. + echo 1 >.version; \
  4662. + else \
  4663. + mv .version .old_version; \
  4664. + expr 0$$(cat .old_version) + 1 >.version; \
  4665. + fi; \
  4666. + $(MAKE) $(build)=init
  4667. +
  4668. +# Generate System.map
  4669. +quiet_cmd_sysmap = SYSMAP
  4670. + cmd_sysmap = $(CONFIG_SHELL) $(srctree)/scripts/mksysmap
  4671. +
  4672. +# Link of vmlinux
  4673. +# If CONFIG_KALLSYMS is set .version is already updated
  4674. +# Generate System.map and verify that the content is consistent
  4675. +# Use + in front of the vmlinux_version rule to silent warning with make -j2
  4676. +# First command is ':' to allow us to use + in front of the rule
  4677. +define rule_vmlinux__
  4678. + :
  4679. + $(if $(CONFIG_KALLSYMS),,+$(call cmd,vmlinux_version))
  4680. +
  4681. + $(call cmd,vmlinux__)
  4682. + $(Q)echo 'cmd_$@ := $(cmd_vmlinux__)' > $(@D)/.$(@F).cmd
  4683. +
  4684. + $(Q)$(if $($(quiet)cmd_sysmap), \
  4685. + echo ' $($(quiet)cmd_sysmap) System.map' &&) \
  4686. + $(cmd_sysmap) $@ System.map; \
  4687. + if [ $$? -ne 0 ]; then \
  4688. + rm -f $@; \
  4689. + /bin/false; \
  4690. + fi;
  4691. + $(verify_kallsyms)
  4692. +endef
  4693. +
  4694. +
  4695. +ifdef CONFIG_KALLSYMS
  4696. +# Generate section listing all symbols and add it into vmlinux $(kallsyms.o)
  4697. +# It's a three stage process:
  4698. +# o .tmp_vmlinux1 has all symbols and sections, but __kallsyms is
  4699. +# empty
  4700. +# Running kallsyms on that gives us .tmp_kallsyms1.o with
  4701. +# the right size - vmlinux version (uname -v) is updated during this step
  4702. +# o .tmp_vmlinux2 now has a __kallsyms section of the right size,
  4703. +# but due to the added section, some addresses have shifted.
  4704. +# From here, we generate a correct .tmp_kallsyms2.o
  4705. +# o The correct .tmp_kallsyms2.o is linked into the final vmlinux.
  4706. +# o Verify that the System.map from vmlinux matches the map from
  4707. +# .tmp_vmlinux2, just in case we did not generate kallsyms correctly.
  4708. +# o If CONFIG_KALLSYMS_EXTRA_PASS is set, do an extra pass using
  4709. +# .tmp_vmlinux3 and .tmp_kallsyms3.o. This is only meant as a
  4710. +# temporary bypass to allow the kernel to be built while the
  4711. +# maintainers work out what went wrong with kallsyms.
  4712. +
  4713. +ifdef CONFIG_KALLSYMS_EXTRA_PASS
  4714. +last_kallsyms := 3
  4715. +else
  4716. +last_kallsyms := 2
  4717. +endif
  4718. +
  4719. +kallsyms.o := .tmp_kallsyms$(last_kallsyms).o
  4720. +
  4721. +define verify_kallsyms
  4722. + $(Q)$(if $($(quiet)cmd_sysmap), \
  4723. + echo ' $($(quiet)cmd_sysmap) .tmp_System.map' &&) \
  4724. + $(cmd_sysmap) .tmp_vmlinux$(last_kallsyms) .tmp_System.map
  4725. + $(Q)cmp -s System.map .tmp_System.map || \
  4726. + (echo Inconsistent kallsyms data; \
  4727. + echo Try setting CONFIG_KALLSYMS_EXTRA_PASS; \
  4728. + rm .tmp_kallsyms* ; /bin/false )
  4729. +endef
  4730. +
  4731. +# Update vmlinux version before link
  4732. +# Use + in front of this rule to silent warning about make -j1
  4733. +# First command is ':' to allow us to use + in front of this rule
  4734. +cmd_ksym_ld = $(cmd_vmlinux__)
  4735. +define rule_ksym_ld
  4736. + :
  4737. + +$(call cmd,vmlinux_version)
  4738. + $(call cmd,vmlinux__)
  4739. + $(Q)echo 'cmd_$@ := $(cmd_vmlinux__)' > $(@D)/.$(@F).cmd
  4740. +endef
  4741. +
  4742. +# Generate .S file with all kernel symbols
  4743. +quiet_cmd_kallsyms = KSYM $@
  4744. + cmd_kallsyms = $(NM) -n $< | $(KALLSYMS) \
  4745. + $(if $(CONFIG_KALLSYMS_ALL),--all-symbols) > $@
  4746. +
  4747. +.tmp_kallsyms1.o .tmp_kallsyms2.o .tmp_kallsyms3.o: %.o: %.S scripts FORCE
  4748. + $(call if_changed_dep,as_o_S)
  4749. +
  4750. +.tmp_kallsyms%.S: .tmp_vmlinux% $(KALLSYMS)
  4751. + $(call cmd,kallsyms)
  4752. +
  4753. +# .tmp_vmlinux1 must be complete except kallsyms, so update vmlinux version
  4754. +.tmp_vmlinux1: $(vmlinux-lds) $(vmlinux-all) FORCE
  4755. + $(call if_changed_rule,ksym_ld)
  4756. +
  4757. +.tmp_vmlinux2: $(vmlinux-lds) $(vmlinux-all) .tmp_kallsyms1.o FORCE
  4758. + $(call if_changed,vmlinux__)
  4759. +
  4760. +.tmp_vmlinux3: $(vmlinux-lds) $(vmlinux-all) .tmp_kallsyms2.o FORCE
  4761. + $(call if_changed,vmlinux__)
  4762. +
  4763. +# Needs to visit scripts/ before $(KALLSYMS) can be used.
  4764. +$(KALLSYMS): scripts ;
  4765. +
  4766. +# Generate some data for debugging strange kallsyms problems
  4767. +debug_kallsyms: .tmp_map$(last_kallsyms)
  4768. +
  4769. +.tmp_map%: .tmp_vmlinux% FORCE
  4770. + ($(OBJDUMP) -h $< | $(AWK) '/^ +[0-9]/{print $$4 " 0 " $$2}'; $(NM) $<) | sort > $@
  4771. +
  4772. +.tmp_map3: .tmp_map2
  4773. +
  4774. +.tmp_map2: .tmp_map1
  4775. +
  4776. +endif # ifdef CONFIG_KALLSYMS
  4777. +
  4778. +# Do modpost on a prelinked vmlinux. The finally linked vmlinux has
  4779. +# relevant sections renamed as per the linker script.
  4780. +quiet_cmd_vmlinux-modpost = LD $@
  4781. + cmd_vmlinux-modpost = $(LD) $(LDFLAGS) -r -o $@ \
  4782. + $(vmlinux-init) --start-group $(vmlinux-main) --end-group \
  4783. + $(filter-out $(vmlinux-init) $(vmlinux-main) FORCE ,$^)
  4784. +define rule_vmlinux-modpost
  4785. + :
  4786. + +$(call cmd,vmlinux-modpost)
  4787. + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost $@
  4788. + $(Q)echo 'cmd_$@ := $(cmd_vmlinux-modpost)' > $(dot-target).cmd
  4789. +endef
  4790. +
  4791. +# vmlinux image - including updated kernel symbols
  4792. +vmlinux: $(vmlinux-lds) $(vmlinux-init) $(vmlinux-main) vmlinux.o $(kallsyms.o) FORCE
  4793. +ifdef CONFIG_HEADERS_CHECK
  4794. + $(Q)$(MAKE) -f $(srctree)/Makefile headers_check
  4795. +endif
  4796. +ifdef CONFIG_SAMPLES
  4797. + $(Q)$(MAKE) $(build)=samples
  4798. +endif
  4799. +ifdef CONFIG_BUILD_DOCSRC
  4800. + $(Q)$(MAKE) $(build)=Documentation
  4801. +endif
  4802. + $(call vmlinux-modpost)
  4803. + $(call if_changed_rule,vmlinux__)
  4804. + $(Q)rm -f .old_version
  4805. +
  4806. +# build vmlinux.o first to catch section mismatch errors early
  4807. +ifdef CONFIG_KALLSYMS
  4808. +.tmp_vmlinux1: vmlinux.o
  4809. +endif
  4810. +
  4811. +modpost-init := $(filter-out init/built-in.o, $(vmlinux-init))
  4812. +vmlinux.o: $(modpost-init) $(vmlinux-main) FORCE
  4813. + $(call if_changed_rule,vmlinux-modpost)
  4814. +
  4815. +# The actual objects are generated when descending,
  4816. +# make sure no implicit rule kicks in
  4817. +$(sort $(vmlinux-init) $(vmlinux-main)) $(vmlinux-lds): $(vmlinux-dirs) ;
  4818. +
  4819. +# Handle descending into subdirectories listed in $(vmlinux-dirs)
  4820. +# Preset locale variables to speed up the build process. Limit locale
  4821. +# tweaks to this spot to avoid wrong language settings when running
  4822. +# make menuconfig etc.
  4823. +# Error messages still appears in the original language
  4824. +
  4825. +PHONY += $(vmlinux-dirs)
  4826. +$(vmlinux-dirs): prepare scripts
  4827. + $(Q)$(MAKE) $(build)=$@
  4828. +
  4829. +# Store (new) KERNELRELASE string in include/config/kernel.release
  4830. +include/config/kernel.release: include/config/auto.conf FORCE
  4831. + $(Q)rm -f $@
  4832. + $(Q)echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion $(srctree))" > $@
  4833. +
  4834. +
  4835. +# Things we need to do before we recursively start building the kernel
  4836. +# or the modules are listed in "prepare".
  4837. +# A multi level approach is used. prepareN is processed before prepareN-1.
  4838. +# archprepare is used in arch Makefiles and when processed asm symlink,
  4839. +# version.h and scripts_basic is processed / created.
  4840. +
  4841. +# Listed in dependency order
  4842. +PHONY += prepare archprepare prepare0 prepare1 prepare2 prepare3
  4843. +
  4844. +# prepare3 is used to check if we are building in a separate output directory,
  4845. +# and if so do:
  4846. +# 1) Check that make has not been executed in the kernel src $(srctree)
  4847. +prepare3: include/config/kernel.release
  4848. +ifneq ($(KBUILD_SRC),)
  4849. + @$(kecho) ' Using $(srctree) as source for kernel'
  4850. + $(Q)if [ -f $(srctree)/.config -o -d $(srctree)/include/config ]; then \
  4851. + echo " $(srctree) is not clean, please run 'make mrproper'";\
  4852. + echo " in the '$(srctree)' directory.";\
  4853. + /bin/false; \
  4854. + fi;
  4855. +endif
  4856. +
  4857. +# prepare2 creates a makefile if using a separate output directory
  4858. +prepare2: prepare3 outputmakefile
  4859. +
  4860. +prepare1: prepare2 include/linux/version.h include/generated/utsrelease.h \
  4861. + include/config/auto.conf
  4862. + $(cmd_crmodverdir)
  4863. +
  4864. +archprepare: prepare1 scripts_basic
  4865. +
  4866. +prepare0: archprepare FORCE
  4867. + $(Q)$(MAKE) $(build)=.
  4868. + $(Q)$(MAKE) $(build)=. missing-syscalls
  4869. +
  4870. +# All the preparing..
  4871. +prepare: prepare0
  4872. +
  4873. +# Generate some files
  4874. +# ---------------------------------------------------------------------------
  4875. +
  4876. +# KERNELRELEASE can change from a few different places, meaning version.h
  4877. +# needs to be updated, so this check is forced on all builds
  4878. +
  4879. +uts_len := 64
  4880. +define filechk_utsrelease.h
  4881. + if [ `echo -n "$(KERNELRELEASE)" | wc -c ` -gt $(uts_len) ]; then \
  4882. + echo '"$(KERNELRELEASE)" exceeds $(uts_len) characters' >&2; \
  4883. + exit 1; \
  4884. + fi; \
  4885. + (echo \#define UTS_RELEASE \"$(KERNELRELEASE)\";)
  4886. +endef
  4887. +
  4888. +define filechk_version.h
  4889. + (echo \#define LINUX_VERSION_CODE $(shell \
  4890. + expr $(VERSION) \* 65536 + $(PATCHLEVEL) \* 256 + $(SUBLEVEL)); \
  4891. + echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))';)
  4892. +endef
  4893. +
  4894. +include/linux/version.h: $(srctree)/Makefile FORCE
  4895. + $(call filechk,version.h)
  4896. +
  4897. +include/generated/utsrelease.h: include/config/kernel.release FORCE
  4898. + $(call filechk,utsrelease.h)
  4899. +
  4900. +PHONY += headerdep
  4901. +headerdep:
  4902. + $(Q)find include/ -name '*.h' | xargs --max-args 1 scripts/headerdep.pl
  4903. +
  4904. +# ---------------------------------------------------------------------------
  4905. +
  4906. +PHONY += depend dep
  4907. +depend dep:
  4908. + @echo '*** Warning: make $@ is unnecessary now.'
  4909. +
  4910. +# ---------------------------------------------------------------------------
  4911. +# Firmware install
  4912. +INSTALL_FW_PATH=$(INSTALL_MOD_PATH)/lib/firmware
  4913. +export INSTALL_FW_PATH
  4914. +
  4915. +PHONY += firmware_install
  4916. +firmware_install: FORCE
  4917. + @mkdir -p $(objtree)/firmware
  4918. + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_install
  4919. +
  4920. +# ---------------------------------------------------------------------------
  4921. +# Kernel headers
  4922. +
  4923. +#Default location for installed headers
  4924. +export INSTALL_HDR_PATH = $(objtree)/usr
  4925. +
  4926. +hdr-inst := -rR -f $(srctree)/scripts/Makefile.headersinst obj
  4927. +
  4928. +# If we do an all arch process set dst to asm-$(hdr-arch)
  4929. +hdr-dst = $(if $(KBUILD_HEADERS), dst=include/asm-$(hdr-arch), dst=include/asm)
  4930. +
  4931. +PHONY += __headers
  4932. +__headers: include/linux/version.h scripts_basic FORCE
  4933. + $(Q)$(MAKE) $(build)=scripts scripts/unifdef
  4934. +
  4935. +PHONY += headers_install_all
  4936. +headers_install_all:
  4937. + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/headers.sh install
  4938. +
  4939. +PHONY += headers_install
  4940. +headers_install: __headers
  4941. + $(if $(wildcard $(srctree)/arch/$(hdr-arch)/include/asm/Kbuild),, \
  4942. + $(error Headers not exportable for the $(SRCARCH) architecture))
  4943. + $(Q)$(MAKE) $(hdr-inst)=include
  4944. + $(Q)$(MAKE) $(hdr-inst)=arch/$(hdr-arch)/include/asm $(hdr-dst)
  4945. +
  4946. +PHONY += headers_check_all
  4947. +headers_check_all: headers_install_all
  4948. + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/headers.sh check
  4949. +
  4950. +PHONY += headers_check
  4951. +headers_check: headers_install
  4952. + $(Q)$(MAKE) $(hdr-inst)=include HDRCHECK=1
  4953. + $(Q)$(MAKE) $(hdr-inst)=arch/$(hdr-arch)/include/asm $(hdr-dst) HDRCHECK=1
  4954. +
  4955. +# ---------------------------------------------------------------------------
  4956. +# Modules
  4957. +
  4958. +ifdef CONFIG_MODULES
  4959. +
  4960. +# By default, build modules as well
  4961. +
  4962. +all: modules
  4963. +
  4964. +# Build modules
  4965. +#
  4966. +# A module can be listed more than once in obj-m resulting in
  4967. +# duplicate lines in modules.order files. Those are removed
  4968. +# using awk while concatenating to the final file.
  4969. +
  4970. +PHONY += modules
  4971. +modules: $(vmlinux-dirs) $(if $(KBUILD_BUILTIN),vmlinux) modules.builtin
  4972. + $(Q)$(AWK) '!x[$$0]++' $(vmlinux-dirs:%=$(objtree)/%/modules.order) > $(objtree)/modules.order
  4973. + @$(kecho) ' Building modules, stage 2.';
  4974. + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost
  4975. + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_modbuild
  4976. +
  4977. +modules.builtin: $(vmlinux-dirs:%=%/modules.builtin)
  4978. + $(Q)$(AWK) '!x[$$0]++' $^ > $(objtree)/modules.builtin
  4979. +
  4980. +%/modules.builtin: include/config/auto.conf
  4981. + $(Q)$(MAKE) $(modbuiltin)=$*
  4982. +
  4983. +
  4984. +# Target to prepare building external modules
  4985. +PHONY += modules_prepare
  4986. +modules_prepare: prepare scripts
  4987. +
  4988. +# Target to install modules
  4989. +PHONY += modules_install
  4990. +modules_install: _modinst_ _modinst_post
  4991. +
  4992. +PHONY += _modinst_
  4993. +_modinst_:
  4994. + @if [ -z "`$(DEPMOD) -V 2>/dev/null | grep module-init-tools`" ]; then \
  4995. + echo "Warning: you may need to install module-init-tools"; \
  4996. + echo "See http://www.codemonkey.org.uk/docs/post-halloween-2.6.txt";\
  4997. + sleep 1; \
  4998. + fi
  4999. + @rm -rf $(MODLIB)/kernel
  5000. + @rm -f $(MODLIB)/source
  5001. + @mkdir -p $(MODLIB)/kernel
  5002. + @ln -s $(srctree) $(MODLIB)/source
  5003. + @if [ ! $(objtree) -ef $(MODLIB)/build ]; then \
  5004. + rm -f $(MODLIB)/build ; \
  5005. + ln -s $(objtree) $(MODLIB)/build ; \
  5006. + fi
  5007. + @cp -f $(objtree)/modules.order $(MODLIB)/
  5008. + @cp -f $(objtree)/modules.builtin $(MODLIB)/
  5009. + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modinst
  5010. +
  5011. +# This depmod is only for convenience to give the initial
  5012. +# boot a modules.dep even before / is mounted read-write. However the
  5013. +# boot script depmod is the master version.
  5014. +PHONY += _modinst_post
  5015. +_modinst_post: _modinst_
  5016. + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_modinst
  5017. + $(call cmd,depmod)
  5018. +
  5019. +else # CONFIG_MODULES
  5020. +
  5021. +# Modules not configured
  5022. +# ---------------------------------------------------------------------------
  5023. +
  5024. +modules modules_install: FORCE
  5025. + @echo
  5026. + @echo "The present kernel configuration has modules disabled."
  5027. + @echo "Type 'make config' and enable loadable module support."
  5028. + @echo "Then build a kernel with module support enabled."
  5029. + @echo
  5030. + @exit 1
  5031. +
  5032. +endif # CONFIG_MODULES
  5033. +
  5034. +###
  5035. +# Cleaning is done on three levels.
  5036. +# make clean Delete most generated files
  5037. +# Leave enough to build external modules
  5038. +# make mrproper Delete the current configuration, and all generated files
  5039. +# make distclean Remove editor backup files, patch leftover files and the like
  5040. +
  5041. +# Directories & files removed with 'make clean'
  5042. +CLEAN_DIRS += $(MODVERDIR)
  5043. +CLEAN_FILES += vmlinux System.map \
  5044. + .tmp_kallsyms* .tmp_version .tmp_vmlinux* .tmp_System.map
  5045. +
  5046. +# Directories & files removed with 'make mrproper'
  5047. +MRPROPER_DIRS += include/config usr/include include/generated
  5048. +MRPROPER_FILES += .config .config.old .version .old_version \
  5049. + include/linux/version.h \
  5050. + Module.symvers tags TAGS cscope*
  5051. +
  5052. +# clean - Delete most, but leave enough to build external modules
  5053. +#
  5054. +clean: rm-dirs := $(CLEAN_DIRS)
  5055. +clean: rm-files := $(CLEAN_FILES)
  5056. +clean-dirs := $(addprefix _clean_, . $(vmlinux-alldirs) Documentation)
  5057. +
  5058. +PHONY += $(clean-dirs) clean archclean
  5059. +$(clean-dirs):
  5060. + $(Q)$(MAKE) $(clean)=$(patsubst _clean_%,%,$@)
  5061. +
  5062. +clean: archclean
  5063. +
  5064. +# mrproper - Delete all generated files, including .config
  5065. +#
  5066. +mrproper: rm-dirs := $(wildcard $(MRPROPER_DIRS))
  5067. +mrproper: rm-files := $(wildcard $(MRPROPER_FILES))
  5068. +mrproper-dirs := $(addprefix _mrproper_,Documentation/DocBook scripts)
  5069. +
  5070. +PHONY += $(mrproper-dirs) mrproper archmrproper
  5071. +$(mrproper-dirs):
  5072. + $(Q)$(MAKE) $(clean)=$(patsubst _mrproper_%,%,$@)
  5073. +
  5074. +mrproper: clean archmrproper $(mrproper-dirs)
  5075. + $(call cmd,rmdirs)
  5076. + $(call cmd,rmfiles)
  5077. +
  5078. +# distclean
  5079. +#
  5080. +PHONY += distclean
  5081. +
  5082. +distclean: mrproper
  5083. + @find $(srctree) $(RCS_FIND_IGNORE) \
  5084. + \( -name '*.orig' -o -name '*.rej' -o -name '*~' \
  5085. + -o -name '*.bak' -o -name '#*#' -o -name '.*.orig' \
  5086. + -o -name '.*.rej' -o -size 0 \
  5087. + -o -name '*%' -o -name '.*.cmd' -o -name 'core' \) \
  5088. + -type f -print | xargs rm -f
  5089. +
  5090. +
  5091. +# Packaging of the kernel to various formats
  5092. +# ---------------------------------------------------------------------------
  5093. +# rpm target kept for backward compatibility
  5094. +package-dir := $(srctree)/scripts/package
  5095. +
  5096. +%src-pkg: FORCE
  5097. + $(Q)$(MAKE) $(build)=$(package-dir) $@
  5098. +%pkg: include/config/kernel.release FORCE
  5099. + $(Q)$(MAKE) $(build)=$(package-dir) $@
  5100. +rpm: include/config/kernel.release FORCE
  5101. + $(Q)$(MAKE) $(build)=$(package-dir) $@
  5102. +
  5103. +
  5104. +# Brief documentation of the typical targets used
  5105. +# ---------------------------------------------------------------------------
  5106. +
  5107. +boards := $(wildcard $(srctree)/arch/$(SRCARCH)/configs/*_defconfig)
  5108. +boards := $(notdir $(boards))
  5109. +board-dirs := $(dir $(wildcard $(srctree)/arch/$(SRCARCH)/configs/*/*_defconfig))
  5110. +board-dirs := $(sort $(notdir $(board-dirs:/=)))
  5111. +
  5112. +help:
  5113. + @echo 'Cleaning targets:'
  5114. + @echo ' clean - Remove most generated files but keep the config and'
  5115. + @echo ' enough build support to build external modules'
  5116. + @echo ' mrproper - Remove all generated files + config + various backup files'
  5117. + @echo ' distclean - mrproper + remove editor backup and patch files'
  5118. + @echo ''
  5119. + @echo 'Configuration targets:'
  5120. + @$(MAKE) -f $(srctree)/scripts/kconfig/Makefile help
  5121. + @echo ''
  5122. + @echo 'Other generic targets:'
  5123. + @echo ' all - Build all targets marked with [*]'
  5124. + @echo '* vmlinux - Build the bare kernel'
  5125. + @echo '* modules - Build all modules'
  5126. + @echo ' modules_install - Install all modules to INSTALL_MOD_PATH (default: /)'
  5127. + @echo ' firmware_install- Install all firmware to INSTALL_FW_PATH'
  5128. + @echo ' (default: $$(INSTALL_MOD_PATH)/lib/firmware)'
  5129. + @echo ' dir/ - Build all files in dir and below'
  5130. + @echo ' dir/file.[oisS] - Build specified target only'
  5131. + @echo ' dir/file.lst - Build specified mixed source/assembly target only'
  5132. + @echo ' (requires a recent binutils and recent build (System.map))'
  5133. + @echo ' dir/file.ko - Build module including final link'
  5134. + @echo ' modules_prepare - Set up for building external modules'
  5135. + @echo ' tags/TAGS - Generate tags file for editors'
  5136. + @echo ' cscope - Generate cscope index'
  5137. + @echo ' kernelrelease - Output the release version string'
  5138. + @echo ' kernelversion - Output the version stored in Makefile'
  5139. + @echo ' headers_install - Install sanitised kernel headers to INSTALL_HDR_PATH'; \
  5140. + echo ' (default: $(INSTALL_HDR_PATH))'; \
  5141. + echo ''
  5142. + @echo 'Static analysers'
  5143. + @echo ' checkstack - Generate a list of stack hogs'
  5144. + @echo ' namespacecheck - Name space analysis on compiled kernel'
  5145. + @echo ' versioncheck - Sanity check on version.h usage'
  5146. + @echo ' includecheck - Check for duplicate included header files'
  5147. + @echo ' export_report - List the usages of all exported symbols'
  5148. + @echo ' headers_check - Sanity check on exported headers'
  5149. + @echo ' headerdep - Detect inclusion cycles in headers'
  5150. + @$(MAKE) -f $(srctree)/scripts/Makefile.help checker-help
  5151. + @echo ''
  5152. + @echo 'Kernel packaging:'
  5153. + @$(MAKE) $(build)=$(package-dir) help
  5154. + @echo ''
  5155. + @echo 'Documentation targets:'
  5156. + @$(MAKE) -f $(srctree)/Documentation/DocBook/Makefile dochelp
  5157. + @echo ''
  5158. + @echo 'Architecture specific targets ($(SRCARCH)):'
  5159. + @$(if $(archhelp),$(archhelp),\
  5160. + echo ' No architecture specific help defined for $(SRCARCH)')
  5161. + @echo ''
  5162. + @$(if $(boards), \
  5163. + $(foreach b, $(boards), \
  5164. + printf " %-24s - Build for %s\\n" $(b) $(subst _defconfig,,$(b));) \
  5165. + echo '')
  5166. + @$(if $(board-dirs), \
  5167. + $(foreach b, $(board-dirs), \
  5168. + printf " %-16s - Show %s-specific targets\\n" help-$(b) $(b);) \
  5169. + printf " %-16s - Show all of the above\\n" help-boards; \
  5170. + echo '')
  5171. +
  5172. + @echo ' make V=0|1 [targets] 0 => quiet build (default), 1 => verbose build'
  5173. + @echo ' make V=2 [targets] 2 => give reason for rebuild of target'
  5174. + @echo ' make O=dir [targets] Locate all output files in "dir", including .config'
  5175. + @echo ' make C=1 [targets] Check all c source with $$CHECK (sparse by default)'
  5176. + @echo ' make C=2 [targets] Force check of all c source with $$CHECK'
  5177. + @echo ''
  5178. + @echo 'Execute "make" or "make all" to build all targets marked with [*] '
  5179. + @echo 'For further info see the ./README file'
  5180. +
  5181. +
  5182. +help-board-dirs := $(addprefix help-,$(board-dirs))
  5183. +
  5184. +help-boards: $(help-board-dirs)
  5185. +
  5186. +boards-per-dir = $(notdir $(wildcard $(srctree)/arch/$(SRCARCH)/configs/$*/*_defconfig))
  5187. +
  5188. +$(help-board-dirs): help-%:
  5189. + @echo 'Architecture specific targets ($(SRCARCH) $*):'
  5190. + @$(if $(boards-per-dir), \
  5191. + $(foreach b, $(boards-per-dir), \
  5192. + printf " %-24s - Build for %s\\n" $*/$(b) $(subst _defconfig,,$(b));) \
  5193. + echo '')
  5194. +
  5195. +
  5196. +# Documentation targets
  5197. +# ---------------------------------------------------------------------------
  5198. +%docs: scripts_basic FORCE
  5199. + $(Q)$(MAKE) $(build)=Documentation/DocBook $@
  5200. +
  5201. +else # KBUILD_EXTMOD
  5202. +
  5203. +###
  5204. +# External module support.
  5205. +# When building external modules the kernel used as basis is considered
  5206. +# read-only, and no consistency checks are made and the make
  5207. +# system is not used on the basis kernel. If updates are required
  5208. +# in the basis kernel ordinary make commands (without M=...) must
  5209. +# be used.
  5210. +#
  5211. +# The following are the only valid targets when building external
  5212. +# modules.
  5213. +# make M=dir clean Delete all automatically generated files
  5214. +# make M=dir modules Make all modules in specified dir
  5215. +# make M=dir Same as 'make M=dir modules'
  5216. +# make M=dir modules_install
  5217. +# Install the modules built in the module directory
  5218. +# Assumes install directory is already created
  5219. +
  5220. +# We are always building modules
  5221. +KBUILD_MODULES := 1
  5222. +PHONY += crmodverdir
  5223. +crmodverdir:
  5224. + $(cmd_crmodverdir)
  5225. +
  5226. +PHONY += $(objtree)/Module.symvers
  5227. +$(objtree)/Module.symvers:
  5228. + @test -e $(objtree)/Module.symvers || ( \
  5229. + echo; \
  5230. + echo " WARNING: Symbol version dump $(objtree)/Module.symvers"; \
  5231. + echo " is missing; modules will have no dependencies and modversions."; \
  5232. + echo )
  5233. +
  5234. +module-dirs := $(addprefix _module_,$(KBUILD_EXTMOD))
  5235. +PHONY += $(module-dirs) modules
  5236. +$(module-dirs): crmodverdir $(objtree)/Module.symvers
  5237. + $(Q)$(MAKE) $(build)=$(patsubst _module_%,%,$@)
  5238. +
  5239. +modules: $(module-dirs)
  5240. + @$(kecho) ' Building modules, stage 2.';
  5241. + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost
  5242. +
  5243. +PHONY += modules_install
  5244. +modules_install: _emodinst_ _emodinst_post
  5245. +
  5246. +install-dir := $(if $(INSTALL_MOD_DIR),$(INSTALL_MOD_DIR),extra)
  5247. +PHONY += _emodinst_
  5248. +_emodinst_:
  5249. + $(Q)mkdir -p $(MODLIB)/$(install-dir)
  5250. + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modinst
  5251. +
  5252. +PHONY += _emodinst_post
  5253. +_emodinst_post: _emodinst_
  5254. + $(call cmd,depmod)
  5255. +
  5256. +clean-dirs := $(addprefix _clean_,$(KBUILD_EXTMOD))
  5257. +
  5258. +PHONY += $(clean-dirs) clean
  5259. +$(clean-dirs):
  5260. + $(Q)$(MAKE) $(clean)=$(patsubst _clean_%,%,$@)
  5261. +
  5262. +clean: rm-dirs := $(MODVERDIR)
  5263. +clean: rm-files := $(KBUILD_EXTMOD)/Module.symvers
  5264. +
  5265. +help:
  5266. + @echo ' Building external modules.'
  5267. + @echo ' Syntax: make -C path/to/kernel/src M=$$PWD target'
  5268. + @echo ''
  5269. + @echo ' modules - default target, build the module(s)'
  5270. + @echo ' modules_install - install the module'
  5271. + @echo ' clean - remove generated files in module directory only'
  5272. + @echo ''
  5273. +
  5274. +# Dummies...
  5275. +PHONY += prepare scripts
  5276. +prepare: ;
  5277. +scripts: ;
  5278. +endif # KBUILD_EXTMOD
  5279. +
  5280. +clean: $(clean-dirs)
  5281. + $(call cmd,rmdirs)
  5282. + $(call cmd,rmfiles)
  5283. + @find $(or $(KBUILD_EXTMOD), .) $(RCS_FIND_IGNORE) \
  5284. + \( -name '*.[oas]' -o -name '*.ko' -o -name '.*.cmd' \
  5285. + -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \
  5286. + -o -name '*.symtypes' -o -name 'modules.order' \
  5287. + -o -name modules.builtin -o -name '.tmp_*.o.*' \
  5288. + -o -name '*.gcno' \) -type f -print | xargs rm -f
  5289. +
  5290. +# Generate tags for editors
  5291. +# ---------------------------------------------------------------------------
  5292. +quiet_cmd_tags = GEN $@
  5293. + cmd_tags = $(CONFIG_SHELL) $(srctree)/scripts/tags.sh $@
  5294. +
  5295. +tags TAGS cscope: FORCE
  5296. + $(call cmd,tags)
  5297. +
  5298. +# Scripts to check various things for consistency
  5299. +# ---------------------------------------------------------------------------
  5300. +
  5301. +includecheck:
  5302. + find * $(RCS_FIND_IGNORE) \
  5303. + -name '*.[hcS]' -type f -print | sort \
  5304. + | xargs $(PERL) -w $(srctree)/scripts/checkincludes.pl
  5305. +
  5306. +versioncheck:
  5307. + find * $(RCS_FIND_IGNORE) \
  5308. + -name '*.[hcS]' -type f -print | sort \
  5309. + | xargs $(PERL) -w $(srctree)/scripts/checkversion.pl
  5310. +
  5311. +coccicheck:
  5312. + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/$@
  5313. +
  5314. +namespacecheck:
  5315. + $(PERL) $(srctree)/scripts/namespace.pl
  5316. +
  5317. +export_report:
  5318. + $(PERL) $(srctree)/scripts/export_report.pl
  5319. +
  5320. +endif #ifeq ($(config-targets),1)
  5321. +endif #ifeq ($(mixed-targets),1)
  5322. +
  5323. +PHONY += checkstack kernelrelease kernelversion
  5324. +
  5325. +# UML needs a little special treatment here. It wants to use the host
  5326. +# toolchain, so needs $(SUBARCH) passed to checkstack.pl. Everyone
  5327. +# else wants $(ARCH), including people doing cross-builds, which means
  5328. +# that $(SUBARCH) doesn't work here.
  5329. +ifeq ($(ARCH), um)
  5330. +CHECKSTACK_ARCH := $(SUBARCH)
  5331. +else
  5332. +CHECKSTACK_ARCH := $(ARCH)
  5333. +endif
  5334. +checkstack:
  5335. + $(OBJDUMP) -d vmlinux $$(find . -name '*.ko') | \
  5336. + $(PERL) $(src)/scripts/checkstack.pl $(CHECKSTACK_ARCH)
  5337. +
  5338. +kernelrelease:
  5339. + @echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion $(srctree))"
  5340. +
  5341. +kernelversion:
  5342. + @echo $(KERNELVERSION)
  5343. +
  5344. +# Single targets
  5345. +# ---------------------------------------------------------------------------
  5346. +# Single targets are compatible with:
  5347. +# - build with mixed source and output
  5348. +# - build with separate output dir 'make O=...'
  5349. +# - external modules
  5350. +#
  5351. +# target-dir => where to store outputfile
  5352. +# build-dir => directory in kernel source tree to use
  5353. +
  5354. +ifeq ($(KBUILD_EXTMOD),)
  5355. + build-dir = $(patsubst %/,%,$(dir $@))
  5356. + target-dir = $(dir $@)
  5357. +else
  5358. + zap-slash=$(filter-out .,$(patsubst %/,%,$(dir $@)))
  5359. + build-dir = $(KBUILD_EXTMOD)$(if $(zap-slash),/$(zap-slash))
  5360. + target-dir = $(if $(KBUILD_EXTMOD),$(dir $<),$(dir $@))
  5361. +endif
  5362. +
  5363. +%.s: %.c prepare scripts FORCE
  5364. + $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
  5365. +%.i: %.c prepare scripts FORCE
  5366. + $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
  5367. +%.o: %.c prepare scripts FORCE
  5368. + $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
  5369. +%.lst: %.c prepare scripts FORCE
  5370. + $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
  5371. +%.s: %.S prepare scripts FORCE
  5372. + $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
  5373. +%.o: %.S prepare scripts FORCE
  5374. + $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
  5375. +%.symtypes: %.c prepare scripts FORCE
  5376. + $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
  5377. +
  5378. +# Modules
  5379. +/: prepare scripts FORCE
  5380. + $(cmd_crmodverdir)
  5381. + $(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
  5382. + $(build)=$(build-dir)
  5383. +%/: prepare scripts FORCE
  5384. + $(cmd_crmodverdir)
  5385. + $(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
  5386. + $(build)=$(build-dir)
  5387. +%.ko: prepare scripts FORCE
  5388. + $(cmd_crmodverdir)
  5389. + $(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
  5390. + $(build)=$(build-dir) $(@:.ko=.o)
  5391. + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost
  5392. +
  5393. +# FIXME Should go into a make.lib or something
  5394. +# ===========================================================================
  5395. +
  5396. +quiet_cmd_rmdirs = $(if $(wildcard $(rm-dirs)),CLEAN $(wildcard $(rm-dirs)))
  5397. + cmd_rmdirs = rm -rf $(rm-dirs)
  5398. +
  5399. +quiet_cmd_rmfiles = $(if $(wildcard $(rm-files)),CLEAN $(wildcard $(rm-files)))
  5400. + cmd_rmfiles = rm -f $(rm-files)
  5401. +
  5402. +# Run depmod only if we have System.map and depmod is executable
  5403. +quiet_cmd_depmod = DEPMOD $(KERNELRELEASE)
  5404. + cmd_depmod = \
  5405. + if [ -r System.map -a -x $(DEPMOD) ]; then \
  5406. + $(DEPMOD) -ae -F System.map \
  5407. + $(if $(strip $(INSTALL_MOD_PATH)), -b $(INSTALL_MOD_PATH) ) \
  5408. + $(KERNELRELEASE); \
  5409. + fi
  5410. +
  5411. +# Create temporary dir for module support files
  5412. +# clean it up only when building all modules
  5413. +cmd_crmodverdir = $(Q)mkdir -p $(MODVERDIR) \
  5414. + $(if $(KBUILD_MODULES),; rm -f $(MODVERDIR)/*)
  5415. +
  5416. +a_flags = -Wp,-MD,$(depfile) $(KBUILD_AFLAGS) $(AFLAGS_KERNEL) \
  5417. + $(KBUILD_AFLAGS_KERNEL) \
  5418. + $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(KBUILD_CPPFLAGS) \
  5419. + $(modkern_aflags) $(EXTRA_AFLAGS) $(AFLAGS_$(basetarget).o)
  5420. +
  5421. +quiet_cmd_as_o_S = AS $@
  5422. +cmd_as_o_S = $(CC) $(a_flags) -c -o $@ $<
  5423. +
  5424. +# read all saved command lines
  5425. +
  5426. +targets := $(wildcard $(sort $(targets)))
  5427. +cmd_files := $(wildcard .*.cmd $(foreach f,$(targets),$(dir $(f)).$(notdir $(f)).cmd))
  5428. +
  5429. +ifneq ($(cmd_files),)
  5430. + $(cmd_files): ; # Do not try to update included dependency files
  5431. + include $(cmd_files)
  5432. +endif
  5433. +
  5434. +# Shorthand for $(Q)$(MAKE) -f scripts/Makefile.clean obj=dir
  5435. +# Usage:
  5436. +# $(Q)$(MAKE) $(clean)=dir
  5437. +clean := -f $(if $(KBUILD_SRC),$(srctree)/)scripts/Makefile.clean obj
  5438. +
  5439. +endif # skip-makefile
  5440. +
  5441. +PHONY += FORCE
  5442. +FORCE:
  5443. +
  5444. +# Declare the contents of the .PHONY variable as phony. We keep that
  5445. +# information in a variable so we can use it in if_changed and friends.
  5446. +.PHONY: $(PHONY)
  5447. diff -Nrupad linux-2.6.37//mm/cleancache.c linux-2.6.37_vanilla//mm/cleancache.c
  5448. --- linux-2.6.37//mm/cleancache.c 1970-01-01 01:00:00.000000000 +0100
  5449. +++ linux-2.6.37_vanilla//mm/cleancache.c 2011-02-14 01:21:43.171793147 +0100
  5450. @@ -0,0 +1,258 @@
  5451. +/*
  5452. + * Cleancache frontend
  5453. + *
  5454. + * This code provides the generic "frontend" layer to call a matching
  5455. + * "backend" driver implementation of cleancache. See
  5456. + * Documentation/vm/cleancache.txt for more information.
  5457. + *
  5458. + * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
  5459. + * Author: Dan Magenheimer
  5460. + *
  5461. + * This work is licensed under the terms of the GNU GPL, version 2.
  5462. + */
  5463. +
  5464. +#include <linux/module.h>
  5465. +#include <linux/fs.h>
  5466. +#include <linux/exportfs.h>
  5467. +#include <linux/mm.h>
  5468. +#include <linux/cleancache.h>
  5469. +
  5470. +/*
  5471. + * This global enablement flag may be read thousands of times per second
  5472. + * by cleancache_get/put/flush even on systems where cleancache_ops
  5473. + * is not claimed (e.g. cleancache is config'ed on but remains
  5474. + * disabled), so is preferred to the slower alternative: a function
  5475. + * call that checks a non-global.
  5476. + */
  5477. +int cleancache_enabled;
  5478. +EXPORT_SYMBOL(cleancache_enabled);
  5479. +
  5480. +/*
  5481. + * cleancache_ops is set by cleancache_ops_register to contain the pointers
  5482. + * to the cleancache "backend" implementation functions.
  5483. + */
  5484. +static struct cleancache_ops cleancache_ops;
  5485. +
  5486. +/* useful stats available in /sys/kernel/mm/cleancache */
  5487. +static unsigned long cleancache_succ_gets;
  5488. +static unsigned long cleancache_failed_gets;
  5489. +static unsigned long cleancache_puts;
  5490. +static unsigned long cleancache_flushes;
  5491. +
  5492. +/*
  5493. + * register operations for cleancache, returning previous thus allowing
  5494. + * detection of multiple backends and possible nesting
  5495. + */
  5496. +struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
  5497. +{
  5498. + struct cleancache_ops old = cleancache_ops;
  5499. +
  5500. + cleancache_ops = *ops;
  5501. + cleancache_enabled = 1;
  5502. + return old;
  5503. +}
  5504. +EXPORT_SYMBOL(cleancache_register_ops);
  5505. +
  5506. +/* Called by a cleancache-enabled filesystem at time of mount */
  5507. +void __cleancache_init_fs(struct super_block *sb)
  5508. +{
  5509. + sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE);
  5510. +}
  5511. +EXPORT_SYMBOL(__cleancache_init_fs);
  5512. +
  5513. +/* Called by a cleancache-enabled clustered filesystem at time of mount */
  5514. +void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
  5515. +{
  5516. + sb->cleancache_poolid =
  5517. + (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE);
  5518. +}
  5519. +EXPORT_SYMBOL(__cleancache_init_shared_fs);
  5520. +
  5521. +/*
  5522. + * If the filesystem uses exportable filehandles, use the filehandle as
  5523. + * the key, else use the inode number.
  5524. + */
  5525. +static int cleancache_get_key(struct inode *inode,
  5526. + struct cleancache_filekey *key)
  5527. +{
  5528. + int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
  5529. + int maxlen = CLEANCACHE_KEY_MAX;
  5530. + struct super_block *sb = inode->i_sb;
  5531. + struct dentry *d;
  5532. +
  5533. + key->u.ino = inode->i_ino;
  5534. + if (sb->s_export_op != NULL) {
  5535. + fhfn = sb->s_export_op->encode_fh;
  5536. + if (fhfn) {
  5537. + d = list_first_entry(&inode->i_dentry,
  5538. + struct dentry, d_alias);
  5539. + (void)(*fhfn)(d, &key->u.fh[0], &maxlen, 0);
  5540. + if (maxlen > CLEANCACHE_KEY_MAX)
  5541. + return -1;
  5542. + }
  5543. + }
  5544. + return 0;
  5545. +}
  5546. +
  5547. +/*
  5548. + * "Get" data from cleancache associated with the poolid/inode/index
  5549. + * that were specified when the data was put to cleanache and, if
  5550. + * successful, use it to fill the specified page with data and return 0.
  5551. + * The pageframe is unchanged and returns -1 if the get fails.
  5552. + * Page must be locked by caller.
  5553. + */
  5554. +int __cleancache_get_page(struct page *page)
  5555. +{
  5556. + int ret = -1;
  5557. + int pool_id;
  5558. + struct cleancache_filekey key = { .u.key = { 0 } };
  5559. +
  5560. + VM_BUG_ON(!PageLocked(page));
  5561. + pool_id = page->mapping->host->i_sb->cleancache_poolid;
  5562. + if (pool_id < 0)
  5563. + goto out;
  5564. +
  5565. + if (cleancache_get_key(page->mapping->host, &key) < 0)
  5566. + goto out;
  5567. +
  5568. + ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page);
  5569. + if (ret == 0)
  5570. + cleancache_succ_gets++;
  5571. + else
  5572. + cleancache_failed_gets++;
  5573. +out:
  5574. + return ret;
  5575. +}
  5576. +EXPORT_SYMBOL(__cleancache_get_page);
  5577. +
  5578. +/*
  5579. + * "Put" data from a page to cleancache and associate it with the
  5580. + * (previously-obtained per-filesystem) poolid and the page's,
  5581. + * inode and page index. Page must be locked. Note that a put_page
  5582. + * always "succeeds", though a subsequent get_page may succeed or fail.
  5583. + */
  5584. +void __cleancache_put_page(struct page *page)
  5585. +{
  5586. + int pool_id;
  5587. + struct cleancache_filekey key = { .u.key = { 0 } };
  5588. +
  5589. + VM_BUG_ON(!PageLocked(page));
  5590. + pool_id = page->mapping->host->i_sb->cleancache_poolid;
  5591. + if (pool_id >= 0 &&
  5592. + cleancache_get_key(page->mapping->host, &key) >= 0) {
  5593. + (*cleancache_ops.put_page)(pool_id, key, page->index, page);
  5594. + cleancache_puts++;
  5595. + }
  5596. +}
  5597. +EXPORT_SYMBOL(__cleancache_put_page);
  5598. +
  5599. +/*
  5600. + * Flush any data from cleancache associated with the poolid and the
  5601. + * page's inode and page index so that a subsequent "get" will fail.
  5602. + */
  5603. +void __cleancache_flush_page(struct address_space *mapping, struct page *page)
  5604. +{
  5605. + /* careful... page->mapping is NULL sometimes when this is called */
  5606. + int pool_id = mapping->host->i_sb->cleancache_poolid;
  5607. + struct cleancache_filekey key = { .u.key = { 0 } };
  5608. +
  5609. + if (pool_id >= 0) {
  5610. + VM_BUG_ON(!PageLocked(page));
  5611. + if (cleancache_get_key(mapping->host, &key) >= 0) {
  5612. + (*cleancache_ops.flush_page)(pool_id, key, page->index);
  5613. + cleancache_flushes++;
  5614. + }
  5615. + }
  5616. +}
  5617. +EXPORT_SYMBOL(__cleancache_flush_page);
  5618. +
  5619. +/*
  5620. + * Flush all data from cleancache associated with the poolid and the
  5621. + * mappings's inode so that all subsequent gets to this poolid/inode
  5622. + * will fail.
  5623. + */
  5624. +void __cleancache_flush_inode(struct address_space *mapping)
  5625. +{
  5626. + int pool_id = mapping->host->i_sb->cleancache_poolid;
  5627. + struct cleancache_filekey key = { .u.key = { 0 } };
  5628. +
  5629. + if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
  5630. + (*cleancache_ops.flush_inode)(pool_id, key);
  5631. +}
  5632. +EXPORT_SYMBOL(__cleancache_flush_inode);
  5633. +
  5634. +/*
  5635. + * Called by any cleancache-enabled filesystem at time of unmount;
  5636. + * note that pool_id is surrendered and may be reutrned by a subsequent
  5637. + * cleancache_init_fs or cleancache_init_shared_fs
  5638. + */
  5639. +void __cleancache_flush_fs(struct super_block *sb)
  5640. +{
  5641. + if (sb->cleancache_poolid >= 0) {
  5642. + int old_poolid = sb->cleancache_poolid;
  5643. + sb->cleancache_poolid = -1;
  5644. + (*cleancache_ops.flush_fs)(old_poolid);
  5645. + }
  5646. +}
  5647. +EXPORT_SYMBOL(__cleancache_flush_fs);
  5648. +
  5649. +#ifdef CONFIG_SYSFS
  5650. +
  5651. +/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
  5652. +
  5653. +#define CLEANCACHE_ATTR_RO(_name) \
  5654. + static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
  5655. +
  5656. +static ssize_t cleancache_succ_gets_show(struct kobject *kobj,
  5657. + struct kobj_attribute *attr, char *buf)
  5658. +{
  5659. + return sprintf(buf, "%lu\n", cleancache_succ_gets);
  5660. +}
  5661. +CLEANCACHE_ATTR_RO(cleancache_succ_gets);
  5662. +
  5663. +static ssize_t cleancache_failed_gets_show(struct kobject *kobj,
  5664. + struct kobj_attribute *attr, char *buf)
  5665. +{
  5666. + return sprintf(buf, "%lu\n", cleancache_failed_gets);
  5667. +}
  5668. +CLEANCACHE_ATTR_RO(cleancache_failed_gets);
  5669. +
  5670. +static ssize_t cleancache_puts_show(struct kobject *kobj,
  5671. + struct kobj_attribute *attr, char *buf)
  5672. +{
  5673. + return sprintf(buf, "%lu\n", cleancache_puts);
  5674. +}
  5675. +CLEANCACHE_ATTR_RO(cleancache_puts);
  5676. +
  5677. +static ssize_t cleancache_flushes_show(struct kobject *kobj,
  5678. + struct kobj_attribute *attr, char *buf)
  5679. +{
  5680. + return sprintf(buf, "%lu\n", cleancache_flushes);
  5681. +}
  5682. +CLEANCACHE_ATTR_RO(cleancache_flushes);
  5683. +
  5684. +static struct attribute *cleancache_attrs[] = {
  5685. + &cleancache_succ_gets_attr.attr,
  5686. + &cleancache_failed_gets_attr.attr,
  5687. + &cleancache_puts_attr.attr,
  5688. + &cleancache_flushes_attr.attr,
  5689. + NULL,
  5690. +};
  5691. +
  5692. +static struct attribute_group cleancache_attr_group = {
  5693. + .attrs = cleancache_attrs,
  5694. + .name = "cleancache",
  5695. +};
  5696. +
  5697. +#endif /* CONFIG_SYSFS */
  5698. +
  5699. +static int __init init_cleancache(void)
  5700. +{
  5701. +#ifdef CONFIG_SYSFS
  5702. + int err;
  5703. +
  5704. + err = sysfs_create_group(mm_kobj, &cleancache_attr_group);
  5705. +#endif /* CONFIG_SYSFS */
  5706. + return 0;
  5707. +}
  5708. +module_init(init_cleancache)
  5709. diff -Nrupad linux-2.6.37//mm/filemap.c linux-2.6.37_vanilla//mm/filemap.c
  5710. --- linux-2.6.37//mm/filemap.c 2011-01-05 01:50:19.000000000 +0100
  5711. +++ linux-2.6.37_vanilla//mm/filemap.c 2011-02-14 01:21:43.172793144 +0100
  5712. @@ -34,6 +34,7 @@
  5713. #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
  5714. #include <linux/memcontrol.h>
  5715. #include <linux/mm_inline.h> /* for page_is_file_cache() */
  5716. +#include <linux/cleancache.h>
  5717. #include "internal.h"
  5718.  
  5719. /*
  5720. @@ -119,6 +120,16 @@ void __remove_from_page_cache(struct pag
  5721. {
  5722. struct address_space *mapping = page->mapping;
  5723.  
  5724. + /*
  5725. + * if we're uptodate, flush out into the cleancache, otherwise
  5726. + * invalidate any existing cleancache entries. We can't leave
  5727. + * stale data around in the cleancache once our page is gone
  5728. + */
  5729. + if (PageUptodate(page))
  5730. + cleancache_put_page(page);
  5731. + else
  5732. + cleancache_flush_page(mapping, page);
  5733. +
  5734. radix_tree_delete(&mapping->page_tree, page->index);
  5735. page->mapping = NULL;
  5736. mapping->nrpages--;
  5737. diff -Nrupad linux-2.6.37//mm/frontswap.c linux-2.6.37_vanilla//mm/frontswap.c
  5738. --- linux-2.6.37//mm/frontswap.c 1970-01-01 01:00:00.000000000 +0100
  5739. +++ linux-2.6.37_vanilla//mm/frontswap.c 2011-02-14 01:21:43.172793144 +0100
  5740. @@ -0,0 +1,331 @@
  5741. +/*
  5742. + * Frontswap frontend
  5743. + *
  5744. + * This code provides the generic "frontend" layer to call a matching
  5745. + * "backend" driver implementation of frontswap. See
  5746. + * Documentation/vm/frontswap.txt for more information.
  5747. + *
  5748. + * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
  5749. + * Author: Dan Magenheimer
  5750. + *
  5751. + * This work is licensed under the terms of the GNU GPL, version 2.
  5752. + */
  5753. +
  5754. +#include <linux/mm.h>
  5755. +#include <linux/mman.h>
  5756. +#include <linux/sysctl.h>
  5757. +#include <linux/swap.h>
  5758. +#include <linux/swapops.h>
  5759. +#include <linux/proc_fs.h>
  5760. +#include <linux/security.h>
  5761. +#include <linux/capability.h>
  5762. +#include <linux/module.h>
  5763. +#include <linux/uaccess.h>
  5764. +#include <linux/frontswap.h>
  5765. +#include <linux/swapfile.h>
  5766. +
  5767. +/*
  5768. + * frontswap_ops is set by frontswap_register_ops to contain the pointers
  5769. + * to the frontswap "backend" implementation functions.
  5770. + */
  5771. +static struct frontswap_ops frontswap_ops;
  5772. +
  5773. +/*
  5774. + * This global enablement flag reduces overhead on systems where frontswap_ops
  5775. + * has not been registered, so is preferred to the slower alternative: a
  5776. + * function call that checks a non-global.
  5777. + */
  5778. +int frontswap_enabled;
  5779. +EXPORT_SYMBOL(frontswap_enabled);
  5780. +
  5781. +/* useful stats available in /sys/kernel/mm/frontswap */
  5782. +static unsigned long frontswap_gets;
  5783. +static unsigned long frontswap_succ_puts;
  5784. +static unsigned long frontswap_failed_puts;
  5785. +static unsigned long frontswap_flushes;
  5786. +
  5787. +/*
  5788. + * register operations for frontswap, returning previous thus allowing
  5789. + * detection of multiple backends and possible nesting
  5790. + */
  5791. +struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
  5792. +{
  5793. + struct frontswap_ops old = frontswap_ops;
  5794. +
  5795. + frontswap_ops = *ops;
  5796. + frontswap_enabled = 1;
  5797. + return old;
  5798. +}
  5799. +EXPORT_SYMBOL(frontswap_register_ops);
  5800. +
  5801. +/* Called when a swap device is swapon'd */
  5802. +void frontswap_init(unsigned type)
  5803. +{
  5804. + if (frontswap_enabled)
  5805. + (*frontswap_ops.init)(type);
  5806. +}
  5807. +EXPORT_SYMBOL(frontswap_init);
  5808. +
  5809. +/*
  5810. + * "Put" data from a page to frontswap and associate it with the page's
  5811. + * swaptype and offset. Page must be locked and in the swap cache.
  5812. + * If frontswap already contains a page with matching swaptype and
  5813. + * offset, the frontswap implmentation may either overwrite the data
  5814. + * and return success or flush the page from frontswap and return failure
  5815. + */
  5816. +int __frontswap_put_page(struct page *page)
  5817. +{
  5818. + int ret = -1, dup = 0;
  5819. + swp_entry_t entry = { .val = page_private(page), };
  5820. + int type = swp_type(entry);
  5821. + struct swap_info_struct *sis = swap_info[type];
  5822. + pgoff_t offset = swp_offset(entry);
  5823. +
  5824. + BUG_ON(!PageLocked(page));
  5825. + if (frontswap_test(sis, offset))
  5826. + dup = 1;
  5827. + ret = (*frontswap_ops.put_page)(type, offset, page);
  5828. + if (ret == 0) {
  5829. + frontswap_set(sis, offset);
  5830. + frontswap_succ_puts++;
  5831. + if (!dup)
  5832. + sis->frontswap_pages++;
  5833. + } else if (dup) {
  5834. + /*
  5835. + failed dup always results in automatic flush of
  5836. + the (older) page from frontswap
  5837. + */
  5838. + frontswap_clear(sis, offset);
  5839. + sis->frontswap_pages--;
  5840. + frontswap_failed_puts++;
  5841. + } else
  5842. + frontswap_failed_puts++;
  5843. + return ret;
  5844. +}
  5845. +
  5846. +/*
  5847. + * "Get" data from frontswap associated with swaptype and offset that were
  5848. + * specified when the data was put to frontswap and use it to fill the
  5849. + * specified page with data. Page must be locked and in the swap cache
  5850. + */
  5851. +int __frontswap_get_page(struct page *page)
  5852. +{
  5853. + int ret = -1;
  5854. + swp_entry_t entry = { .val = page_private(page), };
  5855. + int type = swp_type(entry);
  5856. + struct swap_info_struct *sis = swap_info[type];
  5857. + pgoff_t offset = swp_offset(entry);
  5858. +
  5859. + BUG_ON(!PageLocked(page));
  5860. + if (frontswap_test(sis, offset))
  5861. + ret = (*frontswap_ops.get_page)(type, offset, page);
  5862. + if (ret == 0)
  5863. + frontswap_gets++;
  5864. + return ret;
  5865. +}
  5866. +
  5867. +/*
  5868. + * Flush any data from frontswap associated with the specified swaptype
  5869. + * and offset so that a subsequent "get" will fail.
  5870. + */
  5871. +void __frontswap_flush_page(unsigned type, pgoff_t offset)
  5872. +{
  5873. + struct swap_info_struct *sis = swap_info[type];
  5874. +
  5875. + if (frontswap_test(sis, offset)) {
  5876. + (*frontswap_ops.flush_page)(type, offset);
  5877. + sis->frontswap_pages--;
  5878. + frontswap_clear(sis, offset);
  5879. + frontswap_flushes++;
  5880. + }
  5881. +}
  5882. +
  5883. +/*
  5884. + * Flush all data from frontswap associated with all offsets for the
  5885. + * specified swaptype.
  5886. + */
  5887. +void __frontswap_flush_area(unsigned type)
  5888. +{
  5889. + struct swap_info_struct *sis = swap_info[type];
  5890. +
  5891. + (*frontswap_ops.flush_area)(type);
  5892. + sis->frontswap_pages = 0;
  5893. + memset(sis->frontswap_map, 0, sis->max / sizeof(long));
  5894. +}
  5895. +
  5896. +/*
  5897. + * Frontswap, like a true swap device, may unnecessarily retain pages
  5898. + * under certain circumstances; "shrink" frontswap is essentially a
  5899. + * "partial swapoff" and works by calling try_to_unuse to attempt to
  5900. + * unuse enough frontswap pages to attempt to -- subject to memory
  5901. + * constraints -- reduce the number of pages in frontswap
  5902. + */
  5903. +void frontswap_shrink(unsigned long target_pages)
  5904. +{
  5905. + int wrapped = 0;
  5906. + bool locked = false;
  5907. +
  5908. + for (wrapped = 0; wrapped <= 3; wrapped++) {
  5909. +
  5910. + struct swap_info_struct *si = NULL;
  5911. + unsigned long total_pages = 0, total_pages_to_unuse;
  5912. + unsigned long pages = 0, unuse_pages = 0;
  5913. + int type;
  5914. +
  5915. + /*
  5916. + * we don't want to hold swap_lock while doing a very
  5917. + * lengthy try_to_unuse, but swap_list may change
  5918. + * so restart scan from swap_list.head each time
  5919. + */
  5920. + spin_lock(&swap_lock);
  5921. + locked = true;
  5922. + total_pages = 0;
  5923. + for (type = swap_list.head; type >= 0; type = si->next) {
  5924. + si = swap_info[type];
  5925. + total_pages += si->frontswap_pages;
  5926. + }
  5927. + if (total_pages <= target_pages)
  5928. + goto out;
  5929. + total_pages_to_unuse = total_pages - target_pages;
  5930. + for (type = swap_list.head; type >= 0; type = si->next) {
  5931. + si = swap_info[type];
  5932. + if (total_pages_to_unuse < si->frontswap_pages)
  5933. + pages = unuse_pages = total_pages_to_unuse;
  5934. + else {
  5935. + pages = si->frontswap_pages;
  5936. + unuse_pages = 0; /* unuse all */
  5937. + }
  5938. + if (security_vm_enough_memory_kern(pages))
  5939. + continue;
  5940. + vm_unacct_memory(pages);
  5941. + break;
  5942. + }
  5943. + if (type < 0)
  5944. + goto out;
  5945. + locked = false;
  5946. + spin_unlock(&swap_lock);
  5947. + try_to_unuse(type, true, unuse_pages);
  5948. + }
  5949. +
  5950. +out:
  5951. + if (locked)
  5952. + spin_unlock(&swap_lock);
  5953. + return;
  5954. +}
  5955. +EXPORT_SYMBOL(frontswap_shrink);
  5956. +
  5957. +/*
  5958. + * count and return the number of pages frontswap pages across all
  5959. + * swap devices. This is exported so that a kernel module can
  5960. + * determine current usage without reading sysfs.
  5961. + */
  5962. +unsigned long frontswap_curr_pages(void)
  5963. +{
  5964. + int type;
  5965. + unsigned long totalpages = 0;
  5966. + struct swap_info_struct *si = NULL;
  5967. +
  5968. + spin_lock(&swap_lock);
  5969. + for (type = swap_list.head; type >= 0; type = si->next) {
  5970. + si = swap_info[type];
  5971. + totalpages += si->frontswap_pages;
  5972. + }
  5973. + spin_unlock(&swap_lock);
  5974. + return totalpages;
  5975. +}
  5976. +EXPORT_SYMBOL(frontswap_curr_pages);
  5977. +
  5978. +#ifdef CONFIG_SYSFS
  5979. +
  5980. +/* see Documentation/ABI/xxx/sysfs-kernel-mm-frontswap */
  5981. +
  5982. +#define FRONTSWAP_ATTR_RO(_name) \
  5983. + static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
  5984. +#define FRONTSWAP_ATTR(_name) \
  5985. + static struct kobj_attribute _name##_attr = \
  5986. + __ATTR(_name, 0644, _name##_show, _name##_store)
  5987. +
  5988. +static ssize_t curr_pages_show(struct kobject *kobj,
  5989. + struct kobj_attribute *attr, char *buf)
  5990. +{
  5991. + return sprintf(buf, "%lu\n", frontswap_curr_pages());
  5992. +}
  5993. +
  5994. +static ssize_t curr_pages_store(struct kobject *kobj,
  5995. + struct kobj_attribute *attr,
  5996. + const char *buf, size_t count)
  5997. +{
  5998. + unsigned long target_pages;
  5999. + int err;
  6000. +
  6001. + err = strict_strtoul(buf, 10, &target_pages);
  6002. + if (err)
  6003. + return -EINVAL;
  6004. +
  6005. + frontswap_shrink(target_pages);
  6006. +
  6007. + return count;
  6008. +}
  6009. +FRONTSWAP_ATTR(curr_pages);
  6010. +
  6011. +static ssize_t succ_puts_show(struct kobject *kobj,
  6012. + struct kobj_attribute *attr, char *buf)
  6013. +{
  6014. + return sprintf(buf, "%lu\n", frontswap_succ_puts);
  6015. +}
  6016. +FRONTSWAP_ATTR_RO(succ_puts);
  6017. +
  6018. +static ssize_t failed_puts_show(struct kobject *kobj,
  6019. + struct kobj_attribute *attr, char *buf)
  6020. +{
  6021. + return sprintf(buf, "%lu\n", frontswap_failed_puts);
  6022. +}
  6023. +FRONTSWAP_ATTR_RO(failed_puts);
  6024. +
  6025. +static ssize_t gets_show(struct kobject *kobj,
  6026. + struct kobj_attribute *attr, char *buf)
  6027. +{
  6028. + return sprintf(buf, "%lu\n", frontswap_gets);
  6029. +}
  6030. +FRONTSWAP_ATTR_RO(gets);
  6031. +
  6032. +static ssize_t flushes_show(struct kobject *kobj,
  6033. + struct kobj_attribute *attr, char *buf)
  6034. +{
  6035. + return sprintf(buf, "%lu\n", frontswap_flushes);
  6036. +}
  6037. +FRONTSWAP_ATTR_RO(flushes);
  6038. +
  6039. +static struct attribute *frontswap_attrs[] = {
  6040. + &curr_pages_attr.attr,
  6041. + &succ_puts_attr.attr,
  6042. + &failed_puts_attr.attr,
  6043. + &gets_attr.attr,
  6044. + &flushes_attr.attr,
  6045. + NULL,
  6046. +};
  6047. +
  6048. +static struct attribute_group frontswap_attr_group = {
  6049. + .attrs = frontswap_attrs,
  6050. + .name = "frontswap",
  6051. +};
  6052. +
  6053. +#endif /* CONFIG_SYSFS */
  6054. +
  6055. +static int __init init_frontswap(void)
  6056. +{
  6057. +#ifdef CONFIG_SYSFS
  6058. + int err;
  6059. +
  6060. + err = sysfs_create_group(mm_kobj, &frontswap_attr_group);
  6061. +#endif /* CONFIG_SYSFS */
  6062. + return 0;
  6063. +}
  6064. +
  6065. +static void __exit exit_frontswap(void)
  6066. +{
  6067. + frontswap_shrink(0UL);
  6068. +}
  6069. +
  6070. +module_init(init_frontswap);
  6071. +module_exit(exit_frontswap);
  6072. diff -Nrupad linux-2.6.37//mm/Kconfig linux-2.6.37_vanilla//mm/Kconfig
  6073. --- linux-2.6.37//mm/Kconfig 2011-01-05 01:50:19.000000000 +0100
  6074. +++ linux-2.6.37_vanilla//mm/Kconfig 2011-02-14 01:21:43.172793144 +0100
  6075. @@ -309,3 +309,41 @@ config NEED_PER_CPU_KM
  6076. depends on !SMP
  6077. bool
  6078. default y
  6079. +
  6080. +config CLEANCACHE
  6081. + bool "Enable cleancache pseudo-RAM driver to cache clean pages"
  6082. + default y
  6083. + help
  6084. + Cleancache can be thought of as a page-granularity victim cache
  6085. + for clean pages that the kernel's pageframe replacement algorithm
  6086. + (PFRA) would like to keep around, but can't since there isn't enough
  6087. + memory. So when the PFRA "evicts" a page, it first attempts to put
  6088. + it into a synchronous concurrency-safe page-oriented pseudo-RAM
  6089. + device (such as Xen's Transcendent Memory, aka "tmem") which is not
  6090. + directly accessible or addressable by the kernel and is of unknown
  6091. + (and possibly time-varying) size. And when a cleancache-enabled
  6092. + filesystem wishes to access a page in a file on disk, it first
  6093. + checks cleancache to see if it already contains it; if it does,
  6094. + the page is copied into the kernel and a disk access is avoided.
  6095. + When a pseudo-RAM device is available, a significant I/O reduction
  6096. + may be achieved. When none is available, all cleancache calls
  6097. + are reduced to a single pointer-compare-against-NULL resulting
  6098. + in a negligible performance hit.
  6099. +
  6100. + If unsure, say Y to enable cleancache
  6101. +
  6102. +config FRONTSWAP
  6103. + bool "Enable frontswap pseudo-RAM driver to cache swap pages"
  6104. + default y
  6105. + help
  6106. + Frontswap is so named because it can be thought of as the opposite of
  6107. + a "backing" store for a swap device. The storage is assumed to be
  6108. + a synchronous concurrency-safe page-oriented pseudo-RAM device (such
  6109. + as Xen's Transcendent Memory, aka "tmem") which is not directly
  6110. + accessible or addressable by the kernel and is of unknown (and
  6111. + possibly time-varying) size. When a pseudo-RAM device is available,
  6112. + a signficant swap I/O reduction may be achieved. When none is
  6113. + available, all frontswap calls are reduced to a single pointer-
  6114. + compare-against-NULL resulting in a negligible performance hit.
  6115. +
  6116. + If unsure, say Y to enable frontswap.
  6117. diff -Nrupad linux-2.6.37//mm/Makefile linux-2.6.37_vanilla//mm/Makefile
  6118. --- linux-2.6.37//mm/Makefile 2011-01-05 01:50:19.000000000 +0100
  6119. +++ linux-2.6.37_vanilla//mm/Makefile 2011-02-14 01:21:43.172793144 +0100
  6120. @@ -19,6 +19,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.
  6121.  
  6122. obj-$(CONFIG_BOUNCE) += bounce.o
  6123. obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
  6124. +obj-$(CONFIG_FRONTSWAP) += frontswap.o
  6125. obj-$(CONFIG_HAS_DMA) += dmapool.o
  6126. obj-$(CONFIG_HUGETLBFS) += hugetlb.o
  6127. obj-$(CONFIG_NUMA) += mempolicy.o
  6128. @@ -42,3 +43,5 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-f
  6129. obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
  6130. obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
  6131. obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
  6132. +obj-$(CONFIG_CLEANCACHE) += cleancache.o
  6133. +obj-$(CONFIG_FRONTSWAP) += frontswap.o
  6134. diff -Nrupad linux-2.6.37//mm/page_io.c linux-2.6.37_vanilla//mm/page_io.c
  6135. --- linux-2.6.37//mm/page_io.c 2011-01-05 01:50:19.000000000 +0100
  6136. +++ linux-2.6.37_vanilla//mm/page_io.c 2011-02-14 01:21:43.172793144 +0100
  6137. @@ -18,6 +18,7 @@
  6138. #include <linux/bio.h>
  6139. #include <linux/swapops.h>
  6140. #include <linux/writeback.h>
  6141. +#include <linux/frontswap.h>
  6142. #include <asm/pgtable.h>
  6143.  
  6144. static struct bio *get_swap_bio(gfp_t gfp_flags,
  6145. @@ -98,6 +99,12 @@ int swap_writepage(struct page *page, st
  6146. unlock_page(page);
  6147. goto out;
  6148. }
  6149. + if (frontswap_put_page(page) == 0) {
  6150. + set_page_writeback(page);
  6151. + unlock_page(page);
  6152. + end_page_writeback(page);
  6153. + goto out;
  6154. + }
  6155. bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
  6156. if (bio == NULL) {
  6157. set_page_dirty(page);
  6158. @@ -122,6 +129,11 @@ int swap_readpage(struct page *page)
  6159.  
  6160. VM_BUG_ON(!PageLocked(page));
  6161. VM_BUG_ON(PageUptodate(page));
  6162. + if (frontswap_get_page(page) == 0) {
  6163. + SetPageUptodate(page);
  6164. + unlock_page(page);
  6165. + goto out;
  6166. + }
  6167. bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
  6168. if (bio == NULL) {
  6169. unlock_page(page);
  6170. diff -Nrupad linux-2.6.37//mm/swapfile.c linux-2.6.37_vanilla//mm/swapfile.c
  6171. --- linux-2.6.37//mm/swapfile.c 2011-01-05 01:50:19.000000000 +0100
  6172. +++ linux-2.6.37_vanilla//mm/swapfile.c 2011-02-14 01:21:43.173793142 +0100
  6173. @@ -31,6 +31,8 @@
  6174. #include <linux/syscalls.h>
  6175. #include <linux/memcontrol.h>
  6176. #include <linux/poll.h>
  6177. +#include <linux/frontswap.h>
  6178. +#include <linux/swapfile.h>
  6179.  
  6180. #include <asm/pgtable.h>
  6181. #include <asm/tlbflush.h>
  6182. @@ -42,7 +44,7 @@ static bool swap_count_continued(struct
  6183. static void free_swap_count_continuations(struct swap_info_struct *);
  6184. static sector_t map_swap_entry(swp_entry_t, struct block_device**);
  6185.  
  6186. -static DEFINE_SPINLOCK(swap_lock);
  6187. +DEFINE_SPINLOCK(swap_lock);
  6188. static unsigned int nr_swapfiles;
  6189. long nr_swap_pages;
  6190. long total_swap_pages;
  6191. @@ -53,9 +55,9 @@ static const char Unused_file[] = "Unuse
  6192. static const char Bad_offset[] = "Bad swap offset entry ";
  6193. static const char Unused_offset[] = "Unused swap offset entry ";
  6194.  
  6195. -static struct swap_list_t swap_list = {-1, -1};
  6196. +struct swap_list_t swap_list = {-1, -1};
  6197.  
  6198. -static struct swap_info_struct *swap_info[MAX_SWAPFILES];
  6199. +struct swap_info_struct *swap_info[MAX_SWAPFILES];
  6200.  
  6201. static DEFINE_MUTEX(swapon_mutex);
  6202.  
  6203. @@ -589,6 +591,7 @@ static unsigned char swap_entry_free(str
  6204. swap_list.next = p->type;
  6205. nr_swap_pages++;
  6206. p->inuse_pages--;
  6207. + frontswap_flush_page(p->type, offset);
  6208. if ((p->flags & SWP_BLKDEV) &&
  6209. disk->fops->swap_slot_free_notify)
  6210. disk->fops->swap_slot_free_notify(p->bdev, offset);
  6211. @@ -1052,7 +1055,7 @@ static int unuse_mm(struct mm_struct *mm
  6212. * Recycle to start on reaching the end, returning 0 when empty.
  6213. */
  6214. static unsigned int find_next_to_unuse(struct swap_info_struct *si,
  6215. - unsigned int prev)
  6216. + unsigned int prev, bool frontswap)
  6217. {
  6218. unsigned int max = si->max;
  6219. unsigned int i = prev;
  6220. @@ -1078,6 +1081,12 @@ static unsigned int find_next_to_unuse(s
  6221. prev = 0;
  6222. i = 1;
  6223. }
  6224. + if (frontswap) {
  6225. + if (frontswap_test(si, i))
  6226. + break;
  6227. + else
  6228. + continue;
  6229. + }
  6230. count = si->swap_map[i];
  6231. if (count && swap_count(count) != SWAP_MAP_BAD)
  6232. break;
  6233. @@ -1089,8 +1098,12 @@ static unsigned int find_next_to_unuse(s
  6234. * We completely avoid races by reading each swap page in advance,
  6235. * and then search for the process using it. All the necessary
  6236. * page table adjustments can then be made atomically.
  6237. + *
  6238. + * if the boolean frontswap is true, only unuse pages_to_unuse pages;
  6239. + * pages_to_unuse==0 means all pages
  6240. */
  6241. -static int try_to_unuse(unsigned int type)
  6242. +int try_to_unuse(unsigned int type, bool frontswap,
  6243. + unsigned long pages_to_unuse)
  6244. {
  6245. struct swap_info_struct *si = swap_info[type];
  6246. struct mm_struct *start_mm;
  6247. @@ -1123,7 +1136,7 @@ static int try_to_unuse(unsigned int typ
  6248. * one pass through swap_map is enough, but not necessarily:
  6249. * there are races when an instance of an entry might be missed.
  6250. */
  6251. - while ((i = find_next_to_unuse(si, i)) != 0) {
  6252. + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
  6253. if (signal_pending(current)) {
  6254. retval = -EINTR;
  6255. break;
  6256. @@ -1290,6 +1303,10 @@ static int try_to_unuse(unsigned int typ
  6257. * interactive performance.
  6258. */
  6259. cond_resched();
  6260. + if (frontswap && pages_to_unuse > 0) {
  6261. + if (!--pages_to_unuse)
  6262. + break;
  6263. + }
  6264. }
  6265.  
  6266. mmput(start_mm);
  6267. @@ -1615,7 +1632,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
  6268. spin_unlock(&swap_lock);
  6269.  
  6270. current->flags |= PF_OOM_ORIGIN;
  6271. - err = try_to_unuse(type);
  6272. + err = try_to_unuse(type, false, 0);
  6273. current->flags &= ~PF_OOM_ORIGIN;
  6274.  
  6275. if (err) {
  6276. @@ -1667,9 +1684,12 @@ SYSCALL_DEFINE1(swapoff, const char __us
  6277. swap_map = p->swap_map;
  6278. p->swap_map = NULL;
  6279. p->flags = 0;
  6280. + frontswap_flush_area(type);
  6281. spin_unlock(&swap_lock);
  6282. mutex_unlock(&swapon_mutex);
  6283. vfree(swap_map);
  6284. + if (p->frontswap_map)
  6285. + vfree(p->frontswap_map);
  6286. /* Destroy swap account informatin */
  6287. swap_cgroup_swapoff(type);
  6288.  
  6289. @@ -1864,6 +1884,7 @@ SYSCALL_DEFINE2(swapon, const char __use
  6290. unsigned long maxpages;
  6291. unsigned long swapfilepages;
  6292. unsigned char *swap_map = NULL;
  6293. + unsigned long *frontswap_map = NULL;
  6294. struct page *page = NULL;
  6295. struct inode *inode = NULL;
  6296. int did_down = 0;
  6297. @@ -2085,6 +2106,12 @@ SYSCALL_DEFINE2(swapon, const char __use
  6298. error = -EINVAL;
  6299. goto bad_swap;
  6300. }
  6301. + /* frontswap enabled? set up bit-per-page map for frontswap */
  6302. + if (frontswap_enabled) {
  6303. + frontswap_map = vmalloc(maxpages / sizeof(long));
  6304. + if (frontswap_map)
  6305. + memset(frontswap_map, 0, maxpages / sizeof(long));
  6306. + }
  6307.  
  6308. if (p->bdev) {
  6309. if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
  6310. @@ -2103,16 +2130,18 @@ SYSCALL_DEFINE2(swapon, const char __use
  6311. else
  6312. p->prio = --least_priority;
  6313. p->swap_map = swap_map;
  6314. + p->frontswap_map = frontswap_map;
  6315. p->flags |= SWP_WRITEOK;
  6316. nr_swap_pages += nr_good_pages;
  6317. total_swap_pages += nr_good_pages;
  6318.  
  6319. printk(KERN_INFO "Adding %uk swap on %s. "
  6320. - "Priority:%d extents:%d across:%lluk %s%s\n",
  6321. + "Priority:%d extents:%d across:%lluk %s%s%s\n",
  6322. nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
  6323. nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
  6324. (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
  6325. - (p->flags & SWP_DISCARDABLE) ? "D" : "");
  6326. + (p->flags & SWP_DISCARDABLE) ? "D" : "",
  6327. + (p->frontswap_map) ? "FS" : "");
  6328.  
  6329. /* insert swap space into swap_list: */
  6330. prev = -1;
  6331. @@ -2126,6 +2155,7 @@ SYSCALL_DEFINE2(swapon, const char __use
  6332. swap_list.head = swap_list.next = type;
  6333. else
  6334. swap_info[prev]->next = type;
  6335. + frontswap_init(type);
  6336. spin_unlock(&swap_lock);
  6337. mutex_unlock(&swapon_mutex);
  6338. atomic_inc(&proc_poll_event);
  6339. @@ -2313,6 +2343,10 @@ int valid_swaphandles(swp_entry_t entry,
  6340. base++;
  6341.  
  6342. spin_lock(&swap_lock);
  6343. + if (frontswap_test(si, target)) {
  6344. + spin_unlock(&swap_lock);
  6345. + return 0;
  6346. + }
  6347. if (end > si->max) /* don't go beyond end of map */
  6348. end = si->max;
  6349.  
  6350. @@ -2323,6 +2357,9 @@ int valid_swaphandles(swp_entry_t entry,
  6351. break;
  6352. if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
  6353. break;
  6354. + /* Don't read in frontswap pages */
  6355. + if (frontswap_test(si, toff))
  6356. + break;
  6357. }
  6358. /* Count contiguous allocated slots below our target */
  6359. for (toff = target; --toff >= base; nr_pages++) {
  6360. @@ -2331,6 +2368,9 @@ int valid_swaphandles(swp_entry_t entry,
  6361. break;
  6362. if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
  6363. break;
  6364. + /* Don't read in frontswap pages */
  6365. + if (frontswap_test(si, toff))
  6366. + break;
  6367. }
  6368. spin_unlock(&swap_lock);
  6369.  
  6370. diff -Nrupad linux-2.6.37//mm/truncate.c linux-2.6.37_vanilla//mm/truncate.c
  6371. --- linux-2.6.37//mm/truncate.c 2011-01-05 01:50:19.000000000 +0100
  6372. +++ linux-2.6.37_vanilla//mm/truncate.c 2011-02-14 01:21:43.174793140 +0100
  6373. @@ -19,6 +19,7 @@
  6374. #include <linux/task_io_accounting_ops.h>
  6375. #include <linux/buffer_head.h> /* grr. try_to_release_page,
  6376. do_invalidatepage */
  6377. +#include <linux/cleancache.h>
  6378. #include "internal.h"
  6379.  
  6380.  
  6381. @@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page
  6382. static inline void truncate_partial_page(struct page *page, unsigned partial)
  6383. {
  6384. zero_user_segment(page, partial, PAGE_CACHE_SIZE);
  6385. + cleancache_flush_page(page->mapping, page);
  6386. if (page_has_private(page))
  6387. do_invalidatepage(page, partial);
  6388. }
  6389. @@ -108,6 +110,10 @@ truncate_complete_page(struct address_sp
  6390. clear_page_mlock(page);
  6391. remove_from_page_cache(page);
  6392. ClearPageMappedToDisk(page);
  6393. + /* this must be after the remove_from_page_cache which
  6394. + * calls cleancache_put_page (and note page->mapping is now NULL)
  6395. + */
  6396. + cleancache_flush_page(mapping, page);
  6397. page_cache_release(page); /* pagecache ref */
  6398. return 0;
  6399. }
  6400. @@ -215,6 +221,7 @@ void truncate_inode_pages_range(struct a
  6401. pgoff_t next;
  6402. int i;
  6403.  
  6404. + cleancache_flush_inode(mapping);
  6405. if (mapping->nrpages == 0)
  6406. return;
  6407.  
  6408. @@ -290,6 +297,7 @@ void truncate_inode_pages_range(struct a
  6409. pagevec_release(&pvec);
  6410. mem_cgroup_uncharge_end();
  6411. }
  6412. + cleancache_flush_inode(mapping);
  6413. }
  6414. EXPORT_SYMBOL(truncate_inode_pages_range);
  6415.  
  6416. @@ -432,6 +440,7 @@ int invalidate_inode_pages2_range(struct
  6417. int did_range_unmap = 0;
  6418. int wrapped = 0;
  6419.  
  6420. + cleancache_flush_inode(mapping);
  6421. pagevec_init(&pvec, 0);
  6422. next = start;
  6423. while (next <= end && !wrapped &&
  6424. @@ -490,6 +499,7 @@ int invalidate_inode_pages2_range(struct
  6425. mem_cgroup_uncharge_end();
  6426. cond_resched();
  6427. }
  6428. + cleancache_flush_inode(mapping);
  6429. return ret;
  6430. }
  6431. EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
Add Comment
Please, Sign In to add comment