2.6.37_zcache_V2.patch

diff -Nrupad linux-2.6.37//arch/x86/kvm/vmx.c linux-2.6.37_vanilla//arch/x86/kvm/vmx.c
--- linux-2.6.37//arch/x86/kvm/vmx.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//arch/x86/kvm/vmx.c	2011-02-14 01:20:15.814793213 +0100
@@ -563,7 +563,7 @@ static inline void ept_sync_individual_a
 	}
 }

-static unsigned long vmcs_readl(unsigned long field)
+static noinline unsigned long vmcs_readl(unsigned long field)
 {
 	unsigned long value;

diff -Nrupad linux-2.6.37//Documentation/ABI/testing/sysfs-kernel-mm-cleancache linux-2.6.37_vanilla//Documentation/ABI/testing/sysfs-kernel-mm-cleancache
--- linux-2.6.37//Documentation/ABI/testing/sysfs-kernel-mm-cleancache	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.37_vanilla//Documentation/ABI/testing/sysfs-kernel-mm-cleancache	2011-02-14 01:21:43.156792902 +0100
@@ -0,0 +1,11 @@
+What:		/sys/kernel/mm/cleancache/
+Date:		June 2010
+Contact:	Dan Magenheimer <dan.magenheimer@oracle.com>
+Description:
+		/sys/kernel/mm/cleancache/ contains a number of files which
+		record a count of various cleancache operations
+		(sum across all filesystems):
+			succ_gets
+			failed_gets
+			puts
+			flushes
diff -Nrupad linux-2.6.37//Documentation/ABI/testing/sysfs-kernel-mm-frontswap linux-2.6.37_vanilla//Documentation/ABI/testing/sysfs-kernel-mm-frontswap
--- linux-2.6.37//Documentation/ABI/testing/sysfs-kernel-mm-frontswap	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.37_vanilla//Documentation/ABI/testing/sysfs-kernel-mm-frontswap	2011-02-14 01:21:43.156792902 +0100
@@ -0,0 +1,16 @@
+What:		/sys/kernel/mm/frontswap/
+Date:		June 2010
+Contact:	Dan Magenheimer <dan.magenheimer@oracle.com>
+Description:
+		/sys/kernel/mm/frontswap/ contains a number of files which
+		record a count of various frontswap operations (sum across
+		all swap devices):
+			succ_puts
+			failed_puts
+			gets
+			flushes
+		In addition, reading the curr_pages file shows how many
+		pages are currently contained in frontswap and writing this
+		file with an integer performs a "partial swapoff", reducing
+		the number of frontswap pages to that integer if memory
+		constraints permit.
diff -Nrupad linux-2.6.37//Documentation/vm/cleancache.txt linux-2.6.37_vanilla//Documentation/vm/cleancache.txt
--- linux-2.6.37//Documentation/vm/cleancache.txt	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.37_vanilla//Documentation/vm/cleancache.txt	2011-02-14 01:21:43.157792932 +0100
@@ -0,0 +1,267 @@
+MOTIVATION
+
+Cleancache is a new optional feature provided by the VFS layer that
+potentially dramatically increases page cache effectiveness for
+many workloads in many environments at a negligible cost.
+
+Cleancache can be thought of as a page-granularity victim cache for clean
+pages that the kernel's pageframe replacement algorithm (PFRA) would like
+to keep around, but can't since there isn't enough memory.  So when the
+PFRA "evicts" a page, it first attempts to put it into a synchronous
+concurrency-safe page-oriented "pseudo-RAM" device (such as Xen's
+Transcendent Memory, aka "tmem", or in-kernel compressed memory, aka "zmem",
+or other RAM-like devices) which is not directly accessible or addressable
+by the kernel and is of unknown and possibly time-varying size.  And when a
+cleancache-enabled filesystem wishes to access a page in a file on disk,
+it first checks cleancache to see if it already contains it; if it does,
+the page is copied into the kernel and a disk access is avoided.
+
+FAQs are included below.
+
+IMPLEMENTATION OVERVIEW
+
+A cleancache "backend" that interfaces to this pseudo-RAM links itself
+to the kernel's cleancache "frontend" by calling cleancache_register_ops,
+passing a pointer to a cleancache_ops structure with funcs set appropriately.
+Note that cleancache_register_ops returns the previous settings so that
+chaining can be pefromed if desired. The functions provided must conform to
+certain semantics as follows:
+
+Most important, cleancache is "ephemeral".  Pages which are copied into
+cleancache have an indefinite lifetime which is completely unknowable
+by the kernel and so may or may not still be in cleancache at any later time.
+Thus, as its name implies, cleancache is not suitable for dirty pages.
+Cleancache has complete discretion over what pages to preserve and what
+pages to discard and when.
+
+Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a
+pool id which, if positive, must be saved in the filesystem's superblock;
+a negative return value indicates failure.  A "put_page" will copy a
+(presumably about-to-be-evicted) page into cleancache and associate it with
+the pool id, a file key, and a page index into the file.  (The combination
+of a pool id, a file key, and an index is sometimes called a "handle".)
+A "get_page" will copy the page, if found, from cleancache into kernel memory.
+A "flush_page" will ensure the page no longer is present in cleancache;
+a "flush_inode" will flush all pages associated with the specified file;
+and, when a filesystem is unmounted, a "flush_fs" will flush all pages in
+all files specified by the given pool id and also surrender the pool id.
+
+An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache
+to treat the pool as shared using a 128-bit UUID as a key.  On systems
+that may run multiple kernels (such as hard partitioned or virtualized
+systems) that may share a clustered filesystem, and where cleancache
+may be shared among those kernels, calls to init_shared_fs that specify the
+same UUID will receive the same pool id, thus allowing the pages to
+be shared.  Note that any security requirements must be imposed outside
+of the kernel (e.g. by "tools" that control cleancache).  Or a
+cleancache implementation can simply disable shared_init by always
+returning a negative value.
+
+If a get_page is successful on a non-shared pool, the page is flushed (thus
+making cleancache an "exclusive" cache).  On a shared pool, the page
+is NOT flushed on a successful get_page so that it remains accessible to
+other sharers.  The kernel is responsible for ensuring coherency between
+cleancache (shared or not), the page cache, and the filesystem, using
+cleancache flush operations as required.
+
+Note that cleancache must enforce put-put-get coherency and get-get
+coherency.  For the former, if two puts are made to the same handle but
+with different data, say AAA by the first put and BBB by the second, a
+subsequent get can never return the stale data (AAA).  For get-get coherency,
+if a get for a given handle fails, subsequent gets for that handle will
+never succeed unless preceded by a successful put with that handle.
+
+Last, cleancache provides no SMP serialization guarantees; if two
+different Linux threads are simultaneously putting and flushing a page
+with the same handle, the results are indeterminate.  Callers must
+lock the page to ensure serial behavior.
+
+CLEANCACHE PERFORMANCE METRICS
+
+Cleancache monitoring is done by sysfs files in the
+/sys/kernel/mm/cleancache directory.  The effectiveness of cleancache
+can be measured (across all filesystems) with:
+
+succ_gets	- number of gets that were successful
+failed_gets	- number of gets that failed
+puts		- number of puts attempted (all "succeed")
+flushes		- number of flushes attempted
+
+A backend implementatation may provide additional metrics.
+
+FAQ
+
+1) Where's the value? (Andrew Morton)
+
+Cleancache provides a significant performance benefit to many workloads
+in many environments with negligible overhead by improving the
+effectiveness of the pagecache.  Clean pagecache pages are
+saved in pseudo-RAM (RAM that is otherwise not directly addressable to
+the kernel); fetching those pages later avoids "refaults" and thus
+disk reads.
+
+Cleancache (and its sister code "frontswap") provide interfaces for
+a new pseudo-RAM memory type that conceptually lies between fast
+kernel-directly-addressable RAM and slower DMA/asynchronous devices.
+Disallowing direct kernel or userland reads/writes to this pseudo-RAM
+is ideal when data is transformed to a different form and size (such
+as with compression) or secretly moved (as might be useful for write-
+balancing for some RAM-like devices).  Evicted page-cache pages (and
+swap pages) are a great use for this kind of slower-than-RAM-but-much-
+faster-than-disk pseudo-RAM and the cleancache (and frontswap)
+"page-object-oriented" specification provides a nice way to read and
+write -- and indirectly "name" -- the pages.
+
+In the virtual case, the whole point of virtualization is to statistically
+multiplex physical resources across the varying demands of multiple
+virtual machines.  This is really hard to do with RAM and efforts to
+do it well with no kernel change have essentially failed (except in some
+well-publicized special-case workloads).  Cleancache -- and frontswap --
+with a fairly small impact on the kernel, provide a huge amount
+of flexibility for more dynamic, flexible RAM multiplexing.
+Specifically, the Xen Transcendent Memory backend allows otherwise
+"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
+virtual machines, but the pages can be compressed and deduplicated to
+optimize RAM utilization.  And when guest OS's are induced to surrender
+underutilized RAM (e.g. with "self-ballooning"), page cache pages
+are the first to go, and cleancache allows those pages to be
+saved and reclaimed if overall host system memory conditions allow.
+
+2) Why does cleancache have its sticky fingers so deep inside the
+   filesystems and VFS? (Andrew Morton and Christoph Hellwig)
+
+The core hooks for cleancache in VFS are in most cases a single line
+and the minimum set are placed precisely where needed to maintain
+coherency (via cleancache_flush operations) between cleancache,
+the page cache, and disk.  All hooks compile into nothingness if
+cleancache is config'ed off and turn into a function-pointer-
+compare-to-NULL if config'ed on but no backend claims the ops
+functions, or to a compare-struct-element-to-negative if a
+backend claims the ops functions but a filesystem doesn't enable
+cleancache.
+
+Some filesystems are built entirely on top of VFS and the hooks
+in VFS are sufficient, so don't require an "init_fs" hook; the
+initial implementation of cleancache didn't provide this hook.
+But for some filesystems (such as btrfs), the VFS hooks are
+incomplete and one or more hooks in fs-specific code are required.
+And for some other filesystems, such as tmpfs, cleancache may
+be counterproductive.  So it seemed prudent to require a filesystem
+to "opt in" to use cleancache, which requires adding a hook in
+each filesystem.  Not all filesystems are supported by cleancache
+only because they haven't been tested.  The existing set should
+be sufficient to validate the concept, the opt-in approach means
+that untested filesystems are not affected, and the hooks in the
+existing filesystems should make it very easy to add more
+filesystems in the future.
+
+The total impact of the hooks to existing fs and mm files is 43
+lines added (not counting comments and blank lines).
+
+3) Why not make cleancache asynchronous and batched so it can
+   more easily interface with real devices with DMA instead
+   of copying each individual page? (Minchan Kim)
+
+The one-page-at-a-time copy semantics simplifies the implementation
+on both the frontend and backend and also allows the backend to
+do fancy things on-the-fly like page compression and
+page deduplication.  And since the data is "gone" (copied into/out
+of the pageframe) before the cleancache get/put call returns,
+a great deal of race conditions and potential coherency issues
+are avoided.  While the interface seems odd for a "real device"
+or for real kernel-addressable RAM, it makes perfect sense for
+pseudo-RAM.
+
+4) Why is non-shared cleancache "exclusive"?  And where is the
+   page "flushed" after a "get"? (Minchan Kim)
+
+The main reason is to free up memory in pseudo-RAM and to avoid
+unnecessary cleancache_flush calls.  If you want inclusive,
+the page can be "put" immediately following the "get".  If
+put-after-get for inclusive becomes common, the interface could
+be easily extended to add a "get_no_flush" call.
+
+The flush is done by the cleancache backend implementation.
+
+5) What's the performance impact?
+
+Performance analysis has been presented at OLS'09 and LCA'10.
+Briefly, performance gains can be significant on most workloads,
+especially when memory pressure is high (e.g. when RAM is
+overcommitted in a virtual workload); and because the hooks are
+invoked primarily in place of or in addition to a disk read/write,
+overhead is negligible even in worst case workloads.  Basically
+cleancache replaces I/O with memory-copy-CPU-overhead; on older
+single-core systems with slow memory-copy speeds, cleancache
+has little value, but in newer multicore machines, especially
+consolidated/virtualized machines, it has great value.
+
+6) How do I add cleancache support for filesystem X? (Boaz Harrash)
+
+Filesystems that are well-behaved and conform to certain
+restrictions can utilize cleancache simply by making a call to
+cleancache_init_fs at mount time.  Unusual, misbehaving, or
+poorly layered filesystems must either add additional hooks
+and/or undergo extensive additional testing... or should just
+not enable the optional cleancache.
+
+Some points for a filesystem to consider:
+
+- The FS should be block-device-based (e.g. a ram-based FS such
+  as tmpfs should not enable cleancache)
+- To ensure coherency/correctness, the FS must ensure that all
+  file removal or truncation operations either go through VFS or
+  add hooks to do the equivalent cleancache "flush" operations
+- To ensure coherency/correctness, either inode numbers must
+  be unique across the lifetime of the on-disk file OR the
+  FS must provide an "encode_fh" function.
+- The FS must call the VFS superblock alloc and deactivate routines
+  or add hooks to do the equivalent cleancache calls done there.
+- To maximize performance, all pages fetched from the FS should
+  go through the do_mpag_readpage routine or the FS should add
+  hooks to do the equivalent (cf. btrfs)
+- Currently, the FS blocksize must be the same as PAGESIZE.  This
+  is not an architectural restriction, but no backends currently
+  support anything different.
+- A clustered FS should invoke the "shared_init_fs" cleancache
+  hook to get best performance for some backends.
+
+7) Why not use the KVA of the inode as the key? (Christoph Hellwig)
+
+If cleancache would use the inode virtual address instead of
+inode/filehandle, the pool id could be eliminated.  But, this
+won't work because cleancache retains pagecache data pages
+persistently even when the inode has been pruned from the
+inode unused list, and only flushes the data page if the file
+gets removed/truncated.  So if cleancache used the inode kva,
+there would be potential coherency issues if/when the inode
+kva is reused for a different file.  Alternately, if cleancache
+flushed the pages when the inode kva was freed, much of the value
+of cleancache would be lost because the cache of pages in cleanache
+is potentially much larger than the kernel pagecache and is most
+useful if the pages survive inode cache removal.
+
+8) Why is a global variable required?
+
+The cleancache_enabled flag is checked in all of the frequently-used
+cleancache hooks.  The alternative is a function call to check a static
+variable. Since cleancache is enabled dynamically at runtime, systems
+that don't enable cleancache would suffer thousands (possibly
+tens-of-thousands) of unnecessary function calls per second.  So the
+global variable allows cleancache to be enabled by default at compile
+time, but have insignificant performance impact when cleancache remains
+disabled at runtime.
+
+9) Does cleanache work with KVM?
+
+The memory model of KVM is sufficiently different that a cleancache
+backend may have little value for KVM.  This remains to be tested,
+especially in an overcommitted system.
+
+10) Does cleancache work in userspace?  It sounds useful for
+   memory hungry caches like web browsers.  (Jamie Lokier)
+
+No plans yet, though we agree it sounds useful, at least for
+apps that bypass the page cache (e.g. O_DIRECT).
+
+Last updated: Dan Magenheimer, September 2 2010
diff -Nrupad linux-2.6.37//Documentation/vm/frontswap.txt linux-2.6.37_vanilla//Documentation/vm/frontswap.txt
--- linux-2.6.37//Documentation/vm/frontswap.txt	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.37_vanilla//Documentation/vm/frontswap.txt	2011-02-14 01:21:43.158792960 +0100
@@ -0,0 +1,209 @@
+Frontswap provides a page-accessible-memory (PAM) interface for swap pages.
+In some environments, dramatic performance savings may be obtained because
+swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
+
+Frontswap is so named because it can be thought of as the opposite of
+a "backing" store for a swap device.  The storage is assumed to be
+a synchronous concurrency-safe page-oriented pseudo-RAM device (such as
+Xen's Transcendent Memory, aka "tmem", or in-kernel compressed memory,
+aka "zmem", or other RAM-like devices) which is not directly accessible
+or addressable by the kernel and is of unknown and possibly time-varying
+size.  This pseudo-RAM device links itself to frontswap by calling
+frontswap_register_ops to set the frontswap_ops funcs appropriately and
+the functions it provides must conform to certain policies as follows:
+
+An "init" prepares the pseudo-RAM to receive frontswap pages associated
+with the specified swap device number (aka "type").  A "put_page" will
+copy the page to pseudo-RAM and associate it with the type and offset
+associated with the page. A "get_page" will copy the page, if found,
+from pseudo-RAM into kernel memory, but will NOT remove the page from
+pseudo-RAM.  A "flush_page" will remove the page from pseudo-RAM and a
+"flush_area" will remove ALL pages associated with the swap type
+(e.g., like swapoff) and notify the pseudo-RAM device to refuse
+further puts with that swap type.
+
+Once a page is successfully put, a matching get on the page will always
+succeed.  So when the kernel finds itself in a situation where it needs
+to swap out a page, it first attempts to use frontswap.  If the put returns
+non-zero, the data has been successfully saved to pseudo-RAM and
+a disk write and, if the data is later read back, a disk read are avoided.
+If a put returns zero, pseudo-RAM has rejected the data, and the page can
+be written to swap as usual.
+
+Note that if a page is put and the page already exists in pseudo-RAM
+(a "duplicate" put), either the put succeeds and the data is overwritten,
+or the put fails AND the page is flushed.  This ensures stale data may
+never be obtained from psuedo-RAM.
+
+Monitoring and control of frontswap is done by sysfs files in the
+/sys/kernel/mm/frontswap directory.  The effectiveness of frontswap can
+be measured (across all swap devices) with:
+
+curr_pages	- number of pages currently contained in frontswap
+failed_puts	- how many put attempts have failed
+gets		- how many gets were attempted (all should succeed)
+succ_puts	- how many put attempts have succeeded
+flushes		- how many flushes were attempted
+
+The number can be reduced by root by writing an integer target to curr_pages,
+which results in a "partial swapoff", thus reducing the number of frontswap
+pages to that target if memory constraints permit.
+
+FAQ
+
+1) Where's the value?
+
+When a workload starts swapping, performance falls through the floor.
+Frontswap significantly increases performance in many such workloads by
+providing a clean, dynamic interface to read and write swap pages to
+pseudo-RAM -- RAM that is otherwise not directly addressable to the kernel.
+This interface is ideal when data is transformed to a different form
+and size (such as with compression) or secretly moved (as might be
+useful for write-balancing for some RAM-like devices).  Swap pages (and
+evicted page-cache pages) are a great use for this kind of slower-than-RAM-
+but-much-faster-than-disk pseudo-RAM and the frontswap (and cleancache)
+"page-object-oriented" specification provides a nice way to read
+and write -- and indirectly "name" -- the pages.
+
+In the virtual case, the whole point of virtualization is to statistically
+multiplex physical resources acrosst the varying demands of multiple
+virtual machines.  This is really hard to do with RAM and efforts to do
+it well with no kernel changes have essentially failed (except in some
+well-publicized special-case workloads).  Frontswap -- and cleancache --
+with a fairly small impact on the kernel, provides a huge amount
+of flexibility for more dynamic, flexible RAM multiplexing.
+Specifically, the Xen Transcendent Memory backend allows otherwise
+"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
+virtual machines, but the pages can be compressed and deduplicated to
+optimize RAM utilization.  And when guest OS's are induced to surrender
+underutilized RAM (e.g. with "self-ballooning"), sudden unexpected
+memory pressure may result in swapping; frontswap allows those pages
+to be swapped to and from hypervisor RAM if overall host system memory
+conditions allow.
+
+2) Sure there may be performance advantages in some situations, but
+   what's the space/time overhead of frontswap?
+
+If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
+nothingness and the only overhead is a few extra bytes per swapon'ed
+swap device.  If CONFIG_FRONTSWAP is enabled but no frontswap "backend"
+registers, there is one extra global variable compared to zero for
+every swap page read or written.  If CONFIG_FRONTSWAP is enabled
+AND a frontswap backend registers AND the backend fails every "put"
+request (i.e. provides no memory despite claiming it might),
+CPU overhead is still negligible -- and since every frontswap fail
+precedes a swap page write-to-disk, the system is highly likely
+to be I/O bound and using a small fraction of a percent of a CPU
+will be irrelevant anyway.
+
+As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend
+registers, one bit is allocated for every swap page for every swap
+device that is swapon'd.  This is added to the EIGHT bits (which
+was sixteen until about 2.6.34) that the kernel already allocates
+for every swap page for every swap device that is swapon'd.  (Hugh
+Dickins has observed that frontswap could probably steal one of
+the existing eight bits, but let's worry about that minor optimization
+later.)  For very large swap disks (which are rare) on a standard
+4K pagesize, this is 1MB per 32GB swap.
+
+3) OK, how about a quick overview of what this frontswap patch does
+   in terms that a kernel hacker can grok?
+
+Let's assume that a frontswap "backend" has registered during
+kernel initialization; this registration indicates that this
+frontswap backend has access to some "memory" that is not directly
+accessible by the kernel.  Exactly how much memory it provides is
+entirely dynamic and random.
+
+Whenever a swap-device is swapon'd frontswap_init() is called,
+passing the swap device number (aka "type") as a parameter.
+This notifies frontswap to expect attempts to "put" swap pages
+associated with that number.
+
+Whenever the swap subsystem is readying a page to write to a swap
+device (c.f swap_writepage()), frontswap_put_page is called.  Frontswap
+consults with the frontswap backend and if the backend says
+it does NOT have room, frontswap_put_page returns 0 and the page is
+swapped as normal.  Note that the response from the frontswap
+backend is essentially random; it may choose to never accept a
+page, it could accept every ninth page, or it might accept every
+page.  But if the backend does accept a page, the data from the page
+has already been copied and associated with the type and offset,
+and the backend guarantees the persistence of the data.  In this case,
+frontswap sets a bit in the "frontswap_map" for the swap device
+corresponding to the page offset on the swap device to which it would
+otherwise have written the data.
+
+When the swap subsystem needs to swap-in a page (swap_readpage()),
+it first calls frontswap_get_page() which checks the frontswap_map to
+see if the page was earlier accepted by the frontswap backend.  If
+it was, the page of data is filled from the frontswap backend and
+the swap-in is complete.  If not, the normal swap-in code is
+executed to obtain the page of data from the real swap device.
+
+So every time the frontswap backend accepts a page, a swap device read
+and (potentially) a swap device write are replaced by a "frontswap backend
+put" and (possibly) a "frontswap backend get", which are presumably much
+faster.
+
+4) Can't frontswap be configured as a "special" swap device that is
+   just higher priority than any real swap device (e.g. like zswap)?
+
+No.  Recall that acceptance of any swap page by the frontswap
+backend is entirely unpredictable. This is critical to the definition
+of frontswap because it grants completely dynamic discretion to the
+backend.  But since any "put" might fail, there must always be a real
+slot on a real swap device to swap the page.  Thus frontswap must be
+implemented as a "shadow" to every swapon'd device with the potential
+capability of holding every page that the swap device might have held
+and the possibility that it might hold no pages at all.
+On the downside, this also means that frontswap cannot contain more
+pages than the total of swapon'd swap devices.  For example, if NO
+swap device is configured on some installation, frontswap is useless.
+
+Further, frontswap is entirely synchronous whereas a real swap
+device is, by definition, asynchronous and uses block I/O.  The
+block I/O layer is not only unnecessary, but may perform "optimizations"
+that are inappropriate for a RAM-oriented device including delaying
+the write of some pages for a significant amount of time.
+Synchrony is required to ensure the dynamicity of the backend.
+
+In a virtualized environment, the dynamicity allows the hypervisor
+(or host OS) to do "intelligent overcommit".  For example, it can
+choose to accept pages only until host-swapping might be imminent,
+then force guests to do their own swapping.
+
+5) Why this weird definition about "duplicate puts"?  If a page
+   has been previously successfully put, can't it always be
+   successfully overwritten?
+
+Nearly always it can, but no, sometimes it cannot.  Consider an example
+where data is compressed and the original 4K page has been compressed
+to 1K.  Now an attempt is made to overwrite the page with data that
+is non-compressible and so would take the entire 4K.  But the backend
+has no more space.  In this case, the put must be rejected.  Whenever
+frontswap rejects a put that would overwrite, it also must flush
+the old data and ensure that it is no longer accessible.  Since the
+swap subsystem then writes the new data to the read swap device,
+this is the correct course of action to ensure coherency.
+
+6) What is frontswap_shrink for?
+
+When the (non-frontswap) swap subsystem swaps out a page to a real
+swap device, that page is only taking up low-value pre-allocated disk
+space.  But if frontswap has placed a page in pseudo-RAM, that
+page may be taking up valuable real estate.  The frontswap_shrink
+routine allows a process outside of the swap subsystem (such as
+a userland service via the sysfs interface, or a kernel thread)
+to force pages out of the memory managed by frontswap and back into
+kernel-addressable memory.
+
+7) Why does the frontswap patch create the new include file swapfile.h?
+
+The frontswap code depends on some swap-subsystem-internal data
+structures that have, over the years, moved back and forth between
+static and global.  This seemed a reasonable compromise:  Define
+them as global but declare them in a new include file that isn't
+included by the large number of source files that include swap.h.
+
+Dan Magenheimer, September 21 2010
diff -Nrupad linux-2.6.37//drivers/media/radio/radio-aimslab.c linux-2.6.37_vanilla//drivers/media/radio/radio-aimslab.c
--- linux-2.6.37//drivers/media/radio/radio-aimslab.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//drivers/media/radio/radio-aimslab.c	2011-02-14 01:20:15.814793213 +0100
@@ -71,7 +71,7 @@ static struct rtrack rtrack_card;

 /* local things */

-static void sleep_delay(long n)
+static noinline void sleep_delay(long n)
 {
 	/* Sleep nicely for 'n' uS */
 	int d = n / msecs_to_jiffies(1000);
diff -Nrupad linux-2.6.37//drivers/staging/Kconfig linux-2.6.37_vanilla//drivers/staging/Kconfig
--- linux-2.6.37//drivers/staging/Kconfig	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//drivers/staging/Kconfig	2011-02-14 01:21:43.158792960 +0100
@@ -123,6 +123,8 @@ source "drivers/staging/iio/Kconfig"

 source "drivers/staging/zram/Kconfig"

+source "drivers/staging/zcache/Kconfig"
+
 source "drivers/staging/wlags49_h2/Kconfig"

 source "drivers/staging/wlags49_h25/Kconfig"
diff -Nrupad linux-2.6.37//drivers/staging/Makefile linux-2.6.37_vanilla//drivers/staging/Makefile
--- linux-2.6.37//drivers/staging/Makefile	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//drivers/staging/Makefile	2011-02-14 01:21:43.158792960 +0100
@@ -44,6 +44,7 @@ obj-$(CONFIG_VME_BUS)		+= vme/
 obj-$(CONFIG_MRST_RAR_HANDLER)	+= memrar/
 obj-$(CONFIG_IIO)		+= iio/
 obj-$(CONFIG_ZRAM)		+= zram/
+obj-$(CONFIG_ZCACHE)		+= zcache/
 obj-$(CONFIG_WLAGS49_H2)	+= wlags49_h2/
 obj-$(CONFIG_WLAGS49_H25)	+= wlags49_h25/
 obj-$(CONFIG_BATMAN_ADV)	+= batman-adv/
diff -Nrupad linux-2.6.37//drivers/staging/zcache/Kconfig linux-2.6.37_vanilla//drivers/staging/zcache/Kconfig
--- linux-2.6.37//drivers/staging/zcache/Kconfig	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.37_vanilla//drivers/staging/zcache/Kconfig	2011-02-14 01:21:43.158792960 +0100
@@ -0,0 +1,13 @@
+config ZCACHE
+	tristate "Dynamic compression of swap pages and clean pagecache pages"
+	depends on CLEANCACHE || FRONTSWAP
+	select XVMALLOC
+	select LZO_COMPRESS
+	select LZO_DECOMPRESS
+	default n
+	help
+	  Zcache doubles RAM efficiency while providing a significant
+	  performance boosts on many workloads.  Zcache uses lzo1x
+	  compression and an in-kernel implementation of transcendent
+	  memory to store clean page cache pages and swap in RAM,
+	  providing a noticeable reduction in disk I/O.
diff -Nrupad linux-2.6.37//drivers/staging/zcache/Makefile linux-2.6.37_vanilla//drivers/staging/zcache/Makefile
--- linux-2.6.37//drivers/staging/zcache/Makefile	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.37_vanilla//drivers/staging/zcache/Makefile	2011-02-14 01:21:43.159792985 +0100
@@ -0,0 +1 @@
+obj-$(CONFIG_ZCACHE)	+=	zcache.o tmem.o
diff -Nrupad linux-2.6.37//drivers/staging/zcache/tmem.c linux-2.6.37_vanilla//drivers/staging/zcache/tmem.c
--- linux-2.6.37//drivers/staging/zcache/tmem.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.37_vanilla//drivers/staging/zcache/tmem.c	2011-02-14 01:21:43.160793007 +0100
@@ -0,0 +1,710 @@
+/*
+ * In-kernel transcendent memory (generic implementation)
+ *
+ * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
+ *
+ * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
+ * "handles" (triples containing a pool id, and object id, and an index), to
+ * pages in a page-accessible memory (PAM).  Tmem references the PAM pages via
+ * an abstract "pampd" (PAM page-descriptor), which can be operated on by a
+ * set of functions (pamops).  Each pampd contains some representation of
+ * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of
+ * pages and must be able to insert, find, and delete these pages at a
+ * potential frequency of thousands per second concurrently across many CPUs,
+ * (and, if used with KVM, across many vcpus across many guests).
+ * Tmem is tracked with a hierarchy of data structures, organized by
+ * the elements in a handle-tuple: pool_id, object_id, and page index.
+ * One or more "clients" (e.g. guests) each provide one or more tmem_pools.
+ * Each pool, contains a hash table of rb_trees of tmem_objs.  Each
+ * tmem_obj contains a radix-tree-like tree of pointers, with intermediate
+ * nodes called tmem_objnodes.  Each leaf pointer in this tree points to
+ * a pampd, which is accessible only through a small set of callbacks
+ * registered by the PAM implementation (see tmem_register_pamops). Tmem
+ * does all memory allocation via a set of callbacks registered by the tmem
+ * host implementation (e.g. see tmem_register_hostops).
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+
+#include "tmem.h"
+
+/* data structure sentinels used for debugging... see tmem.h */
+#define POOL_SENTINEL 0x87658765
+#define OBJ_SENTINEL 0x12345678
+#define OBJNODE_SENTINEL 0xfedcba09
+
+/*
+ * A tmem host implementation must use this function to register callbacks
+ * for memory allocation.
+ */
+static struct tmem_hostops tmem_hostops;
+
+static void tmem_objnode_tree_init(void);
+
+void tmem_register_hostops(struct tmem_hostops *m)
+{
+	tmem_objnode_tree_init();
+	tmem_hostops = *m;
+}
+
+/*
+ * A tmem host implementation must use this function to register
+ * callbacks for a page-accessible memory (PAM) implementation
+ */
+static struct tmem_pamops tmem_pamops;
+
+void tmem_register_pamops(struct tmem_pamops *m)
+{
+	tmem_pamops = *m;
+}
+
+/*
+ * Oid's are potentially very sparse and tmem_objs may have an indeterminately
+ * short life, being added and deleted at a relatively high frequency.
+ * So an rb_tree is an ideal data structure to manage tmem_objs.  But because
+ * of the potentially huge number of tmem_objs, each pool manages a hashtable
+ * of rb_trees to reduce search, insert, delete, and rebalancing time.
+ * Each hashbucket also has a lock to manage concurrent access.
+ *
+ * The following routines manage tmem_objs.  When any tmem_obj is accessed,
+ * the hashbucket lock must be held.
+ */
+
+/* searches for object==oid in pool, returns locked object if found */
+static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
+					struct tmem_oid *oidp)
+{
+	struct rb_node *rbnode;
+	struct tmem_obj *obj;
+
+	rbnode = hb->obj_rb_root.rb_node;
+	while (rbnode) {
+		BUG_ON(RB_EMPTY_NODE(rbnode));
+		obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
+		switch (tmem_oid_compare(oidp, &obj->oid)) {
+		case 0: /* equal */
+			goto out;
+		case -1:
+			rbnode = rbnode->rb_left;
+			break;
+		case 1:
+			rbnode = rbnode->rb_right;
+			break;
+		}
+	}
+	obj = NULL;
+out:
+	return obj;
+}
+
+static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *);
+
+/* free an object that has no more pampds in it */
+static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
+{
+	struct tmem_pool *pool;
+
+	BUG_ON(obj == NULL);
+	ASSERT_SENTINEL(obj, OBJ);
+	BUG_ON(obj->pampd_count > 0);
+	pool = obj->pool;
+	BUG_ON(pool == NULL);
+	if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
+		tmem_pampd_destroy_all_in_obj(obj);
+	BUG_ON(obj->objnode_tree_root != NULL);
+	BUG_ON((long)obj->objnode_count != 0);
+	atomic_dec(&pool->obj_count);
+	BUG_ON(atomic_read(&pool->obj_count) < 0);
+	INVERT_SENTINEL(obj, OBJ);
+	obj->pool = NULL;
+	tmem_oid_set_invalid(&obj->oid);
+	rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
+}
+
+/*
+ * initialize, and insert an tmem_object_root (called only if find failed)
+ */
+static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
+					struct tmem_pool *pool,
+					struct tmem_oid *oidp)
+{
+	struct rb_root *root = &hb->obj_rb_root;
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	struct tmem_obj *this;
+
+	BUG_ON(pool == NULL);
+	atomic_inc(&pool->obj_count);
+	obj->objnode_tree_height = 0;
+	obj->objnode_tree_root = NULL;
+	obj->pool = pool;
+	obj->oid = *oidp;
+	obj->objnode_count = 0;
+	obj->pampd_count = 0;
+	SET_SENTINEL(obj, OBJ);
+	while (*new) {
+		BUG_ON(RB_EMPTY_NODE(*new));
+		this = rb_entry(*new, struct tmem_obj, rb_tree_node);
+		parent = *new;
+		switch (tmem_oid_compare(oidp, &this->oid)) {
+		case 0:
+			BUG(); /* already present; should never happen! */
+			break;
+		case -1:
+			new = &(*new)->rb_left;
+			break;
+		case 1:
+			new = &(*new)->rb_right;
+			break;
+		}
+	}
+	rb_link_node(&obj->rb_tree_node, parent, new);
+	rb_insert_color(&obj->rb_tree_node, root);
+}
+
+/*
+ * Tmem is managed as a set of tmem_pools with certain attributes, such as
+ * "ephemeral" vs "persistent".  These attributes apply to all tmem_objs
+ * and all pampds that belong to a tmem_pool.  A tmem_pool is created
+ * or deleted relatively rarely (for example, when a filesystem is
+ * mounted or unmounted.
+ */
+
+/* flush all data from a pool and, optionally, free it */
+static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
+{
+	struct rb_node *rbnode;
+	struct tmem_obj *obj;
+	struct tmem_hashbucket *hb = &pool->hashbucket[0];
+	int i;
+
+	BUG_ON(pool == NULL);
+	for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
+		spin_lock(&hb->lock);
+		rbnode = rb_first(&hb->obj_rb_root);
+		while (rbnode != NULL) {
+			obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
+			rbnode = rb_next(rbnode);
+			tmem_pampd_destroy_all_in_obj(obj);
+			tmem_obj_free(obj, hb);
+			(*tmem_hostops.obj_free)(obj, pool);
+		}
+		spin_unlock(&hb->lock);
+	}
+	if (destroy)
+		list_del(&pool->pool_list);
+}
+
+/*
+ * A tmem_obj contains a radix-tree-like tree in which the intermediate
+ * nodes are called tmem_objnodes.  (The kernel lib/radix-tree.c implementation
+ * is very specialized and tuned for specific uses and is not particularly
+ * suited for use from this code, though some code from the core algorithms has
+ * been reused, thus the copyright notices below).  Each tmem_objnode contains
+ * a set of pointers which point to either a set of intermediate tmem_objnodes
+ * or a set of of pampds.
+ *
+ * Portions Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
+ */
+
+struct tmem_objnode_tree_path {
+	struct tmem_objnode *objnode;
+	int offset;
+};
+
+/* objnode height_to_maxindex translation */
+static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
+
+static void tmem_objnode_tree_init(void)
+{
+	unsigned int ht, tmp;
+
+	for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
+		tmp = ht * OBJNODE_TREE_MAP_SHIFT;
+		if (tmp >= OBJNODE_TREE_INDEX_BITS)
+			tmem_objnode_tree_h2max[ht] = ~0UL;
+		else
+			tmem_objnode_tree_h2max[ht] =
+			    (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
+	}
+}
+
+static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
+{
+	struct tmem_objnode *objnode;
+
+	ASSERT_SENTINEL(obj, OBJ);
+	BUG_ON(obj->pool == NULL);
+	ASSERT_SENTINEL(obj->pool, POOL);
+	objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
+	if (unlikely(objnode == NULL))
+		goto out;
+	objnode->obj = obj;
+	SET_SENTINEL(objnode, OBJNODE);
+	memset(&objnode->slots, 0, sizeof(objnode->slots));
+	objnode->slots_in_use = 0;
+	obj->objnode_count++;
+out:
+	return objnode;
+}
+
+static void tmem_objnode_free(struct tmem_objnode *objnode)
+{
+	struct tmem_pool *pool;
+	int i;
+
+	BUG_ON(objnode == NULL);
+	for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
+		BUG_ON(objnode->slots[i] != NULL);
+	ASSERT_SENTINEL(objnode, OBJNODE);
+	INVERT_SENTINEL(objnode, OBJNODE);
+	BUG_ON(objnode->obj == NULL);
+	ASSERT_SENTINEL(objnode->obj, OBJ);
+	pool = objnode->obj->pool;
+	BUG_ON(pool == NULL);
+	ASSERT_SENTINEL(pool, POOL);
+	objnode->obj->objnode_count--;
+	objnode->obj = NULL;
+	(*tmem_hostops.objnode_free)(objnode, pool);
+}
+
+/*
+ * lookup index in object and return associated pampd (or NULL if not found)
+ */
+static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
+{
+	unsigned int height, shift;
+	struct tmem_objnode **slot = NULL;
+
+	BUG_ON(obj == NULL);
+	ASSERT_SENTINEL(obj, OBJ);
+	BUG_ON(obj->pool == NULL);
+	ASSERT_SENTINEL(obj->pool, POOL);
+
+	height = obj->objnode_tree_height;
+	if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
+		goto out;
+	if (height == 0 && obj->objnode_tree_root) {
+		slot = &obj->objnode_tree_root;
+		goto out;
+	}
+	shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
+	slot = &obj->objnode_tree_root;
+	while (height > 0) {
+		if (*slot == NULL)
+			goto out;
+		slot = (struct tmem_objnode **)
+			((*slot)->slots +
+			 ((index >> shift) & OBJNODE_TREE_MAP_MASK));
+		shift -= OBJNODE_TREE_MAP_SHIFT;
+		height--;
+	}
+out:
+	return slot != NULL ? *slot : NULL;
+}
+
+static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
+					void *pampd)
+{
+	int ret = 0;
+	struct tmem_objnode *objnode = NULL, *newnode, *slot;
+	unsigned int height, shift;
+	int offset = 0;
+
+	/* if necessary, extend the tree to be higher  */
+	if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
+		height = obj->objnode_tree_height + 1;
+		if (index > tmem_objnode_tree_h2max[height])
+			while (index > tmem_objnode_tree_h2max[height])
+				height++;
+		if (obj->objnode_tree_root == NULL) {
+			obj->objnode_tree_height = height;
+			goto insert;
+		}
+		do {
+			newnode = tmem_objnode_alloc(obj);
+			if (!newnode) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			newnode->slots[0] = obj->objnode_tree_root;
+			newnode->slots_in_use = 1;
+			obj->objnode_tree_root = newnode;
+			obj->objnode_tree_height++;
+		} while (height > obj->objnode_tree_height);
+	}
+insert:
+	slot = obj->objnode_tree_root;
+	height = obj->objnode_tree_height;
+	shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
+	while (height > 0) {
+		if (slot == NULL) {
+			/* add a child objnode.  */
+			slot = tmem_objnode_alloc(obj);
+			if (!slot) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (objnode) {
+
+				objnode->slots[offset] = slot;
+				objnode->slots_in_use++;
+			} else
+				obj->objnode_tree_root = slot;
+		}
+		/* go down a level */
+		offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
+		objnode = slot;
+		slot = objnode->slots[offset];
+		shift -= OBJNODE_TREE_MAP_SHIFT;
+		height--;
+	}
+	BUG_ON(slot != NULL);
+	if (objnode) {
+		objnode->slots_in_use++;
+		objnode->slots[offset] = pampd;
+	} else
+		obj->objnode_tree_root = pampd;
+	obj->pampd_count++;
+out:
+	return ret;
+}
+
+static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
+{
+	struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1];
+	struct tmem_objnode_tree_path *pathp = path;
+	struct tmem_objnode *slot = NULL;
+	unsigned int height, shift;
+	int offset;
+
+	BUG_ON(obj == NULL);
+	ASSERT_SENTINEL(obj, OBJ);
+	BUG_ON(obj->pool == NULL);
+	ASSERT_SENTINEL(obj->pool, POOL);
+	height = obj->objnode_tree_height;
+	if (index > tmem_objnode_tree_h2max[height])
+		goto out;
+	slot = obj->objnode_tree_root;
+	if (height == 0 && obj->objnode_tree_root) {
+		obj->objnode_tree_root = NULL;
+		goto out;
+	}
+	shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
+	pathp->objnode = NULL;
+	do {
+		if (slot == NULL)
+			goto out;
+		pathp++;
+		offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
+		pathp->offset = offset;
+		pathp->objnode = slot;
+		slot = slot->slots[offset];
+		shift -= OBJNODE_TREE_MAP_SHIFT;
+		height--;
+	} while (height > 0);
+	if (slot == NULL)
+		goto out;
+	while (pathp->objnode) {
+		pathp->objnode->slots[pathp->offset] = NULL;
+		pathp->objnode->slots_in_use--;
+		if (pathp->objnode->slots_in_use) {
+			if (pathp->objnode == obj->objnode_tree_root) {
+				while (obj->objnode_tree_height > 0 &&
+				  obj->objnode_tree_root->slots_in_use == 1 &&
+				  obj->objnode_tree_root->slots[0]) {
+					struct tmem_objnode *to_free =
+						obj->objnode_tree_root;
+
+					obj->objnode_tree_root =
+							to_free->slots[0];
+					obj->objnode_tree_height--;
+					to_free->slots[0] = NULL;
+					to_free->slots_in_use = 0;
+					tmem_objnode_free(to_free);
+				}
+			}
+			goto out;
+		}
+		tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
+		pathp--;
+	}
+	obj->objnode_tree_height = 0;
+	obj->objnode_tree_root = NULL;
+
+out:
+	if (slot != NULL)
+		obj->pampd_count--;
+	BUG_ON(obj->pampd_count < 0);
+	return slot;
+}
+
+/* recursively walk the objnode_tree destroying pampds and objnodes */
+static void tmem_objnode_node_destroy(struct tmem_obj *obj,
+					struct tmem_objnode *objnode,
+					unsigned int ht)
+{
+	int i;
+
+	if (ht == 0)
+		return;
+	for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
+		if (objnode->slots[i]) {
+			if (ht == 1) {
+				obj->pampd_count--;
+				(*tmem_pamops.free)(objnode->slots[i],
+								obj->pool);
+				objnode->slots[i] = NULL;
+				continue;
+			}
+			tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
+			tmem_objnode_free(objnode->slots[i]);
+			objnode->slots[i] = NULL;
+		}
+	}
+}
+
+static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
+{
+	if (obj->objnode_tree_root == NULL)
+		return;
+	if (obj->objnode_tree_height == 0) {
+		obj->pampd_count--;
+		(*tmem_pamops.free)(obj->objnode_tree_root, obj->pool);
+	} else {
+		tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
+					obj->objnode_tree_height);
+		tmem_objnode_free(obj->objnode_tree_root);
+		obj->objnode_tree_height = 0;
+	}
+	obj->objnode_tree_root = NULL;
+}
+
+/*
+ * Tmem is operated on by a set of well-defined actions:
+ * "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
+ * (The tmem ABI allows for subpages and exchanges but these operations
+ * are not included in this implementation.)
+ *
+ * These "tmem core" operations are implemented in the following functions.
+ */
+
+/*
+ * "Put" a page, e.g. copy a page from the kernel into newly allocated
+ * PAM space (if such space is available).  Tmem_put is complicated by
+ * a corner case: What if a page with matching handle already exists in
+ * tmem?  To guarantee coherency, one of two actions is necessary: Either
+ * the data for the page must be overwritten, or the page must be
+ * "flushed" so that the data is not accessible to a subsequent "get".
+ * Since these "duplicate puts" are relatively rare, this implementation
+ * always flushes for simplicity.
+ */
+int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
+		struct page *page)
+{
+	struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
+	void *pampd = NULL, *pampd_del = NULL;
+	int ret = -ENOMEM;
+	bool ephemeral;
+	struct tmem_hashbucket *hb;
+
+	ephemeral = is_ephemeral(pool);
+	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
+	spin_lock(&hb->lock);
+	obj = objfound = tmem_obj_find(hb, oidp);
+	if (obj != NULL) {
+		pampd = tmem_pampd_lookup_in_obj(objfound, index);
+		if (pampd != NULL) {
+			/* if found, is a dup put, flush the old one */
+			pampd_del = tmem_pampd_delete_from_obj(obj, index);
+			BUG_ON(pampd_del != pampd);
+			(*tmem_pamops.free)(pampd, pool);
+			if (obj->pampd_count == 0) {
+				objnew = obj;
+				objfound = NULL;
+			}
+			pampd = NULL;
+		}
+	} else {
+		obj = objnew = (*tmem_hostops.obj_alloc)(pool);
+		if (unlikely(obj == NULL)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		tmem_obj_init(obj, hb, pool, oidp);
+	}
+	BUG_ON(obj == NULL);
+	BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
+	pampd = (*tmem_pamops.create)(obj->pool, &obj->oid, index, page);
+	if (unlikely(pampd == NULL))
+		goto free;
+	ret = tmem_pampd_add_to_obj(obj, index, pampd);
+	if (unlikely(ret == -ENOMEM))
+		/* may have partially built objnode tree ("stump") */
+		goto delete_and_free;
+	goto out;
+
+delete_and_free:
+	(void)tmem_pampd_delete_from_obj(obj, index);
+free:
+	if (pampd)
+		(*tmem_pamops.free)(pampd, pool);
+	if (objnew) {
+		tmem_obj_free(objnew, hb);
+		(*tmem_hostops.obj_free)(objnew, pool);
+	}
+out:
+	spin_unlock(&hb->lock);
+	return ret;
+}
+
+/*
+ * "Get" a page, e.g. if one can be found, copy the tmem page with the
+ * matching handle from PAM space to the kernel.  By tmem definition,
+ * when a "get" is successful on an ephemeral page, the page is "flushed",
+ * and when a "get" is successful on a persistent page, the page is retained
+ * in tmem.  Note that to preserve
+ * coherency, "get" can never be skipped if tmem contains the data.
+ * That is, if a get is done with a certain handle and fails, any
+ * subsequent "get" must also fail (unless of course there is a
+ * "put" done with the same handle).
+
+ */
+int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp,
+				uint32_t index, struct page *page)
+{
+	struct tmem_obj *obj;
+	void *pampd;
+	bool ephemeral = is_ephemeral(pool);
+	uint32_t ret = -1;
+	struct tmem_hashbucket *hb;
+
+	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
+	spin_lock(&hb->lock);
+	obj = tmem_obj_find(hb, oidp);
+	if (obj == NULL)
+		goto out;
+	ephemeral = is_ephemeral(pool);
+	if (ephemeral)
+		pampd = tmem_pampd_delete_from_obj(obj, index);
+	else
+		pampd = tmem_pampd_lookup_in_obj(obj, index);
+	if (pampd == NULL)
+		goto out;
+	ret = (*tmem_pamops.get_data)(page, pampd, pool);
+	if (ret < 0)
+		goto out;
+	if (ephemeral) {
+		(*tmem_pamops.free)(pampd, pool);
+		if (obj->pampd_count == 0) {
+			tmem_obj_free(obj, hb);
+			(*tmem_hostops.obj_free)(obj, pool);
+			obj = NULL;
+		}
+	}
+	ret = 0;
+out:
+	spin_unlock(&hb->lock);
+	return ret;
+}
+
+/*
+ * If a page in tmem matches the handle, "flush" this page from tmem such
+ * that any subsequent "get" does not succeed (unless, of course, there
+ * was another "put" with the same handle).
+ */
+int tmem_flush_page(struct tmem_pool *pool,
+				struct tmem_oid *oidp, uint32_t index)
+{
+	struct tmem_obj *obj;
+	void *pampd;
+	int ret = -1;
+	struct tmem_hashbucket *hb;
+
+	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
+	spin_lock(&hb->lock);
+	obj = tmem_obj_find(hb, oidp);
+	if (obj == NULL)
+		goto out;
+	pampd = tmem_pampd_delete_from_obj(obj, index);
+	if (pampd == NULL)
+		goto out;
+	(*tmem_pamops.free)(pampd, pool);
+	if (obj->pampd_count == 0) {
+		tmem_obj_free(obj, hb);
+		(*tmem_hostops.obj_free)(obj, pool);
+	}
+	ret = 0;
+
+out:
+	spin_unlock(&hb->lock);
+	return ret;
+}
+
+/*
+ * "Flush" all pages in tmem matching this oid.
+ */
+int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
+{
+	struct tmem_obj *obj;
+	struct tmem_hashbucket *hb;
+	int ret = -1;
+
+	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
+	spin_lock(&hb->lock);
+	obj = tmem_obj_find(hb, oidp);
+	if (obj == NULL)
+		goto out;
+	tmem_pampd_destroy_all_in_obj(obj);
+	tmem_obj_free(obj, hb);
+	(*tmem_hostops.obj_free)(obj, pool);
+	ret = 0;
+
+out:
+	spin_unlock(&hb->lock);
+	return ret;
+}
+
+/*
+ * "Flush" all pages (and tmem_objs) from this tmem_pool and disable
+ * all subsequent access to this tmem_pool.
+ */
+int tmem_destroy_pool(struct tmem_pool *pool)
+{
+	int ret = -1;
+
+	if (pool == NULL)
+		goto out;
+	tmem_pool_flush(pool, 1);
+	ret = 0;
+out:
+	return ret;
+}
+
+static LIST_HEAD(tmem_global_pool_list);
+
+/*
+ * Create a new tmem_pool with the provided flag and return
+ * a pool id provided by the tmem host implementation.
+ */
+void tmem_new_pool(struct tmem_pool *pool, uint32_t flags)
+{
+	int persistent = flags & TMEM_POOL_PERSIST;
+	int shared = flags & TMEM_POOL_SHARED;
+	struct tmem_hashbucket *hb = &pool->hashbucket[0];
+	int i;
+
+	for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
+		hb->obj_rb_root = RB_ROOT;
+		spin_lock_init(&hb->lock);
+	}
+	INIT_LIST_HEAD(&pool->pool_list);
+	atomic_set(&pool->obj_count, 0);
+	SET_SENTINEL(pool, POOL);
+	list_add_tail(&pool->pool_list, &tmem_global_pool_list);
+	pool->persistent = persistent;
+	pool->shared = shared;
+}
diff -Nrupad linux-2.6.37//drivers/staging/zcache/tmem.h linux-2.6.37_vanilla//drivers/staging/zcache/tmem.h
--- linux-2.6.37//drivers/staging/zcache/tmem.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.37_vanilla//drivers/staging/zcache/tmem.h	2011-02-14 01:21:43.160793007 +0100
@@ -0,0 +1,195 @@
+/*
+ * tmem.h
+ *
+ * Transcendent memory
+ *
+ * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
+ */
+
+#ifndef _TMEM_H_
+#define _TMEM_H_
+
+#include <linux/types.h>
+#include <linux/highmem.h>
+#include <linux/hash.h>
+#include <linux/atomic.h>
+
+/*
+ * These are pre-defined by the Xen<->Linux ABI
+ */
+#define TMEM_PUT_PAGE			4
+#define TMEM_GET_PAGE			5
+#define TMEM_FLUSH_PAGE			6
+#define TMEM_FLUSH_OBJECT		7
+#define TMEM_POOL_PERSIST		1
+#define TMEM_POOL_SHARED		2
+#define TMEM_POOL_PRECOMPRESSED		4
+#define TMEM_POOL_PAGESIZE_SHIFT	4
+#define TMEM_POOL_PAGESIZE_MASK		0xf
+#define TMEM_POOL_RESERVED_BITS		0x00ffff00
+
+/*
+ * sentinels have proven very useful for debugging but can be removed
+ * or disabled before final merge.
+ */
+#define SENTINELS
+#ifdef SENTINELS
+#define DECL_SENTINEL uint32_t sentinel;
+#define SET_SENTINEL(_x, _y) (_x->sentinel = _y##_SENTINEL)
+#define INVERT_SENTINEL(_x, _y) (_x->sentinel = ~_y##_SENTINEL)
+#define ASSERT_SENTINEL(_x, _y) WARN_ON(_x->sentinel != _y##_SENTINEL)
+#define ASSERT_INVERTED_SENTINEL(_x, _y) WARN_ON(_x->sentinel != ~_y##_SENTINEL)
+#else
+#define DECL_SENTINEL
+#define SET_SENTINEL(_x, _y) do { } while (0)
+#define INVERT_SENTINEL(_x, _y) do { } while (0)
+#define ASSERT_SENTINEL(_x, _y) do { } while (0)
+#define ASSERT_INVERTED_SENTINEL(_x, _y) do { } while (0)
+#endif
+
+#define ASSERT_SPINLOCK(_l)	WARN_ON(!spin_is_locked(_l))
+
+/*
+ * A pool is the highest-level data structure managed by tmem and
+ * usually corresponds to a large independent set of pages such as
+ * a filesystem.  Each pool has an id, and certain attributes and counters.
+ * It also contains a set of hash buckets, each of which contains an rbtree
+ * of objects and a lock to manage concurrency within the pool.
+ */
+
+#define TMEM_HASH_BUCKET_BITS	8
+#define TMEM_HASH_BUCKETS	(1<<TMEM_HASH_BUCKET_BITS)
+
+struct tmem_hashbucket {
+	struct rb_root obj_rb_root;
+	spinlock_t lock;
+};
+
+struct tmem_pool {
+	void *client; /* "up" for some clients, avoids table lookup */
+	struct list_head pool_list;
+	uint32_t pool_id;
+	bool persistent;
+	bool shared;
+	atomic_t obj_count;
+	atomic_t refcount;
+	struct tmem_hashbucket hashbucket[TMEM_HASH_BUCKETS];
+	DECL_SENTINEL
+};
+
+#define is_persistent(_p)  (_p->persistent)
+#define is_ephemeral(_p)   (!(_p->persistent))
+
+/*
+ * An object id ("oid") is large: 192-bits (to ensure, for example, files
+ * in a modern filesystem can be uniquely identified).
+ */
+
+struct tmem_oid {
+	uint64_t oid[3];
+};
+
+static inline void tmem_oid_set_invalid(struct tmem_oid *oidp)
+{
+	oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
+}
+
+static inline bool tmem_oid_valid(struct tmem_oid *oidp)
+{
+	return oidp->oid[0] != -1UL || oidp->oid[1] != -1UL ||
+		oidp->oid[2] != -1UL;
+}
+
+static inline int tmem_oid_compare(struct tmem_oid *left,
+					struct tmem_oid *right)
+{
+	int ret;
+
+	if (left->oid[2] == right->oid[2]) {
+		if (left->oid[1] == right->oid[1]) {
+			if (left->oid[0] == right->oid[0])
+				ret = 0;
+			else if (left->oid[0] < right->oid[0])
+				ret = -1;
+			else
+				return 1;
+		} else if (left->oid[1] < right->oid[1])
+			ret = -1;
+		else
+			ret = 1;
+	} else if (left->oid[2] < right->oid[2])
+		ret = -1;
+	else
+		ret = 1;
+	return ret;
+}
+
+static inline unsigned tmem_oid_hash(struct tmem_oid *oidp)
+{
+	return hash_long(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2],
+				TMEM_HASH_BUCKET_BITS);
+}
+
+/*
+ * A tmem_obj contains an identifier (oid), pointers to the parent
+ * pool and the rb_tree to which it belongs, counters, and an ordered
+ * set of pampds, structured in a radix-tree-like tree.  The intermediate
+ * nodes of the tree are called tmem_objnodes.
+ */
+
+struct tmem_objnode;
+
+struct tmem_obj {
+	struct tmem_oid oid;
+	struct tmem_pool *pool;
+	struct rb_node rb_tree_node;
+	struct tmem_objnode *objnode_tree_root;
+	unsigned int objnode_tree_height;
+	unsigned long objnode_count;
+	long pampd_count;
+	DECL_SENTINEL
+};
+
+#define OBJNODE_TREE_MAP_SHIFT 6
+#define OBJNODE_TREE_MAP_SIZE (1UL << OBJNODE_TREE_MAP_SHIFT)
+#define OBJNODE_TREE_MAP_MASK (OBJNODE_TREE_MAP_SIZE-1)
+#define OBJNODE_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
+#define OBJNODE_TREE_MAX_PATH \
+		(OBJNODE_TREE_INDEX_BITS/OBJNODE_TREE_MAP_SHIFT + 2)
+
+struct tmem_objnode {
+	struct tmem_obj *obj;
+	DECL_SENTINEL
+	void *slots[OBJNODE_TREE_MAP_SIZE];
+	unsigned int slots_in_use;
+};
+
+/* pampd abstract datatype methods provided by the PAM implementation */
+struct tmem_pamops {
+	void *(*create)(struct tmem_pool *, struct tmem_oid *, uint32_t,
+			struct page *);
+	int (*get_data)(struct page *, void *, struct tmem_pool *);
+	void (*free)(void *, struct tmem_pool *);
+};
+extern void tmem_register_pamops(struct tmem_pamops *m);
+
+/* memory allocation methods provided by the host implementation */
+struct tmem_hostops {
+	struct tmem_obj *(*obj_alloc)(struct tmem_pool *);
+	void (*obj_free)(struct tmem_obj *, struct tmem_pool *);
+	struct tmem_objnode *(*objnode_alloc)(struct tmem_pool *);
+	void (*objnode_free)(struct tmem_objnode *, struct tmem_pool *);
+};
+extern void tmem_register_hostops(struct tmem_hostops *m);
+
+/* core tmem accessor functions */
+extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index,
+			struct page *page);
+extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index,
+			struct page *page);
+extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *,
+			uint32_t index);
+extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *);
+extern int tmem_destroy_pool(struct tmem_pool *);
+extern void tmem_new_pool(struct tmem_pool *, uint32_t);
+#endif /* _TMEM_H */
diff -Nrupad linux-2.6.37//drivers/staging/zcache/zcache.c linux-2.6.37_vanilla//drivers/staging/zcache/zcache.c
--- linux-2.6.37//drivers/staging/zcache/zcache.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.37_vanilla//drivers/staging/zcache/zcache.c	2011-02-14 01:22:00.636793117 +0100
@@ -0,0 +1,1658 @@
+/*
+ * zcache.c
+ *
+ * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.
+ * Copyright (c) 2010,2011, Nitin Gupta
+ *
+ * Zcache provides an in-kernel "host implementation" for transcendent memory
+ * and, thus indirectly, for cleancache and frontswap.  Zcache includes two
+ * page-accessible memory [1] interfaces, both utilizing lzo1x compression:
+ * 1) "compression buddies" ("zbud") is used for ephemeral pages
+ * 2) xvmalloc is used for persistent pages.
+ * Xvmalloc (based on the TLSF allocator) has very low fragmentation
+ * so maximizes space efficiency, while zbud allows pairs (and potentially,
+ * in the future, more than a pair of) compressed pages to be closely linked
+ * so that reclaiming can be done via the kernel's physical-page-oriented
+ * "shrinker" interface.
+ *
+ * [1] For a definition of page-accessible memory (aka PAM), see:
+ *   http://marc.info/?l=linux-mm&m=127811271605009
+ */
+
+#include <linux/cpu.h>
+#include <linux/highmem.h>
+#include <linux/list.h>
+#include <linux/lzo.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include "tmem.h"
+
+#include "../zram/xvmalloc.h" /* if built in drivers/staging */
+
+#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
+#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
+#endif
+#ifdef CONFIG_CLEANCACHE
+#include <linux/cleancache.h>
+#endif
+#ifdef CONFIG_FRONTSWAP
+#include <linux/frontswap.h>
+#endif
+
+#if 0
+/* this is more aggressive but may cause other problems? */
+#define ZCACHE_GFP_MASK	(GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
+#else
+#define ZCACHE_GFP_MASK \
+	(__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
+#endif
+
+/**********
+ * Compression buddies ("zbud") provides for packing two (or, possibly
+ * in the future, more) compressed ephemeral pages into a single "raw"
+ * (physical) page and tracking them with data structures so that
+ * the raw pages can be easily reclaimed.
+ *
+ * A zbud page ("zbpg") is an aligned page containing a list_head,
+ * a lock, and two "zbud headers".  The remainder of the physical
+ * page is divided up into aligned 64-byte "chunks" which contain
+ * the compressed data for zero, one, or two zbuds.  Each zbpg
+ * resides on: (1) an "unused list" if it has no zbuds; (2) a
+ * "buddied" list if it is fully populated  with two zbuds; or
+ * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
+ * the one unbuddied zbud uses.  The data inside a zbpg cannot be
+ * read or written unless the zbpg's lock is held.
+ */
+
+#define ZBH_SENTINEL  0x43214321
+#define ZBPG_SENTINEL  0xdeadbeef
+
+#define ZBUD_MAX_BUDS 2
+
+struct zbud_hdr {
+	uint32_t pool_id;
+	struct tmem_oid oid;
+	uint32_t index;
+	uint16_t size; /* compressed size in bytes, zero means unused */
+	DECL_SENTINEL
+};
+
+struct zbud_page {
+	struct list_head bud_list;
+	spinlock_t lock;
+	struct zbud_hdr buddy[ZBUD_MAX_BUDS];
+	DECL_SENTINEL
+	/* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
+};
+
+#define CHUNK_SHIFT	6
+#define CHUNK_SIZE	(1 << CHUNK_SHIFT)
+#define CHUNK_MASK	(~(CHUNK_SIZE-1))
+#define NCHUNKS		(((PAGE_SIZE - sizeof(struct zbud_page)) & \
+				CHUNK_MASK) >> CHUNK_SHIFT)
+#define MAX_CHUNK	(NCHUNKS-1)
+
+static struct {
+	struct list_head list;
+	unsigned count;
+} zbud_unbuddied[NCHUNKS];
+/* list N contains pages with N chunks USED and NCHUNKS-N unused */
+/* element 0 is never used but optimizing that isn't worth it */
+static unsigned long zbud_cumul_chunk_counts[NCHUNKS];
+
+struct list_head zbud_buddied_list;
+static unsigned long zcache_zbud_buddied_count;
+
+/* protects the buddied list and all unbuddied lists */
+static DEFINE_SPINLOCK(zbud_budlists_spinlock);
+
+static LIST_HEAD(zbpg_unused_list);
+static unsigned long zcache_zbpg_unused_list_count;
+
+/* protects the unused page list */
+static DEFINE_SPINLOCK(zbpg_unused_list_spinlock);
+
+static atomic_t zcache_zbud_curr_raw_pages;
+static atomic_t zcache_zbud_curr_zpages;
+static unsigned long zcache_zbud_curr_zbytes;
+static unsigned long zcache_zbud_cumul_zpages;
+static unsigned long zcache_zbud_cumul_zbytes;
+static unsigned long zcache_compress_poor;
+
+/* forward references */
+static void *zcache_get_free_page(void);
+static void zcache_free_page(void *p);
+
+/*
+ * zbud helper functions
+ */
+
+static inline unsigned zbud_max_buddy_size(void)
+{
+	return MAX_CHUNK << CHUNK_SHIFT;
+}
+
+static inline unsigned zbud_size_to_chunks(unsigned size)
+{
+	BUG_ON(size == 0 || size > zbud_max_buddy_size());
+	return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
+}
+
+static inline int zbud_budnum(struct zbud_hdr *zh)
+{
+	unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1);
+	struct zbud_page *zbpg = NULL;
+	unsigned budnum = -1U;
+	int i;
+
+	for (i = 0; i < ZBUD_MAX_BUDS; i++)
+		if (offset == offsetof(typeof(*zbpg), buddy[i])) {
+			budnum = i;
+			break;
+		}
+	BUG_ON(budnum == -1U);
+	return budnum;
+}
+
+static char *zbud_data(struct zbud_hdr *zh, unsigned size)
+{
+	struct zbud_page *zbpg;
+	char *p;
+	unsigned budnum;
+
+	ASSERT_SENTINEL(zh, ZBH);
+	budnum = zbud_budnum(zh);
+	BUG_ON(size == 0 || size > zbud_max_buddy_size());
+	zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
+	ASSERT_SPINLOCK(&zbpg->lock);
+	p = (char *)zbpg;
+	if (budnum == 0)
+		p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &
+							CHUNK_MASK);
+	else if (budnum == 1)
+		p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
+	return p;
+}
+
+/*
+ * zbud raw page management
+ */
+
+static struct zbud_page *zbud_alloc_raw_page(void)
+{
+	struct zbud_page *zbpg = NULL;
+	struct zbud_hdr *zh0, *zh1;
+	bool recycled = 0;
+
+	/* if any pages on the zbpg list, use one */
+	spin_lock(&zbpg_unused_list_spinlock);
+	if (!list_empty(&zbpg_unused_list)) {
+		zbpg = list_first_entry(&zbpg_unused_list,
+				struct zbud_page, bud_list);
+		list_del_init(&zbpg->bud_list);
+		zcache_zbpg_unused_list_count--;
+		recycled = 1;
+	}
+	spin_unlock(&zbpg_unused_list_spinlock);
+	if (zbpg == NULL)
+		/* none on zbpg list, try to get a kernel page */
+		zbpg = zcache_get_free_page();
+	if (likely(zbpg != NULL)) {
+		INIT_LIST_HEAD(&zbpg->bud_list);
+		zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
+		spin_lock_init(&zbpg->lock);
+		if (recycled) {
+			ASSERT_INVERTED_SENTINEL(zbpg, ZBPG);
+			SET_SENTINEL(zbpg, ZBPG);
+			BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
+			BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
+		} else {
+			atomic_inc(&zcache_zbud_curr_raw_pages);
+			INIT_LIST_HEAD(&zbpg->bud_list);
+			SET_SENTINEL(zbpg, ZBPG);
+			zh0->size = 0; zh1->size = 0;
+			tmem_oid_set_invalid(&zh0->oid);
+			tmem_oid_set_invalid(&zh1->oid);
+		}
+	}
+	return zbpg;
+}
+
+static void zbud_free_raw_page(struct zbud_page *zbpg)
+{
+	struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1];
+
+	ASSERT_SENTINEL(zbpg, ZBPG);
+	BUG_ON(!list_empty(&zbpg->bud_list));
+	ASSERT_SPINLOCK(&zbpg->lock);
+	BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
+	BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
+	INVERT_SENTINEL(zbpg, ZBPG);
+	spin_unlock(&zbpg->lock);
+	spin_lock(&zbpg_unused_list_spinlock);
+	list_add(&zbpg->bud_list, &zbpg_unused_list);
+	zcache_zbpg_unused_list_count++;
+	spin_unlock(&zbpg_unused_list_spinlock);
+}
+
+/*
+ * core zbud handling routines
+ */
+
+static unsigned zbud_free(struct zbud_hdr *zh)
+{
+	unsigned size;
+
+	ASSERT_SENTINEL(zh, ZBH);
+	BUG_ON(!tmem_oid_valid(&zh->oid));
+	size = zh->size;
+	BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
+	zh->size = 0;
+	tmem_oid_set_invalid(&zh->oid);
+	INVERT_SENTINEL(zh, ZBH);
+	zcache_zbud_curr_zbytes -= size;
+	atomic_dec(&zcache_zbud_curr_zpages);
+	return size;
+}
+
+static void zbud_free_and_delist(struct zbud_hdr *zh)
+{
+	unsigned chunks;
+	struct zbud_hdr *zh_other;
+	unsigned budnum = zbud_budnum(zh), size;
+	struct zbud_page *zbpg =
+		container_of(zh, struct zbud_page, buddy[budnum]);
+
+	spin_lock(&zbpg->lock);
+	if (list_empty(&zbpg->bud_list)) {
+		/* ignore zombie page... see zbud_evict_pages() */
+		spin_unlock(&zbpg->lock);
+		return;
+	}
+	size = zbud_free(zh);
+	ASSERT_SPINLOCK(&zbpg->lock);
+	zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0];
+	if (zh_other->size == 0) { /* was unbuddied: unlist and free */
+		chunks = zbud_size_to_chunks(size) ;
+		spin_lock(&zbud_budlists_spinlock);
+		BUG_ON(list_empty(&zbud_unbuddied[chunks].list));
+		list_del_init(&zbpg->bud_list);
+		zbud_unbuddied[chunks].count--;
+		spin_unlock(&zbud_budlists_spinlock);
+		zbud_free_raw_page(zbpg);
+	} else { /* was buddied: move remaining buddy to unbuddied list */
+		chunks = zbud_size_to_chunks(zh_other->size) ;
+		spin_lock(&zbud_budlists_spinlock);
+		list_del_init(&zbpg->bud_list);
+		zcache_zbud_buddied_count--;
+		list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list);
+		zbud_unbuddied[chunks].count++;
+		spin_unlock(&zbud_budlists_spinlock);
+		spin_unlock(&zbpg->lock);
+	}
+}
+
+static struct zbud_hdr *zbud_create(uint32_t pool_id, struct tmem_oid *oid,
+					uint32_t index, struct page *page,
+					void *cdata, unsigned size)
+{
+	struct zbud_hdr *zh0, *zh1, *zh = NULL;
+	struct zbud_page *zbpg = NULL, *ztmp;
+	unsigned nchunks;
+	char *to;
+	int i, found_good_buddy = 0;
+
+	nchunks = zbud_size_to_chunks(size) ;
+	for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
+		spin_lock(&zbud_budlists_spinlock);
+		if (!list_empty(&zbud_unbuddied[i].list)) {
+			list_for_each_entry_safe(zbpg, ztmp,
+				    &zbud_unbuddied[i].list, bud_list) {
+				if (spin_trylock(&zbpg->lock)) {
+					found_good_buddy = i;
+					goto found_unbuddied;
+				}
+			}
+		}
+		spin_unlock(&zbud_budlists_spinlock);
+	}
+	/* didn't find a good buddy, try allocating a new page */
+	zbpg = zbud_alloc_raw_page();
+	if (unlikely(zbpg == NULL))
+		goto out;
+	/* ok, have a page, now compress the data before taking locks */
+	spin_lock(&zbpg->lock);
+	spin_lock(&zbud_budlists_spinlock);
+	list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list);
+	zbud_unbuddied[nchunks].count++;
+	zh = &zbpg->buddy[0];
+	goto init_zh;
+
+found_unbuddied:
+	ASSERT_SPINLOCK(&zbpg->lock);
+	zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
+	BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0)));
+	if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */
+		ASSERT_SENTINEL(zh0, ZBH);
+		zh = zh1;
+	} else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */
+		ASSERT_SENTINEL(zh1, ZBH);
+		zh = zh0;
+	} else
+		BUG();
+	list_del_init(&zbpg->bud_list);
+	zbud_unbuddied[found_good_buddy].count--;
+	list_add_tail(&zbpg->bud_list, &zbud_buddied_list);
+	zcache_zbud_buddied_count++;
+
+init_zh:
+	SET_SENTINEL(zh, ZBH);
+	zh->size = size;
+	zh->index = index;
+	zh->oid = *oid;
+	zh->pool_id = pool_id;
+	/* can wait to copy the data until the list locks are dropped */
+	spin_unlock(&zbud_budlists_spinlock);
+
+	to = zbud_data(zh, size);
+	memcpy(to, cdata, size);
+	spin_unlock(&zbpg->lock);
+	zbud_cumul_chunk_counts[nchunks]++;
+	atomic_inc(&zcache_zbud_curr_zpages);
+	zcache_zbud_cumul_zpages++;
+	zcache_zbud_curr_zbytes += size;
+	zcache_zbud_cumul_zbytes += size;
+out:
+	return zh;
+}
+
+static int zbud_decompress(struct page *page, struct zbud_hdr *zh)
+{
+	struct zbud_page *zbpg;
+	unsigned budnum = zbud_budnum(zh);
+	size_t out_len = PAGE_SIZE;
+	char *to_va, *from_va;
+	unsigned size;
+	int ret = 0;
+
+	zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
+	spin_lock(&zbpg->lock);
+	if (list_empty(&zbpg->bud_list)) {
+		/* ignore zombie page... see zbud_evict_pages() */
+		ret = -EINVAL;
+		goto out;
+	}
+	ASSERT_SENTINEL(zh, ZBH);
+	BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
+	to_va = kmap_atomic(page, KM_USER0);
+	size = zh->size;
+	from_va = zbud_data(zh, size);
+	ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len);
+	BUG_ON(ret != LZO_E_OK);
+	BUG_ON(out_len != PAGE_SIZE);
+	kunmap_atomic(to_va, KM_USER0);
+out:
+	spin_unlock(&zbpg->lock);
+	return ret;
+}
+
+/*
+ * The following routines handle shrinking of ephemeral pages by evicting
+ * pages "least valuable" first.
+ */
+
+static unsigned long zcache_evicted_raw_pages;
+static unsigned long zcache_evicted_buddied_pages;
+static unsigned long zcache_evicted_unbuddied_pages;
+
+static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid);
+static void zcache_put_pool(struct tmem_pool *pool);
+
+/*
+ * Flush and free all zbuds in a zbpg, then free the pageframe
+ */
+static void zbud_evict_zbpg(struct zbud_page *zbpg)
+{
+	struct zbud_hdr *zh;
+	int i, j;
+	uint32_t pool_id[ZBUD_MAX_BUDS], index[ZBUD_MAX_BUDS];
+	struct tmem_oid oid[ZBUD_MAX_BUDS];
+	struct tmem_pool *pool;
+
+	ASSERT_SPINLOCK(&zbpg->lock);
+	BUG_ON(!list_empty(&zbpg->bud_list));
+	for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) {
+		zh = &zbpg->buddy[i];
+		if (zh->size) {
+			pool_id[j] = zh->pool_id;
+			oid[j] = zh->oid;
+			index[j] = zh->index;
+			j++;
+			zbud_free(zh);
+		}
+	}
+	spin_unlock(&zbpg->lock);
+	for (i = 0; i < j; i++) {
+		pool = zcache_get_pool_by_id(pool_id[i]);
+		if (pool != NULL) {
+			tmem_flush_page(pool, &oid[i], index[i]);
+			zcache_put_pool(pool);
+		}
+	}
+	ASSERT_SENTINEL(zbpg, ZBPG);
+	spin_lock(&zbpg->lock);
+	zbud_free_raw_page(zbpg);
+}
+
+/*
+ * Free nr pages.  This code is funky because we want to hold the locks
+ * protecting various lists for as short a time as possible, and in some
+ * circumstances the list may change asynchronously when the list lock is
+ * not held.  In some cases we also trylock not only to avoid waiting on a
+ * page in use by another cpu, but also to avoid potential deadlock due to
+ * lock inversion.
+ */
+static void zbud_evict_pages(int nr)
+{
+	struct zbud_page *zbpg;
+	int i;
+
+	/* first try freeing any pages on unused list */
+retry_unused_list:
+	spin_lock_bh(&zbpg_unused_list_spinlock);
+	if (!list_empty(&zbpg_unused_list)) {
+		/* can't walk list here, since it may change when unlocked */
+		zbpg = list_first_entry(&zbpg_unused_list,
+				struct zbud_page, bud_list);
+		list_del_init(&zbpg->bud_list);
+		zcache_zbpg_unused_list_count--;
+		atomic_dec(&zcache_zbud_curr_raw_pages);
+		spin_unlock_bh(&zbpg_unused_list_spinlock);
+		zcache_free_page(zbpg);
+		zcache_evicted_raw_pages++;
+		if (--nr <= 0)
+			goto out;
+		goto retry_unused_list;
+	}
+	spin_unlock_bh(&zbpg_unused_list_spinlock);
+
+	/* now try freeing unbuddied pages, starting with least space avail */
+	for (i = 0; i < MAX_CHUNK; i++) {
+retry_unbud_list_i:
+		spin_lock_bh(&zbud_budlists_spinlock);
+		if (list_empty(&zbud_unbuddied[i].list)) {
+			spin_unlock_bh(&zbud_budlists_spinlock);
+			continue;
+		}
+		list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {
+			if (unlikely(!spin_trylock(&zbpg->lock)))
+				continue;
+			list_del_init(&zbpg->bud_list);
+			zbud_unbuddied[i].count--;
+			spin_unlock(&zbud_budlists_spinlock);
+			zcache_evicted_unbuddied_pages++;
+			/* want budlists unlocked when doing zbpg eviction */
+			zbud_evict_zbpg(zbpg);
+			local_bh_enable();
+			if (--nr <= 0)
+				goto out;
+			goto retry_unbud_list_i;
+		}
+		spin_unlock_bh(&zbud_budlists_spinlock);
+	}
+
+	/* as a last resort, free buddied pages */
+retry_bud_list:
+	spin_lock_bh(&zbud_budlists_spinlock);
+	if (list_empty(&zbud_buddied_list)) {
+		spin_unlock_bh(&zbud_budlists_spinlock);
+		goto out;
+	}
+	list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {
+		if (unlikely(!spin_trylock(&zbpg->lock)))
+			continue;
+		list_del_init(&zbpg->bud_list);
+		zcache_zbud_buddied_count--;
+		spin_unlock(&zbud_budlists_spinlock);
+		zcache_evicted_buddied_pages++;
+		/* want budlists unlocked when doing zbpg eviction */
+		zbud_evict_zbpg(zbpg);
+		local_bh_enable();
+		if (--nr <= 0)
+			goto out;
+		goto retry_bud_list;
+	}
+	spin_unlock_bh(&zbud_budlists_spinlock);
+out:
+	return;
+}
+
+static void zbud_init(void)
+{
+	int i;
+
+	INIT_LIST_HEAD(&zbud_buddied_list);
+	zcache_zbud_buddied_count = 0;
+	for (i = 0; i < NCHUNKS; i++) {
+		INIT_LIST_HEAD(&zbud_unbuddied[i].list);
+		zbud_unbuddied[i].count = 0;
+	}
+}
+
+#ifdef CONFIG_SYSFS
+/*
+ * These sysfs routines show a nice distribution of how many zbpg's are
+ * currently (and have ever been placed) in each unbuddied list.  It's fun
+ * to watch but can probably go away before final merge.
+ */
+static int zbud_show_unbuddied_list_counts(char *buf)
+{
+	int i;
+	char *p = buf;
+
+	for (i = 0; i < NCHUNKS - 1; i++)
+		p += sprintf(p, "%u ", zbud_unbuddied[i].count);
+	p += sprintf(p, "%d\n", zbud_unbuddied[i].count);
+	return p - buf;
+}
+
+static int zbud_show_cumul_chunk_counts(char *buf)
+{
+	unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0;
+	unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0;
+	unsigned long total_chunks_lte_42 = 0;
+	char *p = buf;
+
+	for (i = 0; i < NCHUNKS; i++) {
+		p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]);
+		chunks += zbud_cumul_chunk_counts[i];
+		total_chunks += zbud_cumul_chunk_counts[i];
+		sum_total_chunks += i * zbud_cumul_chunk_counts[i];
+		if (i == 21)
+			total_chunks_lte_21 = total_chunks;
+		if (i == 32)
+			total_chunks_lte_32 = total_chunks;
+		if (i == 42)
+			total_chunks_lte_42 = total_chunks;
+	}
+	p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
+		total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42,
+		chunks == 0 ? 0 : sum_total_chunks / chunks);
+	return p - buf;
+}
+#endif
+
+/**********
+ * This "zv" PAM implementation combines the TLSF-based xvMalloc
+ * with lzo1x compression to maximize the amount of data that can
+ * be packed into a physical page.
+ *
+ * Zv represents a PAM page with the index and object (plus a "size" value
+ * necessary for decompression) immediately preceding the compressed data.
+ */
+
+#define ZVH_SENTINEL  0x43214321
+
+struct zv_hdr {
+	uint32_t pool_id;
+	struct tmem_oid oid;
+	uint32_t index;
+	DECL_SENTINEL
+};
+
+static const int zv_max_page_size = (PAGE_SIZE / 8) * 7;
+
+static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id,
+				struct tmem_oid *oid, uint32_t index,
+				void *cdata, unsigned clen)
+{
+	struct page *page;
+	struct zv_hdr *zv = NULL;
+	uint32_t offset;
+	int ret;
+
+	BUG_ON(!irqs_disabled());
+	ret = xv_malloc(xvpool, clen + sizeof(struct zv_hdr),
+			&page, &offset, ZCACHE_GFP_MASK);
+	if (unlikely(ret))
+		goto out;
+	zv = kmap_atomic(page, KM_USER0) + offset;
+	zv->index = index;
+	zv->oid = *oid;
+	zv->pool_id = pool_id;
+	SET_SENTINEL(zv, ZVH);
+	memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen);
+	kunmap_atomic(zv, KM_USER0);
+out:
+	return zv;
+}
+
+static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv)
+{
+	unsigned long flags;
+	struct page *page;
+	uint32_t offset;
+	uint16_t size;
+
+	ASSERT_SENTINEL(zv, ZVH);
+	size = xv_get_object_size(zv) - sizeof(*zv);
+	BUG_ON(size == 0 || size > zv_max_page_size);
+	INVERT_SENTINEL(zv, ZVH);
+	page = virt_to_page(zv);
+	offset = (unsigned long)zv & ~PAGE_MASK;
+	local_irq_save(flags);
+	xv_free(xvpool, page, offset);
+	local_irq_restore(flags);
+}
+
+static void zv_decompress(struct page *page, struct zv_hdr *zv)
+{
+	size_t clen = PAGE_SIZE;
+	char *to_va;
+	unsigned size;
+	int ret;
+
+	ASSERT_SENTINEL(zv, ZVH);
+	size = xv_get_object_size(zv) - sizeof(*zv);
+	BUG_ON(size == 0 || size > zv_max_page_size);
+	to_va = kmap_atomic(page, KM_USER0);
+	ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv),
+					size, to_va, &clen);
+	kunmap_atomic(to_va, KM_USER0);
+	BUG_ON(ret != LZO_E_OK);
+	BUG_ON(clen != PAGE_SIZE);
+}
+
+/*
+ * zcache core code starts here
+ */
+
+/* useful stats not collected by cleancache or frontswap */
+static unsigned long zcache_flush_total;
+static unsigned long zcache_flush_found;
+static unsigned long zcache_flobj_total;
+static unsigned long zcache_flobj_found;
+static unsigned long zcache_failed_eph_puts;
+static unsigned long zcache_failed_pers_puts;
+
+#define MAX_POOLS_PER_CLIENT 16
+
+static struct {
+	struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT];
+	struct xv_pool *xvpool;
+} zcache_client;
+
+/*
+ * Tmem operations assume the poolid implies the invoking client.
+ * Zcache only has one client (the kernel itself), so translate
+ * the poolid into the tmem_pool allocated for it.  A KVM version
+ * of zcache would have one client per guest and each client might
+ * have a poolid==N.
+ */
+static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid)
+{
+	struct tmem_pool *pool = NULL;
+
+	if (poolid >= 0) {
+		pool = zcache_client.tmem_pools[poolid];
+		if (pool != NULL)
+			atomic_inc(&pool->refcount);
+	}
+	return pool;
+}
+
+static void zcache_put_pool(struct tmem_pool *pool)
+{
+	if (pool != NULL)
+		atomic_dec(&pool->refcount);
+}
+
+/* counters for debugging */
+static unsigned long zcache_failed_get_free_pages;
+static unsigned long zcache_failed_alloc;
+static unsigned long zcache_put_to_flush;
+static unsigned long zcache_aborted_preload;
+static unsigned long zcache_aborted_shrink;
+
+/*
+ * Ensure that memory allocation requests in zcache don't result
+ * in direct reclaim requests via the shrinker, which would cause
+ * an infinite loop.  Maybe a GFP flag would be better?
+ */
+static DEFINE_SPINLOCK(zcache_direct_reclaim_lock);
+
+/*
+ * for now, used named slabs so can easily track usage; later can
+ * either just use kmalloc, or perhaps add a slab-like allocator
+ * to more carefully manage total memory utilization
+ */
+static struct kmem_cache *zcache_objnode_cache;
+static struct kmem_cache *zcache_obj_cache;
+static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0);
+static unsigned long zcache_curr_obj_count_max;
+static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0);
+static unsigned long zcache_curr_objnode_count_max;
+
+/*
+ * to avoid memory allocation recursion (e.g. due to direct reclaim), we
+ * preload all necessary data structures so the hostops callbacks never
+ * actually do a malloc
+ */
+struct zcache_preload {
+	void *page;
+	struct tmem_obj *obj;
+	int nr;
+	struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH];
+};
+static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };
+
+static int zcache_do_preload(struct tmem_pool *pool)
+{
+	struct zcache_preload *kp;
+	struct tmem_objnode *objnode;
+	struct tmem_obj *obj;
+	void *page;
+	int ret = -ENOMEM;
+
+	if (unlikely(zcache_objnode_cache == NULL))
+		goto out;
+	if (unlikely(zcache_obj_cache == NULL))
+		goto out;
+	if (!spin_trylock(&zcache_direct_reclaim_lock)) {
+		zcache_aborted_preload++;
+		goto out;
+	}
+	preempt_disable();
+	kp = &__get_cpu_var(zcache_preloads);
+	while (kp->nr < ARRAY_SIZE(kp->objnodes)) {
+		preempt_enable_no_resched();
+		objnode = kmem_cache_alloc(zcache_objnode_cache,
+				ZCACHE_GFP_MASK);
+		if (unlikely(objnode == NULL)) {
+			zcache_failed_alloc++;
+			goto unlock_out;
+		}
+		preempt_disable();
+		kp = &__get_cpu_var(zcache_preloads);
+		if (kp->nr < ARRAY_SIZE(kp->objnodes))
+			kp->objnodes[kp->nr++] = objnode;
+		else
+			kmem_cache_free(zcache_objnode_cache, objnode);
+	}
+	preempt_enable_no_resched();
+	obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK);
+	if (unlikely(obj == NULL)) {
+		zcache_failed_alloc++;
+		goto unlock_out;
+	}
+	page = (void *)__get_free_page(ZCACHE_GFP_MASK);
+	if (unlikely(page == NULL)) {
+		zcache_failed_get_free_pages++;
+		kmem_cache_free(zcache_obj_cache, obj);
+		goto unlock_out;
+	}
+	preempt_disable();
+	kp = &__get_cpu_var(zcache_preloads);
+	if (kp->obj == NULL)
+		kp->obj = obj;
+	else
+		kmem_cache_free(zcache_obj_cache, obj);
+	if (kp->page == NULL)
+		kp->page = page;
+	else
+		free_page((unsigned long)page);
+	ret = 0;
+unlock_out:
+	spin_unlock(&zcache_direct_reclaim_lock);
+out:
+	return ret;
+}
+
+static void *zcache_get_free_page(void)
+{
+	struct zcache_preload *kp;
+	void *page;
+
+	kp = &__get_cpu_var(zcache_preloads);
+	page = kp->page;
+	BUG_ON(page == NULL);
+	kp->page = NULL;
+	return page;
+}
+
+static void zcache_free_page(void *p)
+{
+	free_page((unsigned long)p);
+}
+
+/*
+ * zcache implementation for tmem host ops
+ */
+
+static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool)
+{
+	struct tmem_objnode *objnode = NULL;
+	unsigned long count;
+	struct zcache_preload *kp;
+
+	kp = &__get_cpu_var(zcache_preloads);
+	if (kp->nr <= 0)
+		goto out;
+	objnode = kp->objnodes[kp->nr - 1];
+	BUG_ON(objnode == NULL);
+	kp->objnodes[kp->nr - 1] = NULL;
+	kp->nr--;
+	count = atomic_inc_return(&zcache_curr_objnode_count);
+	if (count > zcache_curr_objnode_count_max)
+		zcache_curr_objnode_count_max = count;
+out:
+	return objnode;
+}
+
+static void zcache_objnode_free(struct tmem_objnode *objnode,
+					struct tmem_pool *pool)
+{
+	atomic_dec(&zcache_curr_objnode_count);
+	BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0);
+	kmem_cache_free(zcache_objnode_cache, objnode);
+}
+
+static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool)
+{
+	struct tmem_obj *obj = NULL;
+	unsigned long count;
+	struct zcache_preload *kp;
+
+	kp = &__get_cpu_var(zcache_preloads);
+	obj = kp->obj;
+	BUG_ON(obj == NULL);
+	kp->obj = NULL;
+	count = atomic_inc_return(&zcache_curr_obj_count);
+	if (count > zcache_curr_obj_count_max)
+		zcache_curr_obj_count_max = count;
+	return obj;
+}
+
+static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)
+{
+	atomic_dec(&zcache_curr_obj_count);
+	BUG_ON(atomic_read(&zcache_curr_obj_count) < 0);
+	kmem_cache_free(zcache_obj_cache, obj);
+}
+
+static struct tmem_hostops zcache_hostops = {
+	.obj_alloc = zcache_obj_alloc,
+	.obj_free = zcache_obj_free,
+	.objnode_alloc = zcache_objnode_alloc,
+	.objnode_free = zcache_objnode_free,
+};
+
+/*
+ * zcache implementations for PAM page descriptor ops
+ */
+
+static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0);
+static unsigned long zcache_curr_eph_pampd_count_max;
+static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0);
+static unsigned long zcache_curr_pers_pampd_count_max;
+
+/* forward reference */
+static int zcache_compress(struct page *from, void **out_va, size_t *out_len);
+
+static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid,
+				 uint32_t index, struct page *page)
+{
+	void *pampd = NULL, *cdata;
+	size_t clen;
+	int ret;
+	bool ephemeral = is_ephemeral(pool);
+	unsigned long count;
+
+	if (ephemeral) {
+		ret = zcache_compress(page, &cdata, &clen);
+		if (ret == 0)
+
+			goto out;
+		if (clen == 0 || clen > zbud_max_buddy_size()) {
+			zcache_compress_poor++;
+			goto out;
+		}
+		pampd = (void *)zbud_create(pool->pool_id, oid, index,
+						page, cdata, clen);
+		if (pampd != NULL) {
+			count = atomic_inc_return(&zcache_curr_eph_pampd_count);
+			if (count > zcache_curr_eph_pampd_count_max)
+				zcache_curr_eph_pampd_count_max = count;
+		}
+	} else {
+		/*
+		 * FIXME: This is all the "policy" there is for now.
+		 * 3/4 totpages should allow ~37% of RAM to be filled with
+		 * compressed frontswap pages
+		 */
+		if (atomic_read(&zcache_curr_pers_pampd_count) >
+							3 * totalram_pages / 4)
+			goto out;
+		ret = zcache_compress(page, &cdata, &clen);
+		if (ret == 0)
+			goto out;
+		if (clen > zv_max_page_size) {
+			zcache_compress_poor++;
+			goto out;
+		}
+		pampd = (void *)zv_create(zcache_client.xvpool, pool->pool_id,
+						oid, index, cdata, clen);
+		if (pampd == NULL)
+			goto out;
+		count = atomic_inc_return(&zcache_curr_pers_pampd_count);
+		if (count > zcache_curr_pers_pampd_count_max)
+			zcache_curr_pers_pampd_count_max = count;
+	}
+out:
+	return pampd;
+}
+
+/*
+ * fill the pageframe corresponding to the struct page with the data
+ * from the passed pampd
+ */
+static int zcache_pampd_get_data(struct page *page, void *pampd,
+						struct tmem_pool *pool)
+{
+	int ret = 0;
+
+	if (is_ephemeral(pool))
+		ret = zbud_decompress(page, pampd);
+	else
+		zv_decompress(page, pampd);
+	return ret;
+}
+
+/*
+ * free the pampd and remove it from any zcache lists
+ * pampd must no longer be pointed to from any tmem data structures!
+ */
+static void zcache_pampd_free(void *pampd, struct tmem_pool *pool)
+{
+	if (is_ephemeral(pool)) {
+		zbud_free_and_delist((struct zbud_hdr *)pampd);
+		atomic_dec(&zcache_curr_eph_pampd_count);
+		BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0);
+	} else {
+		zv_free(zcache_client.xvpool, (struct zv_hdr *)pampd);
+		atomic_dec(&zcache_curr_pers_pampd_count);
+		BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0);
+	}
+}
+
+static struct tmem_pamops zcache_pamops = {
+	.create = zcache_pampd_create,
+	.get_data = zcache_pampd_get_data,
+	.free = zcache_pampd_free,
+};
+
+/*
+ * zcache compression/decompression and related per-cpu stuff
+ */
+
+#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
+#define LZO_DSTMEM_PAGE_ORDER 1
+static DEFINE_PER_CPU(unsigned char *, zcache_workmem);
+static DEFINE_PER_CPU(unsigned char *, zcache_dstmem);
+
+static int zcache_compress(struct page *from, void **out_va, size_t *out_len)
+{
+	int ret = 0;
+	unsigned char *dmem = __get_cpu_var(zcache_dstmem);
+	unsigned char *wmem = __get_cpu_var(zcache_workmem);
+	char *from_va;
+
+	BUG_ON(!irqs_disabled());
+	if (unlikely(dmem == NULL || wmem == NULL))
+		goto out;  /* no buffer, so can't compress */
+	from_va = kmap_atomic(from, KM_USER0);
+	mb();
+	ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem);
+	BUG_ON(ret != LZO_E_OK);
+	*out_va = dmem;
+	kunmap_atomic(from_va, KM_USER0);
+	ret = 1;
+out:
+	return ret;
+}
+
+
+static int zcache_cpu_notifier(struct notifier_block *nb,
+				unsigned long action, void *pcpu)
+{
+	int cpu = (long)pcpu;
+	struct zcache_preload *kp;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages(
+			GFP_KERNEL | __GFP_REPEAT,
+			LZO_DSTMEM_PAGE_ORDER),
+		per_cpu(zcache_workmem, cpu) =
+			kzalloc(LZO1X_MEM_COMPRESS,
+				GFP_KERNEL | __GFP_REPEAT);
+		break;
+	case CPU_DEAD:
+	case CPU_UP_CANCELED:
+		free_pages((unsigned long)per_cpu(zcache_dstmem, cpu),
+				LZO_DSTMEM_PAGE_ORDER);
+		per_cpu(zcache_dstmem, cpu) = NULL;
+		kfree(per_cpu(zcache_workmem, cpu));
+		per_cpu(zcache_workmem, cpu) = NULL;
+		kp = &per_cpu(zcache_preloads, cpu);
+		while (kp->nr) {
+			kmem_cache_free(zcache_objnode_cache,
+					kp->objnodes[kp->nr - 1]);
+			kp->objnodes[kp->nr - 1] = NULL;
+			kp->nr--;
+		}
+		kmem_cache_free(zcache_obj_cache, kp->obj);
+		free_page((unsigned long)kp->page);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block zcache_cpu_notifier_block = {
+	.notifier_call = zcache_cpu_notifier
+};
+
+#ifdef CONFIG_SYSFS
+#define ZCACHE_SYSFS_RO(_name) \
+	static ssize_t zcache_##_name##_show(struct kobject *kobj, \
+				struct kobj_attribute *attr, char *buf) \
+	{ \
+		return sprintf(buf, "%lu\n", zcache_##_name); \
+	} \
+	static struct kobj_attribute zcache_##_name##_attr = { \
+		.attr = { .name = __stringify(_name), .mode = 0444 }, \
+		.show = zcache_##_name##_show, \
+	}
+
+#define ZCACHE_SYSFS_RO_ATOMIC(_name) \
+	static ssize_t zcache_##_name##_show(struct kobject *kobj, \
+				struct kobj_attribute *attr, char *buf) \
+	{ \
+	    return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
+	} \
+	static struct kobj_attribute zcache_##_name##_attr = { \
+		.attr = { .name = __stringify(_name), .mode = 0444 }, \
+		.show = zcache_##_name##_show, \
+	}
+
+#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
+	static ssize_t zcache_##_name##_show(struct kobject *kobj, \
+				struct kobj_attribute *attr, char *buf) \
+	{ \
+	    return _func(buf); \
+	} \
+	static struct kobj_attribute zcache_##_name##_attr = { \
+		.attr = { .name = __stringify(_name), .mode = 0444 }, \
+		.show = zcache_##_name##_show, \
+	}
+
+ZCACHE_SYSFS_RO(curr_obj_count_max);
+ZCACHE_SYSFS_RO(curr_objnode_count_max);
+ZCACHE_SYSFS_RO(flush_total);
+ZCACHE_SYSFS_RO(flush_found);
+ZCACHE_SYSFS_RO(flobj_total);
+ZCACHE_SYSFS_RO(flobj_found);
+ZCACHE_SYSFS_RO(failed_eph_puts);
+ZCACHE_SYSFS_RO(failed_pers_puts);
+ZCACHE_SYSFS_RO(zbud_curr_zbytes);
+ZCACHE_SYSFS_RO(zbud_cumul_zpages);
+ZCACHE_SYSFS_RO(zbud_cumul_zbytes);
+ZCACHE_SYSFS_RO(zbud_buddied_count);
+ZCACHE_SYSFS_RO(zbpg_unused_list_count);
+ZCACHE_SYSFS_RO(evicted_raw_pages);
+ZCACHE_SYSFS_RO(evicted_unbuddied_pages);
+ZCACHE_SYSFS_RO(evicted_buddied_pages);
+ZCACHE_SYSFS_RO(failed_get_free_pages);
+ZCACHE_SYSFS_RO(failed_alloc);
+ZCACHE_SYSFS_RO(put_to_flush);
+ZCACHE_SYSFS_RO(aborted_preload);
+ZCACHE_SYSFS_RO(aborted_shrink);
+ZCACHE_SYSFS_RO(compress_poor);
+ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);
+ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);
+ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);
+ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count);
+ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts,
+			zbud_show_unbuddied_list_counts);
+ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts,
+			zbud_show_cumul_chunk_counts);
+
+static struct attribute *zcache_attrs[] = {
+	&zcache_curr_obj_count_attr.attr,
+	&zcache_curr_obj_count_max_attr.attr,
+	&zcache_curr_objnode_count_attr.attr,
+	&zcache_curr_objnode_count_max_attr.attr,
+	&zcache_flush_total_attr.attr,
+	&zcache_flobj_total_attr.attr,
+	&zcache_flush_found_attr.attr,
+	&zcache_flobj_found_attr.attr,
+	&zcache_failed_eph_puts_attr.attr,
+	&zcache_failed_pers_puts_attr.attr,
+	&zcache_compress_poor_attr.attr,
+	&zcache_zbud_curr_raw_pages_attr.attr,
+	&zcache_zbud_curr_zpages_attr.attr,
+	&zcache_zbud_curr_zbytes_attr.attr,
+	&zcache_zbud_cumul_zpages_attr.attr,
+	&zcache_zbud_cumul_zbytes_attr.attr,
+	&zcache_zbud_buddied_count_attr.attr,
+	&zcache_zbpg_unused_list_count_attr.attr,
+	&zcache_evicted_raw_pages_attr.attr,
+	&zcache_evicted_unbuddied_pages_attr.attr,
+	&zcache_evicted_buddied_pages_attr.attr,
+	&zcache_failed_get_free_pages_attr.attr,
+	&zcache_failed_alloc_attr.attr,
+	&zcache_put_to_flush_attr.attr,
+	&zcache_aborted_preload_attr.attr,
+	&zcache_aborted_shrink_attr.attr,
+	&zcache_zbud_unbuddied_list_counts_attr.attr,
+	&zcache_zbud_cumul_chunk_counts_attr.attr,
+	NULL,
+};
+
+static struct attribute_group zcache_attr_group = {
+	.attrs = zcache_attrs,
+	.name = "zcache",
+};
+
+#endif /* CONFIG_SYSFS */
+/*
+ * When zcache is disabled ("frozen"), pools can be created and destroyed,
+ * but all puts (and thus all other operations that require memory allocation)
+ * must fail.  If zcache is unfrozen, accepts puts, then frozen again,
+ * data consistency requires all puts while frozen to be converted into
+ * flushes.
+ */
+static bool zcache_freeze;
+
+/*
+ * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
+ */
+static int shrink_zcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+{
+	int ret = -1;
+
+	if (nr >= 0) {
+		if (!(gfp_mask & __GFP_FS))
+			/* does this case really need to be skipped? */
+			goto out;
+		if (spin_trylock(&zcache_direct_reclaim_lock)) {
+			zbud_evict_pages(nr);
+			spin_unlock(&zcache_direct_reclaim_lock);
+		} else
+			zcache_aborted_shrink++;
+	}
+	ret = (int)atomic_read(&zcache_zbud_curr_raw_pages);
+out:
+	return ret;
+}
+
+static struct shrinker zcache_shrinker = {
+	.shrink = shrink_zcache_memory,
+	.seeks = DEFAULT_SEEKS,
+};
+
+/*
+ * zcache shims between cleancache/frontswap ops and tmem
+ */
+
+static int zcache_put_page(int pool_id, struct tmem_oid *oidp,
+				uint32_t index, struct page *page)
+{
+	struct tmem_pool *pool;
+	int ret = -1;
+
+	BUG_ON(!irqs_disabled());
+	pool = zcache_get_pool_by_id(pool_id);
+	if (unlikely(pool == NULL))
+		goto out;
+	if (!zcache_freeze && zcache_do_preload(pool) == 0) {
+		/* preload does preempt_disable on success */
+		ret = tmem_put(pool, oidp, index, page);
+		if (ret < 0) {
+			if (is_ephemeral(pool))
+				zcache_failed_eph_puts++;
+			else
+				zcache_failed_pers_puts++;
+		}
+		zcache_put_pool(pool);
+		preempt_enable_no_resched();
+	} else {
+		zcache_put_to_flush++;
+		if (atomic_read(&pool->obj_count) > 0)
+			/* the put fails whether the flush succeeds or not */
+			(void)tmem_flush_page(pool, oidp, index);
+		zcache_put_pool(pool);
+	}
+out:
+	return ret;
+}
+
+static int zcache_get_page(int pool_id, struct tmem_oid *oidp,
+				uint32_t index, struct page *page)
+{
+	struct tmem_pool *pool;
+	int ret = -1;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pool = zcache_get_pool_by_id(pool_id);
+	if (likely(pool != NULL)) {
+		if (atomic_read(&pool->obj_count) > 0)
+			ret = tmem_get(pool, oidp, index, page);
+		zcache_put_pool(pool);
+	}
+	local_irq_restore(flags);
+	return ret;
+}
+
+static int zcache_flush_page(int pool_id, struct tmem_oid *oidp, uint32_t index)
+{
+	struct tmem_pool *pool;
+	int ret = -1;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	zcache_flush_total++;
+	pool = zcache_get_pool_by_id(pool_id);
+	if (likely(pool != NULL)) {
+		if (atomic_read(&pool->obj_count) > 0)
+			ret = tmem_flush_page(pool, oidp, index);
+		zcache_put_pool(pool);
+	}
+	if (ret >= 0)
+		zcache_flush_found++;
+	local_irq_restore(flags);
+	return ret;
+}
+
+static int zcache_flush_object(int pool_id, struct tmem_oid *oidp)
+{
+	struct tmem_pool *pool;
+	int ret = -1;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	zcache_flobj_total++;
+	pool = zcache_get_pool_by_id(pool_id);
+	if (likely(pool != NULL)) {
+		if (atomic_read(&pool->obj_count) > 0)
+			ret = tmem_flush_object(pool, oidp);
+		zcache_put_pool(pool);
+	}
+	if (ret >= 0)
+		zcache_flobj_found++;
+	local_irq_restore(flags);
+	return ret;
+}
+
+static int zcache_destroy_pool(int pool_id)
+{
+	struct tmem_pool *pool = NULL;
+	int ret = -1;
+
+	if (pool_id < 0)
+		goto out;
+	pool = zcache_client.tmem_pools[pool_id];
+	if (pool == NULL)
+		goto out;
+	zcache_client.tmem_pools[pool_id] = NULL;
+	/* wait for pool activity on other cpus to quiesce */
+	while (atomic_read(&pool->refcount) != 0)
+		;
+	local_bh_disable();
+	ret = tmem_destroy_pool(pool);
+	local_bh_enable();
+	kfree(pool);
+	pr_info("zcache: destroyed pool id=%d\n", pool_id);
+out:
+	return ret;
+}
+
+static int zcache_new_pool(uint32_t flags)
+{
+	int poolid = -1;
+	struct tmem_pool *pool;
+
+	pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL);
+	if (pool == NULL) {
+		pr_info("zcache: pool creation failed: out of memory\n");
+		goto out;
+	}
+
+	for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++)
+		if (zcache_client.tmem_pools[poolid] == NULL)
+			break;
+	if (poolid >= MAX_POOLS_PER_CLIENT) {
+		pr_info("zcache: pool creation failed: max exceeded\n");
+		kfree(pool);
+		poolid = -1;
+		goto out;
+	}
+	atomic_set(&pool->refcount, 0);
+	pool->client = &zcache_client;
+	pool->pool_id = poolid;
+	tmem_new_pool(pool, flags);
+	zcache_client.tmem_pools[poolid] = pool;
+	pr_info("zcache: created %s tmem pool, id=%d\n",
+		flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
+		poolid);
+out:
+	return poolid;
+}
+
+/**********
+ * Two kernel functionalities currently can be layered on top of tmem.
+ * These are "cleancache" which is used as a second-chance cache for clean
+ * page cache pages; and "frontswap" which is used for swap pages
+ * to avoid writes to disk.  A generic "shim" is provided here for each
+ * to translate in-kernel semantics to zcache semantics.
+ */
+
+#ifdef CONFIG_CLEANCACHE
+static void zcache_cleancache_put_page(int pool_id,
+					struct cleancache_filekey key,
+					pgoff_t index, struct page *page)
+{
+	u32 ind = (u32) index;
+	struct tmem_oid oid = *(struct tmem_oid *)&key;
+
+	if (likely(ind == index))
+		(void)zcache_put_page(pool_id, &oid, index, page);
+}
+
+static int zcache_cleancache_get_page(int pool_id,
+					struct cleancache_filekey key,
+					pgoff_t index, struct page *page)
+{
+	u32 ind = (u32) index;
+	struct tmem_oid oid = *(struct tmem_oid *)&key;
+	int ret = -1;
+
+	if (likely(ind == index))
+		ret = zcache_get_page(pool_id, &oid, index, page);
+	return ret;
+}
+
+static void zcache_cleancache_flush_page(int pool_id,
+					struct cleancache_filekey key,
+					pgoff_t index)
+{
+	u32 ind = (u32) index;
+	struct tmem_oid oid = *(struct tmem_oid *)&key;
+
+	if (likely(ind == index))
+		(void)zcache_flush_page(pool_id, &oid, ind);
+}
+
+static void zcache_cleancache_flush_inode(int pool_id,
+					struct cleancache_filekey key)
+{
+	struct tmem_oid oid = *(struct tmem_oid *)&key;
+
+	(void)zcache_flush_object(pool_id, &oid);
+}
+
+static void zcache_cleancache_flush_fs(int pool_id)
+{
+	if (pool_id >= 0)
+		(void)zcache_destroy_pool(pool_id);
+}
+
+static int zcache_cleancache_init_fs(size_t pagesize)
+{
+	BUG_ON(sizeof(struct cleancache_filekey) !=
+				sizeof(struct tmem_oid));
+	BUG_ON(pagesize != PAGE_SIZE);
+	return zcache_new_pool(0);
+}
+
+static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
+{
+	/* shared pools are unsupported and map to private */
+	BUG_ON(sizeof(struct cleancache_filekey) !=
+				sizeof(struct tmem_oid));
+	BUG_ON(pagesize != PAGE_SIZE);
+	return zcache_new_pool(0);
+}
+
+static struct cleancache_ops zcache_cleancache_ops = {
+	.put_page = zcache_cleancache_put_page,
+	.get_page = zcache_cleancache_get_page,
+	.flush_page = zcache_cleancache_flush_page,
+	.flush_inode = zcache_cleancache_flush_inode,
+	.flush_fs = zcache_cleancache_flush_fs,
+	.init_shared_fs = zcache_cleancache_init_shared_fs,
+	.init_fs = zcache_cleancache_init_fs
+};
+
+struct cleancache_ops zcache_cleancache_register_ops(void)
+{
+	struct cleancache_ops old_ops =
+		cleancache_register_ops(&zcache_cleancache_ops);
+
+	return old_ops;
+}
+#endif
+
+#ifdef CONFIG_FRONTSWAP
+/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
+static int zcache_frontswap_poolid = -1;
+
+/*
+ * Swizzling increases objects per swaptype, increasing tmem concurrency
+ * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
+ */
+#define SWIZ_BITS		4
+#define SWIZ_MASK		((1 << SWIZ_BITS) - 1)
+#define _oswiz(_type, _ind)	((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
+#define iswiz(_ind)		(_ind >> SWIZ_BITS)
+
+static inline struct tmem_oid oswiz(unsigned type, u32 ind)
+{
+	struct tmem_oid oid = { .oid = { 0 } };
+	oid.oid[0] = _oswiz(type, ind);
+	return oid;
+}
+
+static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
+				   struct page *page)
+{
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	struct tmem_oid oid = oswiz(type, ind);
+	int ret = -1;
+	unsigned long flags;
+
+	BUG_ON(!PageLocked(page));
+	if (likely(ind64 == ind)) {
+		local_irq_save(flags);
+		ret = zcache_put_page(zcache_frontswap_poolid, &oid,
+					iswiz(ind), page);
+		local_irq_restore(flags);
+	}
+	return ret;
+}
+
+/* returns 0 if the page was successfully gotten from frontswap, -1 if
+ * was not present (should never happen!) */
+static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
+				   struct page *page)
+{
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	struct tmem_oid oid = oswiz(type, ind);
+	int ret = -1;
+
+	BUG_ON(!PageLocked(page));
+	if (likely(ind64 == ind))
+		ret = zcache_get_page(zcache_frontswap_poolid, &oid,
+					iswiz(ind), page);
+	return ret;
+}
+
+/* flush a single page from frontswap */
+static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)
+{
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	struct tmem_oid oid = oswiz(type, ind);
+
+	if (likely(ind64 == ind))
+		(void)zcache_flush_page(zcache_frontswap_poolid, &oid,
+					iswiz(ind));
+}
+
+/* flush all pages from the passed swaptype */
+static void zcache_frontswap_flush_area(unsigned type)
+{
+	struct tmem_oid oid;
+	int ind;
+
+	for (ind = SWIZ_MASK; ind >= 0; ind--) {
+		oid = oswiz(type, ind);
+		(void)zcache_flush_object(zcache_frontswap_poolid, &oid);
+	}
+}
+
+static void zcache_frontswap_init(unsigned ignored)
+{
+	/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
+	if (zcache_frontswap_poolid < 0)
+		zcache_frontswap_poolid = zcache_new_pool(TMEM_POOL_PERSIST);
+}
+
+static struct frontswap_ops zcache_frontswap_ops = {
+	.put_page = zcache_frontswap_put_page,
+	.get_page = zcache_frontswap_get_page,
+	.flush_page = zcache_frontswap_flush_page,
+	.flush_area = zcache_frontswap_flush_area,
+	.init = zcache_frontswap_init
+};
+
+struct frontswap_ops zcache_frontswap_register_ops(void)
+{
+	struct frontswap_ops old_ops =
+		frontswap_register_ops(&zcache_frontswap_ops);
+
+	return old_ops;
+}
+#endif
+
+/*
+ * zcache initialization
+ * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
+ * NOTHING HAPPENS!
+ */
+
+static int zcache_enabled;
+
+static int __init enable_zcache(char *s)
+{
+	zcache_enabled = 1;
+	return 1;
+}
+__setup("zcache", enable_zcache);
+
+/* allow independent dynamic disabling of cleancache and frontswap */
+
+static int use_cleancache = 1;
+
+static int __init no_cleancache(char *s)
+{
+	use_cleancache = 0;
+	return 1;
+}
+
+__setup("nocleancache", no_cleancache);
+
+static int use_frontswap = 1;
+
+static int __init no_frontswap(char *s)
+{
+	use_frontswap = 0;
+	return 1;
+}
+
+__setup("nofrontswap", no_frontswap);
+
+static int __init zcache_init(void)
+{
+	int ret = 0;
+
+#ifdef CONFIG_SYSFS
+	ret = sysfs_create_group(mm_kobj, &zcache_attr_group);
+	if (ret) {
+		pr_err("zcache: can't create sysfs\n");
+		goto out;
+	}
+#endif /* CONFIG_SYSFS */
+#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
+	if (zcache_enabled) {
+		unsigned int cpu;
+
+		tmem_register_hostops(&zcache_hostops);
+		tmem_register_pamops(&zcache_pamops);
+		ret = register_cpu_notifier(&zcache_cpu_notifier_block);
+		if (ret) {
+			pr_err("zcache: can't register cpu notifier\n");
+			goto out;
+		}
+		for_each_online_cpu(cpu) {
+			void *pcpu = (void *)(long)cpu;
+			zcache_cpu_notifier(&zcache_cpu_notifier_block,
+				CPU_UP_PREPARE, pcpu);
+		}
+	}
+	zcache_objnode_cache = kmem_cache_create("zcache_objnode",
+				sizeof(struct tmem_objnode), 0, 0, NULL);
+	zcache_obj_cache = kmem_cache_create("zcache_obj",
+				sizeof(struct tmem_obj), 0, 0, NULL);
+#endif
+#ifdef CONFIG_CLEANCACHE
+	if (zcache_enabled && use_cleancache) {
+		struct cleancache_ops old_ops;
+
+		zbud_init();
+		register_shrinker(&zcache_shrinker);
+		old_ops = zcache_cleancache_register_ops();
+		pr_info("zcache: cleancache enabled using kernel "
+			"transcendent memory and compression buddies\n");
+		if (old_ops.init_fs != NULL)
+			pr_warning("zcache: cleancache_ops overridden");
+	}
+#endif
+#ifdef CONFIG_FRONTSWAP
+	if (zcache_enabled && use_frontswap) {
+		struct frontswap_ops old_ops;
+
+		zcache_client.xvpool = xv_create_pool();
+		if (zcache_client.xvpool == NULL) {
+			pr_err("zcache: can't create xvpool\n");
+			goto out;
+		}
+		old_ops = zcache_frontswap_register_ops();
+		pr_info("zcache: frontswap enabled using kernel "
+			"transcendent memory and xvmalloc\n");
+		if (old_ops.init != NULL)
+			pr_warning("ktmem: frontswap_ops overridden");
+	}
+#endif
+out:
+	return ret;
+}
+
+module_init(zcache_init)
diff -Nrupad linux-2.6.37//drivers/staging/zram/Kconfig linux-2.6.37_vanilla//drivers/staging/zram/Kconfig
--- linux-2.6.37//drivers/staging/zram/Kconfig	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//drivers/staging/zram/Kconfig	2011-02-14 01:22:46.470793204 +0100
@@ -15,3 +15,11 @@ config ZRAM

 	  See zram.txt for more information.
 	  Project home: http://compcache.googlecode.com/
+
+config ZRAM_DEBUG
+	bool "Compressed RAM block device debug support"
+	depends on ZRAM
+	default n
+	help
+	  This option adds additional debugging code to the compressed
+	  RAM block device driver.
diff -Nrupad linux-2.6.37//drivers/staging/zram/xvmalloc.c linux-2.6.37_vanilla//drivers/staging/zram/xvmalloc.c
--- linux-2.6.37//drivers/staging/zram/xvmalloc.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//drivers/staging/zram/xvmalloc.c	2011-02-14 01:24:56.564792988 +0100
@@ -10,6 +10,10 @@
  * Released under the terms of GNU General Public License Version 2.0
  */

+#ifdef CONFIG_ZRAM_DEBUG
+#define DEBUG
+#endif
+
 #include <linux/bitops.h>
 #include <linux/errno.h>
 #include <linux/highmem.h>
@@ -187,7 +191,7 @@ static void insert_block(struct xv_pool
 	slindex = get_index_for_insert(block->size);
 	flindex = slindex / BITS_PER_LONG;

-	block->link.prev_page = 0;
+	block->link.prev_page = NULL;
 	block->link.prev_offset = 0;
 	block->link.next_page = pool->freelist[slindex].page;
 	block->link.next_offset = pool->freelist[slindex].offset;
@@ -200,6 +204,8 @@ static void insert_block(struct xv_pool
 		nextblock->link.prev_page = page;
 		nextblock->link.prev_offset = offset;
 		put_ptr_atomic(nextblock, KM_USER1);
+		/* If there was a next page then the free bits are set. */
+		return;
 	}

 	__set_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
@@ -207,54 +213,14 @@ static void insert_block(struct xv_pool
 }

 /*
- * Remove block from head of freelist. Index 'slindex' identifies the freelist.
- */
-static void remove_block_head(struct xv_pool *pool,
-			struct block_header *block, u32 slindex)
-{
-	struct block_header *tmpblock;
-	u32 flindex = slindex / BITS_PER_LONG;
-
-	pool->freelist[slindex].page = block->link.next_page;
-	pool->freelist[slindex].offset = block->link.next_offset;
-	block->link.prev_page = 0;
-	block->link.prev_offset = 0;
-
-	if (!pool->freelist[slindex].page) {
-		__clear_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
-		if (!pool->slbitmap[flindex])
-			__clear_bit(flindex, &pool->flbitmap);
-	} else {
-		/*
-		 * DEBUG ONLY: We need not reinitialize freelist head previous
-		 * pointer to 0 - we never depend on its value. But just for
-		 * sanity, lets do it.
-		 */
-		tmpblock = get_ptr_atomic(pool->freelist[slindex].page,
-				pool->freelist[slindex].offset, KM_USER1);
-		tmpblock->link.prev_page = 0;
-		tmpblock->link.prev_offset = 0;
-		put_ptr_atomic(tmpblock, KM_USER1);
-	}
-}
-
-/*
  * Remove block from freelist. Index 'slindex' identifies the freelist.
  */
 static void remove_block(struct xv_pool *pool, struct page *page, u32 offset,
 			struct block_header *block, u32 slindex)
 {
-	u32 flindex;
+	u32 flindex = slindex / BITS_PER_LONG;
 	struct block_header *tmpblock;

-	if (pool->freelist[slindex].page == page
-	   && pool->freelist[slindex].offset == offset) {
-		remove_block_head(pool, block, slindex);
-		return;
-	}
-
-	flindex = slindex / BITS_PER_LONG;
-
 	if (block->link.prev_page) {
 		tmpblock = get_ptr_atomic(block->link.prev_page,
 				block->link.prev_offset, KM_USER1);
@@ -270,6 +236,35 @@ static void remove_block(struct xv_pool
 		tmpblock->link.prev_offset = block->link.prev_offset;
 		put_ptr_atomic(tmpblock, KM_USER1);
 	}
+
+	/* Is this block is at the head of the freelist? */
+	if (pool->freelist[slindex].page == page
+	   && pool->freelist[slindex].offset == offset) {
+
+		pool->freelist[slindex].page = block->link.next_page;
+		pool->freelist[slindex].offset = block->link.next_offset;
+
+		if (pool->freelist[slindex].page) {
+			struct block_header *tmpblock;
+			tmpblock = get_ptr_atomic(pool->freelist[slindex].page,
+					pool->freelist[slindex].offset,
+					KM_USER1);
+			tmpblock->link.prev_page = NULL;
+			tmpblock->link.prev_offset = 0;
+			put_ptr_atomic(tmpblock, KM_USER1);
+		} else {
+			/* This freelist bucket is empty */
+			__clear_bit(slindex % BITS_PER_LONG,
+				    &pool->slbitmap[flindex]);
+			if (!pool->slbitmap[flindex])
+				__clear_bit(flindex, &pool->flbitmap);
+		}
+	}
+
+	block->link.prev_page = NULL;
+	block->link.prev_offset = 0;
+	block->link.next_page = NULL;
+	block->link.next_offset = 0;
 }

 /*
@@ -378,7 +373,7 @@ int xv_malloc(struct xv_pool *pool, u32

 	block = get_ptr_atomic(*page, *offset, KM_USER0);

-	remove_block_head(pool, block, index);
+	remove_block(pool, *page, *offset, block, index);

 	/* Split the block if required */
 	tmpoffset = *offset + size + XV_ALIGN;
diff -Nrupad linux-2.6.37//drivers/staging/zram/xvmalloc_int.h linux-2.6.37_vanilla//drivers/staging/zram/xvmalloc_int.h
--- linux-2.6.37//drivers/staging/zram/xvmalloc_int.h	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//drivers/staging/zram/xvmalloc_int.h	2011-02-14 01:22:50.990793071 +0100
@@ -19,7 +19,11 @@
 /* User configurable params */

 /* Must be power of two */
+#ifdef CONFIG_64BIT
+#define XV_ALIGN_SHIFT 3
+#else
 #define XV_ALIGN_SHIFT	2
+#endif
 #define XV_ALIGN	(1 << XV_ALIGN_SHIFT)
 #define XV_ALIGN_MASK	(XV_ALIGN - 1)

@@ -27,8 +31,16 @@
 #define XV_MIN_ALLOC_SIZE	32
 #define XV_MAX_ALLOC_SIZE	(PAGE_SIZE - XV_ALIGN)

-/* Free lists are separated by FL_DELTA bytes */
-#define FL_DELTA_SHIFT	3
+/*
+ * Free lists are separated by FL_DELTA bytes
+ * This value is 3 for 4k pages and 4 for 64k pages, for any
+ * other page size, a conservative (PAGE_SHIFT - 9) is used.
+ */
+#if PAGE_SHIFT == 16
+#define FL_DELTA_SHIFT 4
+#else
+#define FL_DELTA_SHIFT (PAGE_SHIFT - 9)
+#endif
 #define FL_DELTA	(1 << FL_DELTA_SHIFT)
 #define FL_DELTA_MASK	(FL_DELTA - 1)
 #define NUM_FREE_LISTS	((XV_MAX_ALLOC_SIZE - XV_MIN_ALLOC_SIZE) \
@@ -75,12 +87,9 @@ struct block_header {
 struct xv_pool {
 	ulong flbitmap;
 	ulong slbitmap[MAX_FLI];
-	spinlock_t lock;
-
+	u64 total_pages;	/* stats */
 	struct freelist_entry freelist[NUM_FREE_LISTS];
-
-	/* stats */
-	u64 total_pages;
+	spinlock_t lock;
 };

 #endif
diff -Nrupad linux-2.6.37//drivers/staging/zram/zram_drv.c linux-2.6.37_vanilla//drivers/staging/zram/zram_drv.c
--- linux-2.6.37//drivers/staging/zram/zram_drv.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//drivers/staging/zram/zram_drv.c	2011-02-14 01:24:29.924793006 +0100
@@ -15,6 +15,10 @@
 #define KMSG_COMPONENT "zram"
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

+#ifdef CONFIG_ZRAM_DEBUG
+#define DEBUG
+#endif
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/bio.h>
@@ -227,6 +231,7 @@ static int zram_read(struct zram *zram,

 		if (zram_test_flag(zram, index, ZRAM_ZERO)) {
 			handle_zero_page(page);
+			index++;
 			continue;
 		}

@@ -234,13 +239,15 @@ static int zram_read(struct zram *zram,
 		if (unlikely(!zram->table[index].page)) {
 			pr_debug("Read before write: sector=%lu, size=%u",
 				(ulong)(bio->bi_sector), bio->bi_size);
-			/* Do nothing */
+			handle_zero_page(page);
+			index++;
 			continue;
 		}

 		/* Page is stored uncompressed since it's incompressible */
 		if (unlikely(zram_test_flag(zram, index, ZRAM_UNCOMPRESSED))) {
 			handle_uncompressed_page(zram, page, index);
+			index++;
 			continue;
 		}

@@ -320,6 +327,7 @@ static int zram_write(struct zram *zram,
 			mutex_unlock(&zram->lock);
 			zram_stat_inc(&zram->stats.pages_zero);
 			zram_set_flag(zram, index, ZRAM_ZERO);
+			index++;
 			continue;
 		}

@@ -621,7 +629,8 @@ static int create_device(struct zram *zr
 	 * and n*PAGE_SIZED sized I/O requests.
 	 */
 	blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
-	blk_queue_logical_block_size(zram->disk->queue, PAGE_SIZE);
+	blk_queue_logical_block_size(zram->disk->queue,
+					ZRAM_LOGICAL_BLOCK_SIZE);
 	blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
 	blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);

diff -Nrupad linux-2.6.37//drivers/staging/zram/zram_drv.h linux-2.6.37_vanilla//drivers/staging/zram/zram_drv.h
--- linux-2.6.37//drivers/staging/zram/zram_drv.h	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//drivers/staging/zram/zram_drv.h	2011-02-14 01:22:38.055793098 +0100
@@ -61,6 +61,7 @@ static const unsigned max_zpage_size = P
 #define SECTOR_SIZE		(1 << SECTOR_SHIFT)
 #define SECTORS_PER_PAGE_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
 #define SECTORS_PER_PAGE	(1 << SECTORS_PER_PAGE_SHIFT)
+#define ZRAM_LOGICAL_BLOCK_SIZE	4096

 /* Flags for zram pages (table[page_no].flags) */
 enum zram_pageflags {
diff -Nrupad linux-2.6.37//fs/btrfs/extent_io.c linux-2.6.37_vanilla//fs/btrfs/extent_io.c
--- linux-2.6.37//fs/btrfs/extent_io.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//fs/btrfs/extent_io.c	2011-02-14 01:21:43.164793068 +0100
@@ -10,6 +10,7 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
+#include <linux/cleancache.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "compat.h"
@@ -1981,6 +1982,13 @@ static int __extent_read_full_page(struc

 	set_page_extent_mapped(page);

+	if (!PageUptodate(page)) {
+		if (cleancache_get_page(page) == 0) {
+			BUG_ON(blocksize != PAGE_SIZE);
+			goto out;
+		}
+	}
+
 	end = page_end;
 	while (1) {
 		lock_extent(tree, start, end, GFP_NOFS);
@@ -2105,6 +2113,7 @@ static int __extent_read_full_page(struc
 		cur = cur + iosize;
 		page_offset += iosize;
 	}
+out:
 	if (!nr) {
 		if (!PageError(page))
 			SetPageUptodate(page);
diff -Nrupad linux-2.6.37//fs/btrfs/super.c linux-2.6.37_vanilla//fs/btrfs/super.c
--- linux-2.6.37//fs/btrfs/super.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//fs/btrfs/super.c	2011-02-14 01:21:43.164793068 +0100
@@ -39,6 +39,7 @@
 #include <linux/miscdevice.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
+#include <linux/cleancache.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -494,6 +495,7 @@ static int btrfs_fill_super(struct super
 	sb->s_root = root_dentry;

 	save_mount_options(sb, data);
+	cleancache_init_fs(sb);
 	return 0;

 fail_close:
diff -Nrupad linux-2.6.37//fs/buffer.c linux-2.6.37_vanilla//fs/buffer.c
--- linux-2.6.37//fs/buffer.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//fs/buffer.c	2011-02-14 01:21:43.165793086 +0100
@@ -41,6 +41,7 @@
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
+#include <linux/cleancache.h>

 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);

@@ -277,6 +278,10 @@ void invalidate_bdev(struct block_device
 	invalidate_bh_lrus();
 	lru_add_drain_all();	/* make sure all lru add caches are flushed */
 	invalidate_mapping_pages(mapping, 0, -1);
+	/* 99% of the time, we don't need to flush the cleancache on the bdev.
+	 * But, for the strange corners, lets be cautious
+	 */
+	cleancache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(invalidate_bdev);

diff -Nrupad linux-2.6.37//fs/ext3/super.c linux-2.6.37_vanilla//fs/ext3/super.c
--- linux-2.6.37//fs/ext3/super.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//fs/ext3/super.c	2011-02-14 01:21:43.166793102 +0100
@@ -36,6 +36,7 @@
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
 #include <linux/log2.h>
+#include <linux/cleancache.h>

 #include <asm/uaccess.h>

@@ -1343,6 +1344,7 @@ static int ext3_setup_super(struct super
 	} else {
 		ext3_msg(sb, KERN_INFO, "using internal journal");
 	}
+	cleancache_init_fs(sb);
 	return res;
 }

diff -Nrupad linux-2.6.37//fs/ext4/super.c linux-2.6.37_vanilla//fs/ext4/super.c
--- linux-2.6.37//fs/ext4/super.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//fs/ext4/super.c	2011-02-14 01:21:43.168793127 +0100
@@ -38,6 +38,7 @@
 #include <linux/ctype.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
+#include <linux/cleancache.h>
 #include <asm/uaccess.h>

 #include <linux/kthread.h>
@@ -1902,6 +1903,7 @@ static int ext4_setup_super(struct super
 			EXT4_INODES_PER_GROUP(sb),
 			sbi->s_mount_opt);

+	cleancache_init_fs(sb);
 	return res;
 }

diff -Nrupad linux-2.6.37//fs/mpage.c linux-2.6.37_vanilla//fs/mpage.c
--- linux-2.6.37//fs/mpage.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//fs/mpage.c	2011-02-14 01:21:43.168793127 +0100
@@ -27,6 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/cleancache.h>

 /*
  * I/O completion handler for multipage BIOs.
@@ -286,6 +287,12 @@ do_mpage_readpage(struct bio *bio, struc
 		SetPageMappedToDisk(page);
 	}

+	if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
+	    cleancache_get_page(page) == 0) {
+		SetPageUptodate(page);
+		goto confused;
+	}
+
 	/*
 	 * This page will go to BIO.  Do we need to send this BIO off first?
 	 */
diff -Nrupad linux-2.6.37//fs/ocfs2/super.c linux-2.6.37_vanilla//fs/ocfs2/super.c
--- linux-2.6.37//fs/ocfs2/super.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//fs/ocfs2/super.c	2011-02-14 01:21:43.169793144 +0100
@@ -41,6 +41,7 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
+#include <linux/cleancache.h>

 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -2366,6 +2367,7 @@ static int ocfs2_initialize_super(struct
 		mlog_errno(status);
 		goto bail;
 	}
+	cleancache_init_shared_fs((char *)&uuid_net_key, sb);

 bail:
 	mlog_exit(status);
diff -Nrupad linux-2.6.37//fs/reiserfs/prints.c linux-2.6.37_vanilla//fs/reiserfs/prints.c
--- linux-2.6.37//fs/reiserfs/prints.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//fs/reiserfs/prints.c	2011-02-14 01:20:50.468793185 +0100
@@ -586,13 +586,13 @@ void print_block(struct buffer_head *bh,
 	va_list args;
 	int mode, first, last;

-	va_start(args, bh);
-
 	if (!bh) {
 		printk("print_block: buffer is NULL\n");
 		return;
 	}

+	va_start(args, bh);
+
 	mode = va_arg(args, int);
 	first = va_arg(args, int);
 	last = va_arg(args, int);
diff -Nrupad linux-2.6.37//fs/reiserfs/super.c linux-2.6.37_vanilla//fs/reiserfs/super.c
--- linux-2.6.37//fs/reiserfs/super.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//fs/reiserfs/super.c	2011-02-14 01:21:07.821793171 +0100
@@ -237,7 +237,7 @@ static int finish_unfinished(struct supe
 		pathrelse(&path);

 		inode = reiserfs_iget(s, &obj_key);
-		if (!inode) {
+		if (IS_ERR_OR_NULL(inode)) {
 			/* the unlink almost completed, it just did not manage to remove
 			   "save" link and release objectid */
 			reiserfs_warning(s, "vs-2180", "iget failed for %K",
diff -Nrupad linux-2.6.37//fs/super.c linux-2.6.37_vanilla//fs/super.c
--- linux-2.6.37//fs/super.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//fs/super.c	2011-02-14 01:21:43.169793144 +0100
@@ -30,6 +30,7 @@
 #include <linux/idr.h>
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
+#include <linux/cleancache.h>
 #include "internal.h"


@@ -110,6 +111,7 @@ static struct super_block *alloc_super(s
 		s->s_maxbytes = MAX_NON_LFS;
 		s->s_op = &default_op;
 		s->s_time_gran = 1000000000;
+		s->cleancache_poolid = -1;
 	}
 out:
 	return s;
@@ -176,6 +178,7 @@ void deactivate_locked_super(struct supe
 	struct file_system_type *fs = s->s_type;
 	if (atomic_dec_and_test(&s->s_active)) {
 		fs->kill_sb(s);
+		cleancache_flush_fs(s);
 		put_filesystem(fs);
 		put_super(s);
 	} else {
diff -Nrupad linux-2.6.37//include/linux/cleancache.h linux-2.6.37_vanilla//include/linux/cleancache.h
--- linux-2.6.37//include/linux/cleancache.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.37_vanilla//include/linux/cleancache.h	2011-02-14 01:21:43.169793144 +0100
@@ -0,0 +1,118 @@
+#ifndef _LINUX_CLEANCACHE_H
+#define _LINUX_CLEANCACHE_H
+
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include <linux/mm.h>
+
+#define CLEANCACHE_KEY_MAX 6
+
+/*
+ * cleancache requires every file with a page in cleancache to have a
+ * unique key unless/until the file is removed/truncated.  For some
+ * filesystems, the inode number is unique, but for "modern" filesystems
+ * an exportable filehandle is required (see exportfs.h)
+ */
+struct cleancache_filekey {
+	union {
+		ino_t ino;
+		__u32 fh[CLEANCACHE_KEY_MAX];
+		u32 key[CLEANCACHE_KEY_MAX];
+	} u;
+};
+
+struct cleancache_ops {
+	int (*init_fs)(size_t);
+	int (*init_shared_fs)(char *uuid, size_t);
+	int (*get_page)(int, struct cleancache_filekey,
+			pgoff_t, struct page *);
+	void (*put_page)(int, struct cleancache_filekey,
+			pgoff_t, struct page *);
+	void (*flush_page)(int, struct cleancache_filekey, pgoff_t);
+	void (*flush_inode)(int, struct cleancache_filekey);
+	void (*flush_fs)(int);
+};
+
+extern struct cleancache_ops
+	cleancache_register_ops(struct cleancache_ops *ops);
+extern void __cleancache_init_fs(struct super_block *);
+extern void __cleancache_init_shared_fs(char *, struct super_block *);
+extern int  __cleancache_get_page(struct page *);
+extern void __cleancache_put_page(struct page *);
+extern void __cleancache_flush_page(struct address_space *, struct page *);
+extern void __cleancache_flush_inode(struct address_space *);
+extern void __cleancache_flush_fs(struct super_block *);
+extern int cleancache_enabled;
+
+#ifdef CONFIG_CLEANCACHE
+#define cleancache_fs_enabled(_page) \
+	(_page->mapping->host->i_sb->cleancache_poolid >= 0)
+#define cleancache_fs_enabled_mapping(_mapping) \
+	(mapping->host->i_sb->cleancache_poolid >= 0)
+#else
+#define cleancache_enabled (0)
+#define cleancache_fs_enabled(_page) (0)
+#define cleancache_fs_enabled_mapping(_page) (0)
+#endif
+
+/*
+ * The shim layer provided by these inline functions allows the compiler
+ * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE
+ * is disabled, to a single global variable check if CONFIG_CLEANCACHE
+ * is enabled but no cleancache "backend" has dynamically enabled it,
+ * and, for the most frequent cleancache ops, to a single global variable
+ * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled
+ * and a cleancache backend has dynamically enabled cleancache, but the
+ * filesystem referenced by that cleancache op has not enabled cleancache.
+ * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially
+ * no measurable performance impact.
+ */
+
+static inline void cleancache_init_fs(struct super_block *sb)
+{
+	if (cleancache_enabled)
+		__cleancache_init_fs(sb);
+}
+
+static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+{
+	if (cleancache_enabled)
+		__cleancache_init_shared_fs(uuid, sb);
+}
+
+static inline int cleancache_get_page(struct page *page)
+{
+	int ret = -1;
+
+	if (cleancache_enabled && cleancache_fs_enabled(page))
+		ret = __cleancache_get_page(page);
+	return ret;
+}
+
+static inline void cleancache_put_page(struct page *page)
+{
+	if (cleancache_enabled && cleancache_fs_enabled(page))
+		__cleancache_put_page(page);
+}
+
+static inline void cleancache_flush_page(struct address_space *mapping,
+					struct page *page)
+{
+	/* careful... page->mapping is NULL sometimes when this is called */
+	if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
+		__cleancache_flush_page(mapping, page);
+}
+
+static inline void cleancache_flush_inode(struct address_space *mapping)
+{
+	if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
+		__cleancache_flush_inode(mapping);
+}
+
+static inline void cleancache_flush_fs(struct super_block *sb)
+{
+	if (cleancache_enabled)
+		__cleancache_flush_fs(sb);
+}
+
+#endif /* _LINUX_CLEANCACHE_H */
diff -Nrupad linux-2.6.37//include/linux/frontswap.h linux-2.6.37_vanilla//include/linux/frontswap.h
--- linux-2.6.37//include/linux/frontswap.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.37_vanilla//include/linux/frontswap.h	2011-02-14 01:21:43.169793144 +0100
@@ -0,0 +1,86 @@
+#ifndef _LINUX_FRONTSWAP_H
+#define _LINUX_FRONTSWAP_H
+
+#include <linux/swap.h>
+#include <linux/mm.h>
+
+struct frontswap_ops {
+	void (*init)(unsigned);
+	int (*put_page)(unsigned, pgoff_t, struct page *);
+	int (*get_page)(unsigned, pgoff_t, struct page *);
+	void (*flush_page)(unsigned, pgoff_t);
+	void (*flush_area)(unsigned);
+};
+
+extern int frontswap_enabled;
+extern struct frontswap_ops
+	frontswap_register_ops(struct frontswap_ops *ops);
+extern void frontswap_shrink(unsigned long);
+extern unsigned long frontswap_curr_pages(void);
+
+extern void frontswap_init(unsigned type);
+extern int __frontswap_put_page(struct page *page);
+extern int __frontswap_get_page(struct page *page);
+extern void __frontswap_flush_page(unsigned, pgoff_t);
+extern void __frontswap_flush_area(unsigned);
+
+#ifndef CONFIG_FRONTSWAP
+/* all inline routines become no-ops and all externs are ignored */
+#define frontswap_enabled (0)
+#endif
+
+static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
+{
+	int ret = 0;
+
+	if (frontswap_enabled && sis->frontswap_map)
+		ret = test_bit(offset % BITS_PER_LONG,
+			&sis->frontswap_map[offset/BITS_PER_LONG]);
+	return ret;
+}
+
+static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
+{
+	if (frontswap_enabled && sis->frontswap_map)
+		set_bit(offset % BITS_PER_LONG,
+			&sis->frontswap_map[offset/BITS_PER_LONG]);
+}
+
+static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+	if (frontswap_enabled && sis->frontswap_map)
+		clear_bit(offset % BITS_PER_LONG,
+			&sis->frontswap_map[offset/BITS_PER_LONG]);
+}
+
+static inline int frontswap_put_page(struct page *page)
+{
+	int ret = -1;
+
+	if (frontswap_enabled)
+		ret = __frontswap_put_page(page);
+	return ret;
+}
+
+static inline int frontswap_get_page(struct page *page)
+{
+	int ret = -1;
+
+	if (frontswap_enabled)
+		ret = __frontswap_get_page(page);
+	return ret;
+}
+
+static inline void frontswap_flush_page(unsigned type, pgoff_t offset)
+{
+	if (frontswap_enabled)
+		__frontswap_flush_page(type, offset);
+}
+
+static inline void frontswap_flush_area(unsigned type)
+{
+	if (frontswap_enabled)
+		__frontswap_flush_area(type);
+}
+
+#endif /* _LINUX_FRONTSWAP_H */
diff -Nrupad linux-2.6.37//include/linux/fs.h linux-2.6.37_vanilla//include/linux/fs.h
--- linux-2.6.37//include/linux/fs.h	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//include/linux/fs.h	2011-02-14 01:21:43.170793149 +0100
@@ -1417,6 +1417,11 @@ struct super_block {
 	 * generic_show_options()
 	 */
 	char __rcu *s_options;
+
+	/*
+	 * Saved pool identifier for cleancache (-1 means none)
+	 */
+	int cleancache_poolid;
 };

 extern struct timespec current_fs_time(struct super_block *sb);
diff -Nrupad linux-2.6.37//include/linux/swapfile.h linux-2.6.37_vanilla//include/linux/swapfile.h
--- linux-2.6.37//include/linux/swapfile.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.37_vanilla//include/linux/swapfile.h	2011-02-14 01:21:43.170793149 +0100
@@ -0,0 +1,13 @@
+#ifndef _LINUX_SWAPFILE_H
+#define _LINUX_SWAPFILE_H
+
+/*
+ * these were static in swapfile.c but frontswap.c needs them and we don't
+ * want to expose them to the dozens of source files that include swap.h
+ */
+extern spinlock_t swap_lock;
+extern struct swap_list_t swap_list;
+extern struct swap_info_struct *swap_info[];
+extern int try_to_unuse(unsigned int, bool, unsigned long);
+
+#endif /* _LINUX_SWAPFILE_H */
diff -Nrupad linux-2.6.37//include/linux/swap.h linux-2.6.37_vanilla//include/linux/swap.h
--- linux-2.6.37//include/linux/swap.h	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//include/linux/swap.h	2011-02-14 01:21:43.171793147 +0100
@@ -185,6 +185,8 @@ struct swap_info_struct {
 	struct block_device *bdev;	/* swap device or bdev of swap file */
 	struct file *swap_file;		/* seldom referenced */
 	unsigned int old_block_size;	/* seldom referenced */
+	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
+	unsigned int frontswap_pages;	/* frontswap pages in-use counter */
 };

 struct swap_list_t {
diff -Nrupad linux-2.6.37//Makefile linux-2.6.37_vanilla//Makefile
--- linux-2.6.37//Makefile	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//Makefile	2011-02-14 01:27:32.292792852 +0100
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 37
-EXTRAVERSION =
+EXTRAVERSION = -zcache
 NAME = Flesh-Eating Bats with Fangs

 # *DOCUMENTATION*
diff -Nrupad linux-2.6.37//Makefile~ linux-2.6.37_vanilla//Makefile~
--- linux-2.6.37//Makefile~	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.37_vanilla//Makefile~	2011-02-14 01:19:18.000000000 +0100
@@ -0,0 +1,1533 @@
+VERSION = 2
+PATCHLEVEL = 6
+SUBLEVEL = 37
+EXTRAVERSION =
+NAME = Flesh-Eating Bats with Fangs
+
+# *DOCUMENTATION*
+# To see a list of typical targets execute "make help"
+# More info can be located in ./README
+# Comments in this file are targeted only to the developer, do not
+# expect to learn how to build the kernel reading this file.
+
+# Do not:
+# o  use make's built-in rules and variables
+#    (this increases performance and avoids hard-to-debug behaviour);
+# o  print "Entering directory ...";
+MAKEFLAGS += -rR --no-print-directory
+
+# Avoid funny character set dependencies
+unexport LC_ALL
+LC_COLLATE=C
+LC_NUMERIC=C
+export LC_COLLATE LC_NUMERIC
+
+# We are using a recursive build, so we need to do a little thinking
+# to get the ordering right.
+#
+# Most importantly: sub-Makefiles should only ever modify files in
+# their own directory. If in some directory we have a dependency on
+# a file in another dir (which doesn't happen often, but it's often
+# unavoidable when linking the built-in.o targets which finally
+# turn into vmlinux), we will call a sub make in that other dir, and
+# after that we are sure that everything which is in that other dir
+# is now up to date.
+#
+# The only cases where we need to modify files which have global
+# effects are thus separated out and done before the recursive
+# descending is started. They are now explicitly listed as the
+# prepare rule.
+
+# To put more focus on warnings, be less verbose as default
+# Use 'make V=1' to see the full commands
+
+ifeq ("$(origin V)", "command line")
+  KBUILD_VERBOSE = $(V)
+endif
+ifndef KBUILD_VERBOSE
+  KBUILD_VERBOSE = 0
+endif
+
+# Call a source code checker (by default, "sparse") as part of the
+# C compilation.
+#
+# Use 'make C=1' to enable checking of only re-compiled files.
+# Use 'make C=2' to enable checking of *all* source files, regardless
+# of whether they are re-compiled or not.
+#
+# See the file "Documentation/sparse.txt" for more details, including
+# where to get the "sparse" utility.
+
+ifeq ("$(origin C)", "command line")
+  KBUILD_CHECKSRC = $(C)
+endif
+ifndef KBUILD_CHECKSRC
+  KBUILD_CHECKSRC = 0
+endif
+
+# Use make M=dir to specify directory of external module to build
+# Old syntax make ... SUBDIRS=$PWD is still supported
+# Setting the environment variable KBUILD_EXTMOD take precedence
+ifdef SUBDIRS
+  KBUILD_EXTMOD ?= $(SUBDIRS)
+endif
+
+ifeq ("$(origin M)", "command line")
+  KBUILD_EXTMOD := $(M)
+endif
+
+# kbuild supports saving output files in a separate directory.
+# To locate output files in a separate directory two syntaxes are supported.
+# In both cases the working directory must be the root of the kernel src.
+# 1) O=
+# Use "make O=dir/to/store/output/files/"
+#
+# 2) Set KBUILD_OUTPUT
+# Set the environment variable KBUILD_OUTPUT to point to the directory
+# where the output files shall be placed.
+# export KBUILD_OUTPUT=dir/to/store/output/files/
+# make
+#
+# The O= assignment takes precedence over the KBUILD_OUTPUT environment
+# variable.
+
+
+# KBUILD_SRC is set on invocation of make in OBJ directory
+# KBUILD_SRC is not intended to be used by the regular user (for now)
+ifeq ($(KBUILD_SRC),)
+
+# OK, Make called in directory where kernel src resides
+# Do we want to locate output files in a separate directory?
+ifeq ("$(origin O)", "command line")
+  KBUILD_OUTPUT := $(O)
+endif
+
+# That's our default target when none is given on the command line
+PHONY := _all
+_all:
+
+# Cancel implicit rules on top Makefile
+$(CURDIR)/Makefile Makefile: ;
+
+ifneq ($(KBUILD_OUTPUT),)
+# Invoke a second make in the output directory, passing relevant variables
+# check that the output directory actually exists
+saved-output := $(KBUILD_OUTPUT)
+KBUILD_OUTPUT := $(shell cd $(KBUILD_OUTPUT) && /bin/pwd)
+$(if $(KBUILD_OUTPUT),, \
+     $(error output directory "$(saved-output)" does not exist))
+
+PHONY += $(MAKECMDGOALS) sub-make
+
+$(filter-out _all sub-make $(CURDIR)/Makefile, $(MAKECMDGOALS)) _all: sub-make
+	$(Q)@:
+
+sub-make: FORCE
+	$(if $(KBUILD_VERBOSE:1=),@)$(MAKE) -C $(KBUILD_OUTPUT) \
+	KBUILD_SRC=$(CURDIR) \
+	KBUILD_EXTMOD="$(KBUILD_EXTMOD)" -f $(CURDIR)/Makefile \
+	$(filter-out _all sub-make,$(MAKECMDGOALS))
+
+# Leave processing to above invocation of make
+skip-makefile := 1
+endif # ifneq ($(KBUILD_OUTPUT),)
+endif # ifeq ($(KBUILD_SRC),)
+
+# We process the rest of the Makefile if this is the final invocation of make
+ifeq ($(skip-makefile),)
+
+# If building an external module we do not care about the all: rule
+# but instead _all depend on modules
+PHONY += all
+ifeq ($(KBUILD_EXTMOD),)
+_all: all
+else
+_all: modules
+endif
+
+srctree		:= $(if $(KBUILD_SRC),$(KBUILD_SRC),$(CURDIR))
+objtree		:= $(CURDIR)
+src		:= $(srctree)
+obj		:= $(objtree)
+
+VPATH		:= $(srctree)$(if $(KBUILD_EXTMOD),:$(KBUILD_EXTMOD))
+
+export srctree objtree VPATH
+
+
+# SUBARCH tells the usermode build what the underlying arch is.  That is set
+# first, and if a usermode build is happening, the "ARCH=um" on the command
+# line overrides the setting of ARCH below.  If a native build is happening,
+# then ARCH is assigned, getting whatever value it gets normally, and
+# SUBARCH is subsequently ignored.
+
+SUBARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
+				  -e s/arm.*/arm/ -e s/sa110/arm/ \
+				  -e s/s390x/s390/ -e s/parisc64/parisc/ \
+				  -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \
+				  -e s/sh[234].*/sh/ )
+
+# Cross compiling and selecting different set of gcc/bin-utils
+# ---------------------------------------------------------------------------
+#
+# When performing cross compilation for other architectures ARCH shall be set
+# to the target architecture. (See arch/* for the possibilities).
+# ARCH can be set during invocation of make:
+# make ARCH=ia64
+# Another way is to have ARCH set in the environment.
+# The default ARCH is the host where make is executed.
+
+# CROSS_COMPILE specify the prefix used for all executables used
+# during compilation. Only gcc and related bin-utils executables
+# are prefixed with $(CROSS_COMPILE).
+# CROSS_COMPILE can be set on the command line
+# make CROSS_COMPILE=ia64-linux-
+# Alternatively CROSS_COMPILE can be set in the environment.
+# A third alternative is to store a setting in .config so that plain
+# "make" in the configured kernel build directory always uses that.
+# Default value for CROSS_COMPILE is not to prefix executables
+# Note: Some architectures assign CROSS_COMPILE in their arch/*/Makefile
+export KBUILD_BUILDHOST := $(SUBARCH)
+ARCH		?= $(SUBARCH)
+CROSS_COMPILE	?= $(CONFIG_CROSS_COMPILE:"%"=%)
+
+# Architecture as present in compile.h
+UTS_MACHINE 	:= $(ARCH)
+SRCARCH 	:= $(ARCH)
+
+# Additional ARCH settings for x86
+ifeq ($(ARCH),i386)
+        SRCARCH := x86
+endif
+ifeq ($(ARCH),x86_64)
+        SRCARCH := x86
+endif
+
+# Additional ARCH settings for sparc
+ifeq ($(ARCH),sparc32)
+       SRCARCH := sparc
+endif
+ifeq ($(ARCH),sparc64)
+       SRCARCH := sparc
+endif
+
+# Additional ARCH settings for sh
+ifeq ($(ARCH),sh64)
+       SRCARCH := sh
+endif
+
+# Where to locate arch specific headers
+hdr-arch  := $(SRCARCH)
+
+ifeq ($(ARCH),m68knommu)
+       hdr-arch  := m68k
+endif
+
+KCONFIG_CONFIG	?= .config
+
+# SHELL used by kbuild
+CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \
+	  else if [ -x /bin/bash ]; then echo /bin/bash; \
+	  else echo sh; fi ; fi)
+
+HOSTCC       = gcc
+HOSTCXX      = g++
+HOSTCFLAGS   = -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer
+HOSTCXXFLAGS = -O2
+
+# Decide whether to build built-in, modular, or both.
+# Normally, just do built-in.
+
+KBUILD_MODULES :=
+KBUILD_BUILTIN := 1
+
+#	If we have only "make modules", don't compile built-in objects.
+#	When we're building modules with modversions, we need to consider
+#	the built-in objects during the descend as well, in order to
+#	make sure the checksums are up to date before we record them.
+
+ifeq ($(MAKECMDGOALS),modules)
+  KBUILD_BUILTIN := $(if $(CONFIG_MODVERSIONS),1)
+endif
+
+#	If we have "make <whatever> modules", compile modules
+#	in addition to whatever we do anyway.
+#	Just "make" or "make all" shall build modules as well
+
+ifneq ($(filter all _all modules,$(MAKECMDGOALS)),)
+  KBUILD_MODULES := 1
+endif
+
+ifeq ($(MAKECMDGOALS),)
+  KBUILD_MODULES := 1
+endif
+
+export KBUILD_MODULES KBUILD_BUILTIN
+export KBUILD_CHECKSRC KBUILD_SRC KBUILD_EXTMOD
+
+# Beautify output
+# ---------------------------------------------------------------------------
+#
+# Normally, we echo the whole command before executing it. By making
+# that echo $($(quiet)$(cmd)), we now have the possibility to set
+# $(quiet) to choose other forms of output instead, e.g.
+#
+#         quiet_cmd_cc_o_c = Compiling $(RELDIR)/$@
+#         cmd_cc_o_c       = $(CC) $(c_flags) -c -o $@ $<
+#
+# If $(quiet) is empty, the whole command will be printed.
+# If it is set to "quiet_", only the short version will be printed.
+# If it is set to "silent_", nothing will be printed at all, since
+# the variable $(silent_cmd_cc_o_c) doesn't exist.
+#
+# A simple variant is to prefix commands with $(Q) - that's useful
+# for commands that shall be hidden in non-verbose mode.
+#
+#	$(Q)ln $@ :<
+#
+# If KBUILD_VERBOSE equals 0 then the above command will be hidden.
+# If KBUILD_VERBOSE equals 1 then the above command is displayed.
+
+ifeq ($(KBUILD_VERBOSE),1)
+  quiet =
+  Q =
+else
+  quiet=quiet_
+  Q = @
+endif
+
+# If the user is running make -s (silent mode), suppress echoing of
+# commands
+
+ifneq ($(findstring s,$(MAKEFLAGS)),)
+  quiet=silent_
+endif
+
+export quiet Q KBUILD_VERBOSE
+
+
+# Look for make include files relative to root of kernel src
+MAKEFLAGS += --include-dir=$(srctree)
+
+# We need some generic definitions (do not try to remake the file).
+$(srctree)/scripts/Kbuild.include: ;
+include $(srctree)/scripts/Kbuild.include
+
+# Make variables (CC, etc...)
+
+AS		= $(CROSS_COMPILE)as
+LD		= $(CROSS_COMPILE)ld
+CC		= $(CROSS_COMPILE)gcc
+CPP		= $(CC) -E
+AR		= $(CROSS_COMPILE)ar
+NM		= $(CROSS_COMPILE)nm
+STRIP		= $(CROSS_COMPILE)strip
+OBJCOPY		= $(CROSS_COMPILE)objcopy
+OBJDUMP		= $(CROSS_COMPILE)objdump
+AWK		= awk
+GENKSYMS	= scripts/genksyms/genksyms
+INSTALLKERNEL  := installkernel
+DEPMOD		= /sbin/depmod
+KALLSYMS	= scripts/kallsyms
+PERL		= perl
+CHECK		= sparse
+
+CHECKFLAGS     := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \
+		  -Wbitwise -Wno-return-void $(CF)
+CFLAGS_MODULE   =
+AFLAGS_MODULE   =
+LDFLAGS_MODULE  =
+CFLAGS_KERNEL	=
+AFLAGS_KERNEL	=
+CFLAGS_GCOV	= -fprofile-arcs -ftest-coverage
+
+
+# Use LINUXINCLUDE when you must reference the include/ directory.
+# Needed to be compatible with the O= option
+LINUXINCLUDE    := -I$(srctree)/arch/$(hdr-arch)/include -Iinclude \
+                   $(if $(KBUILD_SRC), -I$(srctree)/include) \
+                   -include include/generated/autoconf.h
+
+KBUILD_CPPFLAGS := -D__KERNEL__
+
+KBUILD_CFLAGS   := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
+		   -fno-strict-aliasing -fno-common \
+		   -Werror-implicit-function-declaration \
+		   -Wno-format-security \
+		   -fno-delete-null-pointer-checks
+KBUILD_AFLAGS_KERNEL :=
+KBUILD_CFLAGS_KERNEL :=
+KBUILD_AFLAGS   := -D__ASSEMBLY__
+KBUILD_AFLAGS_MODULE  := -DMODULE
+KBUILD_CFLAGS_MODULE  := -DMODULE
+KBUILD_LDFLAGS_MODULE := -T $(srctree)/scripts/module-common.lds
+
+# Read KERNELRELEASE from include/config/kernel.release (if it exists)
+KERNELRELEASE = $(shell cat include/config/kernel.release 2> /dev/null)
+KERNELVERSION = $(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
+
+export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION
+export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC
+export CPP AR NM STRIP OBJCOPY OBJDUMP
+export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE
+export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
+
+export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
+export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV
+export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
+export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
+export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
+
+# When compiling out-of-tree modules, put MODVERDIR in the module
+# tree rather than in the kernel tree. The kernel tree might
+# even be read-only.
+export MODVERDIR := $(if $(KBUILD_EXTMOD),$(firstword $(KBUILD_EXTMOD))/).tmp_versions
+
+# Files to ignore in find ... statements
+
+RCS_FIND_IGNORE := \( -name SCCS -o -name BitKeeper -o -name .svn -o -name CVS -o -name .pc -o -name .hg -o -name .git \) -prune -o
+export RCS_TAR_IGNORE := --exclude SCCS --exclude BitKeeper --exclude .svn --exclude CVS --exclude .pc --exclude .hg --exclude .git
+
+# ===========================================================================
+# Rules shared between *config targets and build targets
+
+# Basic helpers built in scripts/
+PHONY += scripts_basic
+scripts_basic:
+	$(Q)$(MAKE) $(build)=scripts/basic
+	$(Q)rm -f .tmp_quiet_recordmcount
+
+# To avoid any implicit rule to kick in, define an empty command.
+scripts/basic/%: scripts_basic ;
+
+PHONY += outputmakefile
+# outputmakefile generates a Makefile in the output directory, if using a
+# separate output directory. This allows convenient use of make in the
+# output directory.
+outputmakefile:
+ifneq ($(KBUILD_SRC),)
+	$(Q)ln -fsn $(srctree) source
+	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkmakefile \
+	    $(srctree) $(objtree) $(VERSION) $(PATCHLEVEL)
+endif
+
+# To make sure we do not include .config for any of the *config targets
+# catch them early, and hand them over to scripts/kconfig/Makefile
+# It is allowed to specify more targets when calling make, including
+# mixing *config targets and build targets.
+# For example 'make oldconfig all'.
+# Detect when mixed targets is specified, and make a second invocation
+# of make so .config is not included in this case either (for *config).
+
+no-dot-config-targets := clean mrproper distclean \
+			 cscope TAGS tags help %docs check% coccicheck \
+			 include/linux/version.h headers_% \
+			 kernelversion %src-pkg
+
+config-targets := 0
+mixed-targets  := 0
+dot-config     := 1
+
+ifneq ($(filter $(no-dot-config-targets), $(MAKECMDGOALS)),)
+	ifeq ($(filter-out $(no-dot-config-targets), $(MAKECMDGOALS)),)
+		dot-config := 0
+	endif
+endif
+
+ifeq ($(KBUILD_EXTMOD),)
+        ifneq ($(filter config %config,$(MAKECMDGOALS)),)
+                config-targets := 1
+                ifneq ($(filter-out config %config,$(MAKECMDGOALS)),)
+                        mixed-targets := 1
+                endif
+        endif
+endif
+
+ifeq ($(mixed-targets),1)
+# ===========================================================================
+# We're called with mixed targets (*config and build targets).
+# Handle them one by one.
+
+%:: FORCE
+	$(Q)$(MAKE) -C $(srctree) KBUILD_SRC= $@
+
+else
+ifeq ($(config-targets),1)
+# ===========================================================================
+# *config targets only - make sure prerequisites are updated, and descend
+# in scripts/kconfig to make the *config target
+
+# Read arch specific Makefile to set KBUILD_DEFCONFIG as needed.
+# KBUILD_DEFCONFIG may point out an alternative default configuration
+# used for 'make defconfig'
+include $(srctree)/arch/$(SRCARCH)/Makefile
+export KBUILD_DEFCONFIG KBUILD_KCONFIG
+
+config: scripts_basic outputmakefile FORCE
+	$(Q)mkdir -p include/linux include/config
+	$(Q)$(MAKE) $(build)=scripts/kconfig $@
+
+%config: scripts_basic outputmakefile FORCE
+	$(Q)mkdir -p include/linux include/config
+	$(Q)$(MAKE) $(build)=scripts/kconfig $@
+
+else
+# ===========================================================================
+# Build targets only - this includes vmlinux, arch specific targets, clean
+# targets and others. In general all targets except *config targets.
+
+ifeq ($(KBUILD_EXTMOD),)
+# Additional helpers built in scripts/
+# Carefully list dependencies so we do not try to build scripts twice
+# in parallel
+PHONY += scripts
+scripts: scripts_basic include/config/auto.conf include/config/tristate.conf
+	$(Q)$(MAKE) $(build)=$(@)
+
+# Objects we will link into vmlinux / subdirs we need to visit
+init-y		:= init/
+drivers-y	:= drivers/ sound/ firmware/
+net-y		:= net/
+libs-y		:= lib/
+core-y		:= usr/
+endif # KBUILD_EXTMOD
+
+ifeq ($(dot-config),1)
+# Read in config
+-include include/config/auto.conf
+
+ifeq ($(KBUILD_EXTMOD),)
+# Read in dependencies to all Kconfig* files, make sure to run
+# oldconfig if changes are detected.
+-include include/config/auto.conf.cmd
+
+# To avoid any implicit rule to kick in, define an empty command
+$(KCONFIG_CONFIG) include/config/auto.conf.cmd: ;
+
+# If .config is newer than include/config/auto.conf, someone tinkered
+# with it and forgot to run make oldconfig.
+# if auto.conf.cmd is missing then we are probably in a cleaned tree so
+# we execute the config step to be sure to catch updated Kconfig files
+include/config/%.conf: $(KCONFIG_CONFIG) include/config/auto.conf.cmd
+	$(Q)$(MAKE) -f $(srctree)/Makefile silentoldconfig
+else
+# external modules needs include/generated/autoconf.h and include/config/auto.conf
+# but do not care if they are up-to-date. Use auto.conf to trigger the test
+PHONY += include/config/auto.conf
+
+include/config/auto.conf:
+	$(Q)test -e include/generated/autoconf.h -a -e $@ || (		\
+	echo;								\
+	echo "  ERROR: Kernel configuration is invalid.";		\
+	echo "         include/generated/autoconf.h or $@ are missing.";\
+	echo "         Run 'make oldconfig && make prepare' on kernel src to fix it.";	\
+	echo;								\
+	/bin/false)
+
+endif # KBUILD_EXTMOD
+
+else
+# Dummy target needed, because used as prerequisite
+include/config/auto.conf: ;
+endif # $(dot-config)
+
+# The all: target is the default when no target is given on the
+# command line.
+# This allow a user to issue only 'make' to build a kernel including modules
+# Defaults to vmlinux, but the arch makefile usually adds further targets
+all: vmlinux
+
+ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
+KBUILD_CFLAGS	+= -Os
+else
+KBUILD_CFLAGS	+= -O2
+endif
+
+include $(srctree)/arch/$(SRCARCH)/Makefile
+
+ifneq ($(CONFIG_FRAME_WARN),0)
+KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN})
+endif
+
+# Force gcc to behave correct even for buggy distributions
+ifndef CONFIG_CC_STACKPROTECTOR
+KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector)
+endif
+
+ifdef CONFIG_FRAME_POINTER
+KBUILD_CFLAGS	+= -fno-omit-frame-pointer -fno-optimize-sibling-calls
+else
+# Some targets (ARM with Thumb2, for example), can't be built with frame
+# pointers.  For those, we don't have FUNCTION_TRACER automatically
+# select FRAME_POINTER.  However, FUNCTION_TRACER adds -pg, and this is
+# incompatible with -fomit-frame-pointer with current GCC, so we don't use
+# -fomit-frame-pointer with FUNCTION_TRACER.
+ifndef CONFIG_FUNCTION_TRACER
+KBUILD_CFLAGS	+= -fomit-frame-pointer
+endif
+endif
+
+ifdef CONFIG_DEBUG_INFO
+KBUILD_CFLAGS	+= -g
+KBUILD_AFLAGS	+= -gdwarf-2
+endif
+
+ifdef CONFIG_DEBUG_INFO_REDUCED
+KBUILD_CFLAGS 	+= $(call cc-option, -femit-struct-debug-baseonly)
+endif
+
+ifdef CONFIG_FUNCTION_TRACER
+KBUILD_CFLAGS	+= -pg
+ifdef CONFIG_DYNAMIC_FTRACE
+	ifdef CONFIG_HAVE_C_RECORDMCOUNT
+		BUILD_C_RECORDMCOUNT := y
+		export BUILD_C_RECORDMCOUNT
+	endif
+endif
+endif
+
+# We trigger additional mismatches with less inlining
+ifdef CONFIG_DEBUG_SECTION_MISMATCH
+KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once)
+endif
+
+# arch Makefile may override CC so keep this after arch Makefile is included
+NOSTDINC_FLAGS += -nostdinc -isystem $(shell $(CC) -print-file-name=include)
+CHECKFLAGS     += $(NOSTDINC_FLAGS)
+
+# warn about C99 declaration after statement
+KBUILD_CFLAGS += $(call cc-option,-Wdeclaration-after-statement,)
+
+# disable pointer signed / unsigned warnings in gcc 4.0
+KBUILD_CFLAGS += $(call cc-option,-Wno-pointer-sign,)
+
+# disable invalid "can't wrap" optimizations for signed / pointers
+KBUILD_CFLAGS	+= $(call cc-option,-fno-strict-overflow)
+
+# conserve stack if available
+KBUILD_CFLAGS   += $(call cc-option,-fconserve-stack)
+
+# check for 'asm goto'
+ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC)), y)
+	KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
+endif
+
+# Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
+# But warn user when we do so
+warn-assign = \
+$(warning "WARNING: Appending $$K$(1) ($(K$(1))) from $(origin K$(1)) to kernel $$$(1)")
+
+ifneq ($(KCPPFLAGS),)
+        $(call warn-assign,CPPFLAGS)
+        KBUILD_CPPFLAGS += $(KCPPFLAGS)
+endif
+ifneq ($(KAFLAGS),)
+        $(call warn-assign,AFLAGS)
+        KBUILD_AFLAGS += $(KAFLAGS)
+endif
+ifneq ($(KCFLAGS),)
+        $(call warn-assign,CFLAGS)
+        KBUILD_CFLAGS += $(KCFLAGS)
+endif
+
+# Use --build-id when available.
+LDFLAGS_BUILD_ID = $(patsubst -Wl$(comma)%,%,\
+			      $(call cc-ldoption, -Wl$(comma)--build-id,))
+KBUILD_LDFLAGS_MODULE += $(LDFLAGS_BUILD_ID)
+LDFLAGS_vmlinux += $(LDFLAGS_BUILD_ID)
+
+ifeq ($(CONFIG_STRIP_ASM_SYMS),y)
+LDFLAGS_vmlinux	+= $(call ld-option, -X,)
+endif
+
+# Default kernel image to build when no specific target is given.
+# KBUILD_IMAGE may be overruled on the command line or
+# set in the environment
+# Also any assignments in arch/$(ARCH)/Makefile take precedence over
+# this default value
+export KBUILD_IMAGE ?= vmlinux
+
+#
+# INSTALL_PATH specifies where to place the updated kernel and system map
+# images. Default is /boot, but you can set it to other values
+export	INSTALL_PATH ?= /boot
+
+#
+# INSTALL_MOD_PATH specifies a prefix to MODLIB for module directory
+# relocations required by build roots.  This is not defined in the
+# makefile but the argument can be passed to make if needed.
+#
+
+MODLIB	= $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE)
+export MODLIB
+
+#
+#  INSTALL_MOD_STRIP, if defined, will cause modules to be
+#  stripped after they are installed.  If INSTALL_MOD_STRIP is '1', then
+#  the default option --strip-debug will be used.  Otherwise,
+#  INSTALL_MOD_STRIP will used as the options to the strip command.
+
+ifdef INSTALL_MOD_STRIP
+ifeq ($(INSTALL_MOD_STRIP),1)
+mod_strip_cmd = $(STRIP) --strip-debug
+else
+mod_strip_cmd = $(STRIP) $(INSTALL_MOD_STRIP)
+endif # INSTALL_MOD_STRIP=1
+else
+mod_strip_cmd = true
+endif # INSTALL_MOD_STRIP
+export mod_strip_cmd
+
+
+ifeq ($(KBUILD_EXTMOD),)
+core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/
+
+vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
+		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
+		     $(net-y) $(net-m) $(libs-y) $(libs-m)))
+
+vmlinux-alldirs	:= $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \
+		     $(init-n) $(init-) \
+		     $(core-n) $(core-) $(drivers-n) $(drivers-) \
+		     $(net-n)  $(net-)  $(libs-n)    $(libs-))))
+
+init-y		:= $(patsubst %/, %/built-in.o, $(init-y))
+core-y		:= $(patsubst %/, %/built-in.o, $(core-y))
+drivers-y	:= $(patsubst %/, %/built-in.o, $(drivers-y))
+net-y		:= $(patsubst %/, %/built-in.o, $(net-y))
+libs-y1		:= $(patsubst %/, %/lib.a, $(libs-y))
+libs-y2		:= $(patsubst %/, %/built-in.o, $(libs-y))
+libs-y		:= $(libs-y1) $(libs-y2)
+
+# Build vmlinux
+# ---------------------------------------------------------------------------
+# vmlinux is built from the objects selected by $(vmlinux-init) and
+# $(vmlinux-main). Most are built-in.o files from top-level directories
+# in the kernel tree, others are specified in arch/$(ARCH)/Makefile.
+# Ordering when linking is important, and $(vmlinux-init) must be first.
+#
+# vmlinux
+#   ^
+#   |
+#   +-< $(vmlinux-init)
+#   |   +--< init/version.o + more
+#   |
+#   +--< $(vmlinux-main)
+#   |    +--< driver/built-in.o mm/built-in.o + more
+#   |
+#   +-< kallsyms.o (see description in CONFIG_KALLSYMS section)
+#
+# vmlinux version (uname -v) cannot be updated during normal
+# descending-into-subdirs phase since we do not yet know if we need to
+# update vmlinux.
+# Therefore this step is delayed until just before final link of vmlinux -
+# except in the kallsyms case where it is done just before adding the
+# symbols to the kernel.
+#
+# System.map is generated to document addresses of all kernel symbols
+
+vmlinux-init := $(head-y) $(init-y)
+vmlinux-main := $(core-y) $(libs-y) $(drivers-y) $(net-y)
+vmlinux-all  := $(vmlinux-init) $(vmlinux-main)
+vmlinux-lds  := arch/$(SRCARCH)/kernel/vmlinux.lds
+export KBUILD_VMLINUX_OBJS := $(vmlinux-all)
+
+# Rule to link vmlinux - also used during CONFIG_KALLSYMS
+# May be overridden by arch/$(ARCH)/Makefile
+quiet_cmd_vmlinux__ ?= LD      $@
+      cmd_vmlinux__ ?= $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux) -o $@ \
+      -T $(vmlinux-lds) $(vmlinux-init)                          \
+      --start-group $(vmlinux-main) --end-group                  \
+      $(filter-out $(vmlinux-lds) $(vmlinux-init) $(vmlinux-main) vmlinux.o FORCE ,$^)
+
+# Generate new vmlinux version
+quiet_cmd_vmlinux_version = GEN     .version
+      cmd_vmlinux_version = set -e;                     \
+	if [ ! -r .version ]; then			\
+	  rm -f .version;				\
+	  echo 1 >.version;				\
+	else						\
+	  mv .version .old_version;			\
+	  expr 0$$(cat .old_version) + 1 >.version;	\
+	fi;						\
+	$(MAKE) $(build)=init
+
+# Generate System.map
+quiet_cmd_sysmap = SYSMAP
+      cmd_sysmap = $(CONFIG_SHELL) $(srctree)/scripts/mksysmap
+
+# Link of vmlinux
+# If CONFIG_KALLSYMS is set .version is already updated
+# Generate System.map and verify that the content is consistent
+# Use + in front of the vmlinux_version rule to silent warning with make -j2
+# First command is ':' to allow us to use + in front of the rule
+define rule_vmlinux__
+	:
+	$(if $(CONFIG_KALLSYMS),,+$(call cmd,vmlinux_version))
+
+	$(call cmd,vmlinux__)
+	$(Q)echo 'cmd_$@ := $(cmd_vmlinux__)' > $(@D)/.$(@F).cmd
+
+	$(Q)$(if $($(quiet)cmd_sysmap),                                      \
+	  echo '  $($(quiet)cmd_sysmap)  System.map' &&)                     \
+	$(cmd_sysmap) $@ System.map;                                         \
+	if [ $$? -ne 0 ]; then                                               \
+		rm -f $@;                                                    \
+		/bin/false;                                                  \
+	fi;
+	$(verify_kallsyms)
+endef
+
+
+ifdef CONFIG_KALLSYMS
+# Generate section listing all symbols and add it into vmlinux $(kallsyms.o)
+# It's a three stage process:
+# o .tmp_vmlinux1 has all symbols and sections, but __kallsyms is
+#   empty
+#   Running kallsyms on that gives us .tmp_kallsyms1.o with
+#   the right size - vmlinux version (uname -v) is updated during this step
+# o .tmp_vmlinux2 now has a __kallsyms section of the right size,
+#   but due to the added section, some addresses have shifted.
+#   From here, we generate a correct .tmp_kallsyms2.o
+# o The correct .tmp_kallsyms2.o is linked into the final vmlinux.
+# o Verify that the System.map from vmlinux matches the map from
+#   .tmp_vmlinux2, just in case we did not generate kallsyms correctly.
+# o If CONFIG_KALLSYMS_EXTRA_PASS is set, do an extra pass using
+#   .tmp_vmlinux3 and .tmp_kallsyms3.o.  This is only meant as a
+#   temporary bypass to allow the kernel to be built while the
+#   maintainers work out what went wrong with kallsyms.
+
+ifdef CONFIG_KALLSYMS_EXTRA_PASS
+last_kallsyms := 3
+else
+last_kallsyms := 2
+endif
+
+kallsyms.o := .tmp_kallsyms$(last_kallsyms).o
+
+define verify_kallsyms
+	$(Q)$(if $($(quiet)cmd_sysmap),                                      \
+	  echo '  $($(quiet)cmd_sysmap)  .tmp_System.map' &&)                \
+	  $(cmd_sysmap) .tmp_vmlinux$(last_kallsyms) .tmp_System.map
+	$(Q)cmp -s System.map .tmp_System.map ||                             \
+		(echo Inconsistent kallsyms data;                            \
+		 echo Try setting CONFIG_KALLSYMS_EXTRA_PASS;                \
+		 rm .tmp_kallsyms* ; /bin/false )
+endef
+
+# Update vmlinux version before link
+# Use + in front of this rule to silent warning about make -j1
+# First command is ':' to allow us to use + in front of this rule
+cmd_ksym_ld = $(cmd_vmlinux__)
+define rule_ksym_ld
+	:
+	+$(call cmd,vmlinux_version)
+	$(call cmd,vmlinux__)
+	$(Q)echo 'cmd_$@ := $(cmd_vmlinux__)' > $(@D)/.$(@F).cmd
+endef
+
+# Generate .S file with all kernel symbols
+quiet_cmd_kallsyms = KSYM    $@
+      cmd_kallsyms = $(NM) -n $< | $(KALLSYMS) \
+                     $(if $(CONFIG_KALLSYMS_ALL),--all-symbols) > $@
+
+.tmp_kallsyms1.o .tmp_kallsyms2.o .tmp_kallsyms3.o: %.o: %.S scripts FORCE
+	$(call if_changed_dep,as_o_S)
+
+.tmp_kallsyms%.S: .tmp_vmlinux% $(KALLSYMS)
+	$(call cmd,kallsyms)
+
+# .tmp_vmlinux1 must be complete except kallsyms, so update vmlinux version
+.tmp_vmlinux1: $(vmlinux-lds) $(vmlinux-all) FORCE
+	$(call if_changed_rule,ksym_ld)
+
+.tmp_vmlinux2: $(vmlinux-lds) $(vmlinux-all) .tmp_kallsyms1.o FORCE
+	$(call if_changed,vmlinux__)
+
+.tmp_vmlinux3: $(vmlinux-lds) $(vmlinux-all) .tmp_kallsyms2.o FORCE
+	$(call if_changed,vmlinux__)
+
+# Needs to visit scripts/ before $(KALLSYMS) can be used.
+$(KALLSYMS): scripts ;
+
+# Generate some data for debugging strange kallsyms problems
+debug_kallsyms: .tmp_map$(last_kallsyms)
+
+.tmp_map%: .tmp_vmlinux% FORCE
+	($(OBJDUMP) -h $< | $(AWK) '/^ +[0-9]/{print $$4 " 0 " $$2}'; $(NM) $<) | sort > $@
+
+.tmp_map3: .tmp_map2
+
+.tmp_map2: .tmp_map1
+
+endif # ifdef CONFIG_KALLSYMS
+
+# Do modpost on a prelinked vmlinux. The finally linked vmlinux has
+# relevant sections renamed as per the linker script.
+quiet_cmd_vmlinux-modpost = LD      $@
+      cmd_vmlinux-modpost = $(LD) $(LDFLAGS) -r -o $@                          \
+	 $(vmlinux-init) --start-group $(vmlinux-main) --end-group             \
+	 $(filter-out $(vmlinux-init) $(vmlinux-main) FORCE ,$^)
+define rule_vmlinux-modpost
+	:
+	+$(call cmd,vmlinux-modpost)
+	$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost $@
+	$(Q)echo 'cmd_$@ := $(cmd_vmlinux-modpost)' > $(dot-target).cmd
+endef
+
+# vmlinux image - including updated kernel symbols
+vmlinux: $(vmlinux-lds) $(vmlinux-init) $(vmlinux-main) vmlinux.o $(kallsyms.o) FORCE
+ifdef CONFIG_HEADERS_CHECK
+	$(Q)$(MAKE) -f $(srctree)/Makefile headers_check
+endif
+ifdef CONFIG_SAMPLES
+	$(Q)$(MAKE) $(build)=samples
+endif
+ifdef CONFIG_BUILD_DOCSRC
+	$(Q)$(MAKE) $(build)=Documentation
+endif
+	$(call vmlinux-modpost)
+	$(call if_changed_rule,vmlinux__)
+	$(Q)rm -f .old_version
+
+# build vmlinux.o first to catch section mismatch errors early
+ifdef CONFIG_KALLSYMS
+.tmp_vmlinux1: vmlinux.o
+endif
+
+modpost-init := $(filter-out init/built-in.o, $(vmlinux-init))
+vmlinux.o: $(modpost-init) $(vmlinux-main) FORCE
+	$(call if_changed_rule,vmlinux-modpost)
+
+# The actual objects are generated when descending,
+# make sure no implicit rule kicks in
+$(sort $(vmlinux-init) $(vmlinux-main)) $(vmlinux-lds): $(vmlinux-dirs) ;
+
+# Handle descending into subdirectories listed in $(vmlinux-dirs)
+# Preset locale variables to speed up the build process. Limit locale
+# tweaks to this spot to avoid wrong language settings when running
+# make menuconfig etc.
+# Error messages still appears in the original language
+
+PHONY += $(vmlinux-dirs)
+$(vmlinux-dirs): prepare scripts
+	$(Q)$(MAKE) $(build)=$@
+
+# Store (new) KERNELRELASE string in include/config/kernel.release
+include/config/kernel.release: include/config/auto.conf FORCE
+	$(Q)rm -f $@
+	$(Q)echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion $(srctree))" > $@
+
+
+# Things we need to do before we recursively start building the kernel
+# or the modules are listed in "prepare".
+# A multi level approach is used. prepareN is processed before prepareN-1.
+# archprepare is used in arch Makefiles and when processed asm symlink,
+# version.h and scripts_basic is processed / created.
+
+# Listed in dependency order
+PHONY += prepare archprepare prepare0 prepare1 prepare2 prepare3
+
+# prepare3 is used to check if we are building in a separate output directory,
+# and if so do:
+# 1) Check that make has not been executed in the kernel src $(srctree)
+prepare3: include/config/kernel.release
+ifneq ($(KBUILD_SRC),)
+	@$(kecho) '  Using $(srctree) as source for kernel'
+	$(Q)if [ -f $(srctree)/.config -o -d $(srctree)/include/config ]; then \
+		echo "  $(srctree) is not clean, please run 'make mrproper'";\
+		echo "  in the '$(srctree)' directory.";\
+		/bin/false; \
+	fi;
+endif
+
+# prepare2 creates a makefile if using a separate output directory
+prepare2: prepare3 outputmakefile
+
+prepare1: prepare2 include/linux/version.h include/generated/utsrelease.h \
+                   include/config/auto.conf
+	$(cmd_crmodverdir)
+
+archprepare: prepare1 scripts_basic
+
+prepare0: archprepare FORCE
+	$(Q)$(MAKE) $(build)=.
+	$(Q)$(MAKE) $(build)=. missing-syscalls
+
+# All the preparing..
+prepare: prepare0
+
+# Generate some files
+# ---------------------------------------------------------------------------
+
+# KERNELRELEASE can change from a few different places, meaning version.h
+# needs to be updated, so this check is forced on all builds
+
+uts_len := 64
+define filechk_utsrelease.h
+	if [ `echo -n "$(KERNELRELEASE)" | wc -c ` -gt $(uts_len) ]; then \
+	  echo '"$(KERNELRELEASE)" exceeds $(uts_len) characters' >&2;    \
+	  exit 1;                                                         \
+	fi;                                                               \
+	(echo \#define UTS_RELEASE \"$(KERNELRELEASE)\";)
+endef
+
+define filechk_version.h
+	(echo \#define LINUX_VERSION_CODE $(shell                             \
+	expr $(VERSION) \* 65536 + $(PATCHLEVEL) \* 256 + $(SUBLEVEL));     \
+	echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))';)
+endef
+
+include/linux/version.h: $(srctree)/Makefile FORCE
+	$(call filechk,version.h)
+
+include/generated/utsrelease.h: include/config/kernel.release FORCE
+	$(call filechk,utsrelease.h)
+
+PHONY += headerdep
+headerdep:
+	$(Q)find include/ -name '*.h' | xargs --max-args 1 scripts/headerdep.pl
+
+# ---------------------------------------------------------------------------
+
+PHONY += depend dep
+depend dep:
+	@echo '*** Warning: make $@ is unnecessary now.'
+
+# ---------------------------------------------------------------------------
+# Firmware install
+INSTALL_FW_PATH=$(INSTALL_MOD_PATH)/lib/firmware
+export INSTALL_FW_PATH
+
+PHONY += firmware_install
+firmware_install: FORCE
+	@mkdir -p $(objtree)/firmware
+	$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_install
+
+# ---------------------------------------------------------------------------
+# Kernel headers
+
+#Default location for installed headers
+export INSTALL_HDR_PATH = $(objtree)/usr
+
+hdr-inst := -rR -f $(srctree)/scripts/Makefile.headersinst obj
+
+# If we do an all arch process set dst to asm-$(hdr-arch)
+hdr-dst = $(if $(KBUILD_HEADERS), dst=include/asm-$(hdr-arch), dst=include/asm)
+
+PHONY += __headers
+__headers: include/linux/version.h scripts_basic FORCE
+	$(Q)$(MAKE) $(build)=scripts scripts/unifdef
+
+PHONY += headers_install_all
+headers_install_all:
+	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/headers.sh install
+
+PHONY += headers_install
+headers_install: __headers
+	$(if $(wildcard $(srctree)/arch/$(hdr-arch)/include/asm/Kbuild),, \
+	$(error Headers not exportable for the $(SRCARCH) architecture))
+	$(Q)$(MAKE) $(hdr-inst)=include
+	$(Q)$(MAKE) $(hdr-inst)=arch/$(hdr-arch)/include/asm $(hdr-dst)
+
+PHONY += headers_check_all
+headers_check_all: headers_install_all
+	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/headers.sh check
+
+PHONY += headers_check
+headers_check: headers_install
+	$(Q)$(MAKE) $(hdr-inst)=include HDRCHECK=1
+	$(Q)$(MAKE) $(hdr-inst)=arch/$(hdr-arch)/include/asm $(hdr-dst) HDRCHECK=1
+
+# ---------------------------------------------------------------------------
+# Modules
+
+ifdef CONFIG_MODULES
+
+# By default, build modules as well
+
+all: modules
+
+#	Build modules
+#
+#	A module can be listed more than once in obj-m resulting in
+#	duplicate lines in modules.order files.  Those are removed
+#	using awk while concatenating to the final file.
+
+PHONY += modules
+modules: $(vmlinux-dirs) $(if $(KBUILD_BUILTIN),vmlinux) modules.builtin
+	$(Q)$(AWK) '!x[$$0]++' $(vmlinux-dirs:%=$(objtree)/%/modules.order) > $(objtree)/modules.order
+	@$(kecho) '  Building modules, stage 2.';
+	$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost
+	$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_modbuild
+
+modules.builtin: $(vmlinux-dirs:%=%/modules.builtin)
+	$(Q)$(AWK) '!x[$$0]++' $^ > $(objtree)/modules.builtin
+
+%/modules.builtin: include/config/auto.conf
+	$(Q)$(MAKE) $(modbuiltin)=$*
+
+
+# Target to prepare building external modules
+PHONY += modules_prepare
+modules_prepare: prepare scripts
+
+# Target to install modules
+PHONY += modules_install
+modules_install: _modinst_ _modinst_post
+
+PHONY += _modinst_
+_modinst_:
+	@if [ -z "`$(DEPMOD) -V 2>/dev/null | grep module-init-tools`" ]; then \
+		echo "Warning: you may need to install module-init-tools"; \
+		echo "See http://www.codemonkey.org.uk/docs/post-halloween-2.6.txt";\
+		sleep 1; \
+	fi
+	@rm -rf $(MODLIB)/kernel
+	@rm -f $(MODLIB)/source
+	@mkdir -p $(MODLIB)/kernel
+	@ln -s $(srctree) $(MODLIB)/source
+	@if [ ! $(objtree) -ef  $(MODLIB)/build ]; then \
+		rm -f $(MODLIB)/build ; \
+		ln -s $(objtree) $(MODLIB)/build ; \
+	fi
+	@cp -f $(objtree)/modules.order $(MODLIB)/
+	@cp -f $(objtree)/modules.builtin $(MODLIB)/
+	$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modinst
+
+# This depmod is only for convenience to give the initial
+# boot a modules.dep even before / is mounted read-write.  However the
+# boot script depmod is the master version.
+PHONY += _modinst_post
+_modinst_post: _modinst_
+	$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_modinst
+	$(call cmd,depmod)
+
+else # CONFIG_MODULES
+
+# Modules not configured
+# ---------------------------------------------------------------------------
+
+modules modules_install: FORCE
+	@echo
+	@echo "The present kernel configuration has modules disabled."
+	@echo "Type 'make config' and enable loadable module support."
+	@echo "Then build a kernel with module support enabled."
+	@echo
+	@exit 1
+
+endif # CONFIG_MODULES
+
+###
+# Cleaning is done on three levels.
+# make clean     Delete most generated files
+#                Leave enough to build external modules
+# make mrproper  Delete the current configuration, and all generated files
+# make distclean Remove editor backup files, patch leftover files and the like
+
+# Directories & files removed with 'make clean'
+CLEAN_DIRS  += $(MODVERDIR)
+CLEAN_FILES +=	vmlinux System.map \
+                .tmp_kallsyms* .tmp_version .tmp_vmlinux* .tmp_System.map
+
+# Directories & files removed with 'make mrproper'
+MRPROPER_DIRS  += include/config usr/include include/generated
+MRPROPER_FILES += .config .config.old .version .old_version             \
+                  include/linux/version.h                               \
+		  Module.symvers tags TAGS cscope*
+
+# clean - Delete most, but leave enough to build external modules
+#
+clean: rm-dirs  := $(CLEAN_DIRS)
+clean: rm-files := $(CLEAN_FILES)
+clean-dirs      := $(addprefix _clean_, . $(vmlinux-alldirs) Documentation)
+
+PHONY += $(clean-dirs) clean archclean
+$(clean-dirs):
+	$(Q)$(MAKE) $(clean)=$(patsubst _clean_%,%,$@)
+
+clean: archclean
+
+# mrproper - Delete all generated files, including .config
+#
+mrproper: rm-dirs  := $(wildcard $(MRPROPER_DIRS))
+mrproper: rm-files := $(wildcard $(MRPROPER_FILES))
+mrproper-dirs      := $(addprefix _mrproper_,Documentation/DocBook scripts)
+
+PHONY += $(mrproper-dirs) mrproper archmrproper
+$(mrproper-dirs):
+	$(Q)$(MAKE) $(clean)=$(patsubst _mrproper_%,%,$@)
+
+mrproper: clean archmrproper $(mrproper-dirs)
+	$(call cmd,rmdirs)
+	$(call cmd,rmfiles)
+
+# distclean
+#
+PHONY += distclean
+
+distclean: mrproper
+	@find $(srctree) $(RCS_FIND_IGNORE) \
+		\( -name '*.orig' -o -name '*.rej' -o -name '*~' \
+		-o -name '*.bak' -o -name '#*#' -o -name '.*.orig' \
+		-o -name '.*.rej' -o -size 0 \
+		-o -name '*%' -o -name '.*.cmd' -o -name 'core' \) \
+		-type f -print | xargs rm -f
+
+
+# Packaging of the kernel to various formats
+# ---------------------------------------------------------------------------
+# rpm target kept for backward compatibility
+package-dir	:= $(srctree)/scripts/package
+
+%src-pkg: FORCE
+	$(Q)$(MAKE) $(build)=$(package-dir) $@
+%pkg: include/config/kernel.release FORCE
+	$(Q)$(MAKE) $(build)=$(package-dir) $@
+rpm: include/config/kernel.release FORCE
+	$(Q)$(MAKE) $(build)=$(package-dir) $@
+
+
+# Brief documentation of the typical targets used
+# ---------------------------------------------------------------------------
+
+boards := $(wildcard $(srctree)/arch/$(SRCARCH)/configs/*_defconfig)
+boards := $(notdir $(boards))
+board-dirs := $(dir $(wildcard $(srctree)/arch/$(SRCARCH)/configs/*/*_defconfig))
+board-dirs := $(sort $(notdir $(board-dirs:/=)))
+
+help:
+	@echo  'Cleaning targets:'
+	@echo  '  clean		  - Remove most generated files but keep the config and'
+	@echo  '                    enough build support to build external modules'
+	@echo  '  mrproper	  - Remove all generated files + config + various backup files'
+	@echo  '  distclean	  - mrproper + remove editor backup and patch files'
+	@echo  ''
+	@echo  'Configuration targets:'
+	@$(MAKE) -f $(srctree)/scripts/kconfig/Makefile help
+	@echo  ''
+	@echo  'Other generic targets:'
+	@echo  '  all		  - Build all targets marked with [*]'
+	@echo  '* vmlinux	  - Build the bare kernel'
+	@echo  '* modules	  - Build all modules'
+	@echo  '  modules_install - Install all modules to INSTALL_MOD_PATH (default: /)'
+	@echo  '  firmware_install- Install all firmware to INSTALL_FW_PATH'
+	@echo  '                    (default: $$(INSTALL_MOD_PATH)/lib/firmware)'
+	@echo  '  dir/            - Build all files in dir and below'
+	@echo  '  dir/file.[oisS] - Build specified target only'
+	@echo  '  dir/file.lst    - Build specified mixed source/assembly target only'
+	@echo  '                    (requires a recent binutils and recent build (System.map))'
+	@echo  '  dir/file.ko     - Build module including final link'
+	@echo  '  modules_prepare - Set up for building external modules'
+	@echo  '  tags/TAGS	  - Generate tags file for editors'
+	@echo  '  cscope	  - Generate cscope index'
+	@echo  '  kernelrelease	  - Output the release version string'
+	@echo  '  kernelversion	  - Output the version stored in Makefile'
+	@echo  '  headers_install - Install sanitised kernel headers to INSTALL_HDR_PATH'; \
+	 echo  '                    (default: $(INSTALL_HDR_PATH))'; \
+	 echo  ''
+	@echo  'Static analysers'
+	@echo  '  checkstack      - Generate a list of stack hogs'
+	@echo  '  namespacecheck  - Name space analysis on compiled kernel'
+	@echo  '  versioncheck    - Sanity check on version.h usage'
+	@echo  '  includecheck    - Check for duplicate included header files'
+	@echo  '  export_report   - List the usages of all exported symbols'
+	@echo  '  headers_check   - Sanity check on exported headers'
+	@echo  '  headerdep       - Detect inclusion cycles in headers'
+	@$(MAKE) -f $(srctree)/scripts/Makefile.help checker-help
+	@echo  ''
+	@echo  'Kernel packaging:'
+	@$(MAKE) $(build)=$(package-dir) help
+	@echo  ''
+	@echo  'Documentation targets:'
+	@$(MAKE) -f $(srctree)/Documentation/DocBook/Makefile dochelp
+	@echo  ''
+	@echo  'Architecture specific targets ($(SRCARCH)):'
+	@$(if $(archhelp),$(archhelp),\
+		echo '  No architecture specific help defined for $(SRCARCH)')
+	@echo  ''
+	@$(if $(boards), \
+		$(foreach b, $(boards), \
+		printf "  %-24s - Build for %s\\n" $(b) $(subst _defconfig,,$(b));) \
+		echo '')
+	@$(if $(board-dirs), \
+		$(foreach b, $(board-dirs), \
+		printf "  %-16s - Show %s-specific targets\\n" help-$(b) $(b);) \
+		printf "  %-16s - Show all of the above\\n" help-boards; \
+		echo '')
+
+	@echo  '  make V=0|1 [targets] 0 => quiet build (default), 1 => verbose build'
+	@echo  '  make V=2   [targets] 2 => give reason for rebuild of target'
+	@echo  '  make O=dir [targets] Locate all output files in "dir", including .config'
+	@echo  '  make C=1   [targets] Check all c source with $$CHECK (sparse by default)'
+	@echo  '  make C=2   [targets] Force check of all c source with $$CHECK'
+	@echo  ''
+	@echo  'Execute "make" or "make all" to build all targets marked with [*] '
+	@echo  'For further info see the ./README file'
+
+
+help-board-dirs := $(addprefix help-,$(board-dirs))
+
+help-boards: $(help-board-dirs)
+
+boards-per-dir = $(notdir $(wildcard $(srctree)/arch/$(SRCARCH)/configs/$*/*_defconfig))
+
+$(help-board-dirs): help-%:
+	@echo  'Architecture specific targets ($(SRCARCH) $*):'
+	@$(if $(boards-per-dir), \
+		$(foreach b, $(boards-per-dir), \
+		printf "  %-24s - Build for %s\\n" $*/$(b) $(subst _defconfig,,$(b));) \
+		echo '')
+
+
+# Documentation targets
+# ---------------------------------------------------------------------------
+%docs: scripts_basic FORCE
+	$(Q)$(MAKE) $(build)=Documentation/DocBook $@
+
+else # KBUILD_EXTMOD
+
+###
+# External module support.
+# When building external modules the kernel used as basis is considered
+# read-only, and no consistency checks are made and the make
+# system is not used on the basis kernel. If updates are required
+# in the basis kernel ordinary make commands (without M=...) must
+# be used.
+#
+# The following are the only valid targets when building external
+# modules.
+# make M=dir clean     Delete all automatically generated files
+# make M=dir modules   Make all modules in specified dir
+# make M=dir	       Same as 'make M=dir modules'
+# make M=dir modules_install
+#                      Install the modules built in the module directory
+#                      Assumes install directory is already created
+
+# We are always building modules
+KBUILD_MODULES := 1
+PHONY += crmodverdir
+crmodverdir:
+	$(cmd_crmodverdir)
+
+PHONY += $(objtree)/Module.symvers
+$(objtree)/Module.symvers:
+	@test -e $(objtree)/Module.symvers || ( \
+	echo; \
+	echo "  WARNING: Symbol version dump $(objtree)/Module.symvers"; \
+	echo "           is missing; modules will have no dependencies and modversions."; \
+	echo )
+
+module-dirs := $(addprefix _module_,$(KBUILD_EXTMOD))
+PHONY += $(module-dirs) modules
+$(module-dirs): crmodverdir $(objtree)/Module.symvers
+	$(Q)$(MAKE) $(build)=$(patsubst _module_%,%,$@)
+
+modules: $(module-dirs)
+	@$(kecho) '  Building modules, stage 2.';
+	$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost
+
+PHONY += modules_install
+modules_install: _emodinst_ _emodinst_post
+
+install-dir := $(if $(INSTALL_MOD_DIR),$(INSTALL_MOD_DIR),extra)
+PHONY += _emodinst_
+_emodinst_:
+	$(Q)mkdir -p $(MODLIB)/$(install-dir)
+	$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modinst
+
+PHONY += _emodinst_post
+_emodinst_post: _emodinst_
+	$(call cmd,depmod)
+
+clean-dirs := $(addprefix _clean_,$(KBUILD_EXTMOD))
+
+PHONY += $(clean-dirs) clean
+$(clean-dirs):
+	$(Q)$(MAKE) $(clean)=$(patsubst _clean_%,%,$@)
+
+clean:	rm-dirs := $(MODVERDIR)
+clean: rm-files := $(KBUILD_EXTMOD)/Module.symvers
+
+help:
+	@echo  '  Building external modules.'
+	@echo  '  Syntax: make -C path/to/kernel/src M=$$PWD target'
+	@echo  ''
+	@echo  '  modules         - default target, build the module(s)'
+	@echo  '  modules_install - install the module'
+	@echo  '  clean           - remove generated files in module directory only'
+	@echo  ''
+
+# Dummies...
+PHONY += prepare scripts
+prepare: ;
+scripts: ;
+endif # KBUILD_EXTMOD
+
+clean: $(clean-dirs)
+	$(call cmd,rmdirs)
+	$(call cmd,rmfiles)
+	@find $(or $(KBUILD_EXTMOD), .) $(RCS_FIND_IGNORE) \
+		\( -name '*.[oas]' -o -name '*.ko' -o -name '.*.cmd' \
+		-o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \
+		-o -name '*.symtypes' -o -name 'modules.order' \
+		-o -name modules.builtin -o -name '.tmp_*.o.*' \
+		-o -name '*.gcno' \) -type f -print | xargs rm -f
+
+# Generate tags for editors
+# ---------------------------------------------------------------------------
+quiet_cmd_tags = GEN     $@
+      cmd_tags = $(CONFIG_SHELL) $(srctree)/scripts/tags.sh $@
+
+tags TAGS cscope: FORCE
+	$(call cmd,tags)
+
+# Scripts to check various things for consistency
+# ---------------------------------------------------------------------------
+
+includecheck:
+	find * $(RCS_FIND_IGNORE) \
+		-name '*.[hcS]' -type f -print | sort \
+		| xargs $(PERL) -w $(srctree)/scripts/checkincludes.pl
+
+versioncheck:
+	find * $(RCS_FIND_IGNORE) \
+		-name '*.[hcS]' -type f -print | sort \
+		| xargs $(PERL) -w $(srctree)/scripts/checkversion.pl
+
+coccicheck:
+	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/$@
+
+namespacecheck:
+	$(PERL) $(srctree)/scripts/namespace.pl
+
+export_report:
+	$(PERL) $(srctree)/scripts/export_report.pl
+
+endif #ifeq ($(config-targets),1)
+endif #ifeq ($(mixed-targets),1)
+
+PHONY += checkstack kernelrelease kernelversion
+
+# UML needs a little special treatment here.  It wants to use the host
+# toolchain, so needs $(SUBARCH) passed to checkstack.pl.  Everyone
+# else wants $(ARCH), including people doing cross-builds, which means
+# that $(SUBARCH) doesn't work here.
+ifeq ($(ARCH), um)
+CHECKSTACK_ARCH := $(SUBARCH)
+else
+CHECKSTACK_ARCH := $(ARCH)
+endif
+checkstack:
+	$(OBJDUMP) -d vmlinux $$(find . -name '*.ko') | \
+	$(PERL) $(src)/scripts/checkstack.pl $(CHECKSTACK_ARCH)
+
+kernelrelease:
+	@echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion $(srctree))"
+
+kernelversion:
+	@echo $(KERNELVERSION)
+
+# Single targets
+# ---------------------------------------------------------------------------
+# Single targets are compatible with:
+# - build with mixed source and output
+# - build with separate output dir 'make O=...'
+# - external modules
+#
+#  target-dir => where to store outputfile
+#  build-dir  => directory in kernel source tree to use
+
+ifeq ($(KBUILD_EXTMOD),)
+        build-dir  = $(patsubst %/,%,$(dir $@))
+        target-dir = $(dir $@)
+else
+        zap-slash=$(filter-out .,$(patsubst %/,%,$(dir $@)))
+        build-dir  = $(KBUILD_EXTMOD)$(if $(zap-slash),/$(zap-slash))
+        target-dir = $(if $(KBUILD_EXTMOD),$(dir $<),$(dir $@))
+endif
+
+%.s: %.c prepare scripts FORCE
+	$(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
+%.i: %.c prepare scripts FORCE
+	$(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
+%.o: %.c prepare scripts FORCE
+	$(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
+%.lst: %.c prepare scripts FORCE
+	$(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
+%.s: %.S prepare scripts FORCE
+	$(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
+%.o: %.S prepare scripts FORCE
+	$(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
+%.symtypes: %.c prepare scripts FORCE
+	$(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
+
+# Modules
+/: prepare scripts FORCE
+	$(cmd_crmodverdir)
+	$(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
+	$(build)=$(build-dir)
+%/: prepare scripts FORCE
+	$(cmd_crmodverdir)
+	$(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
+	$(build)=$(build-dir)
+%.ko: prepare scripts FORCE
+	$(cmd_crmodverdir)
+	$(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1)   \
+	$(build)=$(build-dir) $(@:.ko=.o)
+	$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost
+
+# FIXME Should go into a make.lib or something
+# ===========================================================================
+
+quiet_cmd_rmdirs = $(if $(wildcard $(rm-dirs)),CLEAN   $(wildcard $(rm-dirs)))
+      cmd_rmdirs = rm -rf $(rm-dirs)
+
+quiet_cmd_rmfiles = $(if $(wildcard $(rm-files)),CLEAN   $(wildcard $(rm-files)))
+      cmd_rmfiles = rm -f $(rm-files)
+
+# Run depmod only if we have System.map and depmod is executable
+quiet_cmd_depmod = DEPMOD  $(KERNELRELEASE)
+      cmd_depmod = \
+	if [ -r System.map -a -x $(DEPMOD) ]; then                              \
+		$(DEPMOD) -ae -F System.map                                     \
+		$(if $(strip $(INSTALL_MOD_PATH)), -b $(INSTALL_MOD_PATH) )     \
+		$(KERNELRELEASE);                                               \
+	fi
+
+# Create temporary dir for module support files
+# clean it up only when building all modules
+cmd_crmodverdir = $(Q)mkdir -p $(MODVERDIR) \
+                  $(if $(KBUILD_MODULES),; rm -f $(MODVERDIR)/*)
+
+a_flags = -Wp,-MD,$(depfile) $(KBUILD_AFLAGS) $(AFLAGS_KERNEL) \
+	  $(KBUILD_AFLAGS_KERNEL)                              \
+	  $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(KBUILD_CPPFLAGS) \
+	  $(modkern_aflags) $(EXTRA_AFLAGS) $(AFLAGS_$(basetarget).o)
+
+quiet_cmd_as_o_S = AS      $@
+cmd_as_o_S       = $(CC) $(a_flags) -c -o $@ $<
+
+# read all saved command lines
+
+targets := $(wildcard $(sort $(targets)))
+cmd_files := $(wildcard .*.cmd $(foreach f,$(targets),$(dir $(f)).$(notdir $(f)).cmd))
+
+ifneq ($(cmd_files),)
+  $(cmd_files): ;	# Do not try to update included dependency files
+  include $(cmd_files)
+endif
+
+# Shorthand for $(Q)$(MAKE) -f scripts/Makefile.clean obj=dir
+# Usage:
+# $(Q)$(MAKE) $(clean)=dir
+clean := -f $(if $(KBUILD_SRC),$(srctree)/)scripts/Makefile.clean obj
+
+endif	# skip-makefile
+
+PHONY += FORCE
+FORCE:
+
+# Declare the contents of the .PHONY variable as phony.  We keep that
+# information in a variable so we can use it in if_changed and friends.
+.PHONY: $(PHONY)
diff -Nrupad linux-2.6.37//mm/cleancache.c linux-2.6.37_vanilla//mm/cleancache.c
--- linux-2.6.37//mm/cleancache.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.37_vanilla//mm/cleancache.c	2011-02-14 01:21:43.171793147 +0100
@@ -0,0 +1,258 @@
+/*
+ * Cleancache frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of cleancache.  See
+ * Documentation/vm/cleancache.txt for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include <linux/mm.h>
+#include <linux/cleancache.h>
+
+/*
+ * This global enablement flag may be read thousands of times per second
+ * by cleancache_get/put/flush even on systems where cleancache_ops
+ * is not claimed (e.g. cleancache is config'ed on but remains
+ * disabled), so is preferred to the slower alternative: a function
+ * call that checks a non-global.
+ */
+int cleancache_enabled;
+EXPORT_SYMBOL(cleancache_enabled);
+
+/*
+ * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * to the cleancache "backend" implementation functions.
+ */
+static struct cleancache_ops cleancache_ops;
+
+/* useful stats available in /sys/kernel/mm/cleancache */
+static unsigned long cleancache_succ_gets;
+static unsigned long cleancache_failed_gets;
+static unsigned long cleancache_puts;
+static unsigned long cleancache_flushes;
+
+/*
+ * register operations for cleancache, returning previous thus allowing
+ * detection of multiple backends and possible nesting
+ */
+struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
+{
+	struct cleancache_ops old = cleancache_ops;
+
+	cleancache_ops = *ops;
+	cleancache_enabled = 1;
+	return old;
+}
+EXPORT_SYMBOL(cleancache_register_ops);
+
+/* Called by a cleancache-enabled filesystem at time of mount */
+void __cleancache_init_fs(struct super_block *sb)
+{
+	sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE);
+}
+EXPORT_SYMBOL(__cleancache_init_fs);
+
+/* Called by a cleancache-enabled clustered filesystem at time of mount */
+void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+{
+	sb->cleancache_poolid =
+		(*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE);
+}
+EXPORT_SYMBOL(__cleancache_init_shared_fs);
+
+/*
+ * If the filesystem uses exportable filehandles, use the filehandle as
+ * the key, else use the inode number.
+ */
+static int cleancache_get_key(struct inode *inode,
+			      struct cleancache_filekey *key)
+{
+	int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
+	int maxlen = CLEANCACHE_KEY_MAX;
+	struct super_block *sb = inode->i_sb;
+	struct dentry *d;
+
+	key->u.ino = inode->i_ino;
+	if (sb->s_export_op != NULL) {
+		fhfn = sb->s_export_op->encode_fh;
+		if  (fhfn) {
+			d = list_first_entry(&inode->i_dentry,
+						struct dentry, d_alias);
+			(void)(*fhfn)(d, &key->u.fh[0], &maxlen, 0);
+			if (maxlen > CLEANCACHE_KEY_MAX)
+				return -1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * "Get" data from cleancache associated with the poolid/inode/index
+ * that were specified when the data was put to cleanache and, if
+ * successful, use it to fill the specified page with data and return 0.
+ * The pageframe is unchanged and returns -1 if the get fails.
+ * Page must be locked by caller.
+ */
+int __cleancache_get_page(struct page *page)
+{
+	int ret = -1;
+	int pool_id;
+	struct cleancache_filekey key = { .u.key = { 0 } };
+
+	VM_BUG_ON(!PageLocked(page));
+	pool_id = page->mapping->host->i_sb->cleancache_poolid;
+	if (pool_id < 0)
+		goto out;
+
+	if (cleancache_get_key(page->mapping->host, &key) < 0)
+		goto out;
+
+	ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page);
+	if (ret == 0)
+		cleancache_succ_gets++;
+	else
+		cleancache_failed_gets++;
+out:
+	return ret;
+}
+EXPORT_SYMBOL(__cleancache_get_page);
+
+/*
+ * "Put" data from a page to cleancache and associate it with the
+ * (previously-obtained per-filesystem) poolid and the page's,
+ * inode and page index.  Page must be locked.  Note that a put_page
+ * always "succeeds", though a subsequent get_page may succeed or fail.
+ */
+void __cleancache_put_page(struct page *page)
+{
+	int pool_id;
+	struct cleancache_filekey key = { .u.key = { 0 } };
+
+	VM_BUG_ON(!PageLocked(page));
+	pool_id = page->mapping->host->i_sb->cleancache_poolid;
+	if (pool_id >= 0 &&
+	      cleancache_get_key(page->mapping->host, &key) >= 0) {
+		(*cleancache_ops.put_page)(pool_id, key, page->index, page);
+		cleancache_puts++;
+	}
+}
+EXPORT_SYMBOL(__cleancache_put_page);
+
+/*
+ * Flush any data from cleancache associated with the poolid and the
+ * page's inode and page index so that a subsequent "get" will fail.
+ */
+void __cleancache_flush_page(struct address_space *mapping, struct page *page)
+{
+	/* careful... page->mapping is NULL sometimes when this is called */
+	int pool_id = mapping->host->i_sb->cleancache_poolid;
+	struct cleancache_filekey key = { .u.key = { 0 } };
+
+	if (pool_id >= 0) {
+		VM_BUG_ON(!PageLocked(page));
+		if (cleancache_get_key(mapping->host, &key) >= 0) {
+			(*cleancache_ops.flush_page)(pool_id, key, page->index);
+			cleancache_flushes++;
+		}
+	}
+}
+EXPORT_SYMBOL(__cleancache_flush_page);
+
+/*
+ * Flush all data from cleancache associated with the poolid and the
+ * mappings's inode so that all subsequent gets to this poolid/inode
+ * will fail.
+ */
+void __cleancache_flush_inode(struct address_space *mapping)
+{
+	int pool_id = mapping->host->i_sb->cleancache_poolid;
+	struct cleancache_filekey key = { .u.key = { 0 } };
+
+	if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
+		(*cleancache_ops.flush_inode)(pool_id, key);
+}
+EXPORT_SYMBOL(__cleancache_flush_inode);
+
+/*
+ * Called by any cleancache-enabled filesystem at time of unmount;
+ * note that pool_id is surrendered and may be reutrned by a subsequent
+ * cleancache_init_fs or cleancache_init_shared_fs
+ */
+void __cleancache_flush_fs(struct super_block *sb)
+{
+	if (sb->cleancache_poolid >= 0) {
+		int old_poolid = sb->cleancache_poolid;
+		sb->cleancache_poolid = -1;
+		(*cleancache_ops.flush_fs)(old_poolid);
+	}
+}
+EXPORT_SYMBOL(__cleancache_flush_fs);
+
+#ifdef CONFIG_SYSFS
+
+/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
+
+#define CLEANCACHE_ATTR_RO(_name) \
+	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+static ssize_t cleancache_succ_gets_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", cleancache_succ_gets);
+}
+CLEANCACHE_ATTR_RO(cleancache_succ_gets);
+
+static ssize_t cleancache_failed_gets_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", cleancache_failed_gets);
+}
+CLEANCACHE_ATTR_RO(cleancache_failed_gets);
+
+static ssize_t cleancache_puts_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", cleancache_puts);
+}
+CLEANCACHE_ATTR_RO(cleancache_puts);
+
+static ssize_t cleancache_flushes_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", cleancache_flushes);
+}
+CLEANCACHE_ATTR_RO(cleancache_flushes);
+
+static struct attribute *cleancache_attrs[] = {
+	&cleancache_succ_gets_attr.attr,
+	&cleancache_failed_gets_attr.attr,
+	&cleancache_puts_attr.attr,
+	&cleancache_flushes_attr.attr,
+	NULL,
+};
+
+static struct attribute_group cleancache_attr_group = {
+	.attrs = cleancache_attrs,
+	.name = "cleancache",
+};
+
+#endif /* CONFIG_SYSFS */
+
+static int __init init_cleancache(void)
+{
+#ifdef CONFIG_SYSFS
+	int err;
+
+	err = sysfs_create_group(mm_kobj, &cleancache_attr_group);
+#endif /* CONFIG_SYSFS */
+	return 0;
+}
+module_init(init_cleancache)
diff -Nrupad linux-2.6.37//mm/filemap.c linux-2.6.37_vanilla//mm/filemap.c
--- linux-2.6.37//mm/filemap.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//mm/filemap.c	2011-02-14 01:21:43.172793144 +0100
@@ -34,6 +34,7 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <linux/cleancache.h>
 #include "internal.h"

 /*
@@ -119,6 +120,16 @@ void __remove_from_page_cache(struct pag
 {
 	struct address_space *mapping = page->mapping;

+	/*
+	 * if we're uptodate, flush out into the cleancache, otherwise
+	 * invalidate any existing cleancache entries.  We can't leave
+	 * stale data around in the cleancache once our page is gone
+	 */
+	if (PageUptodate(page))
+		cleancache_put_page(page);
+	else
+		cleancache_flush_page(mapping, page);
+
 	radix_tree_delete(&mapping->page_tree, page->index);
 	page->mapping = NULL;
 	mapping->nrpages--;
diff -Nrupad linux-2.6.37//mm/frontswap.c linux-2.6.37_vanilla//mm/frontswap.c
--- linux-2.6.37//mm/frontswap.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.37_vanilla//mm/frontswap.c	2011-02-14 01:21:43.172793144 +0100
@@ -0,0 +1,331 @@
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap.  See
+ * Documentation/vm/frontswap.txt for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp.  All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sysctl.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+
+/*
+ * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * to the frontswap "backend" implementation functions.
+ */
+static struct frontswap_ops frontswap_ops;
+
+/*
+ * This global enablement flag reduces overhead on systems where frontswap_ops
+ * has not been registered, so is preferred to the slower alternative: a
+ * function call that checks a non-global.
+ */
+int frontswap_enabled;
+EXPORT_SYMBOL(frontswap_enabled);
+
+/* useful stats available in /sys/kernel/mm/frontswap */
+static unsigned long frontswap_gets;
+static unsigned long frontswap_succ_puts;
+static unsigned long frontswap_failed_puts;
+static unsigned long frontswap_flushes;
+
+/*
+ * register operations for frontswap, returning previous thus allowing
+ * detection of multiple backends and possible nesting
+ */
+struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
+{
+	struct frontswap_ops old = frontswap_ops;
+
+	frontswap_ops = *ops;
+	frontswap_enabled = 1;
+	return old;
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+
+/* Called when a swap device is swapon'd */
+void frontswap_init(unsigned type)
+{
+	if (frontswap_enabled)
+		(*frontswap_ops.init)(type);
+}
+EXPORT_SYMBOL(frontswap_init);
+
+/*
+ * "Put" data from a page to frontswap and associate it with the page's
+ * swaptype and offset.  Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implmentation may either overwrite the data
+ * and return success or flush the page from frontswap and return failure
+ */
+int __frontswap_put_page(struct page *page)
+{
+	int ret = -1, dup = 0;
+	swp_entry_t entry = { .val = page_private(page), };
+	int type = swp_type(entry);
+	struct swap_info_struct *sis = swap_info[type];
+	pgoff_t offset = swp_offset(entry);
+
+	BUG_ON(!PageLocked(page));
+	if (frontswap_test(sis, offset))
+		dup = 1;
+	ret = (*frontswap_ops.put_page)(type, offset, page);
+	if (ret == 0) {
+		frontswap_set(sis, offset);
+		frontswap_succ_puts++;
+		if (!dup)
+			sis->frontswap_pages++;
+	} else if (dup) {
+		/*
+		  failed dup always results in automatic flush of
+		  the (older) page from frontswap
+		 */
+		frontswap_clear(sis, offset);
+		sis->frontswap_pages--;
+		frontswap_failed_puts++;
+	} else
+		frontswap_failed_puts++;
+	return ret;
+}
+
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache
+ */
+int __frontswap_get_page(struct page *page)
+{
+	int ret = -1;
+	swp_entry_t entry = { .val = page_private(page), };
+	int type = swp_type(entry);
+	struct swap_info_struct *sis = swap_info[type];
+	pgoff_t offset = swp_offset(entry);
+
+	BUG_ON(!PageLocked(page));
+	if (frontswap_test(sis, offset))
+		ret = (*frontswap_ops.get_page)(type, offset, page);
+	if (ret == 0)
+		frontswap_gets++;
+	return ret;
+}
+
+/*
+ * Flush any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_flush_page(unsigned type, pgoff_t offset)
+{
+	struct swap_info_struct *sis = swap_info[type];
+
+	if (frontswap_test(sis, offset)) {
+		(*frontswap_ops.flush_page)(type, offset);
+		sis->frontswap_pages--;
+		frontswap_clear(sis, offset);
+		frontswap_flushes++;
+	}
+}
+
+/*
+ * Flush all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_flush_area(unsigned type)
+{
+	struct swap_info_struct *sis = swap_info[type];
+
+	(*frontswap_ops.flush_area)(type);
+	sis->frontswap_pages = 0;
+	memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+}
+
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+	int wrapped = 0;
+	bool locked = false;
+
+	for (wrapped = 0; wrapped <= 3; wrapped++) {
+
+		struct swap_info_struct *si = NULL;
+		unsigned long total_pages = 0, total_pages_to_unuse;
+		unsigned long pages = 0, unuse_pages = 0;
+		int type;
+
+		/*
+		 * we don't want to hold swap_lock while doing a very
+		 * lengthy try_to_unuse, but swap_list may change
+		 * so restart scan from swap_list.head each time
+		 */
+		spin_lock(&swap_lock);
+		locked = true;
+		total_pages = 0;
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = swap_info[type];
+			total_pages += si->frontswap_pages;
+		}
+		if (total_pages <= target_pages)
+			goto out;
+		total_pages_to_unuse = total_pages - target_pages;
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = swap_info[type];
+			if (total_pages_to_unuse < si->frontswap_pages)
+				pages = unuse_pages = total_pages_to_unuse;
+			else {
+				pages = si->frontswap_pages;
+				unuse_pages = 0; /* unuse all */
+			}
+			if (security_vm_enough_memory_kern(pages))
+				continue;
+			vm_unacct_memory(pages);
+			break;
+		}
+		if (type < 0)
+			goto out;
+		locked = false;
+		spin_unlock(&swap_lock);
+		try_to_unuse(type, true, unuse_pages);
+	}
+
+out:
+	if (locked)
+		spin_unlock(&swap_lock);
+	return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+
+/*
+ * count and return the number of pages frontswap pages across all
+ * swap devices.  This is exported so that a kernel module can
+ * determine current usage without reading sysfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+	int type;
+	unsigned long totalpages = 0;
+	struct swap_info_struct *si = NULL;
+
+	spin_lock(&swap_lock);
+	for (type = swap_list.head; type >= 0; type = si->next) {
+		si = swap_info[type];
+		totalpages += si->frontswap_pages;
+	}
+	spin_unlock(&swap_lock);
+	return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+
+#ifdef CONFIG_SYSFS
+
+/* see Documentation/ABI/xxx/sysfs-kernel-mm-frontswap */
+
+#define FRONTSWAP_ATTR_RO(_name) \
+	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define FRONTSWAP_ATTR(_name) \
+	static struct kobj_attribute _name##_attr = \
+		__ATTR(_name, 0644, _name##_show, _name##_store)
+
+static ssize_t curr_pages_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_curr_pages());
+}
+
+static ssize_t curr_pages_store(struct kobject *kobj,
+			       struct kobj_attribute *attr,
+			       const char *buf, size_t count)
+{
+	unsigned long target_pages;
+	int err;
+
+	err = strict_strtoul(buf, 10, &target_pages);
+	if (err)
+		return -EINVAL;
+
+	frontswap_shrink(target_pages);
+
+	return count;
+}
+FRONTSWAP_ATTR(curr_pages);
+
+static ssize_t succ_puts_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_succ_puts);
+}
+FRONTSWAP_ATTR_RO(succ_puts);
+
+static ssize_t failed_puts_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_failed_puts);
+}
+FRONTSWAP_ATTR_RO(failed_puts);
+
+static ssize_t gets_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_gets);
+}
+FRONTSWAP_ATTR_RO(gets);
+
+static ssize_t flushes_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", frontswap_flushes);
+}
+FRONTSWAP_ATTR_RO(flushes);
+
+static struct attribute *frontswap_attrs[] = {
+	&curr_pages_attr.attr,
+	&succ_puts_attr.attr,
+	&failed_puts_attr.attr,
+	&gets_attr.attr,
+	&flushes_attr.attr,
+	NULL,
+};
+
+static struct attribute_group frontswap_attr_group = {
+	.attrs = frontswap_attrs,
+	.name = "frontswap",
+};
+
+#endif /* CONFIG_SYSFS */
+
+static int __init init_frontswap(void)
+{
+#ifdef CONFIG_SYSFS
+	int err;
+
+	err = sysfs_create_group(mm_kobj, &frontswap_attr_group);
+#endif /* CONFIG_SYSFS */
+	return 0;
+}
+
+static void __exit exit_frontswap(void)
+{
+	frontswap_shrink(0UL);
+}
+
+module_init(init_frontswap);
+module_exit(exit_frontswap);
diff -Nrupad linux-2.6.37//mm/Kconfig linux-2.6.37_vanilla//mm/Kconfig
--- linux-2.6.37//mm/Kconfig	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//mm/Kconfig	2011-02-14 01:21:43.172793144 +0100
@@ -309,3 +309,41 @@ config NEED_PER_CPU_KM
 	depends on !SMP
 	bool
 	default y
+
+config CLEANCACHE
+	bool "Enable cleancache pseudo-RAM driver to cache clean pages"
+	default y
+	help
+	  Cleancache can be thought of as a page-granularity victim cache
+	  for clean pages that the kernel's pageframe replacement algorithm
+	  (PFRA) would like to keep around, but can't since there isn't enough
+	  memory.  So when the PFRA "evicts" a page, it first attempts to put
+	  it into a synchronous concurrency-safe page-oriented pseudo-RAM
+	  device (such as Xen's Transcendent Memory, aka "tmem") which is not
+	  directly accessible or addressable by the kernel and is of unknown
+	  (and possibly time-varying) size.  And when a cleancache-enabled
+	  filesystem wishes to access a page in a file on disk, it first
+	  checks cleancache to see if it already contains it; if it does,
+	  the page is copied into the kernel and a disk access is avoided.
+	  When a pseudo-RAM device is available, a significant I/O reduction
+	  may be achieved.  When none is available, all cleancache calls
+	  are reduced to a single pointer-compare-against-NULL resulting
+	  in a negligible performance hit.
+
+	  If unsure, say Y to enable cleancache
+
+config FRONTSWAP
+	bool "Enable frontswap pseudo-RAM driver to cache swap pages"
+	default y
+	help
+ 	  Frontswap is so named because it can be thought of as the opposite of
+ 	  a "backing" store for a swap device.  The storage is assumed to be
+ 	  a synchronous concurrency-safe page-oriented pseudo-RAM device (such
+	  as Xen's Transcendent Memory, aka "tmem") which is not directly
+	  accessible or addressable by the kernel and is of unknown (and
+	  possibly time-varying) size.  When a pseudo-RAM device is available,
+	  a signficant swap I/O reduction may be achieved.  When none is
+	  available, all frontswap calls are reduced to a single pointer-
+	  compare-against-NULL resulting in a negligible performance hit.
+
+	  If unsure, say Y to enable frontswap.
diff -Nrupad linux-2.6.37//mm/Makefile linux-2.6.37_vanilla//mm/Makefile
--- linux-2.6.37//mm/Makefile	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//mm/Makefile	2011-02-14 01:21:43.172793144 +0100
@@ -19,6 +19,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.

 obj-$(CONFIG_BOUNCE)	+= bounce.o
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_FRONTSWAP)	+= frontswap.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
@@ -42,3 +43,5 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-f
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
+obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_FRONTSWAP) += frontswap.o
diff -Nrupad linux-2.6.37//mm/page_io.c linux-2.6.37_vanilla//mm/page_io.c
--- linux-2.6.37//mm/page_io.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//mm/page_io.c	2011-02-14 01:21:43.172793144 +0100
@@ -18,6 +18,7 @@
 #include <linux/bio.h>
 #include <linux/swapops.h>
 #include <linux/writeback.h>
+#include <linux/frontswap.h>
 #include <asm/pgtable.h>

 static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -98,6 +99,12 @@ int swap_writepage(struct page *page, st
 		unlock_page(page);
 		goto out;
 	}
+	if (frontswap_put_page(page) == 0) {
+		set_page_writeback(page);
+		unlock_page(page);
+		end_page_writeback(page);
+		goto out;
+	}
 	bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
 	if (bio == NULL) {
 		set_page_dirty(page);
@@ -122,6 +129,11 @@ int swap_readpage(struct page *page)

 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(PageUptodate(page));
+	if (frontswap_get_page(page) == 0) {
+		SetPageUptodate(page);
+		unlock_page(page);
+		goto out;
+	}
 	bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
 	if (bio == NULL) {
 		unlock_page(page);
diff -Nrupad linux-2.6.37//mm/swapfile.c linux-2.6.37_vanilla//mm/swapfile.c
--- linux-2.6.37//mm/swapfile.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//mm/swapfile.c	2011-02-14 01:21:43.173793142 +0100
@@ -31,6 +31,8 @@
 #include <linux/syscalls.h>
 #include <linux/memcontrol.h>
 #include <linux/poll.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>

 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -42,7 +44,7 @@ static bool swap_count_continued(struct
 static void free_swap_count_continuations(struct swap_info_struct *);
 static sector_t map_swap_entry(swp_entry_t, struct block_device**);

-static DEFINE_SPINLOCK(swap_lock);
+DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
 long nr_swap_pages;
 long total_swap_pages;
@@ -53,9 +55,9 @@ static const char Unused_file[] = "Unuse
 static const char Bad_offset[] = "Bad swap offset entry ";
 static const char Unused_offset[] = "Unused swap offset entry ";

-static struct swap_list_t swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};

-static struct swap_info_struct *swap_info[MAX_SWAPFILES];
+struct swap_info_struct *swap_info[MAX_SWAPFILES];

 static DEFINE_MUTEX(swapon_mutex);

@@ -589,6 +591,7 @@ static unsigned char swap_entry_free(str
 			swap_list.next = p->type;
 		nr_swap_pages++;
 		p->inuse_pages--;
+		frontswap_flush_page(p->type, offset);
 		if ((p->flags & SWP_BLKDEV) &&
 				disk->fops->swap_slot_free_notify)
 			disk->fops->swap_slot_free_notify(p->bdev, offset);
@@ -1052,7 +1055,7 @@ static int unuse_mm(struct mm_struct *mm
  * Recycle to start on reaching the end, returning 0 when empty.
  */
 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
-					unsigned int prev)
+					unsigned int prev, bool frontswap)
 {
 	unsigned int max = si->max;
 	unsigned int i = prev;
@@ -1078,6 +1081,12 @@ static unsigned int find_next_to_unuse(s
 			prev = 0;
 			i = 1;
 		}
+		if (frontswap) {
+			if (frontswap_test(si, i))
+				break;
+			else
+				continue;
+		}
 		count = si->swap_map[i];
 		if (count && swap_count(count) != SWAP_MAP_BAD)
 			break;
@@ -1089,8 +1098,12 @@ static unsigned int find_next_to_unuse(s
  * We completely avoid races by reading each swap page in advance,
  * and then search for the process using it.  All the necessary
  * page table adjustments can then be made atomically.
+ *
+ * if the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages
  */
-static int try_to_unuse(unsigned int type)
+int try_to_unuse(unsigned int type, bool frontswap,
+		 unsigned long pages_to_unuse)
 {
 	struct swap_info_struct *si = swap_info[type];
 	struct mm_struct *start_mm;
@@ -1123,7 +1136,7 @@ static int try_to_unuse(unsigned int typ
 	 * one pass through swap_map is enough, but not necessarily:
 	 * there are races when an instance of an entry might be missed.
 	 */
-	while ((i = find_next_to_unuse(si, i)) != 0) {
+	while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
 		if (signal_pending(current)) {
 			retval = -EINTR;
 			break;
@@ -1290,6 +1303,10 @@ static int try_to_unuse(unsigned int typ
 		 * interactive performance.
 		 */
 		cond_resched();
+		if (frontswap && pages_to_unuse > 0) {
+			if (!--pages_to_unuse)
+				break;
+		}
 	}

 	mmput(start_mm);
@@ -1615,7 +1632,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
 	spin_unlock(&swap_lock);

 	current->flags |= PF_OOM_ORIGIN;
-	err = try_to_unuse(type);
+	err = try_to_unuse(type, false, 0);
 	current->flags &= ~PF_OOM_ORIGIN;

 	if (err) {
@@ -1667,9 +1684,12 @@ SYSCALL_DEFINE1(swapoff, const char __us
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
 	p->flags = 0;
+	frontswap_flush_area(type);
 	spin_unlock(&swap_lock);
 	mutex_unlock(&swapon_mutex);
 	vfree(swap_map);
+	if (p->frontswap_map)
+		vfree(p->frontswap_map);
 	/* Destroy swap account informatin */
 	swap_cgroup_swapoff(type);

@@ -1864,6 +1884,7 @@ SYSCALL_DEFINE2(swapon, const char __use
 	unsigned long maxpages;
 	unsigned long swapfilepages;
 	unsigned char *swap_map = NULL;
+	unsigned long *frontswap_map = NULL;
 	struct page *page = NULL;
 	struct inode *inode = NULL;
 	int did_down = 0;
@@ -2085,6 +2106,12 @@ SYSCALL_DEFINE2(swapon, const char __use
 		error = -EINVAL;
 		goto bad_swap;
 	}
+	/* frontswap enabled? set up bit-per-page map for frontswap */
+	if (frontswap_enabled) {
+		frontswap_map = vmalloc(maxpages / sizeof(long));
+		if (frontswap_map)
+			memset(frontswap_map, 0, maxpages / sizeof(long));
+	}

 	if (p->bdev) {
 		if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2103,16 +2130,18 @@ SYSCALL_DEFINE2(swapon, const char __use
 	else
 		p->prio = --least_priority;
 	p->swap_map = swap_map;
+	p->frontswap_map = frontswap_map;
 	p->flags |= SWP_WRITEOK;
 	nr_swap_pages += nr_good_pages;
 	total_swap_pages += nr_good_pages;

 	printk(KERN_INFO "Adding %uk swap on %s.  "
-			"Priority:%d extents:%d across:%lluk %s%s\n",
+			 "Priority:%d extents:%d across:%lluk %s%s%s\n",
 		nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
 		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
 		(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
-		(p->flags & SWP_DISCARDABLE) ? "D" : "");
+		(p->flags & SWP_DISCARDABLE) ? "D" : "",
+		(p->frontswap_map) ? "FS" : "");

 	/* insert swap space into swap_list: */
 	prev = -1;
@@ -2126,6 +2155,7 @@ SYSCALL_DEFINE2(swapon, const char __use
 		swap_list.head = swap_list.next = type;
 	else
 		swap_info[prev]->next = type;
+	frontswap_init(type);
 	spin_unlock(&swap_lock);
 	mutex_unlock(&swapon_mutex);
 	atomic_inc(&proc_poll_event);
@@ -2313,6 +2343,10 @@ int valid_swaphandles(swp_entry_t entry,
 		base++;

 	spin_lock(&swap_lock);
+	if (frontswap_test(si, target)) {
+		spin_unlock(&swap_lock);
+		return 0;
+	}
 	if (end > si->max)	/* don't go beyond end of map */
 		end = si->max;

@@ -2323,6 +2357,9 @@ int valid_swaphandles(swp_entry_t entry,
 			break;
 		if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
 			break;
+		/* Don't read in frontswap pages */
+		if (frontswap_test(si, toff))
+			break;
 	}
 	/* Count contiguous allocated slots below our target */
 	for (toff = target; --toff >= base; nr_pages++) {
@@ -2331,6 +2368,9 @@ int valid_swaphandles(swp_entry_t entry,
 			break;
 		if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
 			break;
+		/* Don't read in frontswap pages */
+		if (frontswap_test(si, toff))
+			break;
 	}
 	spin_unlock(&swap_lock);

diff -Nrupad linux-2.6.37//mm/truncate.c linux-2.6.37_vanilla//mm/truncate.c
--- linux-2.6.37//mm/truncate.c	2011-01-05 01:50:19.000000000 +0100
+++ linux-2.6.37_vanilla//mm/truncate.c	2011-02-14 01:21:43.174793140 +0100
@@ -19,6 +19,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/buffer_head.h>	/* grr. try_to_release_page,
 				   do_invalidatepage */
+#include <linux/cleancache.h>
 #include "internal.h"


@@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
 	zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+	cleancache_flush_page(page->mapping, page);
 	if (page_has_private(page))
 		do_invalidatepage(page, partial);
 }
@@ -108,6 +110,10 @@ truncate_complete_page(struct address_sp
 	clear_page_mlock(page);
 	remove_from_page_cache(page);
 	ClearPageMappedToDisk(page);
+	/* this must be after the remove_from_page_cache which
+	 * calls cleancache_put_page (and note page->mapping is now NULL)
+	 */
+	cleancache_flush_page(mapping, page);
 	page_cache_release(page);	/* pagecache ref */
 	return 0;
 }
@@ -215,6 +221,7 @@ void truncate_inode_pages_range(struct a
 	pgoff_t next;
 	int i;

+	cleancache_flush_inode(mapping);
 	if (mapping->nrpages == 0)
 		return;

@@ -290,6 +297,7 @@ void truncate_inode_pages_range(struct a
 		pagevec_release(&pvec);
 		mem_cgroup_uncharge_end();
 	}
+	cleancache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);

@@ -432,6 +440,7 @@ int invalidate_inode_pages2_range(struct
 	int did_range_unmap = 0;
 	int wrapped = 0;

+	cleancache_flush_inode(mapping);
 	pagevec_init(&pvec, 0);
 	next = start;
 	while (next <= end && !wrapped &&
@@ -490,6 +499,7 @@ int invalidate_inode_pages2_range(struct
 		mem_cgroup_uncharge_end();
 		cond_resched();
 	}
+	cleancache_flush_inode(mapping);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);