Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- diff -Nrupad linux-2.6.37//arch/x86/kvm/vmx.c linux-2.6.37_vanilla//arch/x86/kvm/vmx.c
- --- linux-2.6.37//arch/x86/kvm/vmx.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//arch/x86/kvm/vmx.c 2011-02-14 01:20:15.814793213 +0100
- @@ -563,7 +563,7 @@ static inline void ept_sync_individual_a
- }
- }
- -static unsigned long vmcs_readl(unsigned long field)
- +static noinline unsigned long vmcs_readl(unsigned long field)
- {
- unsigned long value;
- diff -Nrupad linux-2.6.37//Documentation/ABI/testing/sysfs-kernel-mm-cleancache linux-2.6.37_vanilla//Documentation/ABI/testing/sysfs-kernel-mm-cleancache
- --- linux-2.6.37//Documentation/ABI/testing/sysfs-kernel-mm-cleancache 1970-01-01 01:00:00.000000000 +0100
- +++ linux-2.6.37_vanilla//Documentation/ABI/testing/sysfs-kernel-mm-cleancache 2011-02-14 01:21:43.156792902 +0100
- @@ -0,0 +1,11 @@
- +What: /sys/kernel/mm/cleancache/
- +Date: June 2010
- +Contact: Dan Magenheimer <dan.magenheimer@oracle.com>
- +Description:
- + /sys/kernel/mm/cleancache/ contains a number of files which
- + record a count of various cleancache operations
- + (sum across all filesystems):
- + succ_gets
- + failed_gets
- + puts
- + flushes
- diff -Nrupad linux-2.6.37//Documentation/ABI/testing/sysfs-kernel-mm-frontswap linux-2.6.37_vanilla//Documentation/ABI/testing/sysfs-kernel-mm-frontswap
- --- linux-2.6.37//Documentation/ABI/testing/sysfs-kernel-mm-frontswap 1970-01-01 01:00:00.000000000 +0100
- +++ linux-2.6.37_vanilla//Documentation/ABI/testing/sysfs-kernel-mm-frontswap 2011-02-14 01:21:43.156792902 +0100
- @@ -0,0 +1,16 @@
- +What: /sys/kernel/mm/frontswap/
- +Date: June 2010
- +Contact: Dan Magenheimer <dan.magenheimer@oracle.com>
- +Description:
- + /sys/kernel/mm/frontswap/ contains a number of files which
- + record a count of various frontswap operations (sum across
- + all swap devices):
- + succ_puts
- + failed_puts
- + gets
- + flushes
- + In addition, reading the curr_pages file shows how many
- + pages are currently contained in frontswap and writing this
- + file with an integer performs a "partial swapoff", reducing
- + the number of frontswap pages to that integer if memory
- + constraints permit.
- diff -Nrupad linux-2.6.37//Documentation/vm/cleancache.txt linux-2.6.37_vanilla//Documentation/vm/cleancache.txt
- --- linux-2.6.37//Documentation/vm/cleancache.txt 1970-01-01 01:00:00.000000000 +0100
- +++ linux-2.6.37_vanilla//Documentation/vm/cleancache.txt 2011-02-14 01:21:43.157792932 +0100
- @@ -0,0 +1,267 @@
- +MOTIVATION
- +
- +Cleancache is a new optional feature provided by the VFS layer that
- +potentially dramatically increases page cache effectiveness for
- +many workloads in many environments at a negligible cost.
- +
- +Cleancache can be thought of as a page-granularity victim cache for clean
- +pages that the kernel's pageframe replacement algorithm (PFRA) would like
- +to keep around, but can't since there isn't enough memory. So when the
- +PFRA "evicts" a page, it first attempts to put it into a synchronous
- +concurrency-safe page-oriented "pseudo-RAM" device (such as Xen's
- +Transcendent Memory, aka "tmem", or in-kernel compressed memory, aka "zmem",
- +or other RAM-like devices) which is not directly accessible or addressable
- +by the kernel and is of unknown and possibly time-varying size. And when a
- +cleancache-enabled filesystem wishes to access a page in a file on disk,
- +it first checks cleancache to see if it already contains it; if it does,
- +the page is copied into the kernel and a disk access is avoided.
- +
- +FAQs are included below.
- +
- +IMPLEMENTATION OVERVIEW
- +
- +A cleancache "backend" that interfaces to this pseudo-RAM links itself
- +to the kernel's cleancache "frontend" by calling cleancache_register_ops,
- +passing a pointer to a cleancache_ops structure with funcs set appropriately.
- +Note that cleancache_register_ops returns the previous settings so that
- +chaining can be pefromed if desired. The functions provided must conform to
- +certain semantics as follows:
- +
- +Most important, cleancache is "ephemeral". Pages which are copied into
- +cleancache have an indefinite lifetime which is completely unknowable
- +by the kernel and so may or may not still be in cleancache at any later time.
- +Thus, as its name implies, cleancache is not suitable for dirty pages.
- +Cleancache has complete discretion over what pages to preserve and what
- +pages to discard and when.
- +
- +Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a
- +pool id which, if positive, must be saved in the filesystem's superblock;
- +a negative return value indicates failure. A "put_page" will copy a
- +(presumably about-to-be-evicted) page into cleancache and associate it with
- +the pool id, a file key, and a page index into the file. (The combination
- +of a pool id, a file key, and an index is sometimes called a "handle".)
- +A "get_page" will copy the page, if found, from cleancache into kernel memory.
- +A "flush_page" will ensure the page no longer is present in cleancache;
- +a "flush_inode" will flush all pages associated with the specified file;
- +and, when a filesystem is unmounted, a "flush_fs" will flush all pages in
- +all files specified by the given pool id and also surrender the pool id.
- +
- +An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache
- +to treat the pool as shared using a 128-bit UUID as a key. On systems
- +that may run multiple kernels (such as hard partitioned or virtualized
- +systems) that may share a clustered filesystem, and where cleancache
- +may be shared among those kernels, calls to init_shared_fs that specify the
- +same UUID will receive the same pool id, thus allowing the pages to
- +be shared. Note that any security requirements must be imposed outside
- +of the kernel (e.g. by "tools" that control cleancache). Or a
- +cleancache implementation can simply disable shared_init by always
- +returning a negative value.
- +
- +If a get_page is successful on a non-shared pool, the page is flushed (thus
- +making cleancache an "exclusive" cache). On a shared pool, the page
- +is NOT flushed on a successful get_page so that it remains accessible to
- +other sharers. The kernel is responsible for ensuring coherency between
- +cleancache (shared or not), the page cache, and the filesystem, using
- +cleancache flush operations as required.
- +
- +Note that cleancache must enforce put-put-get coherency and get-get
- +coherency. For the former, if two puts are made to the same handle but
- +with different data, say AAA by the first put and BBB by the second, a
- +subsequent get can never return the stale data (AAA). For get-get coherency,
- +if a get for a given handle fails, subsequent gets for that handle will
- +never succeed unless preceded by a successful put with that handle.
- +
- +Last, cleancache provides no SMP serialization guarantees; if two
- +different Linux threads are simultaneously putting and flushing a page
- +with the same handle, the results are indeterminate. Callers must
- +lock the page to ensure serial behavior.
- +
- +CLEANCACHE PERFORMANCE METRICS
- +
- +Cleancache monitoring is done by sysfs files in the
- +/sys/kernel/mm/cleancache directory. The effectiveness of cleancache
- +can be measured (across all filesystems) with:
- +
- +succ_gets - number of gets that were successful
- +failed_gets - number of gets that failed
- +puts - number of puts attempted (all "succeed")
- +flushes - number of flushes attempted
- +
- +A backend implementatation may provide additional metrics.
- +
- +FAQ
- +
- +1) Where's the value? (Andrew Morton)
- +
- +Cleancache provides a significant performance benefit to many workloads
- +in many environments with negligible overhead by improving the
- +effectiveness of the pagecache. Clean pagecache pages are
- +saved in pseudo-RAM (RAM that is otherwise not directly addressable to
- +the kernel); fetching those pages later avoids "refaults" and thus
- +disk reads.
- +
- +Cleancache (and its sister code "frontswap") provide interfaces for
- +a new pseudo-RAM memory type that conceptually lies between fast
- +kernel-directly-addressable RAM and slower DMA/asynchronous devices.
- +Disallowing direct kernel or userland reads/writes to this pseudo-RAM
- +is ideal when data is transformed to a different form and size (such
- +as with compression) or secretly moved (as might be useful for write-
- +balancing for some RAM-like devices). Evicted page-cache pages (and
- +swap pages) are a great use for this kind of slower-than-RAM-but-much-
- +faster-than-disk pseudo-RAM and the cleancache (and frontswap)
- +"page-object-oriented" specification provides a nice way to read and
- +write -- and indirectly "name" -- the pages.
- +
- +In the virtual case, the whole point of virtualization is to statistically
- +multiplex physical resources across the varying demands of multiple
- +virtual machines. This is really hard to do with RAM and efforts to
- +do it well with no kernel change have essentially failed (except in some
- +well-publicized special-case workloads). Cleancache -- and frontswap --
- +with a fairly small impact on the kernel, provide a huge amount
- +of flexibility for more dynamic, flexible RAM multiplexing.
- +Specifically, the Xen Transcendent Memory backend allows otherwise
- +"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
- +virtual machines, but the pages can be compressed and deduplicated to
- +optimize RAM utilization. And when guest OS's are induced to surrender
- +underutilized RAM (e.g. with "self-ballooning"), page cache pages
- +are the first to go, and cleancache allows those pages to be
- +saved and reclaimed if overall host system memory conditions allow.
- +
- +2) Why does cleancache have its sticky fingers so deep inside the
- + filesystems and VFS? (Andrew Morton and Christoph Hellwig)
- +
- +The core hooks for cleancache in VFS are in most cases a single line
- +and the minimum set are placed precisely where needed to maintain
- +coherency (via cleancache_flush operations) between cleancache,
- +the page cache, and disk. All hooks compile into nothingness if
- +cleancache is config'ed off and turn into a function-pointer-
- +compare-to-NULL if config'ed on but no backend claims the ops
- +functions, or to a compare-struct-element-to-negative if a
- +backend claims the ops functions but a filesystem doesn't enable
- +cleancache.
- +
- +Some filesystems are built entirely on top of VFS and the hooks
- +in VFS are sufficient, so don't require an "init_fs" hook; the
- +initial implementation of cleancache didn't provide this hook.
- +But for some filesystems (such as btrfs), the VFS hooks are
- +incomplete and one or more hooks in fs-specific code are required.
- +And for some other filesystems, such as tmpfs, cleancache may
- +be counterproductive. So it seemed prudent to require a filesystem
- +to "opt in" to use cleancache, which requires adding a hook in
- +each filesystem. Not all filesystems are supported by cleancache
- +only because they haven't been tested. The existing set should
- +be sufficient to validate the concept, the opt-in approach means
- +that untested filesystems are not affected, and the hooks in the
- +existing filesystems should make it very easy to add more
- +filesystems in the future.
- +
- +The total impact of the hooks to existing fs and mm files is 43
- +lines added (not counting comments and blank lines).
- +
- +3) Why not make cleancache asynchronous and batched so it can
- + more easily interface with real devices with DMA instead
- + of copying each individual page? (Minchan Kim)
- +
- +The one-page-at-a-time copy semantics simplifies the implementation
- +on both the frontend and backend and also allows the backend to
- +do fancy things on-the-fly like page compression and
- +page deduplication. And since the data is "gone" (copied into/out
- +of the pageframe) before the cleancache get/put call returns,
- +a great deal of race conditions and potential coherency issues
- +are avoided. While the interface seems odd for a "real device"
- +or for real kernel-addressable RAM, it makes perfect sense for
- +pseudo-RAM.
- +
- +4) Why is non-shared cleancache "exclusive"? And where is the
- + page "flushed" after a "get"? (Minchan Kim)
- +
- +The main reason is to free up memory in pseudo-RAM and to avoid
- +unnecessary cleancache_flush calls. If you want inclusive,
- +the page can be "put" immediately following the "get". If
- +put-after-get for inclusive becomes common, the interface could
- +be easily extended to add a "get_no_flush" call.
- +
- +The flush is done by the cleancache backend implementation.
- +
- +5) What's the performance impact?
- +
- +Performance analysis has been presented at OLS'09 and LCA'10.
- +Briefly, performance gains can be significant on most workloads,
- +especially when memory pressure is high (e.g. when RAM is
- +overcommitted in a virtual workload); and because the hooks are
- +invoked primarily in place of or in addition to a disk read/write,
- +overhead is negligible even in worst case workloads. Basically
- +cleancache replaces I/O with memory-copy-CPU-overhead; on older
- +single-core systems with slow memory-copy speeds, cleancache
- +has little value, but in newer multicore machines, especially
- +consolidated/virtualized machines, it has great value.
- +
- +6) How do I add cleancache support for filesystem X? (Boaz Harrash)
- +
- +Filesystems that are well-behaved and conform to certain
- +restrictions can utilize cleancache simply by making a call to
- +cleancache_init_fs at mount time. Unusual, misbehaving, or
- +poorly layered filesystems must either add additional hooks
- +and/or undergo extensive additional testing... or should just
- +not enable the optional cleancache.
- +
- +Some points for a filesystem to consider:
- +
- +- The FS should be block-device-based (e.g. a ram-based FS such
- + as tmpfs should not enable cleancache)
- +- To ensure coherency/correctness, the FS must ensure that all
- + file removal or truncation operations either go through VFS or
- + add hooks to do the equivalent cleancache "flush" operations
- +- To ensure coherency/correctness, either inode numbers must
- + be unique across the lifetime of the on-disk file OR the
- + FS must provide an "encode_fh" function.
- +- The FS must call the VFS superblock alloc and deactivate routines
- + or add hooks to do the equivalent cleancache calls done there.
- +- To maximize performance, all pages fetched from the FS should
- + go through the do_mpag_readpage routine or the FS should add
- + hooks to do the equivalent (cf. btrfs)
- +- Currently, the FS blocksize must be the same as PAGESIZE. This
- + is not an architectural restriction, but no backends currently
- + support anything different.
- +- A clustered FS should invoke the "shared_init_fs" cleancache
- + hook to get best performance for some backends.
- +
- +7) Why not use the KVA of the inode as the key? (Christoph Hellwig)
- +
- +If cleancache would use the inode virtual address instead of
- +inode/filehandle, the pool id could be eliminated. But, this
- +won't work because cleancache retains pagecache data pages
- +persistently even when the inode has been pruned from the
- +inode unused list, and only flushes the data page if the file
- +gets removed/truncated. So if cleancache used the inode kva,
- +there would be potential coherency issues if/when the inode
- +kva is reused for a different file. Alternately, if cleancache
- +flushed the pages when the inode kva was freed, much of the value
- +of cleancache would be lost because the cache of pages in cleanache
- +is potentially much larger than the kernel pagecache and is most
- +useful if the pages survive inode cache removal.
- +
- +8) Why is a global variable required?
- +
- +The cleancache_enabled flag is checked in all of the frequently-used
- +cleancache hooks. The alternative is a function call to check a static
- +variable. Since cleancache is enabled dynamically at runtime, systems
- +that don't enable cleancache would suffer thousands (possibly
- +tens-of-thousands) of unnecessary function calls per second. So the
- +global variable allows cleancache to be enabled by default at compile
- +time, but have insignificant performance impact when cleancache remains
- +disabled at runtime.
- +
- +9) Does cleanache work with KVM?
- +
- +The memory model of KVM is sufficiently different that a cleancache
- +backend may have little value for KVM. This remains to be tested,
- +especially in an overcommitted system.
- +
- +10) Does cleancache work in userspace? It sounds useful for
- + memory hungry caches like web browsers. (Jamie Lokier)
- +
- +No plans yet, though we agree it sounds useful, at least for
- +apps that bypass the page cache (e.g. O_DIRECT).
- +
- +Last updated: Dan Magenheimer, September 2 2010
- diff -Nrupad linux-2.6.37//Documentation/vm/frontswap.txt linux-2.6.37_vanilla//Documentation/vm/frontswap.txt
- --- linux-2.6.37//Documentation/vm/frontswap.txt 1970-01-01 01:00:00.000000000 +0100
- +++ linux-2.6.37_vanilla//Documentation/vm/frontswap.txt 2011-02-14 01:21:43.158792960 +0100
- @@ -0,0 +1,209 @@
- +Frontswap provides a page-accessible-memory (PAM) interface for swap pages.
- +In some environments, dramatic performance savings may be obtained because
- +swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
- +
- +Frontswap is so named because it can be thought of as the opposite of
- +a "backing" store for a swap device. The storage is assumed to be
- +a synchronous concurrency-safe page-oriented pseudo-RAM device (such as
- +Xen's Transcendent Memory, aka "tmem", or in-kernel compressed memory,
- +aka "zmem", or other RAM-like devices) which is not directly accessible
- +or addressable by the kernel and is of unknown and possibly time-varying
- +size. This pseudo-RAM device links itself to frontswap by calling
- +frontswap_register_ops to set the frontswap_ops funcs appropriately and
- +the functions it provides must conform to certain policies as follows:
- +
- +An "init" prepares the pseudo-RAM to receive frontswap pages associated
- +with the specified swap device number (aka "type"). A "put_page" will
- +copy the page to pseudo-RAM and associate it with the type and offset
- +associated with the page. A "get_page" will copy the page, if found,
- +from pseudo-RAM into kernel memory, but will NOT remove the page from
- +pseudo-RAM. A "flush_page" will remove the page from pseudo-RAM and a
- +"flush_area" will remove ALL pages associated with the swap type
- +(e.g., like swapoff) and notify the pseudo-RAM device to refuse
- +further puts with that swap type.
- +
- +Once a page is successfully put, a matching get on the page will always
- +succeed. So when the kernel finds itself in a situation where it needs
- +to swap out a page, it first attempts to use frontswap. If the put returns
- +non-zero, the data has been successfully saved to pseudo-RAM and
- +a disk write and, if the data is later read back, a disk read are avoided.
- +If a put returns zero, pseudo-RAM has rejected the data, and the page can
- +be written to swap as usual.
- +
- +Note that if a page is put and the page already exists in pseudo-RAM
- +(a "duplicate" put), either the put succeeds and the data is overwritten,
- +or the put fails AND the page is flushed. This ensures stale data may
- +never be obtained from psuedo-RAM.
- +
- +Monitoring and control of frontswap is done by sysfs files in the
- +/sys/kernel/mm/frontswap directory. The effectiveness of frontswap can
- +be measured (across all swap devices) with:
- +
- +curr_pages - number of pages currently contained in frontswap
- +failed_puts - how many put attempts have failed
- +gets - how many gets were attempted (all should succeed)
- +succ_puts - how many put attempts have succeeded
- +flushes - how many flushes were attempted
- +
- +The number can be reduced by root by writing an integer target to curr_pages,
- +which results in a "partial swapoff", thus reducing the number of frontswap
- +pages to that target if memory constraints permit.
- +
- +FAQ
- +
- +1) Where's the value?
- +
- +When a workload starts swapping, performance falls through the floor.
- +Frontswap significantly increases performance in many such workloads by
- +providing a clean, dynamic interface to read and write swap pages to
- +pseudo-RAM -- RAM that is otherwise not directly addressable to the kernel.
- +This interface is ideal when data is transformed to a different form
- +and size (such as with compression) or secretly moved (as might be
- +useful for write-balancing for some RAM-like devices). Swap pages (and
- +evicted page-cache pages) are a great use for this kind of slower-than-RAM-
- +but-much-faster-than-disk pseudo-RAM and the frontswap (and cleancache)
- +"page-object-oriented" specification provides a nice way to read
- +and write -- and indirectly "name" -- the pages.
- +
- +In the virtual case, the whole point of virtualization is to statistically
- +multiplex physical resources acrosst the varying demands of multiple
- +virtual machines. This is really hard to do with RAM and efforts to do
- +it well with no kernel changes have essentially failed (except in some
- +well-publicized special-case workloads). Frontswap -- and cleancache --
- +with a fairly small impact on the kernel, provides a huge amount
- +of flexibility for more dynamic, flexible RAM multiplexing.
- +Specifically, the Xen Transcendent Memory backend allows otherwise
- +"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
- +virtual machines, but the pages can be compressed and deduplicated to
- +optimize RAM utilization. And when guest OS's are induced to surrender
- +underutilized RAM (e.g. with "self-ballooning"), sudden unexpected
- +memory pressure may result in swapping; frontswap allows those pages
- +to be swapped to and from hypervisor RAM if overall host system memory
- +conditions allow.
- +
- +2) Sure there may be performance advantages in some situations, but
- + what's the space/time overhead of frontswap?
- +
- +If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
- +nothingness and the only overhead is a few extra bytes per swapon'ed
- +swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend"
- +registers, there is one extra global variable compared to zero for
- +every swap page read or written. If CONFIG_FRONTSWAP is enabled
- +AND a frontswap backend registers AND the backend fails every "put"
- +request (i.e. provides no memory despite claiming it might),
- +CPU overhead is still negligible -- and since every frontswap fail
- +precedes a swap page write-to-disk, the system is highly likely
- +to be I/O bound and using a small fraction of a percent of a CPU
- +will be irrelevant anyway.
- +
- +As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend
- +registers, one bit is allocated for every swap page for every swap
- +device that is swapon'd. This is added to the EIGHT bits (which
- +was sixteen until about 2.6.34) that the kernel already allocates
- +for every swap page for every swap device that is swapon'd. (Hugh
- +Dickins has observed that frontswap could probably steal one of
- +the existing eight bits, but let's worry about that minor optimization
- +later.) For very large swap disks (which are rare) on a standard
- +4K pagesize, this is 1MB per 32GB swap.
- +
- +3) OK, how about a quick overview of what this frontswap patch does
- + in terms that a kernel hacker can grok?
- +
- +Let's assume that a frontswap "backend" has registered during
- +kernel initialization; this registration indicates that this
- +frontswap backend has access to some "memory" that is not directly
- +accessible by the kernel. Exactly how much memory it provides is
- +entirely dynamic and random.
- +
- +Whenever a swap-device is swapon'd frontswap_init() is called,
- +passing the swap device number (aka "type") as a parameter.
- +This notifies frontswap to expect attempts to "put" swap pages
- +associated with that number.
- +
- +Whenever the swap subsystem is readying a page to write to a swap
- +device (c.f swap_writepage()), frontswap_put_page is called. Frontswap
- +consults with the frontswap backend and if the backend says
- +it does NOT have room, frontswap_put_page returns 0 and the page is
- +swapped as normal. Note that the response from the frontswap
- +backend is essentially random; it may choose to never accept a
- +page, it could accept every ninth page, or it might accept every
- +page. But if the backend does accept a page, the data from the page
- +has already been copied and associated with the type and offset,
- +and the backend guarantees the persistence of the data. In this case,
- +frontswap sets a bit in the "frontswap_map" for the swap device
- +corresponding to the page offset on the swap device to which it would
- +otherwise have written the data.
- +
- +When the swap subsystem needs to swap-in a page (swap_readpage()),
- +it first calls frontswap_get_page() which checks the frontswap_map to
- +see if the page was earlier accepted by the frontswap backend. If
- +it was, the page of data is filled from the frontswap backend and
- +the swap-in is complete. If not, the normal swap-in code is
- +executed to obtain the page of data from the real swap device.
- +
- +So every time the frontswap backend accepts a page, a swap device read
- +and (potentially) a swap device write are replaced by a "frontswap backend
- +put" and (possibly) a "frontswap backend get", which are presumably much
- +faster.
- +
- +4) Can't frontswap be configured as a "special" swap device that is
- + just higher priority than any real swap device (e.g. like zswap)?
- +
- +No. Recall that acceptance of any swap page by the frontswap
- +backend is entirely unpredictable. This is critical to the definition
- +of frontswap because it grants completely dynamic discretion to the
- +backend. But since any "put" might fail, there must always be a real
- +slot on a real swap device to swap the page. Thus frontswap must be
- +implemented as a "shadow" to every swapon'd device with the potential
- +capability of holding every page that the swap device might have held
- +and the possibility that it might hold no pages at all.
- +On the downside, this also means that frontswap cannot contain more
- +pages than the total of swapon'd swap devices. For example, if NO
- +swap device is configured on some installation, frontswap is useless.
- +
- +Further, frontswap is entirely synchronous whereas a real swap
- +device is, by definition, asynchronous and uses block I/O. The
- +block I/O layer is not only unnecessary, but may perform "optimizations"
- +that are inappropriate for a RAM-oriented device including delaying
- +the write of some pages for a significant amount of time.
- +Synchrony is required to ensure the dynamicity of the backend.
- +
- +In a virtualized environment, the dynamicity allows the hypervisor
- +(or host OS) to do "intelligent overcommit". For example, it can
- +choose to accept pages only until host-swapping might be imminent,
- +then force guests to do their own swapping.
- +
- +5) Why this weird definition about "duplicate puts"? If a page
- + has been previously successfully put, can't it always be
- + successfully overwritten?
- +
- +Nearly always it can, but no, sometimes it cannot. Consider an example
- +where data is compressed and the original 4K page has been compressed
- +to 1K. Now an attempt is made to overwrite the page with data that
- +is non-compressible and so would take the entire 4K. But the backend
- +has no more space. In this case, the put must be rejected. Whenever
- +frontswap rejects a put that would overwrite, it also must flush
- +the old data and ensure that it is no longer accessible. Since the
- +swap subsystem then writes the new data to the read swap device,
- +this is the correct course of action to ensure coherency.
- +
- +6) What is frontswap_shrink for?
- +
- +When the (non-frontswap) swap subsystem swaps out a page to a real
- +swap device, that page is only taking up low-value pre-allocated disk
- +space. But if frontswap has placed a page in pseudo-RAM, that
- +page may be taking up valuable real estate. The frontswap_shrink
- +routine allows a process outside of the swap subsystem (such as
- +a userland service via the sysfs interface, or a kernel thread)
- +to force pages out of the memory managed by frontswap and back into
- +kernel-addressable memory.
- +
- +7) Why does the frontswap patch create the new include file swapfile.h?
- +
- +The frontswap code depends on some swap-subsystem-internal data
- +structures that have, over the years, moved back and forth between
- +static and global. This seemed a reasonable compromise: Define
- +them as global but declare them in a new include file that isn't
- +included by the large number of source files that include swap.h.
- +
- +Dan Magenheimer, September 21 2010
- diff -Nrupad linux-2.6.37//drivers/media/radio/radio-aimslab.c linux-2.6.37_vanilla//drivers/media/radio/radio-aimslab.c
- --- linux-2.6.37//drivers/media/radio/radio-aimslab.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//drivers/media/radio/radio-aimslab.c 2011-02-14 01:20:15.814793213 +0100
- @@ -71,7 +71,7 @@ static struct rtrack rtrack_card;
- /* local things */
- -static void sleep_delay(long n)
- +static noinline void sleep_delay(long n)
- {
- /* Sleep nicely for 'n' uS */
- int d = n / msecs_to_jiffies(1000);
- diff -Nrupad linux-2.6.37//drivers/staging/Kconfig linux-2.6.37_vanilla//drivers/staging/Kconfig
- --- linux-2.6.37//drivers/staging/Kconfig 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//drivers/staging/Kconfig 2011-02-14 01:21:43.158792960 +0100
- @@ -123,6 +123,8 @@ source "drivers/staging/iio/Kconfig"
- source "drivers/staging/zram/Kconfig"
- +source "drivers/staging/zcache/Kconfig"
- +
- source "drivers/staging/wlags49_h2/Kconfig"
- source "drivers/staging/wlags49_h25/Kconfig"
- diff -Nrupad linux-2.6.37//drivers/staging/Makefile linux-2.6.37_vanilla//drivers/staging/Makefile
- --- linux-2.6.37//drivers/staging/Makefile 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//drivers/staging/Makefile 2011-02-14 01:21:43.158792960 +0100
- @@ -44,6 +44,7 @@ obj-$(CONFIG_VME_BUS) += vme/
- obj-$(CONFIG_MRST_RAR_HANDLER) += memrar/
- obj-$(CONFIG_IIO) += iio/
- obj-$(CONFIG_ZRAM) += zram/
- +obj-$(CONFIG_ZCACHE) += zcache/
- obj-$(CONFIG_WLAGS49_H2) += wlags49_h2/
- obj-$(CONFIG_WLAGS49_H25) += wlags49_h25/
- obj-$(CONFIG_BATMAN_ADV) += batman-adv/
- diff -Nrupad linux-2.6.37//drivers/staging/zcache/Kconfig linux-2.6.37_vanilla//drivers/staging/zcache/Kconfig
- --- linux-2.6.37//drivers/staging/zcache/Kconfig 1970-01-01 01:00:00.000000000 +0100
- +++ linux-2.6.37_vanilla//drivers/staging/zcache/Kconfig 2011-02-14 01:21:43.158792960 +0100
- @@ -0,0 +1,13 @@
- +config ZCACHE
- + tristate "Dynamic compression of swap pages and clean pagecache pages"
- + depends on CLEANCACHE || FRONTSWAP
- + select XVMALLOC
- + select LZO_COMPRESS
- + select LZO_DECOMPRESS
- + default n
- + help
- + Zcache doubles RAM efficiency while providing a significant
- + performance boosts on many workloads. Zcache uses lzo1x
- + compression and an in-kernel implementation of transcendent
- + memory to store clean page cache pages and swap in RAM,
- + providing a noticeable reduction in disk I/O.
- diff -Nrupad linux-2.6.37//drivers/staging/zcache/Makefile linux-2.6.37_vanilla//drivers/staging/zcache/Makefile
- --- linux-2.6.37//drivers/staging/zcache/Makefile 1970-01-01 01:00:00.000000000 +0100
- +++ linux-2.6.37_vanilla//drivers/staging/zcache/Makefile 2011-02-14 01:21:43.159792985 +0100
- @@ -0,0 +1 @@
- +obj-$(CONFIG_ZCACHE) += zcache.o tmem.o
- diff -Nrupad linux-2.6.37//drivers/staging/zcache/tmem.c linux-2.6.37_vanilla//drivers/staging/zcache/tmem.c
- --- linux-2.6.37//drivers/staging/zcache/tmem.c 1970-01-01 01:00:00.000000000 +0100
- +++ linux-2.6.37_vanilla//drivers/staging/zcache/tmem.c 2011-02-14 01:21:43.160793007 +0100
- @@ -0,0 +1,710 @@
- +/*
- + * In-kernel transcendent memory (generic implementation)
- + *
- + * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
- + *
- + * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
- + * "handles" (triples containing a pool id, and object id, and an index), to
- + * pages in a page-accessible memory (PAM). Tmem references the PAM pages via
- + * an abstract "pampd" (PAM page-descriptor), which can be operated on by a
- + * set of functions (pamops). Each pampd contains some representation of
- + * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of
- + * pages and must be able to insert, find, and delete these pages at a
- + * potential frequency of thousands per second concurrently across many CPUs,
- + * (and, if used with KVM, across many vcpus across many guests).
- + * Tmem is tracked with a hierarchy of data structures, organized by
- + * the elements in a handle-tuple: pool_id, object_id, and page index.
- + * One or more "clients" (e.g. guests) each provide one or more tmem_pools.
- + * Each pool, contains a hash table of rb_trees of tmem_objs. Each
- + * tmem_obj contains a radix-tree-like tree of pointers, with intermediate
- + * nodes called tmem_objnodes. Each leaf pointer in this tree points to
- + * a pampd, which is accessible only through a small set of callbacks
- + * registered by the PAM implementation (see tmem_register_pamops). Tmem
- + * does all memory allocation via a set of callbacks registered by the tmem
- + * host implementation (e.g. see tmem_register_hostops).
- + */
- +
- +#include <linux/list.h>
- +#include <linux/spinlock.h>
- +#include <linux/atomic.h>
- +
- +#include "tmem.h"
- +
- +/* data structure sentinels used for debugging... see tmem.h */
- +#define POOL_SENTINEL 0x87658765
- +#define OBJ_SENTINEL 0x12345678
- +#define OBJNODE_SENTINEL 0xfedcba09
- +
- +/*
- + * A tmem host implementation must use this function to register callbacks
- + * for memory allocation.
- + */
- +static struct tmem_hostops tmem_hostops;
- +
- +static void tmem_objnode_tree_init(void);
- +
- +void tmem_register_hostops(struct tmem_hostops *m)
- +{
- + tmem_objnode_tree_init();
- + tmem_hostops = *m;
- +}
- +
- +/*
- + * A tmem host implementation must use this function to register
- + * callbacks for a page-accessible memory (PAM) implementation
- + */
- +static struct tmem_pamops tmem_pamops;
- +
- +void tmem_register_pamops(struct tmem_pamops *m)
- +{
- + tmem_pamops = *m;
- +}
- +
- +/*
- + * Oid's are potentially very sparse and tmem_objs may have an indeterminately
- + * short life, being added and deleted at a relatively high frequency.
- + * So an rb_tree is an ideal data structure to manage tmem_objs. But because
- + * of the potentially huge number of tmem_objs, each pool manages a hashtable
- + * of rb_trees to reduce search, insert, delete, and rebalancing time.
- + * Each hashbucket also has a lock to manage concurrent access.
- + *
- + * The following routines manage tmem_objs. When any tmem_obj is accessed,
- + * the hashbucket lock must be held.
- + */
- +
- +/* searches for object==oid in pool, returns locked object if found */
- +static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
- + struct tmem_oid *oidp)
- +{
- + struct rb_node *rbnode;
- + struct tmem_obj *obj;
- +
- + rbnode = hb->obj_rb_root.rb_node;
- + while (rbnode) {
- + BUG_ON(RB_EMPTY_NODE(rbnode));
- + obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
- + switch (tmem_oid_compare(oidp, &obj->oid)) {
- + case 0: /* equal */
- + goto out;
- + case -1:
- + rbnode = rbnode->rb_left;
- + break;
- + case 1:
- + rbnode = rbnode->rb_right;
- + break;
- + }
- + }
- + obj = NULL;
- +out:
- + return obj;
- +}
- +
- +static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *);
- +
- +/* free an object that has no more pampds in it */
- +static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
- +{
- + struct tmem_pool *pool;
- +
- + BUG_ON(obj == NULL);
- + ASSERT_SENTINEL(obj, OBJ);
- + BUG_ON(obj->pampd_count > 0);
- + pool = obj->pool;
- + BUG_ON(pool == NULL);
- + if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
- + tmem_pampd_destroy_all_in_obj(obj);
- + BUG_ON(obj->objnode_tree_root != NULL);
- + BUG_ON((long)obj->objnode_count != 0);
- + atomic_dec(&pool->obj_count);
- + BUG_ON(atomic_read(&pool->obj_count) < 0);
- + INVERT_SENTINEL(obj, OBJ);
- + obj->pool = NULL;
- + tmem_oid_set_invalid(&obj->oid);
- + rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
- +}
- +
- +/*
- + * initialize, and insert an tmem_object_root (called only if find failed)
- + */
- +static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
- + struct tmem_pool *pool,
- + struct tmem_oid *oidp)
- +{
- + struct rb_root *root = &hb->obj_rb_root;
- + struct rb_node **new = &(root->rb_node), *parent = NULL;
- + struct tmem_obj *this;
- +
- + BUG_ON(pool == NULL);
- + atomic_inc(&pool->obj_count);
- + obj->objnode_tree_height = 0;
- + obj->objnode_tree_root = NULL;
- + obj->pool = pool;
- + obj->oid = *oidp;
- + obj->objnode_count = 0;
- + obj->pampd_count = 0;
- + SET_SENTINEL(obj, OBJ);
- + while (*new) {
- + BUG_ON(RB_EMPTY_NODE(*new));
- + this = rb_entry(*new, struct tmem_obj, rb_tree_node);
- + parent = *new;
- + switch (tmem_oid_compare(oidp, &this->oid)) {
- + case 0:
- + BUG(); /* already present; should never happen! */
- + break;
- + case -1:
- + new = &(*new)->rb_left;
- + break;
- + case 1:
- + new = &(*new)->rb_right;
- + break;
- + }
- + }
- + rb_link_node(&obj->rb_tree_node, parent, new);
- + rb_insert_color(&obj->rb_tree_node, root);
- +}
- +
- +/*
- + * Tmem is managed as a set of tmem_pools with certain attributes, such as
- + * "ephemeral" vs "persistent". These attributes apply to all tmem_objs
- + * and all pampds that belong to a tmem_pool. A tmem_pool is created
- + * or deleted relatively rarely (for example, when a filesystem is
- + * mounted or unmounted.
- + */
- +
- +/* flush all data from a pool and, optionally, free it */
- +static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
- +{
- + struct rb_node *rbnode;
- + struct tmem_obj *obj;
- + struct tmem_hashbucket *hb = &pool->hashbucket[0];
- + int i;
- +
- + BUG_ON(pool == NULL);
- + for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
- + spin_lock(&hb->lock);
- + rbnode = rb_first(&hb->obj_rb_root);
- + while (rbnode != NULL) {
- + obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
- + rbnode = rb_next(rbnode);
- + tmem_pampd_destroy_all_in_obj(obj);
- + tmem_obj_free(obj, hb);
- + (*tmem_hostops.obj_free)(obj, pool);
- + }
- + spin_unlock(&hb->lock);
- + }
- + if (destroy)
- + list_del(&pool->pool_list);
- +}
- +
- +/*
- + * A tmem_obj contains a radix-tree-like tree in which the intermediate
- + * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation
- + * is very specialized and tuned for specific uses and is not particularly
- + * suited for use from this code, though some code from the core algorithms has
- + * been reused, thus the copyright notices below). Each tmem_objnode contains
- + * a set of pointers which point to either a set of intermediate tmem_objnodes
- + * or a set of of pampds.
- + *
- + * Portions Copyright (C) 2001 Momchil Velikov
- + * Portions Copyright (C) 2001 Christoph Hellwig
- + * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
- + */
- +
- +struct tmem_objnode_tree_path {
- + struct tmem_objnode *objnode;
- + int offset;
- +};
- +
- +/* objnode height_to_maxindex translation */
- +static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
- +
- +static void tmem_objnode_tree_init(void)
- +{
- + unsigned int ht, tmp;
- +
- + for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
- + tmp = ht * OBJNODE_TREE_MAP_SHIFT;
- + if (tmp >= OBJNODE_TREE_INDEX_BITS)
- + tmem_objnode_tree_h2max[ht] = ~0UL;
- + else
- + tmem_objnode_tree_h2max[ht] =
- + (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
- + }
- +}
- +
- +static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
- +{
- + struct tmem_objnode *objnode;
- +
- + ASSERT_SENTINEL(obj, OBJ);
- + BUG_ON(obj->pool == NULL);
- + ASSERT_SENTINEL(obj->pool, POOL);
- + objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
- + if (unlikely(objnode == NULL))
- + goto out;
- + objnode->obj = obj;
- + SET_SENTINEL(objnode, OBJNODE);
- + memset(&objnode->slots, 0, sizeof(objnode->slots));
- + objnode->slots_in_use = 0;
- + obj->objnode_count++;
- +out:
- + return objnode;
- +}
- +
- +static void tmem_objnode_free(struct tmem_objnode *objnode)
- +{
- + struct tmem_pool *pool;
- + int i;
- +
- + BUG_ON(objnode == NULL);
- + for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
- + BUG_ON(objnode->slots[i] != NULL);
- + ASSERT_SENTINEL(objnode, OBJNODE);
- + INVERT_SENTINEL(objnode, OBJNODE);
- + BUG_ON(objnode->obj == NULL);
- + ASSERT_SENTINEL(objnode->obj, OBJ);
- + pool = objnode->obj->pool;
- + BUG_ON(pool == NULL);
- + ASSERT_SENTINEL(pool, POOL);
- + objnode->obj->objnode_count--;
- + objnode->obj = NULL;
- + (*tmem_hostops.objnode_free)(objnode, pool);
- +}
- +
- +/*
- + * lookup index in object and return associated pampd (or NULL if not found)
- + */
- +static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
- +{
- + unsigned int height, shift;
- + struct tmem_objnode **slot = NULL;
- +
- + BUG_ON(obj == NULL);
- + ASSERT_SENTINEL(obj, OBJ);
- + BUG_ON(obj->pool == NULL);
- + ASSERT_SENTINEL(obj->pool, POOL);
- +
- + height = obj->objnode_tree_height;
- + if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
- + goto out;
- + if (height == 0 && obj->objnode_tree_root) {
- + slot = &obj->objnode_tree_root;
- + goto out;
- + }
- + shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
- + slot = &obj->objnode_tree_root;
- + while (height > 0) {
- + if (*slot == NULL)
- + goto out;
- + slot = (struct tmem_objnode **)
- + ((*slot)->slots +
- + ((index >> shift) & OBJNODE_TREE_MAP_MASK));
- + shift -= OBJNODE_TREE_MAP_SHIFT;
- + height--;
- + }
- +out:
- + return slot != NULL ? *slot : NULL;
- +}
- +
- +static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
- + void *pampd)
- +{
- + int ret = 0;
- + struct tmem_objnode *objnode = NULL, *newnode, *slot;
- + unsigned int height, shift;
- + int offset = 0;
- +
- + /* if necessary, extend the tree to be higher */
- + if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
- + height = obj->objnode_tree_height + 1;
- + if (index > tmem_objnode_tree_h2max[height])
- + while (index > tmem_objnode_tree_h2max[height])
- + height++;
- + if (obj->objnode_tree_root == NULL) {
- + obj->objnode_tree_height = height;
- + goto insert;
- + }
- + do {
- + newnode = tmem_objnode_alloc(obj);
- + if (!newnode) {
- + ret = -ENOMEM;
- + goto out;
- + }
- + newnode->slots[0] = obj->objnode_tree_root;
- + newnode->slots_in_use = 1;
- + obj->objnode_tree_root = newnode;
- + obj->objnode_tree_height++;
- + } while (height > obj->objnode_tree_height);
- + }
- +insert:
- + slot = obj->objnode_tree_root;
- + height = obj->objnode_tree_height;
- + shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
- + while (height > 0) {
- + if (slot == NULL) {
- + /* add a child objnode. */
- + slot = tmem_objnode_alloc(obj);
- + if (!slot) {
- + ret = -ENOMEM;
- + goto out;
- + }
- + if (objnode) {
- +
- + objnode->slots[offset] = slot;
- + objnode->slots_in_use++;
- + } else
- + obj->objnode_tree_root = slot;
- + }
- + /* go down a level */
- + offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
- + objnode = slot;
- + slot = objnode->slots[offset];
- + shift -= OBJNODE_TREE_MAP_SHIFT;
- + height--;
- + }
- + BUG_ON(slot != NULL);
- + if (objnode) {
- + objnode->slots_in_use++;
- + objnode->slots[offset] = pampd;
- + } else
- + obj->objnode_tree_root = pampd;
- + obj->pampd_count++;
- +out:
- + return ret;
- +}
- +
- +static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
- +{
- + struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1];
- + struct tmem_objnode_tree_path *pathp = path;
- + struct tmem_objnode *slot = NULL;
- + unsigned int height, shift;
- + int offset;
- +
- + BUG_ON(obj == NULL);
- + ASSERT_SENTINEL(obj, OBJ);
- + BUG_ON(obj->pool == NULL);
- + ASSERT_SENTINEL(obj->pool, POOL);
- + height = obj->objnode_tree_height;
- + if (index > tmem_objnode_tree_h2max[height])
- + goto out;
- + slot = obj->objnode_tree_root;
- + if (height == 0 && obj->objnode_tree_root) {
- + obj->objnode_tree_root = NULL;
- + goto out;
- + }
- + shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
- + pathp->objnode = NULL;
- + do {
- + if (slot == NULL)
- + goto out;
- + pathp++;
- + offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
- + pathp->offset = offset;
- + pathp->objnode = slot;
- + slot = slot->slots[offset];
- + shift -= OBJNODE_TREE_MAP_SHIFT;
- + height--;
- + } while (height > 0);
- + if (slot == NULL)
- + goto out;
- + while (pathp->objnode) {
- + pathp->objnode->slots[pathp->offset] = NULL;
- + pathp->objnode->slots_in_use--;
- + if (pathp->objnode->slots_in_use) {
- + if (pathp->objnode == obj->objnode_tree_root) {
- + while (obj->objnode_tree_height > 0 &&
- + obj->objnode_tree_root->slots_in_use == 1 &&
- + obj->objnode_tree_root->slots[0]) {
- + struct tmem_objnode *to_free =
- + obj->objnode_tree_root;
- +
- + obj->objnode_tree_root =
- + to_free->slots[0];
- + obj->objnode_tree_height--;
- + to_free->slots[0] = NULL;
- + to_free->slots_in_use = 0;
- + tmem_objnode_free(to_free);
- + }
- + }
- + goto out;
- + }
- + tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
- + pathp--;
- + }
- + obj->objnode_tree_height = 0;
- + obj->objnode_tree_root = NULL;
- +
- +out:
- + if (slot != NULL)
- + obj->pampd_count--;
- + BUG_ON(obj->pampd_count < 0);
- + return slot;
- +}
- +
- +/* recursively walk the objnode_tree destroying pampds and objnodes */
- +static void tmem_objnode_node_destroy(struct tmem_obj *obj,
- + struct tmem_objnode *objnode,
- + unsigned int ht)
- +{
- + int i;
- +
- + if (ht == 0)
- + return;
- + for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
- + if (objnode->slots[i]) {
- + if (ht == 1) {
- + obj->pampd_count--;
- + (*tmem_pamops.free)(objnode->slots[i],
- + obj->pool);
- + objnode->slots[i] = NULL;
- + continue;
- + }
- + tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
- + tmem_objnode_free(objnode->slots[i]);
- + objnode->slots[i] = NULL;
- + }
- + }
- +}
- +
- +static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
- +{
- + if (obj->objnode_tree_root == NULL)
- + return;
- + if (obj->objnode_tree_height == 0) {
- + obj->pampd_count--;
- + (*tmem_pamops.free)(obj->objnode_tree_root, obj->pool);
- + } else {
- + tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
- + obj->objnode_tree_height);
- + tmem_objnode_free(obj->objnode_tree_root);
- + obj->objnode_tree_height = 0;
- + }
- + obj->objnode_tree_root = NULL;
- +}
- +
- +/*
- + * Tmem is operated on by a set of well-defined actions:
- + * "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
- + * (The tmem ABI allows for subpages and exchanges but these operations
- + * are not included in this implementation.)
- + *
- + * These "tmem core" operations are implemented in the following functions.
- + */
- +
- +/*
- + * "Put" a page, e.g. copy a page from the kernel into newly allocated
- + * PAM space (if such space is available). Tmem_put is complicated by
- + * a corner case: What if a page with matching handle already exists in
- + * tmem? To guarantee coherency, one of two actions is necessary: Either
- + * the data for the page must be overwritten, or the page must be
- + * "flushed" so that the data is not accessible to a subsequent "get".
- + * Since these "duplicate puts" are relatively rare, this implementation
- + * always flushes for simplicity.
- + */
- +int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
- + struct page *page)
- +{
- + struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
- + void *pampd = NULL, *pampd_del = NULL;
- + int ret = -ENOMEM;
- + bool ephemeral;
- + struct tmem_hashbucket *hb;
- +
- + ephemeral = is_ephemeral(pool);
- + hb = &pool->hashbucket[tmem_oid_hash(oidp)];
- + spin_lock(&hb->lock);
- + obj = objfound = tmem_obj_find(hb, oidp);
- + if (obj != NULL) {
- + pampd = tmem_pampd_lookup_in_obj(objfound, index);
- + if (pampd != NULL) {
- + /* if found, is a dup put, flush the old one */
- + pampd_del = tmem_pampd_delete_from_obj(obj, index);
- + BUG_ON(pampd_del != pampd);
- + (*tmem_pamops.free)(pampd, pool);
- + if (obj->pampd_count == 0) {
- + objnew = obj;
- + objfound = NULL;
- + }
- + pampd = NULL;
- + }
- + } else {
- + obj = objnew = (*tmem_hostops.obj_alloc)(pool);
- + if (unlikely(obj == NULL)) {
- + ret = -ENOMEM;
- + goto out;
- + }
- + tmem_obj_init(obj, hb, pool, oidp);
- + }
- + BUG_ON(obj == NULL);
- + BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
- + pampd = (*tmem_pamops.create)(obj->pool, &obj->oid, index, page);
- + if (unlikely(pampd == NULL))
- + goto free;
- + ret = tmem_pampd_add_to_obj(obj, index, pampd);
- + if (unlikely(ret == -ENOMEM))
- + /* may have partially built objnode tree ("stump") */
- + goto delete_and_free;
- + goto out;
- +
- +delete_and_free:
- + (void)tmem_pampd_delete_from_obj(obj, index);
- +free:
- + if (pampd)
- + (*tmem_pamops.free)(pampd, pool);
- + if (objnew) {
- + tmem_obj_free(objnew, hb);
- + (*tmem_hostops.obj_free)(objnew, pool);
- + }
- +out:
- + spin_unlock(&hb->lock);
- + return ret;
- +}
- +
- +/*
- + * "Get" a page, e.g. if one can be found, copy the tmem page with the
- + * matching handle from PAM space to the kernel. By tmem definition,
- + * when a "get" is successful on an ephemeral page, the page is "flushed",
- + * and when a "get" is successful on a persistent page, the page is retained
- + * in tmem. Note that to preserve
- + * coherency, "get" can never be skipped if tmem contains the data.
- + * That is, if a get is done with a certain handle and fails, any
- + * subsequent "get" must also fail (unless of course there is a
- + * "put" done with the same handle).
- +
- + */
- +int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp,
- + uint32_t index, struct page *page)
- +{
- + struct tmem_obj *obj;
- + void *pampd;
- + bool ephemeral = is_ephemeral(pool);
- + uint32_t ret = -1;
- + struct tmem_hashbucket *hb;
- +
- + hb = &pool->hashbucket[tmem_oid_hash(oidp)];
- + spin_lock(&hb->lock);
- + obj = tmem_obj_find(hb, oidp);
- + if (obj == NULL)
- + goto out;
- + ephemeral = is_ephemeral(pool);
- + if (ephemeral)
- + pampd = tmem_pampd_delete_from_obj(obj, index);
- + else
- + pampd = tmem_pampd_lookup_in_obj(obj, index);
- + if (pampd == NULL)
- + goto out;
- + ret = (*tmem_pamops.get_data)(page, pampd, pool);
- + if (ret < 0)
- + goto out;
- + if (ephemeral) {
- + (*tmem_pamops.free)(pampd, pool);
- + if (obj->pampd_count == 0) {
- + tmem_obj_free(obj, hb);
- + (*tmem_hostops.obj_free)(obj, pool);
- + obj = NULL;
- + }
- + }
- + ret = 0;
- +out:
- + spin_unlock(&hb->lock);
- + return ret;
- +}
- +
- +/*
- + * If a page in tmem matches the handle, "flush" this page from tmem such
- + * that any subsequent "get" does not succeed (unless, of course, there
- + * was another "put" with the same handle).
- + */
- +int tmem_flush_page(struct tmem_pool *pool,
- + struct tmem_oid *oidp, uint32_t index)
- +{
- + struct tmem_obj *obj;
- + void *pampd;
- + int ret = -1;
- + struct tmem_hashbucket *hb;
- +
- + hb = &pool->hashbucket[tmem_oid_hash(oidp)];
- + spin_lock(&hb->lock);
- + obj = tmem_obj_find(hb, oidp);
- + if (obj == NULL)
- + goto out;
- + pampd = tmem_pampd_delete_from_obj(obj, index);
- + if (pampd == NULL)
- + goto out;
- + (*tmem_pamops.free)(pampd, pool);
- + if (obj->pampd_count == 0) {
- + tmem_obj_free(obj, hb);
- + (*tmem_hostops.obj_free)(obj, pool);
- + }
- + ret = 0;
- +
- +out:
- + spin_unlock(&hb->lock);
- + return ret;
- +}
- +
- +/*
- + * "Flush" all pages in tmem matching this oid.
- + */
- +int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
- +{
- + struct tmem_obj *obj;
- + struct tmem_hashbucket *hb;
- + int ret = -1;
- +
- + hb = &pool->hashbucket[tmem_oid_hash(oidp)];
- + spin_lock(&hb->lock);
- + obj = tmem_obj_find(hb, oidp);
- + if (obj == NULL)
- + goto out;
- + tmem_pampd_destroy_all_in_obj(obj);
- + tmem_obj_free(obj, hb);
- + (*tmem_hostops.obj_free)(obj, pool);
- + ret = 0;
- +
- +out:
- + spin_unlock(&hb->lock);
- + return ret;
- +}
- +
- +/*
- + * "Flush" all pages (and tmem_objs) from this tmem_pool and disable
- + * all subsequent access to this tmem_pool.
- + */
- +int tmem_destroy_pool(struct tmem_pool *pool)
- +{
- + int ret = -1;
- +
- + if (pool == NULL)
- + goto out;
- + tmem_pool_flush(pool, 1);
- + ret = 0;
- +out:
- + return ret;
- +}
- +
- +static LIST_HEAD(tmem_global_pool_list);
- +
- +/*
- + * Create a new tmem_pool with the provided flag and return
- + * a pool id provided by the tmem host implementation.
- + */
- +void tmem_new_pool(struct tmem_pool *pool, uint32_t flags)
- +{
- + int persistent = flags & TMEM_POOL_PERSIST;
- + int shared = flags & TMEM_POOL_SHARED;
- + struct tmem_hashbucket *hb = &pool->hashbucket[0];
- + int i;
- +
- + for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
- + hb->obj_rb_root = RB_ROOT;
- + spin_lock_init(&hb->lock);
- + }
- + INIT_LIST_HEAD(&pool->pool_list);
- + atomic_set(&pool->obj_count, 0);
- + SET_SENTINEL(pool, POOL);
- + list_add_tail(&pool->pool_list, &tmem_global_pool_list);
- + pool->persistent = persistent;
- + pool->shared = shared;
- +}
- diff -Nrupad linux-2.6.37//drivers/staging/zcache/tmem.h linux-2.6.37_vanilla//drivers/staging/zcache/tmem.h
- --- linux-2.6.37//drivers/staging/zcache/tmem.h 1970-01-01 01:00:00.000000000 +0100
- +++ linux-2.6.37_vanilla//drivers/staging/zcache/tmem.h 2011-02-14 01:21:43.160793007 +0100
- @@ -0,0 +1,195 @@
- +/*
- + * tmem.h
- + *
- + * Transcendent memory
- + *
- + * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
- + */
- +
- +#ifndef _TMEM_H_
- +#define _TMEM_H_
- +
- +#include <linux/types.h>
- +#include <linux/highmem.h>
- +#include <linux/hash.h>
- +#include <linux/atomic.h>
- +
- +/*
- + * These are pre-defined by the Xen<->Linux ABI
- + */
- +#define TMEM_PUT_PAGE 4
- +#define TMEM_GET_PAGE 5
- +#define TMEM_FLUSH_PAGE 6
- +#define TMEM_FLUSH_OBJECT 7
- +#define TMEM_POOL_PERSIST 1
- +#define TMEM_POOL_SHARED 2
- +#define TMEM_POOL_PRECOMPRESSED 4
- +#define TMEM_POOL_PAGESIZE_SHIFT 4
- +#define TMEM_POOL_PAGESIZE_MASK 0xf
- +#define TMEM_POOL_RESERVED_BITS 0x00ffff00
- +
- +/*
- + * sentinels have proven very useful for debugging but can be removed
- + * or disabled before final merge.
- + */
- +#define SENTINELS
- +#ifdef SENTINELS
- +#define DECL_SENTINEL uint32_t sentinel;
- +#define SET_SENTINEL(_x, _y) (_x->sentinel = _y##_SENTINEL)
- +#define INVERT_SENTINEL(_x, _y) (_x->sentinel = ~_y##_SENTINEL)
- +#define ASSERT_SENTINEL(_x, _y) WARN_ON(_x->sentinel != _y##_SENTINEL)
- +#define ASSERT_INVERTED_SENTINEL(_x, _y) WARN_ON(_x->sentinel != ~_y##_SENTINEL)
- +#else
- +#define DECL_SENTINEL
- +#define SET_SENTINEL(_x, _y) do { } while (0)
- +#define INVERT_SENTINEL(_x, _y) do { } while (0)
- +#define ASSERT_SENTINEL(_x, _y) do { } while (0)
- +#define ASSERT_INVERTED_SENTINEL(_x, _y) do { } while (0)
- +#endif
- +
- +#define ASSERT_SPINLOCK(_l) WARN_ON(!spin_is_locked(_l))
- +
- +/*
- + * A pool is the highest-level data structure managed by tmem and
- + * usually corresponds to a large independent set of pages such as
- + * a filesystem. Each pool has an id, and certain attributes and counters.
- + * It also contains a set of hash buckets, each of which contains an rbtree
- + * of objects and a lock to manage concurrency within the pool.
- + */
- +
- +#define TMEM_HASH_BUCKET_BITS 8
- +#define TMEM_HASH_BUCKETS (1<<TMEM_HASH_BUCKET_BITS)
- +
- +struct tmem_hashbucket {
- + struct rb_root obj_rb_root;
- + spinlock_t lock;
- +};
- +
- +struct tmem_pool {
- + void *client; /* "up" for some clients, avoids table lookup */
- + struct list_head pool_list;
- + uint32_t pool_id;
- + bool persistent;
- + bool shared;
- + atomic_t obj_count;
- + atomic_t refcount;
- + struct tmem_hashbucket hashbucket[TMEM_HASH_BUCKETS];
- + DECL_SENTINEL
- +};
- +
- +#define is_persistent(_p) (_p->persistent)
- +#define is_ephemeral(_p) (!(_p->persistent))
- +
- +/*
- + * An object id ("oid") is large: 192-bits (to ensure, for example, files
- + * in a modern filesystem can be uniquely identified).
- + */
- +
- +struct tmem_oid {
- + uint64_t oid[3];
- +};
- +
- +static inline void tmem_oid_set_invalid(struct tmem_oid *oidp)
- +{
- + oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
- +}
- +
- +static inline bool tmem_oid_valid(struct tmem_oid *oidp)
- +{
- + return oidp->oid[0] != -1UL || oidp->oid[1] != -1UL ||
- + oidp->oid[2] != -1UL;
- +}
- +
- +static inline int tmem_oid_compare(struct tmem_oid *left,
- + struct tmem_oid *right)
- +{
- + int ret;
- +
- + if (left->oid[2] == right->oid[2]) {
- + if (left->oid[1] == right->oid[1]) {
- + if (left->oid[0] == right->oid[0])
- + ret = 0;
- + else if (left->oid[0] < right->oid[0])
- + ret = -1;
- + else
- + return 1;
- + } else if (left->oid[1] < right->oid[1])
- + ret = -1;
- + else
- + ret = 1;
- + } else if (left->oid[2] < right->oid[2])
- + ret = -1;
- + else
- + ret = 1;
- + return ret;
- +}
- +
- +static inline unsigned tmem_oid_hash(struct tmem_oid *oidp)
- +{
- + return hash_long(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2],
- + TMEM_HASH_BUCKET_BITS);
- +}
- +
- +/*
- + * A tmem_obj contains an identifier (oid), pointers to the parent
- + * pool and the rb_tree to which it belongs, counters, and an ordered
- + * set of pampds, structured in a radix-tree-like tree. The intermediate
- + * nodes of the tree are called tmem_objnodes.
- + */
- +
- +struct tmem_objnode;
- +
- +struct tmem_obj {
- + struct tmem_oid oid;
- + struct tmem_pool *pool;
- + struct rb_node rb_tree_node;
- + struct tmem_objnode *objnode_tree_root;
- + unsigned int objnode_tree_height;
- + unsigned long objnode_count;
- + long pampd_count;
- + DECL_SENTINEL
- +};
- +
- +#define OBJNODE_TREE_MAP_SHIFT 6
- +#define OBJNODE_TREE_MAP_SIZE (1UL << OBJNODE_TREE_MAP_SHIFT)
- +#define OBJNODE_TREE_MAP_MASK (OBJNODE_TREE_MAP_SIZE-1)
- +#define OBJNODE_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
- +#define OBJNODE_TREE_MAX_PATH \
- + (OBJNODE_TREE_INDEX_BITS/OBJNODE_TREE_MAP_SHIFT + 2)
- +
- +struct tmem_objnode {
- + struct tmem_obj *obj;
- + DECL_SENTINEL
- + void *slots[OBJNODE_TREE_MAP_SIZE];
- + unsigned int slots_in_use;
- +};
- +
- +/* pampd abstract datatype methods provided by the PAM implementation */
- +struct tmem_pamops {
- + void *(*create)(struct tmem_pool *, struct tmem_oid *, uint32_t,
- + struct page *);
- + int (*get_data)(struct page *, void *, struct tmem_pool *);
- + void (*free)(void *, struct tmem_pool *);
- +};
- +extern void tmem_register_pamops(struct tmem_pamops *m);
- +
- +/* memory allocation methods provided by the host implementation */
- +struct tmem_hostops {
- + struct tmem_obj *(*obj_alloc)(struct tmem_pool *);
- + void (*obj_free)(struct tmem_obj *, struct tmem_pool *);
- + struct tmem_objnode *(*objnode_alloc)(struct tmem_pool *);
- + void (*objnode_free)(struct tmem_objnode *, struct tmem_pool *);
- +};
- +extern void tmem_register_hostops(struct tmem_hostops *m);
- +
- +/* core tmem accessor functions */
- +extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index,
- + struct page *page);
- +extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index,
- + struct page *page);
- +extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *,
- + uint32_t index);
- +extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *);
- +extern int tmem_destroy_pool(struct tmem_pool *);
- +extern void tmem_new_pool(struct tmem_pool *, uint32_t);
- +#endif /* _TMEM_H */
- diff -Nrupad linux-2.6.37//drivers/staging/zcache/zcache.c linux-2.6.37_vanilla//drivers/staging/zcache/zcache.c
- --- linux-2.6.37//drivers/staging/zcache/zcache.c 1970-01-01 01:00:00.000000000 +0100
- +++ linux-2.6.37_vanilla//drivers/staging/zcache/zcache.c 2011-02-14 01:22:00.636793117 +0100
- @@ -0,0 +1,1658 @@
- +/*
- + * zcache.c
- + *
- + * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.
- + * Copyright (c) 2010,2011, Nitin Gupta
- + *
- + * Zcache provides an in-kernel "host implementation" for transcendent memory
- + * and, thus indirectly, for cleancache and frontswap. Zcache includes two
- + * page-accessible memory [1] interfaces, both utilizing lzo1x compression:
- + * 1) "compression buddies" ("zbud") is used for ephemeral pages
- + * 2) xvmalloc is used for persistent pages.
- + * Xvmalloc (based on the TLSF allocator) has very low fragmentation
- + * so maximizes space efficiency, while zbud allows pairs (and potentially,
- + * in the future, more than a pair of) compressed pages to be closely linked
- + * so that reclaiming can be done via the kernel's physical-page-oriented
- + * "shrinker" interface.
- + *
- + * [1] For a definition of page-accessible memory (aka PAM), see:
- + * http://marc.info/?l=linux-mm&m=127811271605009
- + */
- +
- +#include <linux/cpu.h>
- +#include <linux/highmem.h>
- +#include <linux/list.h>
- +#include <linux/lzo.h>
- +#include <linux/slab.h>
- +#include <linux/spinlock.h>
- +#include <linux/types.h>
- +#include <linux/atomic.h>
- +#include "tmem.h"
- +
- +#include "../zram/xvmalloc.h" /* if built in drivers/staging */
- +
- +#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
- +#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
- +#endif
- +#ifdef CONFIG_CLEANCACHE
- +#include <linux/cleancache.h>
- +#endif
- +#ifdef CONFIG_FRONTSWAP
- +#include <linux/frontswap.h>
- +#endif
- +
- +#if 0
- +/* this is more aggressive but may cause other problems? */
- +#define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
- +#else
- +#define ZCACHE_GFP_MASK \
- + (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
- +#endif
- +
- +/**********
- + * Compression buddies ("zbud") provides for packing two (or, possibly
- + * in the future, more) compressed ephemeral pages into a single "raw"
- + * (physical) page and tracking them with data structures so that
- + * the raw pages can be easily reclaimed.
- + *
- + * A zbud page ("zbpg") is an aligned page containing a list_head,
- + * a lock, and two "zbud headers". The remainder of the physical
- + * page is divided up into aligned 64-byte "chunks" which contain
- + * the compressed data for zero, one, or two zbuds. Each zbpg
- + * resides on: (1) an "unused list" if it has no zbuds; (2) a
- + * "buddied" list if it is fully populated with two zbuds; or
- + * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
- + * the one unbuddied zbud uses. The data inside a zbpg cannot be
- + * read or written unless the zbpg's lock is held.
- + */
- +
- +#define ZBH_SENTINEL 0x43214321
- +#define ZBPG_SENTINEL 0xdeadbeef
- +
- +#define ZBUD_MAX_BUDS 2
- +
- +struct zbud_hdr {
- + uint32_t pool_id;
- + struct tmem_oid oid;
- + uint32_t index;
- + uint16_t size; /* compressed size in bytes, zero means unused */
- + DECL_SENTINEL
- +};
- +
- +struct zbud_page {
- + struct list_head bud_list;
- + spinlock_t lock;
- + struct zbud_hdr buddy[ZBUD_MAX_BUDS];
- + DECL_SENTINEL
- + /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
- +};
- +
- +#define CHUNK_SHIFT 6
- +#define CHUNK_SIZE (1 << CHUNK_SHIFT)
- +#define CHUNK_MASK (~(CHUNK_SIZE-1))
- +#define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \
- + CHUNK_MASK) >> CHUNK_SHIFT)
- +#define MAX_CHUNK (NCHUNKS-1)
- +
- +static struct {
- + struct list_head list;
- + unsigned count;
- +} zbud_unbuddied[NCHUNKS];
- +/* list N contains pages with N chunks USED and NCHUNKS-N unused */
- +/* element 0 is never used but optimizing that isn't worth it */
- +static unsigned long zbud_cumul_chunk_counts[NCHUNKS];
- +
- +struct list_head zbud_buddied_list;
- +static unsigned long zcache_zbud_buddied_count;
- +
- +/* protects the buddied list and all unbuddied lists */
- +static DEFINE_SPINLOCK(zbud_budlists_spinlock);
- +
- +static LIST_HEAD(zbpg_unused_list);
- +static unsigned long zcache_zbpg_unused_list_count;
- +
- +/* protects the unused page list */
- +static DEFINE_SPINLOCK(zbpg_unused_list_spinlock);
- +
- +static atomic_t zcache_zbud_curr_raw_pages;
- +static atomic_t zcache_zbud_curr_zpages;
- +static unsigned long zcache_zbud_curr_zbytes;
- +static unsigned long zcache_zbud_cumul_zpages;
- +static unsigned long zcache_zbud_cumul_zbytes;
- +static unsigned long zcache_compress_poor;
- +
- +/* forward references */
- +static void *zcache_get_free_page(void);
- +static void zcache_free_page(void *p);
- +
- +/*
- + * zbud helper functions
- + */
- +
- +static inline unsigned zbud_max_buddy_size(void)
- +{
- + return MAX_CHUNK << CHUNK_SHIFT;
- +}
- +
- +static inline unsigned zbud_size_to_chunks(unsigned size)
- +{
- + BUG_ON(size == 0 || size > zbud_max_buddy_size());
- + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
- +}
- +
- +static inline int zbud_budnum(struct zbud_hdr *zh)
- +{
- + unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1);
- + struct zbud_page *zbpg = NULL;
- + unsigned budnum = -1U;
- + int i;
- +
- + for (i = 0; i < ZBUD_MAX_BUDS; i++)
- + if (offset == offsetof(typeof(*zbpg), buddy[i])) {
- + budnum = i;
- + break;
- + }
- + BUG_ON(budnum == -1U);
- + return budnum;
- +}
- +
- +static char *zbud_data(struct zbud_hdr *zh, unsigned size)
- +{
- + struct zbud_page *zbpg;
- + char *p;
- + unsigned budnum;
- +
- + ASSERT_SENTINEL(zh, ZBH);
- + budnum = zbud_budnum(zh);
- + BUG_ON(size == 0 || size > zbud_max_buddy_size());
- + zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
- + ASSERT_SPINLOCK(&zbpg->lock);
- + p = (char *)zbpg;
- + if (budnum == 0)
- + p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &
- + CHUNK_MASK);
- + else if (budnum == 1)
- + p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
- + return p;
- +}
- +
- +/*
- + * zbud raw page management
- + */
- +
- +static struct zbud_page *zbud_alloc_raw_page(void)
- +{
- + struct zbud_page *zbpg = NULL;
- + struct zbud_hdr *zh0, *zh1;
- + bool recycled = 0;
- +
- + /* if any pages on the zbpg list, use one */
- + spin_lock(&zbpg_unused_list_spinlock);
- + if (!list_empty(&zbpg_unused_list)) {
- + zbpg = list_first_entry(&zbpg_unused_list,
- + struct zbud_page, bud_list);
- + list_del_init(&zbpg->bud_list);
- + zcache_zbpg_unused_list_count--;
- + recycled = 1;
- + }
- + spin_unlock(&zbpg_unused_list_spinlock);
- + if (zbpg == NULL)
- + /* none on zbpg list, try to get a kernel page */
- + zbpg = zcache_get_free_page();
- + if (likely(zbpg != NULL)) {
- + INIT_LIST_HEAD(&zbpg->bud_list);
- + zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
- + spin_lock_init(&zbpg->lock);
- + if (recycled) {
- + ASSERT_INVERTED_SENTINEL(zbpg, ZBPG);
- + SET_SENTINEL(zbpg, ZBPG);
- + BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
- + BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
- + } else {
- + atomic_inc(&zcache_zbud_curr_raw_pages);
- + INIT_LIST_HEAD(&zbpg->bud_list);
- + SET_SENTINEL(zbpg, ZBPG);
- + zh0->size = 0; zh1->size = 0;
- + tmem_oid_set_invalid(&zh0->oid);
- + tmem_oid_set_invalid(&zh1->oid);
- + }
- + }
- + return zbpg;
- +}
- +
- +static void zbud_free_raw_page(struct zbud_page *zbpg)
- +{
- + struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1];
- +
- + ASSERT_SENTINEL(zbpg, ZBPG);
- + BUG_ON(!list_empty(&zbpg->bud_list));
- + ASSERT_SPINLOCK(&zbpg->lock);
- + BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
- + BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
- + INVERT_SENTINEL(zbpg, ZBPG);
- + spin_unlock(&zbpg->lock);
- + spin_lock(&zbpg_unused_list_spinlock);
- + list_add(&zbpg->bud_list, &zbpg_unused_list);
- + zcache_zbpg_unused_list_count++;
- + spin_unlock(&zbpg_unused_list_spinlock);
- +}
- +
- +/*
- + * core zbud handling routines
- + */
- +
- +static unsigned zbud_free(struct zbud_hdr *zh)
- +{
- + unsigned size;
- +
- + ASSERT_SENTINEL(zh, ZBH);
- + BUG_ON(!tmem_oid_valid(&zh->oid));
- + size = zh->size;
- + BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
- + zh->size = 0;
- + tmem_oid_set_invalid(&zh->oid);
- + INVERT_SENTINEL(zh, ZBH);
- + zcache_zbud_curr_zbytes -= size;
- + atomic_dec(&zcache_zbud_curr_zpages);
- + return size;
- +}
- +
- +static void zbud_free_and_delist(struct zbud_hdr *zh)
- +{
- + unsigned chunks;
- + struct zbud_hdr *zh_other;
- + unsigned budnum = zbud_budnum(zh), size;
- + struct zbud_page *zbpg =
- + container_of(zh, struct zbud_page, buddy[budnum]);
- +
- + spin_lock(&zbpg->lock);
- + if (list_empty(&zbpg->bud_list)) {
- + /* ignore zombie page... see zbud_evict_pages() */
- + spin_unlock(&zbpg->lock);
- + return;
- + }
- + size = zbud_free(zh);
- + ASSERT_SPINLOCK(&zbpg->lock);
- + zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0];
- + if (zh_other->size == 0) { /* was unbuddied: unlist and free */
- + chunks = zbud_size_to_chunks(size) ;
- + spin_lock(&zbud_budlists_spinlock);
- + BUG_ON(list_empty(&zbud_unbuddied[chunks].list));
- + list_del_init(&zbpg->bud_list);
- + zbud_unbuddied[chunks].count--;
- + spin_unlock(&zbud_budlists_spinlock);
- + zbud_free_raw_page(zbpg);
- + } else { /* was buddied: move remaining buddy to unbuddied list */
- + chunks = zbud_size_to_chunks(zh_other->size) ;
- + spin_lock(&zbud_budlists_spinlock);
- + list_del_init(&zbpg->bud_list);
- + zcache_zbud_buddied_count--;
- + list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list);
- + zbud_unbuddied[chunks].count++;
- + spin_unlock(&zbud_budlists_spinlock);
- + spin_unlock(&zbpg->lock);
- + }
- +}
- +
- +static struct zbud_hdr *zbud_create(uint32_t pool_id, struct tmem_oid *oid,
- + uint32_t index, struct page *page,
- + void *cdata, unsigned size)
- +{
- + struct zbud_hdr *zh0, *zh1, *zh = NULL;
- + struct zbud_page *zbpg = NULL, *ztmp;
- + unsigned nchunks;
- + char *to;
- + int i, found_good_buddy = 0;
- +
- + nchunks = zbud_size_to_chunks(size) ;
- + for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
- + spin_lock(&zbud_budlists_spinlock);
- + if (!list_empty(&zbud_unbuddied[i].list)) {
- + list_for_each_entry_safe(zbpg, ztmp,
- + &zbud_unbuddied[i].list, bud_list) {
- + if (spin_trylock(&zbpg->lock)) {
- + found_good_buddy = i;
- + goto found_unbuddied;
- + }
- + }
- + }
- + spin_unlock(&zbud_budlists_spinlock);
- + }
- + /* didn't find a good buddy, try allocating a new page */
- + zbpg = zbud_alloc_raw_page();
- + if (unlikely(zbpg == NULL))
- + goto out;
- + /* ok, have a page, now compress the data before taking locks */
- + spin_lock(&zbpg->lock);
- + spin_lock(&zbud_budlists_spinlock);
- + list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list);
- + zbud_unbuddied[nchunks].count++;
- + zh = &zbpg->buddy[0];
- + goto init_zh;
- +
- +found_unbuddied:
- + ASSERT_SPINLOCK(&zbpg->lock);
- + zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
- + BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0)));
- + if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */
- + ASSERT_SENTINEL(zh0, ZBH);
- + zh = zh1;
- + } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */
- + ASSERT_SENTINEL(zh1, ZBH);
- + zh = zh0;
- + } else
- + BUG();
- + list_del_init(&zbpg->bud_list);
- + zbud_unbuddied[found_good_buddy].count--;
- + list_add_tail(&zbpg->bud_list, &zbud_buddied_list);
- + zcache_zbud_buddied_count++;
- +
- +init_zh:
- + SET_SENTINEL(zh, ZBH);
- + zh->size = size;
- + zh->index = index;
- + zh->oid = *oid;
- + zh->pool_id = pool_id;
- + /* can wait to copy the data until the list locks are dropped */
- + spin_unlock(&zbud_budlists_spinlock);
- +
- + to = zbud_data(zh, size);
- + memcpy(to, cdata, size);
- + spin_unlock(&zbpg->lock);
- + zbud_cumul_chunk_counts[nchunks]++;
- + atomic_inc(&zcache_zbud_curr_zpages);
- + zcache_zbud_cumul_zpages++;
- + zcache_zbud_curr_zbytes += size;
- + zcache_zbud_cumul_zbytes += size;
- +out:
- + return zh;
- +}
- +
- +static int zbud_decompress(struct page *page, struct zbud_hdr *zh)
- +{
- + struct zbud_page *zbpg;
- + unsigned budnum = zbud_budnum(zh);
- + size_t out_len = PAGE_SIZE;
- + char *to_va, *from_va;
- + unsigned size;
- + int ret = 0;
- +
- + zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
- + spin_lock(&zbpg->lock);
- + if (list_empty(&zbpg->bud_list)) {
- + /* ignore zombie page... see zbud_evict_pages() */
- + ret = -EINVAL;
- + goto out;
- + }
- + ASSERT_SENTINEL(zh, ZBH);
- + BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
- + to_va = kmap_atomic(page, KM_USER0);
- + size = zh->size;
- + from_va = zbud_data(zh, size);
- + ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len);
- + BUG_ON(ret != LZO_E_OK);
- + BUG_ON(out_len != PAGE_SIZE);
- + kunmap_atomic(to_va, KM_USER0);
- +out:
- + spin_unlock(&zbpg->lock);
- + return ret;
- +}
- +
- +/*
- + * The following routines handle shrinking of ephemeral pages by evicting
- + * pages "least valuable" first.
- + */
- +
- +static unsigned long zcache_evicted_raw_pages;
- +static unsigned long zcache_evicted_buddied_pages;
- +static unsigned long zcache_evicted_unbuddied_pages;
- +
- +static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid);
- +static void zcache_put_pool(struct tmem_pool *pool);
- +
- +/*
- + * Flush and free all zbuds in a zbpg, then free the pageframe
- + */
- +static void zbud_evict_zbpg(struct zbud_page *zbpg)
- +{
- + struct zbud_hdr *zh;
- + int i, j;
- + uint32_t pool_id[ZBUD_MAX_BUDS], index[ZBUD_MAX_BUDS];
- + struct tmem_oid oid[ZBUD_MAX_BUDS];
- + struct tmem_pool *pool;
- +
- + ASSERT_SPINLOCK(&zbpg->lock);
- + BUG_ON(!list_empty(&zbpg->bud_list));
- + for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) {
- + zh = &zbpg->buddy[i];
- + if (zh->size) {
- + pool_id[j] = zh->pool_id;
- + oid[j] = zh->oid;
- + index[j] = zh->index;
- + j++;
- + zbud_free(zh);
- + }
- + }
- + spin_unlock(&zbpg->lock);
- + for (i = 0; i < j; i++) {
- + pool = zcache_get_pool_by_id(pool_id[i]);
- + if (pool != NULL) {
- + tmem_flush_page(pool, &oid[i], index[i]);
- + zcache_put_pool(pool);
- + }
- + }
- + ASSERT_SENTINEL(zbpg, ZBPG);
- + spin_lock(&zbpg->lock);
- + zbud_free_raw_page(zbpg);
- +}
- +
- +/*
- + * Free nr pages. This code is funky because we want to hold the locks
- + * protecting various lists for as short a time as possible, and in some
- + * circumstances the list may change asynchronously when the list lock is
- + * not held. In some cases we also trylock not only to avoid waiting on a
- + * page in use by another cpu, but also to avoid potential deadlock due to
- + * lock inversion.
- + */
- +static void zbud_evict_pages(int nr)
- +{
- + struct zbud_page *zbpg;
- + int i;
- +
- + /* first try freeing any pages on unused list */
- +retry_unused_list:
- + spin_lock_bh(&zbpg_unused_list_spinlock);
- + if (!list_empty(&zbpg_unused_list)) {
- + /* can't walk list here, since it may change when unlocked */
- + zbpg = list_first_entry(&zbpg_unused_list,
- + struct zbud_page, bud_list);
- + list_del_init(&zbpg->bud_list);
- + zcache_zbpg_unused_list_count--;
- + atomic_dec(&zcache_zbud_curr_raw_pages);
- + spin_unlock_bh(&zbpg_unused_list_spinlock);
- + zcache_free_page(zbpg);
- + zcache_evicted_raw_pages++;
- + if (--nr <= 0)
- + goto out;
- + goto retry_unused_list;
- + }
- + spin_unlock_bh(&zbpg_unused_list_spinlock);
- +
- + /* now try freeing unbuddied pages, starting with least space avail */
- + for (i = 0; i < MAX_CHUNK; i++) {
- +retry_unbud_list_i:
- + spin_lock_bh(&zbud_budlists_spinlock);
- + if (list_empty(&zbud_unbuddied[i].list)) {
- + spin_unlock_bh(&zbud_budlists_spinlock);
- + continue;
- + }
- + list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {
- + if (unlikely(!spin_trylock(&zbpg->lock)))
- + continue;
- + list_del_init(&zbpg->bud_list);
- + zbud_unbuddied[i].count--;
- + spin_unlock(&zbud_budlists_spinlock);
- + zcache_evicted_unbuddied_pages++;
- + /* want budlists unlocked when doing zbpg eviction */
- + zbud_evict_zbpg(zbpg);
- + local_bh_enable();
- + if (--nr <= 0)
- + goto out;
- + goto retry_unbud_list_i;
- + }
- + spin_unlock_bh(&zbud_budlists_spinlock);
- + }
- +
- + /* as a last resort, free buddied pages */
- +retry_bud_list:
- + spin_lock_bh(&zbud_budlists_spinlock);
- + if (list_empty(&zbud_buddied_list)) {
- + spin_unlock_bh(&zbud_budlists_spinlock);
- + goto out;
- + }
- + list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {
- + if (unlikely(!spin_trylock(&zbpg->lock)))
- + continue;
- + list_del_init(&zbpg->bud_list);
- + zcache_zbud_buddied_count--;
- + spin_unlock(&zbud_budlists_spinlock);
- + zcache_evicted_buddied_pages++;
- + /* want budlists unlocked when doing zbpg eviction */
- + zbud_evict_zbpg(zbpg);
- + local_bh_enable();
- + if (--nr <= 0)
- + goto out;
- + goto retry_bud_list;
- + }
- + spin_unlock_bh(&zbud_budlists_spinlock);
- +out:
- + return;
- +}
- +
- +static void zbud_init(void)
- +{
- + int i;
- +
- + INIT_LIST_HEAD(&zbud_buddied_list);
- + zcache_zbud_buddied_count = 0;
- + for (i = 0; i < NCHUNKS; i++) {
- + INIT_LIST_HEAD(&zbud_unbuddied[i].list);
- + zbud_unbuddied[i].count = 0;
- + }
- +}
- +
- +#ifdef CONFIG_SYSFS
- +/*
- + * These sysfs routines show a nice distribution of how many zbpg's are
- + * currently (and have ever been placed) in each unbuddied list. It's fun
- + * to watch but can probably go away before final merge.
- + */
- +static int zbud_show_unbuddied_list_counts(char *buf)
- +{
- + int i;
- + char *p = buf;
- +
- + for (i = 0; i < NCHUNKS - 1; i++)
- + p += sprintf(p, "%u ", zbud_unbuddied[i].count);
- + p += sprintf(p, "%d\n", zbud_unbuddied[i].count);
- + return p - buf;
- +}
- +
- +static int zbud_show_cumul_chunk_counts(char *buf)
- +{
- + unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0;
- + unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0;
- + unsigned long total_chunks_lte_42 = 0;
- + char *p = buf;
- +
- + for (i = 0; i < NCHUNKS; i++) {
- + p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]);
- + chunks += zbud_cumul_chunk_counts[i];
- + total_chunks += zbud_cumul_chunk_counts[i];
- + sum_total_chunks += i * zbud_cumul_chunk_counts[i];
- + if (i == 21)
- + total_chunks_lte_21 = total_chunks;
- + if (i == 32)
- + total_chunks_lte_32 = total_chunks;
- + if (i == 42)
- + total_chunks_lte_42 = total_chunks;
- + }
- + p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
- + total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42,
- + chunks == 0 ? 0 : sum_total_chunks / chunks);
- + return p - buf;
- +}
- +#endif
- +
- +/**********
- + * This "zv" PAM implementation combines the TLSF-based xvMalloc
- + * with lzo1x compression to maximize the amount of data that can
- + * be packed into a physical page.
- + *
- + * Zv represents a PAM page with the index and object (plus a "size" value
- + * necessary for decompression) immediately preceding the compressed data.
- + */
- +
- +#define ZVH_SENTINEL 0x43214321
- +
- +struct zv_hdr {
- + uint32_t pool_id;
- + struct tmem_oid oid;
- + uint32_t index;
- + DECL_SENTINEL
- +};
- +
- +static const int zv_max_page_size = (PAGE_SIZE / 8) * 7;
- +
- +static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id,
- + struct tmem_oid *oid, uint32_t index,
- + void *cdata, unsigned clen)
- +{
- + struct page *page;
- + struct zv_hdr *zv = NULL;
- + uint32_t offset;
- + int ret;
- +
- + BUG_ON(!irqs_disabled());
- + ret = xv_malloc(xvpool, clen + sizeof(struct zv_hdr),
- + &page, &offset, ZCACHE_GFP_MASK);
- + if (unlikely(ret))
- + goto out;
- + zv = kmap_atomic(page, KM_USER0) + offset;
- + zv->index = index;
- + zv->oid = *oid;
- + zv->pool_id = pool_id;
- + SET_SENTINEL(zv, ZVH);
- + memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen);
- + kunmap_atomic(zv, KM_USER0);
- +out:
- + return zv;
- +}
- +
- +static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv)
- +{
- + unsigned long flags;
- + struct page *page;
- + uint32_t offset;
- + uint16_t size;
- +
- + ASSERT_SENTINEL(zv, ZVH);
- + size = xv_get_object_size(zv) - sizeof(*zv);
- + BUG_ON(size == 0 || size > zv_max_page_size);
- + INVERT_SENTINEL(zv, ZVH);
- + page = virt_to_page(zv);
- + offset = (unsigned long)zv & ~PAGE_MASK;
- + local_irq_save(flags);
- + xv_free(xvpool, page, offset);
- + local_irq_restore(flags);
- +}
- +
- +static void zv_decompress(struct page *page, struct zv_hdr *zv)
- +{
- + size_t clen = PAGE_SIZE;
- + char *to_va;
- + unsigned size;
- + int ret;
- +
- + ASSERT_SENTINEL(zv, ZVH);
- + size = xv_get_object_size(zv) - sizeof(*zv);
- + BUG_ON(size == 0 || size > zv_max_page_size);
- + to_va = kmap_atomic(page, KM_USER0);
- + ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv),
- + size, to_va, &clen);
- + kunmap_atomic(to_va, KM_USER0);
- + BUG_ON(ret != LZO_E_OK);
- + BUG_ON(clen != PAGE_SIZE);
- +}
- +
- +/*
- + * zcache core code starts here
- + */
- +
- +/* useful stats not collected by cleancache or frontswap */
- +static unsigned long zcache_flush_total;
- +static unsigned long zcache_flush_found;
- +static unsigned long zcache_flobj_total;
- +static unsigned long zcache_flobj_found;
- +static unsigned long zcache_failed_eph_puts;
- +static unsigned long zcache_failed_pers_puts;
- +
- +#define MAX_POOLS_PER_CLIENT 16
- +
- +static struct {
- + struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT];
- + struct xv_pool *xvpool;
- +} zcache_client;
- +
- +/*
- + * Tmem operations assume the poolid implies the invoking client.
- + * Zcache only has one client (the kernel itself), so translate
- + * the poolid into the tmem_pool allocated for it. A KVM version
- + * of zcache would have one client per guest and each client might
- + * have a poolid==N.
- + */
- +static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid)
- +{
- + struct tmem_pool *pool = NULL;
- +
- + if (poolid >= 0) {
- + pool = zcache_client.tmem_pools[poolid];
- + if (pool != NULL)
- + atomic_inc(&pool->refcount);
- + }
- + return pool;
- +}
- +
- +static void zcache_put_pool(struct tmem_pool *pool)
- +{
- + if (pool != NULL)
- + atomic_dec(&pool->refcount);
- +}
- +
- +/* counters for debugging */
- +static unsigned long zcache_failed_get_free_pages;
- +static unsigned long zcache_failed_alloc;
- +static unsigned long zcache_put_to_flush;
- +static unsigned long zcache_aborted_preload;
- +static unsigned long zcache_aborted_shrink;
- +
- +/*
- + * Ensure that memory allocation requests in zcache don't result
- + * in direct reclaim requests via the shrinker, which would cause
- + * an infinite loop. Maybe a GFP flag would be better?
- + */
- +static DEFINE_SPINLOCK(zcache_direct_reclaim_lock);
- +
- +/*
- + * for now, used named slabs so can easily track usage; later can
- + * either just use kmalloc, or perhaps add a slab-like allocator
- + * to more carefully manage total memory utilization
- + */
- +static struct kmem_cache *zcache_objnode_cache;
- +static struct kmem_cache *zcache_obj_cache;
- +static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0);
- +static unsigned long zcache_curr_obj_count_max;
- +static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0);
- +static unsigned long zcache_curr_objnode_count_max;
- +
- +/*
- + * to avoid memory allocation recursion (e.g. due to direct reclaim), we
- + * preload all necessary data structures so the hostops callbacks never
- + * actually do a malloc
- + */
- +struct zcache_preload {
- + void *page;
- + struct tmem_obj *obj;
- + int nr;
- + struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH];
- +};
- +static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };
- +
- +static int zcache_do_preload(struct tmem_pool *pool)
- +{
- + struct zcache_preload *kp;
- + struct tmem_objnode *objnode;
- + struct tmem_obj *obj;
- + void *page;
- + int ret = -ENOMEM;
- +
- + if (unlikely(zcache_objnode_cache == NULL))
- + goto out;
- + if (unlikely(zcache_obj_cache == NULL))
- + goto out;
- + if (!spin_trylock(&zcache_direct_reclaim_lock)) {
- + zcache_aborted_preload++;
- + goto out;
- + }
- + preempt_disable();
- + kp = &__get_cpu_var(zcache_preloads);
- + while (kp->nr < ARRAY_SIZE(kp->objnodes)) {
- + preempt_enable_no_resched();
- + objnode = kmem_cache_alloc(zcache_objnode_cache,
- + ZCACHE_GFP_MASK);
- + if (unlikely(objnode == NULL)) {
- + zcache_failed_alloc++;
- + goto unlock_out;
- + }
- + preempt_disable();
- + kp = &__get_cpu_var(zcache_preloads);
- + if (kp->nr < ARRAY_SIZE(kp->objnodes))
- + kp->objnodes[kp->nr++] = objnode;
- + else
- + kmem_cache_free(zcache_objnode_cache, objnode);
- + }
- + preempt_enable_no_resched();
- + obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK);
- + if (unlikely(obj == NULL)) {
- + zcache_failed_alloc++;
- + goto unlock_out;
- + }
- + page = (void *)__get_free_page(ZCACHE_GFP_MASK);
- + if (unlikely(page == NULL)) {
- + zcache_failed_get_free_pages++;
- + kmem_cache_free(zcache_obj_cache, obj);
- + goto unlock_out;
- + }
- + preempt_disable();
- + kp = &__get_cpu_var(zcache_preloads);
- + if (kp->obj == NULL)
- + kp->obj = obj;
- + else
- + kmem_cache_free(zcache_obj_cache, obj);
- + if (kp->page == NULL)
- + kp->page = page;
- + else
- + free_page((unsigned long)page);
- + ret = 0;
- +unlock_out:
- + spin_unlock(&zcache_direct_reclaim_lock);
- +out:
- + return ret;
- +}
- +
- +static void *zcache_get_free_page(void)
- +{
- + struct zcache_preload *kp;
- + void *page;
- +
- + kp = &__get_cpu_var(zcache_preloads);
- + page = kp->page;
- + BUG_ON(page == NULL);
- + kp->page = NULL;
- + return page;
- +}
- +
- +static void zcache_free_page(void *p)
- +{
- + free_page((unsigned long)p);
- +}
- +
- +/*
- + * zcache implementation for tmem host ops
- + */
- +
- +static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool)
- +{
- + struct tmem_objnode *objnode = NULL;
- + unsigned long count;
- + struct zcache_preload *kp;
- +
- + kp = &__get_cpu_var(zcache_preloads);
- + if (kp->nr <= 0)
- + goto out;
- + objnode = kp->objnodes[kp->nr - 1];
- + BUG_ON(objnode == NULL);
- + kp->objnodes[kp->nr - 1] = NULL;
- + kp->nr--;
- + count = atomic_inc_return(&zcache_curr_objnode_count);
- + if (count > zcache_curr_objnode_count_max)
- + zcache_curr_objnode_count_max = count;
- +out:
- + return objnode;
- +}
- +
- +static void zcache_objnode_free(struct tmem_objnode *objnode,
- + struct tmem_pool *pool)
- +{
- + atomic_dec(&zcache_curr_objnode_count);
- + BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0);
- + kmem_cache_free(zcache_objnode_cache, objnode);
- +}
- +
- +static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool)
- +{
- + struct tmem_obj *obj = NULL;
- + unsigned long count;
- + struct zcache_preload *kp;
- +
- + kp = &__get_cpu_var(zcache_preloads);
- + obj = kp->obj;
- + BUG_ON(obj == NULL);
- + kp->obj = NULL;
- + count = atomic_inc_return(&zcache_curr_obj_count);
- + if (count > zcache_curr_obj_count_max)
- + zcache_curr_obj_count_max = count;
- + return obj;
- +}
- +
- +static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)
- +{
- + atomic_dec(&zcache_curr_obj_count);
- + BUG_ON(atomic_read(&zcache_curr_obj_count) < 0);
- + kmem_cache_free(zcache_obj_cache, obj);
- +}
- +
- +static struct tmem_hostops zcache_hostops = {
- + .obj_alloc = zcache_obj_alloc,
- + .obj_free = zcache_obj_free,
- + .objnode_alloc = zcache_objnode_alloc,
- + .objnode_free = zcache_objnode_free,
- +};
- +
- +/*
- + * zcache implementations for PAM page descriptor ops
- + */
- +
- +static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0);
- +static unsigned long zcache_curr_eph_pampd_count_max;
- +static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0);
- +static unsigned long zcache_curr_pers_pampd_count_max;
- +
- +/* forward reference */
- +static int zcache_compress(struct page *from, void **out_va, size_t *out_len);
- +
- +static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid,
- + uint32_t index, struct page *page)
- +{
- + void *pampd = NULL, *cdata;
- + size_t clen;
- + int ret;
- + bool ephemeral = is_ephemeral(pool);
- + unsigned long count;
- +
- + if (ephemeral) {
- + ret = zcache_compress(page, &cdata, &clen);
- + if (ret == 0)
- +
- + goto out;
- + if (clen == 0 || clen > zbud_max_buddy_size()) {
- + zcache_compress_poor++;
- + goto out;
- + }
- + pampd = (void *)zbud_create(pool->pool_id, oid, index,
- + page, cdata, clen);
- + if (pampd != NULL) {
- + count = atomic_inc_return(&zcache_curr_eph_pampd_count);
- + if (count > zcache_curr_eph_pampd_count_max)
- + zcache_curr_eph_pampd_count_max = count;
- + }
- + } else {
- + /*
- + * FIXME: This is all the "policy" there is for now.
- + * 3/4 totpages should allow ~37% of RAM to be filled with
- + * compressed frontswap pages
- + */
- + if (atomic_read(&zcache_curr_pers_pampd_count) >
- + 3 * totalram_pages / 4)
- + goto out;
- + ret = zcache_compress(page, &cdata, &clen);
- + if (ret == 0)
- + goto out;
- + if (clen > zv_max_page_size) {
- + zcache_compress_poor++;
- + goto out;
- + }
- + pampd = (void *)zv_create(zcache_client.xvpool, pool->pool_id,
- + oid, index, cdata, clen);
- + if (pampd == NULL)
- + goto out;
- + count = atomic_inc_return(&zcache_curr_pers_pampd_count);
- + if (count > zcache_curr_pers_pampd_count_max)
- + zcache_curr_pers_pampd_count_max = count;
- + }
- +out:
- + return pampd;
- +}
- +
- +/*
- + * fill the pageframe corresponding to the struct page with the data
- + * from the passed pampd
- + */
- +static int zcache_pampd_get_data(struct page *page, void *pampd,
- + struct tmem_pool *pool)
- +{
- + int ret = 0;
- +
- + if (is_ephemeral(pool))
- + ret = zbud_decompress(page, pampd);
- + else
- + zv_decompress(page, pampd);
- + return ret;
- +}
- +
- +/*
- + * free the pampd and remove it from any zcache lists
- + * pampd must no longer be pointed to from any tmem data structures!
- + */
- +static void zcache_pampd_free(void *pampd, struct tmem_pool *pool)
- +{
- + if (is_ephemeral(pool)) {
- + zbud_free_and_delist((struct zbud_hdr *)pampd);
- + atomic_dec(&zcache_curr_eph_pampd_count);
- + BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0);
- + } else {
- + zv_free(zcache_client.xvpool, (struct zv_hdr *)pampd);
- + atomic_dec(&zcache_curr_pers_pampd_count);
- + BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0);
- + }
- +}
- +
- +static struct tmem_pamops zcache_pamops = {
- + .create = zcache_pampd_create,
- + .get_data = zcache_pampd_get_data,
- + .free = zcache_pampd_free,
- +};
- +
- +/*
- + * zcache compression/decompression and related per-cpu stuff
- + */
- +
- +#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
- +#define LZO_DSTMEM_PAGE_ORDER 1
- +static DEFINE_PER_CPU(unsigned char *, zcache_workmem);
- +static DEFINE_PER_CPU(unsigned char *, zcache_dstmem);
- +
- +static int zcache_compress(struct page *from, void **out_va, size_t *out_len)
- +{
- + int ret = 0;
- + unsigned char *dmem = __get_cpu_var(zcache_dstmem);
- + unsigned char *wmem = __get_cpu_var(zcache_workmem);
- + char *from_va;
- +
- + BUG_ON(!irqs_disabled());
- + if (unlikely(dmem == NULL || wmem == NULL))
- + goto out; /* no buffer, so can't compress */
- + from_va = kmap_atomic(from, KM_USER0);
- + mb();
- + ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem);
- + BUG_ON(ret != LZO_E_OK);
- + *out_va = dmem;
- + kunmap_atomic(from_va, KM_USER0);
- + ret = 1;
- +out:
- + return ret;
- +}
- +
- +
- +static int zcache_cpu_notifier(struct notifier_block *nb,
- + unsigned long action, void *pcpu)
- +{
- + int cpu = (long)pcpu;
- + struct zcache_preload *kp;
- +
- + switch (action) {
- + case CPU_UP_PREPARE:
- + per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages(
- + GFP_KERNEL | __GFP_REPEAT,
- + LZO_DSTMEM_PAGE_ORDER),
- + per_cpu(zcache_workmem, cpu) =
- + kzalloc(LZO1X_MEM_COMPRESS,
- + GFP_KERNEL | __GFP_REPEAT);
- + break;
- + case CPU_DEAD:
- + case CPU_UP_CANCELED:
- + free_pages((unsigned long)per_cpu(zcache_dstmem, cpu),
- + LZO_DSTMEM_PAGE_ORDER);
- + per_cpu(zcache_dstmem, cpu) = NULL;
- + kfree(per_cpu(zcache_workmem, cpu));
- + per_cpu(zcache_workmem, cpu) = NULL;
- + kp = &per_cpu(zcache_preloads, cpu);
- + while (kp->nr) {
- + kmem_cache_free(zcache_objnode_cache,
- + kp->objnodes[kp->nr - 1]);
- + kp->objnodes[kp->nr - 1] = NULL;
- + kp->nr--;
- + }
- + kmem_cache_free(zcache_obj_cache, kp->obj);
- + free_page((unsigned long)kp->page);
- + break;
- + default:
- + break;
- + }
- + return NOTIFY_OK;
- +}
- +
- +static struct notifier_block zcache_cpu_notifier_block = {
- + .notifier_call = zcache_cpu_notifier
- +};
- +
- +#ifdef CONFIG_SYSFS
- +#define ZCACHE_SYSFS_RO(_name) \
- + static ssize_t zcache_##_name##_show(struct kobject *kobj, \
- + struct kobj_attribute *attr, char *buf) \
- + { \
- + return sprintf(buf, "%lu\n", zcache_##_name); \
- + } \
- + static struct kobj_attribute zcache_##_name##_attr = { \
- + .attr = { .name = __stringify(_name), .mode = 0444 }, \
- + .show = zcache_##_name##_show, \
- + }
- +
- +#define ZCACHE_SYSFS_RO_ATOMIC(_name) \
- + static ssize_t zcache_##_name##_show(struct kobject *kobj, \
- + struct kobj_attribute *attr, char *buf) \
- + { \
- + return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
- + } \
- + static struct kobj_attribute zcache_##_name##_attr = { \
- + .attr = { .name = __stringify(_name), .mode = 0444 }, \
- + .show = zcache_##_name##_show, \
- + }
- +
- +#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
- + static ssize_t zcache_##_name##_show(struct kobject *kobj, \
- + struct kobj_attribute *attr, char *buf) \
- + { \
- + return _func(buf); \
- + } \
- + static struct kobj_attribute zcache_##_name##_attr = { \
- + .attr = { .name = __stringify(_name), .mode = 0444 }, \
- + .show = zcache_##_name##_show, \
- + }
- +
- +ZCACHE_SYSFS_RO(curr_obj_count_max);
- +ZCACHE_SYSFS_RO(curr_objnode_count_max);
- +ZCACHE_SYSFS_RO(flush_total);
- +ZCACHE_SYSFS_RO(flush_found);
- +ZCACHE_SYSFS_RO(flobj_total);
- +ZCACHE_SYSFS_RO(flobj_found);
- +ZCACHE_SYSFS_RO(failed_eph_puts);
- +ZCACHE_SYSFS_RO(failed_pers_puts);
- +ZCACHE_SYSFS_RO(zbud_curr_zbytes);
- +ZCACHE_SYSFS_RO(zbud_cumul_zpages);
- +ZCACHE_SYSFS_RO(zbud_cumul_zbytes);
- +ZCACHE_SYSFS_RO(zbud_buddied_count);
- +ZCACHE_SYSFS_RO(zbpg_unused_list_count);
- +ZCACHE_SYSFS_RO(evicted_raw_pages);
- +ZCACHE_SYSFS_RO(evicted_unbuddied_pages);
- +ZCACHE_SYSFS_RO(evicted_buddied_pages);
- +ZCACHE_SYSFS_RO(failed_get_free_pages);
- +ZCACHE_SYSFS_RO(failed_alloc);
- +ZCACHE_SYSFS_RO(put_to_flush);
- +ZCACHE_SYSFS_RO(aborted_preload);
- +ZCACHE_SYSFS_RO(aborted_shrink);
- +ZCACHE_SYSFS_RO(compress_poor);
- +ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);
- +ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);
- +ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);
- +ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count);
- +ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts,
- + zbud_show_unbuddied_list_counts);
- +ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts,
- + zbud_show_cumul_chunk_counts);
- +
- +static struct attribute *zcache_attrs[] = {
- + &zcache_curr_obj_count_attr.attr,
- + &zcache_curr_obj_count_max_attr.attr,
- + &zcache_curr_objnode_count_attr.attr,
- + &zcache_curr_objnode_count_max_attr.attr,
- + &zcache_flush_total_attr.attr,
- + &zcache_flobj_total_attr.attr,
- + &zcache_flush_found_attr.attr,
- + &zcache_flobj_found_attr.attr,
- + &zcache_failed_eph_puts_attr.attr,
- + &zcache_failed_pers_puts_attr.attr,
- + &zcache_compress_poor_attr.attr,
- + &zcache_zbud_curr_raw_pages_attr.attr,
- + &zcache_zbud_curr_zpages_attr.attr,
- + &zcache_zbud_curr_zbytes_attr.attr,
- + &zcache_zbud_cumul_zpages_attr.attr,
- + &zcache_zbud_cumul_zbytes_attr.attr,
- + &zcache_zbud_buddied_count_attr.attr,
- + &zcache_zbpg_unused_list_count_attr.attr,
- + &zcache_evicted_raw_pages_attr.attr,
- + &zcache_evicted_unbuddied_pages_attr.attr,
- + &zcache_evicted_buddied_pages_attr.attr,
- + &zcache_failed_get_free_pages_attr.attr,
- + &zcache_failed_alloc_attr.attr,
- + &zcache_put_to_flush_attr.attr,
- + &zcache_aborted_preload_attr.attr,
- + &zcache_aborted_shrink_attr.attr,
- + &zcache_zbud_unbuddied_list_counts_attr.attr,
- + &zcache_zbud_cumul_chunk_counts_attr.attr,
- + NULL,
- +};
- +
- +static struct attribute_group zcache_attr_group = {
- + .attrs = zcache_attrs,
- + .name = "zcache",
- +};
- +
- +#endif /* CONFIG_SYSFS */
- +/*
- + * When zcache is disabled ("frozen"), pools can be created and destroyed,
- + * but all puts (and thus all other operations that require memory allocation)
- + * must fail. If zcache is unfrozen, accepts puts, then frozen again,
- + * data consistency requires all puts while frozen to be converted into
- + * flushes.
- + */
- +static bool zcache_freeze;
- +
- +/*
- + * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
- + */
- +static int shrink_zcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
- +{
- + int ret = -1;
- +
- + if (nr >= 0) {
- + if (!(gfp_mask & __GFP_FS))
- + /* does this case really need to be skipped? */
- + goto out;
- + if (spin_trylock(&zcache_direct_reclaim_lock)) {
- + zbud_evict_pages(nr);
- + spin_unlock(&zcache_direct_reclaim_lock);
- + } else
- + zcache_aborted_shrink++;
- + }
- + ret = (int)atomic_read(&zcache_zbud_curr_raw_pages);
- +out:
- + return ret;
- +}
- +
- +static struct shrinker zcache_shrinker = {
- + .shrink = shrink_zcache_memory,
- + .seeks = DEFAULT_SEEKS,
- +};
- +
- +/*
- + * zcache shims between cleancache/frontswap ops and tmem
- + */
- +
- +static int zcache_put_page(int pool_id, struct tmem_oid *oidp,
- + uint32_t index, struct page *page)
- +{
- + struct tmem_pool *pool;
- + int ret = -1;
- +
- + BUG_ON(!irqs_disabled());
- + pool = zcache_get_pool_by_id(pool_id);
- + if (unlikely(pool == NULL))
- + goto out;
- + if (!zcache_freeze && zcache_do_preload(pool) == 0) {
- + /* preload does preempt_disable on success */
- + ret = tmem_put(pool, oidp, index, page);
- + if (ret < 0) {
- + if (is_ephemeral(pool))
- + zcache_failed_eph_puts++;
- + else
- + zcache_failed_pers_puts++;
- + }
- + zcache_put_pool(pool);
- + preempt_enable_no_resched();
- + } else {
- + zcache_put_to_flush++;
- + if (atomic_read(&pool->obj_count) > 0)
- + /* the put fails whether the flush succeeds or not */
- + (void)tmem_flush_page(pool, oidp, index);
- + zcache_put_pool(pool);
- + }
- +out:
- + return ret;
- +}
- +
- +static int zcache_get_page(int pool_id, struct tmem_oid *oidp,
- + uint32_t index, struct page *page)
- +{
- + struct tmem_pool *pool;
- + int ret = -1;
- + unsigned long flags;
- +
- + local_irq_save(flags);
- + pool = zcache_get_pool_by_id(pool_id);
- + if (likely(pool != NULL)) {
- + if (atomic_read(&pool->obj_count) > 0)
- + ret = tmem_get(pool, oidp, index, page);
- + zcache_put_pool(pool);
- + }
- + local_irq_restore(flags);
- + return ret;
- +}
- +
- +static int zcache_flush_page(int pool_id, struct tmem_oid *oidp, uint32_t index)
- +{
- + struct tmem_pool *pool;
- + int ret = -1;
- + unsigned long flags;
- +
- + local_irq_save(flags);
- + zcache_flush_total++;
- + pool = zcache_get_pool_by_id(pool_id);
- + if (likely(pool != NULL)) {
- + if (atomic_read(&pool->obj_count) > 0)
- + ret = tmem_flush_page(pool, oidp, index);
- + zcache_put_pool(pool);
- + }
- + if (ret >= 0)
- + zcache_flush_found++;
- + local_irq_restore(flags);
- + return ret;
- +}
- +
- +static int zcache_flush_object(int pool_id, struct tmem_oid *oidp)
- +{
- + struct tmem_pool *pool;
- + int ret = -1;
- + unsigned long flags;
- +
- + local_irq_save(flags);
- + zcache_flobj_total++;
- + pool = zcache_get_pool_by_id(pool_id);
- + if (likely(pool != NULL)) {
- + if (atomic_read(&pool->obj_count) > 0)
- + ret = tmem_flush_object(pool, oidp);
- + zcache_put_pool(pool);
- + }
- + if (ret >= 0)
- + zcache_flobj_found++;
- + local_irq_restore(flags);
- + return ret;
- +}
- +
- +static int zcache_destroy_pool(int pool_id)
- +{
- + struct tmem_pool *pool = NULL;
- + int ret = -1;
- +
- + if (pool_id < 0)
- + goto out;
- + pool = zcache_client.tmem_pools[pool_id];
- + if (pool == NULL)
- + goto out;
- + zcache_client.tmem_pools[pool_id] = NULL;
- + /* wait for pool activity on other cpus to quiesce */
- + while (atomic_read(&pool->refcount) != 0)
- + ;
- + local_bh_disable();
- + ret = tmem_destroy_pool(pool);
- + local_bh_enable();
- + kfree(pool);
- + pr_info("zcache: destroyed pool id=%d\n", pool_id);
- +out:
- + return ret;
- +}
- +
- +static int zcache_new_pool(uint32_t flags)
- +{
- + int poolid = -1;
- + struct tmem_pool *pool;
- +
- + pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL);
- + if (pool == NULL) {
- + pr_info("zcache: pool creation failed: out of memory\n");
- + goto out;
- + }
- +
- + for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++)
- + if (zcache_client.tmem_pools[poolid] == NULL)
- + break;
- + if (poolid >= MAX_POOLS_PER_CLIENT) {
- + pr_info("zcache: pool creation failed: max exceeded\n");
- + kfree(pool);
- + poolid = -1;
- + goto out;
- + }
- + atomic_set(&pool->refcount, 0);
- + pool->client = &zcache_client;
- + pool->pool_id = poolid;
- + tmem_new_pool(pool, flags);
- + zcache_client.tmem_pools[poolid] = pool;
- + pr_info("zcache: created %s tmem pool, id=%d\n",
- + flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
- + poolid);
- +out:
- + return poolid;
- +}
- +
- +/**********
- + * Two kernel functionalities currently can be layered on top of tmem.
- + * These are "cleancache" which is used as a second-chance cache for clean
- + * page cache pages; and "frontswap" which is used for swap pages
- + * to avoid writes to disk. A generic "shim" is provided here for each
- + * to translate in-kernel semantics to zcache semantics.
- + */
- +
- +#ifdef CONFIG_CLEANCACHE
- +static void zcache_cleancache_put_page(int pool_id,
- + struct cleancache_filekey key,
- + pgoff_t index, struct page *page)
- +{
- + u32 ind = (u32) index;
- + struct tmem_oid oid = *(struct tmem_oid *)&key;
- +
- + if (likely(ind == index))
- + (void)zcache_put_page(pool_id, &oid, index, page);
- +}
- +
- +static int zcache_cleancache_get_page(int pool_id,
- + struct cleancache_filekey key,
- + pgoff_t index, struct page *page)
- +{
- + u32 ind = (u32) index;
- + struct tmem_oid oid = *(struct tmem_oid *)&key;
- + int ret = -1;
- +
- + if (likely(ind == index))
- + ret = zcache_get_page(pool_id, &oid, index, page);
- + return ret;
- +}
- +
- +static void zcache_cleancache_flush_page(int pool_id,
- + struct cleancache_filekey key,
- + pgoff_t index)
- +{
- + u32 ind = (u32) index;
- + struct tmem_oid oid = *(struct tmem_oid *)&key;
- +
- + if (likely(ind == index))
- + (void)zcache_flush_page(pool_id, &oid, ind);
- +}
- +
- +static void zcache_cleancache_flush_inode(int pool_id,
- + struct cleancache_filekey key)
- +{
- + struct tmem_oid oid = *(struct tmem_oid *)&key;
- +
- + (void)zcache_flush_object(pool_id, &oid);
- +}
- +
- +static void zcache_cleancache_flush_fs(int pool_id)
- +{
- + if (pool_id >= 0)
- + (void)zcache_destroy_pool(pool_id);
- +}
- +
- +static int zcache_cleancache_init_fs(size_t pagesize)
- +{
- + BUG_ON(sizeof(struct cleancache_filekey) !=
- + sizeof(struct tmem_oid));
- + BUG_ON(pagesize != PAGE_SIZE);
- + return zcache_new_pool(0);
- +}
- +
- +static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
- +{
- + /* shared pools are unsupported and map to private */
- + BUG_ON(sizeof(struct cleancache_filekey) !=
- + sizeof(struct tmem_oid));
- + BUG_ON(pagesize != PAGE_SIZE);
- + return zcache_new_pool(0);
- +}
- +
- +static struct cleancache_ops zcache_cleancache_ops = {
- + .put_page = zcache_cleancache_put_page,
- + .get_page = zcache_cleancache_get_page,
- + .flush_page = zcache_cleancache_flush_page,
- + .flush_inode = zcache_cleancache_flush_inode,
- + .flush_fs = zcache_cleancache_flush_fs,
- + .init_shared_fs = zcache_cleancache_init_shared_fs,
- + .init_fs = zcache_cleancache_init_fs
- +};
- +
- +struct cleancache_ops zcache_cleancache_register_ops(void)
- +{
- + struct cleancache_ops old_ops =
- + cleancache_register_ops(&zcache_cleancache_ops);
- +
- + return old_ops;
- +}
- +#endif
- +
- +#ifdef CONFIG_FRONTSWAP
- +/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
- +static int zcache_frontswap_poolid = -1;
- +
- +/*
- + * Swizzling increases objects per swaptype, increasing tmem concurrency
- + * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS
- + */
- +#define SWIZ_BITS 4
- +#define SWIZ_MASK ((1 << SWIZ_BITS) - 1)
- +#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
- +#define iswiz(_ind) (_ind >> SWIZ_BITS)
- +
- +static inline struct tmem_oid oswiz(unsigned type, u32 ind)
- +{
- + struct tmem_oid oid = { .oid = { 0 } };
- + oid.oid[0] = _oswiz(type, ind);
- + return oid;
- +}
- +
- +static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
- + struct page *page)
- +{
- + u64 ind64 = (u64)offset;
- + u32 ind = (u32)offset;
- + struct tmem_oid oid = oswiz(type, ind);
- + int ret = -1;
- + unsigned long flags;
- +
- + BUG_ON(!PageLocked(page));
- + if (likely(ind64 == ind)) {
- + local_irq_save(flags);
- + ret = zcache_put_page(zcache_frontswap_poolid, &oid,
- + iswiz(ind), page);
- + local_irq_restore(flags);
- + }
- + return ret;
- +}
- +
- +/* returns 0 if the page was successfully gotten from frontswap, -1 if
- + * was not present (should never happen!) */
- +static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
- + struct page *page)
- +{
- + u64 ind64 = (u64)offset;
- + u32 ind = (u32)offset;
- + struct tmem_oid oid = oswiz(type, ind);
- + int ret = -1;
- +
- + BUG_ON(!PageLocked(page));
- + if (likely(ind64 == ind))
- + ret = zcache_get_page(zcache_frontswap_poolid, &oid,
- + iswiz(ind), page);
- + return ret;
- +}
- +
- +/* flush a single page from frontswap */
- +static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)
- +{
- + u64 ind64 = (u64)offset;
- + u32 ind = (u32)offset;
- + struct tmem_oid oid = oswiz(type, ind);
- +
- + if (likely(ind64 == ind))
- + (void)zcache_flush_page(zcache_frontswap_poolid, &oid,
- + iswiz(ind));
- +}
- +
- +/* flush all pages from the passed swaptype */
- +static void zcache_frontswap_flush_area(unsigned type)
- +{
- + struct tmem_oid oid;
- + int ind;
- +
- + for (ind = SWIZ_MASK; ind >= 0; ind--) {
- + oid = oswiz(type, ind);
- + (void)zcache_flush_object(zcache_frontswap_poolid, &oid);
- + }
- +}
- +
- +static void zcache_frontswap_init(unsigned ignored)
- +{
- + /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
- + if (zcache_frontswap_poolid < 0)
- + zcache_frontswap_poolid = zcache_new_pool(TMEM_POOL_PERSIST);
- +}
- +
- +static struct frontswap_ops zcache_frontswap_ops = {
- + .put_page = zcache_frontswap_put_page,
- + .get_page = zcache_frontswap_get_page,
- + .flush_page = zcache_frontswap_flush_page,
- + .flush_area = zcache_frontswap_flush_area,
- + .init = zcache_frontswap_init
- +};
- +
- +struct frontswap_ops zcache_frontswap_register_ops(void)
- +{
- + struct frontswap_ops old_ops =
- + frontswap_register_ops(&zcache_frontswap_ops);
- +
- + return old_ops;
- +}
- +#endif
- +
- +/*
- + * zcache initialization
- + * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
- + * NOTHING HAPPENS!
- + */
- +
- +static int zcache_enabled;
- +
- +static int __init enable_zcache(char *s)
- +{
- + zcache_enabled = 1;
- + return 1;
- +}
- +__setup("zcache", enable_zcache);
- +
- +/* allow independent dynamic disabling of cleancache and frontswap */
- +
- +static int use_cleancache = 1;
- +
- +static int __init no_cleancache(char *s)
- +{
- + use_cleancache = 0;
- + return 1;
- +}
- +
- +__setup("nocleancache", no_cleancache);
- +
- +static int use_frontswap = 1;
- +
- +static int __init no_frontswap(char *s)
- +{
- + use_frontswap = 0;
- + return 1;
- +}
- +
- +__setup("nofrontswap", no_frontswap);
- +
- +static int __init zcache_init(void)
- +{
- + int ret = 0;
- +
- +#ifdef CONFIG_SYSFS
- + ret = sysfs_create_group(mm_kobj, &zcache_attr_group);
- + if (ret) {
- + pr_err("zcache: can't create sysfs\n");
- + goto out;
- + }
- +#endif /* CONFIG_SYSFS */
- +#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
- + if (zcache_enabled) {
- + unsigned int cpu;
- +
- + tmem_register_hostops(&zcache_hostops);
- + tmem_register_pamops(&zcache_pamops);
- + ret = register_cpu_notifier(&zcache_cpu_notifier_block);
- + if (ret) {
- + pr_err("zcache: can't register cpu notifier\n");
- + goto out;
- + }
- + for_each_online_cpu(cpu) {
- + void *pcpu = (void *)(long)cpu;
- + zcache_cpu_notifier(&zcache_cpu_notifier_block,
- + CPU_UP_PREPARE, pcpu);
- + }
- + }
- + zcache_objnode_cache = kmem_cache_create("zcache_objnode",
- + sizeof(struct tmem_objnode), 0, 0, NULL);
- + zcache_obj_cache = kmem_cache_create("zcache_obj",
- + sizeof(struct tmem_obj), 0, 0, NULL);
- +#endif
- +#ifdef CONFIG_CLEANCACHE
- + if (zcache_enabled && use_cleancache) {
- + struct cleancache_ops old_ops;
- +
- + zbud_init();
- + register_shrinker(&zcache_shrinker);
- + old_ops = zcache_cleancache_register_ops();
- + pr_info("zcache: cleancache enabled using kernel "
- + "transcendent memory and compression buddies\n");
- + if (old_ops.init_fs != NULL)
- + pr_warning("zcache: cleancache_ops overridden");
- + }
- +#endif
- +#ifdef CONFIG_FRONTSWAP
- + if (zcache_enabled && use_frontswap) {
- + struct frontswap_ops old_ops;
- +
- + zcache_client.xvpool = xv_create_pool();
- + if (zcache_client.xvpool == NULL) {
- + pr_err("zcache: can't create xvpool\n");
- + goto out;
- + }
- + old_ops = zcache_frontswap_register_ops();
- + pr_info("zcache: frontswap enabled using kernel "
- + "transcendent memory and xvmalloc\n");
- + if (old_ops.init != NULL)
- + pr_warning("ktmem: frontswap_ops overridden");
- + }
- +#endif
- +out:
- + return ret;
- +}
- +
- +module_init(zcache_init)
- diff -Nrupad linux-2.6.37//drivers/staging/zram/Kconfig linux-2.6.37_vanilla//drivers/staging/zram/Kconfig
- --- linux-2.6.37//drivers/staging/zram/Kconfig 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//drivers/staging/zram/Kconfig 2011-02-14 01:22:46.470793204 +0100
- @@ -15,3 +15,11 @@ config ZRAM
- See zram.txt for more information.
- Project home: http://compcache.googlecode.com/
- +
- +config ZRAM_DEBUG
- + bool "Compressed RAM block device debug support"
- + depends on ZRAM
- + default n
- + help
- + This option adds additional debugging code to the compressed
- + RAM block device driver.
- diff -Nrupad linux-2.6.37//drivers/staging/zram/xvmalloc.c linux-2.6.37_vanilla//drivers/staging/zram/xvmalloc.c
- --- linux-2.6.37//drivers/staging/zram/xvmalloc.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//drivers/staging/zram/xvmalloc.c 2011-02-14 01:24:56.564792988 +0100
- @@ -10,6 +10,10 @@
- * Released under the terms of GNU General Public License Version 2.0
- */
- +#ifdef CONFIG_ZRAM_DEBUG
- +#define DEBUG
- +#endif
- +
- #include <linux/bitops.h>
- #include <linux/errno.h>
- #include <linux/highmem.h>
- @@ -187,7 +191,7 @@ static void insert_block(struct xv_pool
- slindex = get_index_for_insert(block->size);
- flindex = slindex / BITS_PER_LONG;
- - block->link.prev_page = 0;
- + block->link.prev_page = NULL;
- block->link.prev_offset = 0;
- block->link.next_page = pool->freelist[slindex].page;
- block->link.next_offset = pool->freelist[slindex].offset;
- @@ -200,6 +204,8 @@ static void insert_block(struct xv_pool
- nextblock->link.prev_page = page;
- nextblock->link.prev_offset = offset;
- put_ptr_atomic(nextblock, KM_USER1);
- + /* If there was a next page then the free bits are set. */
- + return;
- }
- __set_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
- @@ -207,54 +213,14 @@ static void insert_block(struct xv_pool
- }
- /*
- - * Remove block from head of freelist. Index 'slindex' identifies the freelist.
- - */
- -static void remove_block_head(struct xv_pool *pool,
- - struct block_header *block, u32 slindex)
- -{
- - struct block_header *tmpblock;
- - u32 flindex = slindex / BITS_PER_LONG;
- -
- - pool->freelist[slindex].page = block->link.next_page;
- - pool->freelist[slindex].offset = block->link.next_offset;
- - block->link.prev_page = 0;
- - block->link.prev_offset = 0;
- -
- - if (!pool->freelist[slindex].page) {
- - __clear_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
- - if (!pool->slbitmap[flindex])
- - __clear_bit(flindex, &pool->flbitmap);
- - } else {
- - /*
- - * DEBUG ONLY: We need not reinitialize freelist head previous
- - * pointer to 0 - we never depend on its value. But just for
- - * sanity, lets do it.
- - */
- - tmpblock = get_ptr_atomic(pool->freelist[slindex].page,
- - pool->freelist[slindex].offset, KM_USER1);
- - tmpblock->link.prev_page = 0;
- - tmpblock->link.prev_offset = 0;
- - put_ptr_atomic(tmpblock, KM_USER1);
- - }
- -}
- -
- -/*
- * Remove block from freelist. Index 'slindex' identifies the freelist.
- */
- static void remove_block(struct xv_pool *pool, struct page *page, u32 offset,
- struct block_header *block, u32 slindex)
- {
- - u32 flindex;
- + u32 flindex = slindex / BITS_PER_LONG;
- struct block_header *tmpblock;
- - if (pool->freelist[slindex].page == page
- - && pool->freelist[slindex].offset == offset) {
- - remove_block_head(pool, block, slindex);
- - return;
- - }
- -
- - flindex = slindex / BITS_PER_LONG;
- -
- if (block->link.prev_page) {
- tmpblock = get_ptr_atomic(block->link.prev_page,
- block->link.prev_offset, KM_USER1);
- @@ -270,6 +236,35 @@ static void remove_block(struct xv_pool
- tmpblock->link.prev_offset = block->link.prev_offset;
- put_ptr_atomic(tmpblock, KM_USER1);
- }
- +
- + /* Is this block is at the head of the freelist? */
- + if (pool->freelist[slindex].page == page
- + && pool->freelist[slindex].offset == offset) {
- +
- + pool->freelist[slindex].page = block->link.next_page;
- + pool->freelist[slindex].offset = block->link.next_offset;
- +
- + if (pool->freelist[slindex].page) {
- + struct block_header *tmpblock;
- + tmpblock = get_ptr_atomic(pool->freelist[slindex].page,
- + pool->freelist[slindex].offset,
- + KM_USER1);
- + tmpblock->link.prev_page = NULL;
- + tmpblock->link.prev_offset = 0;
- + put_ptr_atomic(tmpblock, KM_USER1);
- + } else {
- + /* This freelist bucket is empty */
- + __clear_bit(slindex % BITS_PER_LONG,
- + &pool->slbitmap[flindex]);
- + if (!pool->slbitmap[flindex])
- + __clear_bit(flindex, &pool->flbitmap);
- + }
- + }
- +
- + block->link.prev_page = NULL;
- + block->link.prev_offset = 0;
- + block->link.next_page = NULL;
- + block->link.next_offset = 0;
- }
- /*
- @@ -378,7 +373,7 @@ int xv_malloc(struct xv_pool *pool, u32
- block = get_ptr_atomic(*page, *offset, KM_USER0);
- - remove_block_head(pool, block, index);
- + remove_block(pool, *page, *offset, block, index);
- /* Split the block if required */
- tmpoffset = *offset + size + XV_ALIGN;
- diff -Nrupad linux-2.6.37//drivers/staging/zram/xvmalloc_int.h linux-2.6.37_vanilla//drivers/staging/zram/xvmalloc_int.h
- --- linux-2.6.37//drivers/staging/zram/xvmalloc_int.h 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//drivers/staging/zram/xvmalloc_int.h 2011-02-14 01:22:50.990793071 +0100
- @@ -19,7 +19,11 @@
- /* User configurable params */
- /* Must be power of two */
- +#ifdef CONFIG_64BIT
- +#define XV_ALIGN_SHIFT 3
- +#else
- #define XV_ALIGN_SHIFT 2
- +#endif
- #define XV_ALIGN (1 << XV_ALIGN_SHIFT)
- #define XV_ALIGN_MASK (XV_ALIGN - 1)
- @@ -27,8 +31,16 @@
- #define XV_MIN_ALLOC_SIZE 32
- #define XV_MAX_ALLOC_SIZE (PAGE_SIZE - XV_ALIGN)
- -/* Free lists are separated by FL_DELTA bytes */
- -#define FL_DELTA_SHIFT 3
- +/*
- + * Free lists are separated by FL_DELTA bytes
- + * This value is 3 for 4k pages and 4 for 64k pages, for any
- + * other page size, a conservative (PAGE_SHIFT - 9) is used.
- + */
- +#if PAGE_SHIFT == 16
- +#define FL_DELTA_SHIFT 4
- +#else
- +#define FL_DELTA_SHIFT (PAGE_SHIFT - 9)
- +#endif
- #define FL_DELTA (1 << FL_DELTA_SHIFT)
- #define FL_DELTA_MASK (FL_DELTA - 1)
- #define NUM_FREE_LISTS ((XV_MAX_ALLOC_SIZE - XV_MIN_ALLOC_SIZE) \
- @@ -75,12 +87,9 @@ struct block_header {
- struct xv_pool {
- ulong flbitmap;
- ulong slbitmap[MAX_FLI];
- - spinlock_t lock;
- -
- + u64 total_pages; /* stats */
- struct freelist_entry freelist[NUM_FREE_LISTS];
- -
- - /* stats */
- - u64 total_pages;
- + spinlock_t lock;
- };
- #endif
- diff -Nrupad linux-2.6.37//drivers/staging/zram/zram_drv.c linux-2.6.37_vanilla//drivers/staging/zram/zram_drv.c
- --- linux-2.6.37//drivers/staging/zram/zram_drv.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//drivers/staging/zram/zram_drv.c 2011-02-14 01:24:29.924793006 +0100
- @@ -15,6 +15,10 @@
- #define KMSG_COMPONENT "zram"
- #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
- +#ifdef CONFIG_ZRAM_DEBUG
- +#define DEBUG
- +#endif
- +
- #include <linux/module.h>
- #include <linux/kernel.h>
- #include <linux/bio.h>
- @@ -227,6 +231,7 @@ static int zram_read(struct zram *zram,
- if (zram_test_flag(zram, index, ZRAM_ZERO)) {
- handle_zero_page(page);
- + index++;
- continue;
- }
- @@ -234,13 +239,15 @@ static int zram_read(struct zram *zram,
- if (unlikely(!zram->table[index].page)) {
- pr_debug("Read before write: sector=%lu, size=%u",
- (ulong)(bio->bi_sector), bio->bi_size);
- - /* Do nothing */
- + handle_zero_page(page);
- + index++;
- continue;
- }
- /* Page is stored uncompressed since it's incompressible */
- if (unlikely(zram_test_flag(zram, index, ZRAM_UNCOMPRESSED))) {
- handle_uncompressed_page(zram, page, index);
- + index++;
- continue;
- }
- @@ -320,6 +327,7 @@ static int zram_write(struct zram *zram,
- mutex_unlock(&zram->lock);
- zram_stat_inc(&zram->stats.pages_zero);
- zram_set_flag(zram, index, ZRAM_ZERO);
- + index++;
- continue;
- }
- @@ -621,7 +629,8 @@ static int create_device(struct zram *zr
- * and n*PAGE_SIZED sized I/O requests.
- */
- blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
- - blk_queue_logical_block_size(zram->disk->queue, PAGE_SIZE);
- + blk_queue_logical_block_size(zram->disk->queue,
- + ZRAM_LOGICAL_BLOCK_SIZE);
- blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
- blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
- diff -Nrupad linux-2.6.37//drivers/staging/zram/zram_drv.h linux-2.6.37_vanilla//drivers/staging/zram/zram_drv.h
- --- linux-2.6.37//drivers/staging/zram/zram_drv.h 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//drivers/staging/zram/zram_drv.h 2011-02-14 01:22:38.055793098 +0100
- @@ -61,6 +61,7 @@ static const unsigned max_zpage_size = P
- #define SECTOR_SIZE (1 << SECTOR_SHIFT)
- #define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
- #define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT)
- +#define ZRAM_LOGICAL_BLOCK_SIZE 4096
- /* Flags for zram pages (table[page_no].flags) */
- enum zram_pageflags {
- diff -Nrupad linux-2.6.37//fs/btrfs/extent_io.c linux-2.6.37_vanilla//fs/btrfs/extent_io.c
- --- linux-2.6.37//fs/btrfs/extent_io.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//fs/btrfs/extent_io.c 2011-02-14 01:21:43.164793068 +0100
- @@ -10,6 +10,7 @@
- #include <linux/swap.h>
- #include <linux/writeback.h>
- #include <linux/pagevec.h>
- +#include <linux/cleancache.h>
- #include "extent_io.h"
- #include "extent_map.h"
- #include "compat.h"
- @@ -1981,6 +1982,13 @@ static int __extent_read_full_page(struc
- set_page_extent_mapped(page);
- + if (!PageUptodate(page)) {
- + if (cleancache_get_page(page) == 0) {
- + BUG_ON(blocksize != PAGE_SIZE);
- + goto out;
- + }
- + }
- +
- end = page_end;
- while (1) {
- lock_extent(tree, start, end, GFP_NOFS);
- @@ -2105,6 +2113,7 @@ static int __extent_read_full_page(struc
- cur = cur + iosize;
- page_offset += iosize;
- }
- +out:
- if (!nr) {
- if (!PageError(page))
- SetPageUptodate(page);
- diff -Nrupad linux-2.6.37//fs/btrfs/super.c linux-2.6.37_vanilla//fs/btrfs/super.c
- --- linux-2.6.37//fs/btrfs/super.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//fs/btrfs/super.c 2011-02-14 01:21:43.164793068 +0100
- @@ -39,6 +39,7 @@
- #include <linux/miscdevice.h>
- #include <linux/magic.h>
- #include <linux/slab.h>
- +#include <linux/cleancache.h>
- #include "compat.h"
- #include "ctree.h"
- #include "disk-io.h"
- @@ -494,6 +495,7 @@ static int btrfs_fill_super(struct super
- sb->s_root = root_dentry;
- save_mount_options(sb, data);
- + cleancache_init_fs(sb);
- return 0;
- fail_close:
- diff -Nrupad linux-2.6.37//fs/buffer.c linux-2.6.37_vanilla//fs/buffer.c
- --- linux-2.6.37//fs/buffer.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//fs/buffer.c 2011-02-14 01:21:43.165793086 +0100
- @@ -41,6 +41,7 @@
- #include <linux/bitops.h>
- #include <linux/mpage.h>
- #include <linux/bit_spinlock.h>
- +#include <linux/cleancache.h>
- static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
- @@ -277,6 +278,10 @@ void invalidate_bdev(struct block_device
- invalidate_bh_lrus();
- lru_add_drain_all(); /* make sure all lru add caches are flushed */
- invalidate_mapping_pages(mapping, 0, -1);
- + /* 99% of the time, we don't need to flush the cleancache on the bdev.
- + * But, for the strange corners, lets be cautious
- + */
- + cleancache_flush_inode(mapping);
- }
- EXPORT_SYMBOL(invalidate_bdev);
- diff -Nrupad linux-2.6.37//fs/ext3/super.c linux-2.6.37_vanilla//fs/ext3/super.c
- --- linux-2.6.37//fs/ext3/super.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//fs/ext3/super.c 2011-02-14 01:21:43.166793102 +0100
- @@ -36,6 +36,7 @@
- #include <linux/quotaops.h>
- #include <linux/seq_file.h>
- #include <linux/log2.h>
- +#include <linux/cleancache.h>
- #include <asm/uaccess.h>
- @@ -1343,6 +1344,7 @@ static int ext3_setup_super(struct super
- } else {
- ext3_msg(sb, KERN_INFO, "using internal journal");
- }
- + cleancache_init_fs(sb);
- return res;
- }
- diff -Nrupad linux-2.6.37//fs/ext4/super.c linux-2.6.37_vanilla//fs/ext4/super.c
- --- linux-2.6.37//fs/ext4/super.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//fs/ext4/super.c 2011-02-14 01:21:43.168793127 +0100
- @@ -38,6 +38,7 @@
- #include <linux/ctype.h>
- #include <linux/log2.h>
- #include <linux/crc16.h>
- +#include <linux/cleancache.h>
- #include <asm/uaccess.h>
- #include <linux/kthread.h>
- @@ -1902,6 +1903,7 @@ static int ext4_setup_super(struct super
- EXT4_INODES_PER_GROUP(sb),
- sbi->s_mount_opt);
- + cleancache_init_fs(sb);
- return res;
- }
- diff -Nrupad linux-2.6.37//fs/mpage.c linux-2.6.37_vanilla//fs/mpage.c
- --- linux-2.6.37//fs/mpage.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//fs/mpage.c 2011-02-14 01:21:43.168793127 +0100
- @@ -27,6 +27,7 @@
- #include <linux/writeback.h>
- #include <linux/backing-dev.h>
- #include <linux/pagevec.h>
- +#include <linux/cleancache.h>
- /*
- * I/O completion handler for multipage BIOs.
- @@ -286,6 +287,12 @@ do_mpage_readpage(struct bio *bio, struc
- SetPageMappedToDisk(page);
- }
- + if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
- + cleancache_get_page(page) == 0) {
- + SetPageUptodate(page);
- + goto confused;
- + }
- +
- /*
- * This page will go to BIO. Do we need to send this BIO off first?
- */
- diff -Nrupad linux-2.6.37//fs/ocfs2/super.c linux-2.6.37_vanilla//fs/ocfs2/super.c
- --- linux-2.6.37//fs/ocfs2/super.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//fs/ocfs2/super.c 2011-02-14 01:21:43.169793144 +0100
- @@ -41,6 +41,7 @@
- #include <linux/mount.h>
- #include <linux/seq_file.h>
- #include <linux/quotaops.h>
- +#include <linux/cleancache.h>
- #define MLOG_MASK_PREFIX ML_SUPER
- #include <cluster/masklog.h>
- @@ -2366,6 +2367,7 @@ static int ocfs2_initialize_super(struct
- mlog_errno(status);
- goto bail;
- }
- + cleancache_init_shared_fs((char *)&uuid_net_key, sb);
- bail:
- mlog_exit(status);
- diff -Nrupad linux-2.6.37//fs/reiserfs/prints.c linux-2.6.37_vanilla//fs/reiserfs/prints.c
- --- linux-2.6.37//fs/reiserfs/prints.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//fs/reiserfs/prints.c 2011-02-14 01:20:50.468793185 +0100
- @@ -586,13 +586,13 @@ void print_block(struct buffer_head *bh,
- va_list args;
- int mode, first, last;
- - va_start(args, bh);
- -
- if (!bh) {
- printk("print_block: buffer is NULL\n");
- return;
- }
- + va_start(args, bh);
- +
- mode = va_arg(args, int);
- first = va_arg(args, int);
- last = va_arg(args, int);
- diff -Nrupad linux-2.6.37//fs/reiserfs/super.c linux-2.6.37_vanilla//fs/reiserfs/super.c
- --- linux-2.6.37//fs/reiserfs/super.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//fs/reiserfs/super.c 2011-02-14 01:21:07.821793171 +0100
- @@ -237,7 +237,7 @@ static int finish_unfinished(struct supe
- pathrelse(&path);
- inode = reiserfs_iget(s, &obj_key);
- - if (!inode) {
- + if (IS_ERR_OR_NULL(inode)) {
- /* the unlink almost completed, it just did not manage to remove
- "save" link and release objectid */
- reiserfs_warning(s, "vs-2180", "iget failed for %K",
- diff -Nrupad linux-2.6.37//fs/super.c linux-2.6.37_vanilla//fs/super.c
- --- linux-2.6.37//fs/super.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//fs/super.c 2011-02-14 01:21:43.169793144 +0100
- @@ -30,6 +30,7 @@
- #include <linux/idr.h>
- #include <linux/mutex.h>
- #include <linux/backing-dev.h>
- +#include <linux/cleancache.h>
- #include "internal.h"
- @@ -110,6 +111,7 @@ static struct super_block *alloc_super(s
- s->s_maxbytes = MAX_NON_LFS;
- s->s_op = &default_op;
- s->s_time_gran = 1000000000;
- + s->cleancache_poolid = -1;
- }
- out:
- return s;
- @@ -176,6 +178,7 @@ void deactivate_locked_super(struct supe
- struct file_system_type *fs = s->s_type;
- if (atomic_dec_and_test(&s->s_active)) {
- fs->kill_sb(s);
- + cleancache_flush_fs(s);
- put_filesystem(fs);
- put_super(s);
- } else {
- diff -Nrupad linux-2.6.37//include/linux/cleancache.h linux-2.6.37_vanilla//include/linux/cleancache.h
- --- linux-2.6.37//include/linux/cleancache.h 1970-01-01 01:00:00.000000000 +0100
- +++ linux-2.6.37_vanilla//include/linux/cleancache.h 2011-02-14 01:21:43.169793144 +0100
- @@ -0,0 +1,118 @@
- +#ifndef _LINUX_CLEANCACHE_H
- +#define _LINUX_CLEANCACHE_H
- +
- +#include <linux/fs.h>
- +#include <linux/exportfs.h>
- +#include <linux/mm.h>
- +
- +#define CLEANCACHE_KEY_MAX 6
- +
- +/*
- + * cleancache requires every file with a page in cleancache to have a
- + * unique key unless/until the file is removed/truncated. For some
- + * filesystems, the inode number is unique, but for "modern" filesystems
- + * an exportable filehandle is required (see exportfs.h)
- + */
- +struct cleancache_filekey {
- + union {
- + ino_t ino;
- + __u32 fh[CLEANCACHE_KEY_MAX];
- + u32 key[CLEANCACHE_KEY_MAX];
- + } u;
- +};
- +
- +struct cleancache_ops {
- + int (*init_fs)(size_t);
- + int (*init_shared_fs)(char *uuid, size_t);
- + int (*get_page)(int, struct cleancache_filekey,
- + pgoff_t, struct page *);
- + void (*put_page)(int, struct cleancache_filekey,
- + pgoff_t, struct page *);
- + void (*flush_page)(int, struct cleancache_filekey, pgoff_t);
- + void (*flush_inode)(int, struct cleancache_filekey);
- + void (*flush_fs)(int);
- +};
- +
- +extern struct cleancache_ops
- + cleancache_register_ops(struct cleancache_ops *ops);
- +extern void __cleancache_init_fs(struct super_block *);
- +extern void __cleancache_init_shared_fs(char *, struct super_block *);
- +extern int __cleancache_get_page(struct page *);
- +extern void __cleancache_put_page(struct page *);
- +extern void __cleancache_flush_page(struct address_space *, struct page *);
- +extern void __cleancache_flush_inode(struct address_space *);
- +extern void __cleancache_flush_fs(struct super_block *);
- +extern int cleancache_enabled;
- +
- +#ifdef CONFIG_CLEANCACHE
- +#define cleancache_fs_enabled(_page) \
- + (_page->mapping->host->i_sb->cleancache_poolid >= 0)
- +#define cleancache_fs_enabled_mapping(_mapping) \
- + (mapping->host->i_sb->cleancache_poolid >= 0)
- +#else
- +#define cleancache_enabled (0)
- +#define cleancache_fs_enabled(_page) (0)
- +#define cleancache_fs_enabled_mapping(_page) (0)
- +#endif
- +
- +/*
- + * The shim layer provided by these inline functions allows the compiler
- + * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE
- + * is disabled, to a single global variable check if CONFIG_CLEANCACHE
- + * is enabled but no cleancache "backend" has dynamically enabled it,
- + * and, for the most frequent cleancache ops, to a single global variable
- + * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled
- + * and a cleancache backend has dynamically enabled cleancache, but the
- + * filesystem referenced by that cleancache op has not enabled cleancache.
- + * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially
- + * no measurable performance impact.
- + */
- +
- +static inline void cleancache_init_fs(struct super_block *sb)
- +{
- + if (cleancache_enabled)
- + __cleancache_init_fs(sb);
- +}
- +
- +static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb)
- +{
- + if (cleancache_enabled)
- + __cleancache_init_shared_fs(uuid, sb);
- +}
- +
- +static inline int cleancache_get_page(struct page *page)
- +{
- + int ret = -1;
- +
- + if (cleancache_enabled && cleancache_fs_enabled(page))
- + ret = __cleancache_get_page(page);
- + return ret;
- +}
- +
- +static inline void cleancache_put_page(struct page *page)
- +{
- + if (cleancache_enabled && cleancache_fs_enabled(page))
- + __cleancache_put_page(page);
- +}
- +
- +static inline void cleancache_flush_page(struct address_space *mapping,
- + struct page *page)
- +{
- + /* careful... page->mapping is NULL sometimes when this is called */
- + if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
- + __cleancache_flush_page(mapping, page);
- +}
- +
- +static inline void cleancache_flush_inode(struct address_space *mapping)
- +{
- + if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
- + __cleancache_flush_inode(mapping);
- +}
- +
- +static inline void cleancache_flush_fs(struct super_block *sb)
- +{
- + if (cleancache_enabled)
- + __cleancache_flush_fs(sb);
- +}
- +
- +#endif /* _LINUX_CLEANCACHE_H */
- diff -Nrupad linux-2.6.37//include/linux/frontswap.h linux-2.6.37_vanilla//include/linux/frontswap.h
- --- linux-2.6.37//include/linux/frontswap.h 1970-01-01 01:00:00.000000000 +0100
- +++ linux-2.6.37_vanilla//include/linux/frontswap.h 2011-02-14 01:21:43.169793144 +0100
- @@ -0,0 +1,86 @@
- +#ifndef _LINUX_FRONTSWAP_H
- +#define _LINUX_FRONTSWAP_H
- +
- +#include <linux/swap.h>
- +#include <linux/mm.h>
- +
- +struct frontswap_ops {
- + void (*init)(unsigned);
- + int (*put_page)(unsigned, pgoff_t, struct page *);
- + int (*get_page)(unsigned, pgoff_t, struct page *);
- + void (*flush_page)(unsigned, pgoff_t);
- + void (*flush_area)(unsigned);
- +};
- +
- +extern int frontswap_enabled;
- +extern struct frontswap_ops
- + frontswap_register_ops(struct frontswap_ops *ops);
- +extern void frontswap_shrink(unsigned long);
- +extern unsigned long frontswap_curr_pages(void);
- +
- +extern void frontswap_init(unsigned type);
- +extern int __frontswap_put_page(struct page *page);
- +extern int __frontswap_get_page(struct page *page);
- +extern void __frontswap_flush_page(unsigned, pgoff_t);
- +extern void __frontswap_flush_area(unsigned);
- +
- +#ifndef CONFIG_FRONTSWAP
- +/* all inline routines become no-ops and all externs are ignored */
- +#define frontswap_enabled (0)
- +#endif
- +
- +static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
- +{
- + int ret = 0;
- +
- + if (frontswap_enabled && sis->frontswap_map)
- + ret = test_bit(offset % BITS_PER_LONG,
- + &sis->frontswap_map[offset/BITS_PER_LONG]);
- + return ret;
- +}
- +
- +static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
- +{
- + if (frontswap_enabled && sis->frontswap_map)
- + set_bit(offset % BITS_PER_LONG,
- + &sis->frontswap_map[offset/BITS_PER_LONG]);
- +}
- +
- +static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
- +{
- + if (frontswap_enabled && sis->frontswap_map)
- + clear_bit(offset % BITS_PER_LONG,
- + &sis->frontswap_map[offset/BITS_PER_LONG]);
- +}
- +
- +static inline int frontswap_put_page(struct page *page)
- +{
- + int ret = -1;
- +
- + if (frontswap_enabled)
- + ret = __frontswap_put_page(page);
- + return ret;
- +}
- +
- +static inline int frontswap_get_page(struct page *page)
- +{
- + int ret = -1;
- +
- + if (frontswap_enabled)
- + ret = __frontswap_get_page(page);
- + return ret;
- +}
- +
- +static inline void frontswap_flush_page(unsigned type, pgoff_t offset)
- +{
- + if (frontswap_enabled)
- + __frontswap_flush_page(type, offset);
- +}
- +
- +static inline void frontswap_flush_area(unsigned type)
- +{
- + if (frontswap_enabled)
- + __frontswap_flush_area(type);
- +}
- +
- +#endif /* _LINUX_FRONTSWAP_H */
- diff -Nrupad linux-2.6.37//include/linux/fs.h linux-2.6.37_vanilla//include/linux/fs.h
- --- linux-2.6.37//include/linux/fs.h 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//include/linux/fs.h 2011-02-14 01:21:43.170793149 +0100
- @@ -1417,6 +1417,11 @@ struct super_block {
- * generic_show_options()
- */
- char __rcu *s_options;
- +
- + /*
- + * Saved pool identifier for cleancache (-1 means none)
- + */
- + int cleancache_poolid;
- };
- extern struct timespec current_fs_time(struct super_block *sb);
- diff -Nrupad linux-2.6.37//include/linux/swapfile.h linux-2.6.37_vanilla//include/linux/swapfile.h
- --- linux-2.6.37//include/linux/swapfile.h 1970-01-01 01:00:00.000000000 +0100
- +++ linux-2.6.37_vanilla//include/linux/swapfile.h 2011-02-14 01:21:43.170793149 +0100
- @@ -0,0 +1,13 @@
- +#ifndef _LINUX_SWAPFILE_H
- +#define _LINUX_SWAPFILE_H
- +
- +/*
- + * these were static in swapfile.c but frontswap.c needs them and we don't
- + * want to expose them to the dozens of source files that include swap.h
- + */
- +extern spinlock_t swap_lock;
- +extern struct swap_list_t swap_list;
- +extern struct swap_info_struct *swap_info[];
- +extern int try_to_unuse(unsigned int, bool, unsigned long);
- +
- +#endif /* _LINUX_SWAPFILE_H */
- diff -Nrupad linux-2.6.37//include/linux/swap.h linux-2.6.37_vanilla//include/linux/swap.h
- --- linux-2.6.37//include/linux/swap.h 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//include/linux/swap.h 2011-02-14 01:21:43.171793147 +0100
- @@ -185,6 +185,8 @@ struct swap_info_struct {
- struct block_device *bdev; /* swap device or bdev of swap file */
- struct file *swap_file; /* seldom referenced */
- unsigned int old_block_size; /* seldom referenced */
- + unsigned long *frontswap_map; /* frontswap in-use, one bit per page */
- + unsigned int frontswap_pages; /* frontswap pages in-use counter */
- };
- struct swap_list_t {
- diff -Nrupad linux-2.6.37//Makefile linux-2.6.37_vanilla//Makefile
- --- linux-2.6.37//Makefile 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//Makefile 2011-02-14 01:27:32.292792852 +0100
- @@ -1,7 +1,7 @@
- VERSION = 2
- PATCHLEVEL = 6
- SUBLEVEL = 37
- -EXTRAVERSION =
- +EXTRAVERSION = -zcache
- NAME = Flesh-Eating Bats with Fangs
- # *DOCUMENTATION*
- diff -Nrupad linux-2.6.37//Makefile~ linux-2.6.37_vanilla//Makefile~
- --- linux-2.6.37//Makefile~ 1970-01-01 01:00:00.000000000 +0100
- +++ linux-2.6.37_vanilla//Makefile~ 2011-02-14 01:19:18.000000000 +0100
- @@ -0,0 +1,1533 @@
- +VERSION = 2
- +PATCHLEVEL = 6
- +SUBLEVEL = 37
- +EXTRAVERSION =
- +NAME = Flesh-Eating Bats with Fangs
- +
- +# *DOCUMENTATION*
- +# To see a list of typical targets execute "make help"
- +# More info can be located in ./README
- +# Comments in this file are targeted only to the developer, do not
- +# expect to learn how to build the kernel reading this file.
- +
- +# Do not:
- +# o use make's built-in rules and variables
- +# (this increases performance and avoids hard-to-debug behaviour);
- +# o print "Entering directory ...";
- +MAKEFLAGS += -rR --no-print-directory
- +
- +# Avoid funny character set dependencies
- +unexport LC_ALL
- +LC_COLLATE=C
- +LC_NUMERIC=C
- +export LC_COLLATE LC_NUMERIC
- +
- +# We are using a recursive build, so we need to do a little thinking
- +# to get the ordering right.
- +#
- +# Most importantly: sub-Makefiles should only ever modify files in
- +# their own directory. If in some directory we have a dependency on
- +# a file in another dir (which doesn't happen often, but it's often
- +# unavoidable when linking the built-in.o targets which finally
- +# turn into vmlinux), we will call a sub make in that other dir, and
- +# after that we are sure that everything which is in that other dir
- +# is now up to date.
- +#
- +# The only cases where we need to modify files which have global
- +# effects are thus separated out and done before the recursive
- +# descending is started. They are now explicitly listed as the
- +# prepare rule.
- +
- +# To put more focus on warnings, be less verbose as default
- +# Use 'make V=1' to see the full commands
- +
- +ifeq ("$(origin V)", "command line")
- + KBUILD_VERBOSE = $(V)
- +endif
- +ifndef KBUILD_VERBOSE
- + KBUILD_VERBOSE = 0
- +endif
- +
- +# Call a source code checker (by default, "sparse") as part of the
- +# C compilation.
- +#
- +# Use 'make C=1' to enable checking of only re-compiled files.
- +# Use 'make C=2' to enable checking of *all* source files, regardless
- +# of whether they are re-compiled or not.
- +#
- +# See the file "Documentation/sparse.txt" for more details, including
- +# where to get the "sparse" utility.
- +
- +ifeq ("$(origin C)", "command line")
- + KBUILD_CHECKSRC = $(C)
- +endif
- +ifndef KBUILD_CHECKSRC
- + KBUILD_CHECKSRC = 0
- +endif
- +
- +# Use make M=dir to specify directory of external module to build
- +# Old syntax make ... SUBDIRS=$PWD is still supported
- +# Setting the environment variable KBUILD_EXTMOD take precedence
- +ifdef SUBDIRS
- + KBUILD_EXTMOD ?= $(SUBDIRS)
- +endif
- +
- +ifeq ("$(origin M)", "command line")
- + KBUILD_EXTMOD := $(M)
- +endif
- +
- +# kbuild supports saving output files in a separate directory.
- +# To locate output files in a separate directory two syntaxes are supported.
- +# In both cases the working directory must be the root of the kernel src.
- +# 1) O=
- +# Use "make O=dir/to/store/output/files/"
- +#
- +# 2) Set KBUILD_OUTPUT
- +# Set the environment variable KBUILD_OUTPUT to point to the directory
- +# where the output files shall be placed.
- +# export KBUILD_OUTPUT=dir/to/store/output/files/
- +# make
- +#
- +# The O= assignment takes precedence over the KBUILD_OUTPUT environment
- +# variable.
- +
- +
- +# KBUILD_SRC is set on invocation of make in OBJ directory
- +# KBUILD_SRC is not intended to be used by the regular user (for now)
- +ifeq ($(KBUILD_SRC),)
- +
- +# OK, Make called in directory where kernel src resides
- +# Do we want to locate output files in a separate directory?
- +ifeq ("$(origin O)", "command line")
- + KBUILD_OUTPUT := $(O)
- +endif
- +
- +# That's our default target when none is given on the command line
- +PHONY := _all
- +_all:
- +
- +# Cancel implicit rules on top Makefile
- +$(CURDIR)/Makefile Makefile: ;
- +
- +ifneq ($(KBUILD_OUTPUT),)
- +# Invoke a second make in the output directory, passing relevant variables
- +# check that the output directory actually exists
- +saved-output := $(KBUILD_OUTPUT)
- +KBUILD_OUTPUT := $(shell cd $(KBUILD_OUTPUT) && /bin/pwd)
- +$(if $(KBUILD_OUTPUT),, \
- + $(error output directory "$(saved-output)" does not exist))
- +
- +PHONY += $(MAKECMDGOALS) sub-make
- +
- +$(filter-out _all sub-make $(CURDIR)/Makefile, $(MAKECMDGOALS)) _all: sub-make
- + $(Q)@:
- +
- +sub-make: FORCE
- + $(if $(KBUILD_VERBOSE:1=),@)$(MAKE) -C $(KBUILD_OUTPUT) \
- + KBUILD_SRC=$(CURDIR) \
- + KBUILD_EXTMOD="$(KBUILD_EXTMOD)" -f $(CURDIR)/Makefile \
- + $(filter-out _all sub-make,$(MAKECMDGOALS))
- +
- +# Leave processing to above invocation of make
- +skip-makefile := 1
- +endif # ifneq ($(KBUILD_OUTPUT),)
- +endif # ifeq ($(KBUILD_SRC),)
- +
- +# We process the rest of the Makefile if this is the final invocation of make
- +ifeq ($(skip-makefile),)
- +
- +# If building an external module we do not care about the all: rule
- +# but instead _all depend on modules
- +PHONY += all
- +ifeq ($(KBUILD_EXTMOD),)
- +_all: all
- +else
- +_all: modules
- +endif
- +
- +srctree := $(if $(KBUILD_SRC),$(KBUILD_SRC),$(CURDIR))
- +objtree := $(CURDIR)
- +src := $(srctree)
- +obj := $(objtree)
- +
- +VPATH := $(srctree)$(if $(KBUILD_EXTMOD),:$(KBUILD_EXTMOD))
- +
- +export srctree objtree VPATH
- +
- +
- +# SUBARCH tells the usermode build what the underlying arch is. That is set
- +# first, and if a usermode build is happening, the "ARCH=um" on the command
- +# line overrides the setting of ARCH below. If a native build is happening,
- +# then ARCH is assigned, getting whatever value it gets normally, and
- +# SUBARCH is subsequently ignored.
- +
- +SUBARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
- + -e s/arm.*/arm/ -e s/sa110/arm/ \
- + -e s/s390x/s390/ -e s/parisc64/parisc/ \
- + -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \
- + -e s/sh[234].*/sh/ )
- +
- +# Cross compiling and selecting different set of gcc/bin-utils
- +# ---------------------------------------------------------------------------
- +#
- +# When performing cross compilation for other architectures ARCH shall be set
- +# to the target architecture. (See arch/* for the possibilities).
- +# ARCH can be set during invocation of make:
- +# make ARCH=ia64
- +# Another way is to have ARCH set in the environment.
- +# The default ARCH is the host where make is executed.
- +
- +# CROSS_COMPILE specify the prefix used for all executables used
- +# during compilation. Only gcc and related bin-utils executables
- +# are prefixed with $(CROSS_COMPILE).
- +# CROSS_COMPILE can be set on the command line
- +# make CROSS_COMPILE=ia64-linux-
- +# Alternatively CROSS_COMPILE can be set in the environment.
- +# A third alternative is to store a setting in .config so that plain
- +# "make" in the configured kernel build directory always uses that.
- +# Default value for CROSS_COMPILE is not to prefix executables
- +# Note: Some architectures assign CROSS_COMPILE in their arch/*/Makefile
- +export KBUILD_BUILDHOST := $(SUBARCH)
- +ARCH ?= $(SUBARCH)
- +CROSS_COMPILE ?= $(CONFIG_CROSS_COMPILE:"%"=%)
- +
- +# Architecture as present in compile.h
- +UTS_MACHINE := $(ARCH)
- +SRCARCH := $(ARCH)
- +
- +# Additional ARCH settings for x86
- +ifeq ($(ARCH),i386)
- + SRCARCH := x86
- +endif
- +ifeq ($(ARCH),x86_64)
- + SRCARCH := x86
- +endif
- +
- +# Additional ARCH settings for sparc
- +ifeq ($(ARCH),sparc32)
- + SRCARCH := sparc
- +endif
- +ifeq ($(ARCH),sparc64)
- + SRCARCH := sparc
- +endif
- +
- +# Additional ARCH settings for sh
- +ifeq ($(ARCH),sh64)
- + SRCARCH := sh
- +endif
- +
- +# Where to locate arch specific headers
- +hdr-arch := $(SRCARCH)
- +
- +ifeq ($(ARCH),m68knommu)
- + hdr-arch := m68k
- +endif
- +
- +KCONFIG_CONFIG ?= .config
- +
- +# SHELL used by kbuild
- +CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \
- + else if [ -x /bin/bash ]; then echo /bin/bash; \
- + else echo sh; fi ; fi)
- +
- +HOSTCC = gcc
- +HOSTCXX = g++
- +HOSTCFLAGS = -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer
- +HOSTCXXFLAGS = -O2
- +
- +# Decide whether to build built-in, modular, or both.
- +# Normally, just do built-in.
- +
- +KBUILD_MODULES :=
- +KBUILD_BUILTIN := 1
- +
- +# If we have only "make modules", don't compile built-in objects.
- +# When we're building modules with modversions, we need to consider
- +# the built-in objects during the descend as well, in order to
- +# make sure the checksums are up to date before we record them.
- +
- +ifeq ($(MAKECMDGOALS),modules)
- + KBUILD_BUILTIN := $(if $(CONFIG_MODVERSIONS),1)
- +endif
- +
- +# If we have "make <whatever> modules", compile modules
- +# in addition to whatever we do anyway.
- +# Just "make" or "make all" shall build modules as well
- +
- +ifneq ($(filter all _all modules,$(MAKECMDGOALS)),)
- + KBUILD_MODULES := 1
- +endif
- +
- +ifeq ($(MAKECMDGOALS),)
- + KBUILD_MODULES := 1
- +endif
- +
- +export KBUILD_MODULES KBUILD_BUILTIN
- +export KBUILD_CHECKSRC KBUILD_SRC KBUILD_EXTMOD
- +
- +# Beautify output
- +# ---------------------------------------------------------------------------
- +#
- +# Normally, we echo the whole command before executing it. By making
- +# that echo $($(quiet)$(cmd)), we now have the possibility to set
- +# $(quiet) to choose other forms of output instead, e.g.
- +#
- +# quiet_cmd_cc_o_c = Compiling $(RELDIR)/$@
- +# cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $<
- +#
- +# If $(quiet) is empty, the whole command will be printed.
- +# If it is set to "quiet_", only the short version will be printed.
- +# If it is set to "silent_", nothing will be printed at all, since
- +# the variable $(silent_cmd_cc_o_c) doesn't exist.
- +#
- +# A simple variant is to prefix commands with $(Q) - that's useful
- +# for commands that shall be hidden in non-verbose mode.
- +#
- +# $(Q)ln $@ :<
- +#
- +# If KBUILD_VERBOSE equals 0 then the above command will be hidden.
- +# If KBUILD_VERBOSE equals 1 then the above command is displayed.
- +
- +ifeq ($(KBUILD_VERBOSE),1)
- + quiet =
- + Q =
- +else
- + quiet=quiet_
- + Q = @
- +endif
- +
- +# If the user is running make -s (silent mode), suppress echoing of
- +# commands
- +
- +ifneq ($(findstring s,$(MAKEFLAGS)),)
- + quiet=silent_
- +endif
- +
- +export quiet Q KBUILD_VERBOSE
- +
- +
- +# Look for make include files relative to root of kernel src
- +MAKEFLAGS += --include-dir=$(srctree)
- +
- +# We need some generic definitions (do not try to remake the file).
- +$(srctree)/scripts/Kbuild.include: ;
- +include $(srctree)/scripts/Kbuild.include
- +
- +# Make variables (CC, etc...)
- +
- +AS = $(CROSS_COMPILE)as
- +LD = $(CROSS_COMPILE)ld
- +CC = $(CROSS_COMPILE)gcc
- +CPP = $(CC) -E
- +AR = $(CROSS_COMPILE)ar
- +NM = $(CROSS_COMPILE)nm
- +STRIP = $(CROSS_COMPILE)strip
- +OBJCOPY = $(CROSS_COMPILE)objcopy
- +OBJDUMP = $(CROSS_COMPILE)objdump
- +AWK = awk
- +GENKSYMS = scripts/genksyms/genksyms
- +INSTALLKERNEL := installkernel
- +DEPMOD = /sbin/depmod
- +KALLSYMS = scripts/kallsyms
- +PERL = perl
- +CHECK = sparse
- +
- +CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \
- + -Wbitwise -Wno-return-void $(CF)
- +CFLAGS_MODULE =
- +AFLAGS_MODULE =
- +LDFLAGS_MODULE =
- +CFLAGS_KERNEL =
- +AFLAGS_KERNEL =
- +CFLAGS_GCOV = -fprofile-arcs -ftest-coverage
- +
- +
- +# Use LINUXINCLUDE when you must reference the include/ directory.
- +# Needed to be compatible with the O= option
- +LINUXINCLUDE := -I$(srctree)/arch/$(hdr-arch)/include -Iinclude \
- + $(if $(KBUILD_SRC), -I$(srctree)/include) \
- + -include include/generated/autoconf.h
- +
- +KBUILD_CPPFLAGS := -D__KERNEL__
- +
- +KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
- + -fno-strict-aliasing -fno-common \
- + -Werror-implicit-function-declaration \
- + -Wno-format-security \
- + -fno-delete-null-pointer-checks
- +KBUILD_AFLAGS_KERNEL :=
- +KBUILD_CFLAGS_KERNEL :=
- +KBUILD_AFLAGS := -D__ASSEMBLY__
- +KBUILD_AFLAGS_MODULE := -DMODULE
- +KBUILD_CFLAGS_MODULE := -DMODULE
- +KBUILD_LDFLAGS_MODULE := -T $(srctree)/scripts/module-common.lds
- +
- +# Read KERNELRELEASE from include/config/kernel.release (if it exists)
- +KERNELRELEASE = $(shell cat include/config/kernel.release 2> /dev/null)
- +KERNELVERSION = $(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
- +
- +export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION
- +export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC
- +export CPP AR NM STRIP OBJCOPY OBJDUMP
- +export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE
- +export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
- +
- +export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
- +export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV
- +export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
- +export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
- +export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
- +
- +# When compiling out-of-tree modules, put MODVERDIR in the module
- +# tree rather than in the kernel tree. The kernel tree might
- +# even be read-only.
- +export MODVERDIR := $(if $(KBUILD_EXTMOD),$(firstword $(KBUILD_EXTMOD))/).tmp_versions
- +
- +# Files to ignore in find ... statements
- +
- +RCS_FIND_IGNORE := \( -name SCCS -o -name BitKeeper -o -name .svn -o -name CVS -o -name .pc -o -name .hg -o -name .git \) -prune -o
- +export RCS_TAR_IGNORE := --exclude SCCS --exclude BitKeeper --exclude .svn --exclude CVS --exclude .pc --exclude .hg --exclude .git
- +
- +# ===========================================================================
- +# Rules shared between *config targets and build targets
- +
- +# Basic helpers built in scripts/
- +PHONY += scripts_basic
- +scripts_basic:
- + $(Q)$(MAKE) $(build)=scripts/basic
- + $(Q)rm -f .tmp_quiet_recordmcount
- +
- +# To avoid any implicit rule to kick in, define an empty command.
- +scripts/basic/%: scripts_basic ;
- +
- +PHONY += outputmakefile
- +# outputmakefile generates a Makefile in the output directory, if using a
- +# separate output directory. This allows convenient use of make in the
- +# output directory.
- +outputmakefile:
- +ifneq ($(KBUILD_SRC),)
- + $(Q)ln -fsn $(srctree) source
- + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkmakefile \
- + $(srctree) $(objtree) $(VERSION) $(PATCHLEVEL)
- +endif
- +
- +# To make sure we do not include .config for any of the *config targets
- +# catch them early, and hand them over to scripts/kconfig/Makefile
- +# It is allowed to specify more targets when calling make, including
- +# mixing *config targets and build targets.
- +# For example 'make oldconfig all'.
- +# Detect when mixed targets is specified, and make a second invocation
- +# of make so .config is not included in this case either (for *config).
- +
- +no-dot-config-targets := clean mrproper distclean \
- + cscope TAGS tags help %docs check% coccicheck \
- + include/linux/version.h headers_% \
- + kernelversion %src-pkg
- +
- +config-targets := 0
- +mixed-targets := 0
- +dot-config := 1
- +
- +ifneq ($(filter $(no-dot-config-targets), $(MAKECMDGOALS)),)
- + ifeq ($(filter-out $(no-dot-config-targets), $(MAKECMDGOALS)),)
- + dot-config := 0
- + endif
- +endif
- +
- +ifeq ($(KBUILD_EXTMOD),)
- + ifneq ($(filter config %config,$(MAKECMDGOALS)),)
- + config-targets := 1
- + ifneq ($(filter-out config %config,$(MAKECMDGOALS)),)
- + mixed-targets := 1
- + endif
- + endif
- +endif
- +
- +ifeq ($(mixed-targets),1)
- +# ===========================================================================
- +# We're called with mixed targets (*config and build targets).
- +# Handle them one by one.
- +
- +%:: FORCE
- + $(Q)$(MAKE) -C $(srctree) KBUILD_SRC= $@
- +
- +else
- +ifeq ($(config-targets),1)
- +# ===========================================================================
- +# *config targets only - make sure prerequisites are updated, and descend
- +# in scripts/kconfig to make the *config target
- +
- +# Read arch specific Makefile to set KBUILD_DEFCONFIG as needed.
- +# KBUILD_DEFCONFIG may point out an alternative default configuration
- +# used for 'make defconfig'
- +include $(srctree)/arch/$(SRCARCH)/Makefile
- +export KBUILD_DEFCONFIG KBUILD_KCONFIG
- +
- +config: scripts_basic outputmakefile FORCE
- + $(Q)mkdir -p include/linux include/config
- + $(Q)$(MAKE) $(build)=scripts/kconfig $@
- +
- +%config: scripts_basic outputmakefile FORCE
- + $(Q)mkdir -p include/linux include/config
- + $(Q)$(MAKE) $(build)=scripts/kconfig $@
- +
- +else
- +# ===========================================================================
- +# Build targets only - this includes vmlinux, arch specific targets, clean
- +# targets and others. In general all targets except *config targets.
- +
- +ifeq ($(KBUILD_EXTMOD),)
- +# Additional helpers built in scripts/
- +# Carefully list dependencies so we do not try to build scripts twice
- +# in parallel
- +PHONY += scripts
- +scripts: scripts_basic include/config/auto.conf include/config/tristate.conf
- + $(Q)$(MAKE) $(build)=$(@)
- +
- +# Objects we will link into vmlinux / subdirs we need to visit
- +init-y := init/
- +drivers-y := drivers/ sound/ firmware/
- +net-y := net/
- +libs-y := lib/
- +core-y := usr/
- +endif # KBUILD_EXTMOD
- +
- +ifeq ($(dot-config),1)
- +# Read in config
- +-include include/config/auto.conf
- +
- +ifeq ($(KBUILD_EXTMOD),)
- +# Read in dependencies to all Kconfig* files, make sure to run
- +# oldconfig if changes are detected.
- +-include include/config/auto.conf.cmd
- +
- +# To avoid any implicit rule to kick in, define an empty command
- +$(KCONFIG_CONFIG) include/config/auto.conf.cmd: ;
- +
- +# If .config is newer than include/config/auto.conf, someone tinkered
- +# with it and forgot to run make oldconfig.
- +# if auto.conf.cmd is missing then we are probably in a cleaned tree so
- +# we execute the config step to be sure to catch updated Kconfig files
- +include/config/%.conf: $(KCONFIG_CONFIG) include/config/auto.conf.cmd
- + $(Q)$(MAKE) -f $(srctree)/Makefile silentoldconfig
- +else
- +# external modules needs include/generated/autoconf.h and include/config/auto.conf
- +# but do not care if they are up-to-date. Use auto.conf to trigger the test
- +PHONY += include/config/auto.conf
- +
- +include/config/auto.conf:
- + $(Q)test -e include/generated/autoconf.h -a -e $@ || ( \
- + echo; \
- + echo " ERROR: Kernel configuration is invalid."; \
- + echo " include/generated/autoconf.h or $@ are missing.";\
- + echo " Run 'make oldconfig && make prepare' on kernel src to fix it."; \
- + echo; \
- + /bin/false)
- +
- +endif # KBUILD_EXTMOD
- +
- +else
- +# Dummy target needed, because used as prerequisite
- +include/config/auto.conf: ;
- +endif # $(dot-config)
- +
- +# The all: target is the default when no target is given on the
- +# command line.
- +# This allow a user to issue only 'make' to build a kernel including modules
- +# Defaults to vmlinux, but the arch makefile usually adds further targets
- +all: vmlinux
- +
- +ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
- +KBUILD_CFLAGS += -Os
- +else
- +KBUILD_CFLAGS += -O2
- +endif
- +
- +include $(srctree)/arch/$(SRCARCH)/Makefile
- +
- +ifneq ($(CONFIG_FRAME_WARN),0)
- +KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN})
- +endif
- +
- +# Force gcc to behave correct even for buggy distributions
- +ifndef CONFIG_CC_STACKPROTECTOR
- +KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector)
- +endif
- +
- +ifdef CONFIG_FRAME_POINTER
- +KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls
- +else
- +# Some targets (ARM with Thumb2, for example), can't be built with frame
- +# pointers. For those, we don't have FUNCTION_TRACER automatically
- +# select FRAME_POINTER. However, FUNCTION_TRACER adds -pg, and this is
- +# incompatible with -fomit-frame-pointer with current GCC, so we don't use
- +# -fomit-frame-pointer with FUNCTION_TRACER.
- +ifndef CONFIG_FUNCTION_TRACER
- +KBUILD_CFLAGS += -fomit-frame-pointer
- +endif
- +endif
- +
- +ifdef CONFIG_DEBUG_INFO
- +KBUILD_CFLAGS += -g
- +KBUILD_AFLAGS += -gdwarf-2
- +endif
- +
- +ifdef CONFIG_DEBUG_INFO_REDUCED
- +KBUILD_CFLAGS += $(call cc-option, -femit-struct-debug-baseonly)
- +endif
- +
- +ifdef CONFIG_FUNCTION_TRACER
- +KBUILD_CFLAGS += -pg
- +ifdef CONFIG_DYNAMIC_FTRACE
- + ifdef CONFIG_HAVE_C_RECORDMCOUNT
- + BUILD_C_RECORDMCOUNT := y
- + export BUILD_C_RECORDMCOUNT
- + endif
- +endif
- +endif
- +
- +# We trigger additional mismatches with less inlining
- +ifdef CONFIG_DEBUG_SECTION_MISMATCH
- +KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once)
- +endif
- +
- +# arch Makefile may override CC so keep this after arch Makefile is included
- +NOSTDINC_FLAGS += -nostdinc -isystem $(shell $(CC) -print-file-name=include)
- +CHECKFLAGS += $(NOSTDINC_FLAGS)
- +
- +# warn about C99 declaration after statement
- +KBUILD_CFLAGS += $(call cc-option,-Wdeclaration-after-statement,)
- +
- +# disable pointer signed / unsigned warnings in gcc 4.0
- +KBUILD_CFLAGS += $(call cc-option,-Wno-pointer-sign,)
- +
- +# disable invalid "can't wrap" optimizations for signed / pointers
- +KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
- +
- +# conserve stack if available
- +KBUILD_CFLAGS += $(call cc-option,-fconserve-stack)
- +
- +# check for 'asm goto'
- +ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC)), y)
- + KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
- +endif
- +
- +# Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
- +# But warn user when we do so
- +warn-assign = \
- +$(warning "WARNING: Appending $$K$(1) ($(K$(1))) from $(origin K$(1)) to kernel $$$(1)")
- +
- +ifneq ($(KCPPFLAGS),)
- + $(call warn-assign,CPPFLAGS)
- + KBUILD_CPPFLAGS += $(KCPPFLAGS)
- +endif
- +ifneq ($(KAFLAGS),)
- + $(call warn-assign,AFLAGS)
- + KBUILD_AFLAGS += $(KAFLAGS)
- +endif
- +ifneq ($(KCFLAGS),)
- + $(call warn-assign,CFLAGS)
- + KBUILD_CFLAGS += $(KCFLAGS)
- +endif
- +
- +# Use --build-id when available.
- +LDFLAGS_BUILD_ID = $(patsubst -Wl$(comma)%,%,\
- + $(call cc-ldoption, -Wl$(comma)--build-id,))
- +KBUILD_LDFLAGS_MODULE += $(LDFLAGS_BUILD_ID)
- +LDFLAGS_vmlinux += $(LDFLAGS_BUILD_ID)
- +
- +ifeq ($(CONFIG_STRIP_ASM_SYMS),y)
- +LDFLAGS_vmlinux += $(call ld-option, -X,)
- +endif
- +
- +# Default kernel image to build when no specific target is given.
- +# KBUILD_IMAGE may be overruled on the command line or
- +# set in the environment
- +# Also any assignments in arch/$(ARCH)/Makefile take precedence over
- +# this default value
- +export KBUILD_IMAGE ?= vmlinux
- +
- +#
- +# INSTALL_PATH specifies where to place the updated kernel and system map
- +# images. Default is /boot, but you can set it to other values
- +export INSTALL_PATH ?= /boot
- +
- +#
- +# INSTALL_MOD_PATH specifies a prefix to MODLIB for module directory
- +# relocations required by build roots. This is not defined in the
- +# makefile but the argument can be passed to make if needed.
- +#
- +
- +MODLIB = $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE)
- +export MODLIB
- +
- +#
- +# INSTALL_MOD_STRIP, if defined, will cause modules to be
- +# stripped after they are installed. If INSTALL_MOD_STRIP is '1', then
- +# the default option --strip-debug will be used. Otherwise,
- +# INSTALL_MOD_STRIP will used as the options to the strip command.
- +
- +ifdef INSTALL_MOD_STRIP
- +ifeq ($(INSTALL_MOD_STRIP),1)
- +mod_strip_cmd = $(STRIP) --strip-debug
- +else
- +mod_strip_cmd = $(STRIP) $(INSTALL_MOD_STRIP)
- +endif # INSTALL_MOD_STRIP=1
- +else
- +mod_strip_cmd = true
- +endif # INSTALL_MOD_STRIP
- +export mod_strip_cmd
- +
- +
- +ifeq ($(KBUILD_EXTMOD),)
- +core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/
- +
- +vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
- + $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
- + $(net-y) $(net-m) $(libs-y) $(libs-m)))
- +
- +vmlinux-alldirs := $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \
- + $(init-n) $(init-) \
- + $(core-n) $(core-) $(drivers-n) $(drivers-) \
- + $(net-n) $(net-) $(libs-n) $(libs-))))
- +
- +init-y := $(patsubst %/, %/built-in.o, $(init-y))
- +core-y := $(patsubst %/, %/built-in.o, $(core-y))
- +drivers-y := $(patsubst %/, %/built-in.o, $(drivers-y))
- +net-y := $(patsubst %/, %/built-in.o, $(net-y))
- +libs-y1 := $(patsubst %/, %/lib.a, $(libs-y))
- +libs-y2 := $(patsubst %/, %/built-in.o, $(libs-y))
- +libs-y := $(libs-y1) $(libs-y2)
- +
- +# Build vmlinux
- +# ---------------------------------------------------------------------------
- +# vmlinux is built from the objects selected by $(vmlinux-init) and
- +# $(vmlinux-main). Most are built-in.o files from top-level directories
- +# in the kernel tree, others are specified in arch/$(ARCH)/Makefile.
- +# Ordering when linking is important, and $(vmlinux-init) must be first.
- +#
- +# vmlinux
- +# ^
- +# |
- +# +-< $(vmlinux-init)
- +# | +--< init/version.o + more
- +# |
- +# +--< $(vmlinux-main)
- +# | +--< driver/built-in.o mm/built-in.o + more
- +# |
- +# +-< kallsyms.o (see description in CONFIG_KALLSYMS section)
- +#
- +# vmlinux version (uname -v) cannot be updated during normal
- +# descending-into-subdirs phase since we do not yet know if we need to
- +# update vmlinux.
- +# Therefore this step is delayed until just before final link of vmlinux -
- +# except in the kallsyms case where it is done just before adding the
- +# symbols to the kernel.
- +#
- +# System.map is generated to document addresses of all kernel symbols
- +
- +vmlinux-init := $(head-y) $(init-y)
- +vmlinux-main := $(core-y) $(libs-y) $(drivers-y) $(net-y)
- +vmlinux-all := $(vmlinux-init) $(vmlinux-main)
- +vmlinux-lds := arch/$(SRCARCH)/kernel/vmlinux.lds
- +export KBUILD_VMLINUX_OBJS := $(vmlinux-all)
- +
- +# Rule to link vmlinux - also used during CONFIG_KALLSYMS
- +# May be overridden by arch/$(ARCH)/Makefile
- +quiet_cmd_vmlinux__ ?= LD $@
- + cmd_vmlinux__ ?= $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux) -o $@ \
- + -T $(vmlinux-lds) $(vmlinux-init) \
- + --start-group $(vmlinux-main) --end-group \
- + $(filter-out $(vmlinux-lds) $(vmlinux-init) $(vmlinux-main) vmlinux.o FORCE ,$^)
- +
- +# Generate new vmlinux version
- +quiet_cmd_vmlinux_version = GEN .version
- + cmd_vmlinux_version = set -e; \
- + if [ ! -r .version ]; then \
- + rm -f .version; \
- + echo 1 >.version; \
- + else \
- + mv .version .old_version; \
- + expr 0$$(cat .old_version) + 1 >.version; \
- + fi; \
- + $(MAKE) $(build)=init
- +
- +# Generate System.map
- +quiet_cmd_sysmap = SYSMAP
- + cmd_sysmap = $(CONFIG_SHELL) $(srctree)/scripts/mksysmap
- +
- +# Link of vmlinux
- +# If CONFIG_KALLSYMS is set .version is already updated
- +# Generate System.map and verify that the content is consistent
- +# Use + in front of the vmlinux_version rule to silent warning with make -j2
- +# First command is ':' to allow us to use + in front of the rule
- +define rule_vmlinux__
- + :
- + $(if $(CONFIG_KALLSYMS),,+$(call cmd,vmlinux_version))
- +
- + $(call cmd,vmlinux__)
- + $(Q)echo 'cmd_$@ := $(cmd_vmlinux__)' > $(@D)/.$(@F).cmd
- +
- + $(Q)$(if $($(quiet)cmd_sysmap), \
- + echo ' $($(quiet)cmd_sysmap) System.map' &&) \
- + $(cmd_sysmap) $@ System.map; \
- + if [ $$? -ne 0 ]; then \
- + rm -f $@; \
- + /bin/false; \
- + fi;
- + $(verify_kallsyms)
- +endef
- +
- +
- +ifdef CONFIG_KALLSYMS
- +# Generate section listing all symbols and add it into vmlinux $(kallsyms.o)
- +# It's a three stage process:
- +# o .tmp_vmlinux1 has all symbols and sections, but __kallsyms is
- +# empty
- +# Running kallsyms on that gives us .tmp_kallsyms1.o with
- +# the right size - vmlinux version (uname -v) is updated during this step
- +# o .tmp_vmlinux2 now has a __kallsyms section of the right size,
- +# but due to the added section, some addresses have shifted.
- +# From here, we generate a correct .tmp_kallsyms2.o
- +# o The correct .tmp_kallsyms2.o is linked into the final vmlinux.
- +# o Verify that the System.map from vmlinux matches the map from
- +# .tmp_vmlinux2, just in case we did not generate kallsyms correctly.
- +# o If CONFIG_KALLSYMS_EXTRA_PASS is set, do an extra pass using
- +# .tmp_vmlinux3 and .tmp_kallsyms3.o. This is only meant as a
- +# temporary bypass to allow the kernel to be built while the
- +# maintainers work out what went wrong with kallsyms.
- +
- +ifdef CONFIG_KALLSYMS_EXTRA_PASS
- +last_kallsyms := 3
- +else
- +last_kallsyms := 2
- +endif
- +
- +kallsyms.o := .tmp_kallsyms$(last_kallsyms).o
- +
- +define verify_kallsyms
- + $(Q)$(if $($(quiet)cmd_sysmap), \
- + echo ' $($(quiet)cmd_sysmap) .tmp_System.map' &&) \
- + $(cmd_sysmap) .tmp_vmlinux$(last_kallsyms) .tmp_System.map
- + $(Q)cmp -s System.map .tmp_System.map || \
- + (echo Inconsistent kallsyms data; \
- + echo Try setting CONFIG_KALLSYMS_EXTRA_PASS; \
- + rm .tmp_kallsyms* ; /bin/false )
- +endef
- +
- +# Update vmlinux version before link
- +# Use + in front of this rule to silent warning about make -j1
- +# First command is ':' to allow us to use + in front of this rule
- +cmd_ksym_ld = $(cmd_vmlinux__)
- +define rule_ksym_ld
- + :
- + +$(call cmd,vmlinux_version)
- + $(call cmd,vmlinux__)
- + $(Q)echo 'cmd_$@ := $(cmd_vmlinux__)' > $(@D)/.$(@F).cmd
- +endef
- +
- +# Generate .S file with all kernel symbols
- +quiet_cmd_kallsyms = KSYM $@
- + cmd_kallsyms = $(NM) -n $< | $(KALLSYMS) \
- + $(if $(CONFIG_KALLSYMS_ALL),--all-symbols) > $@
- +
- +.tmp_kallsyms1.o .tmp_kallsyms2.o .tmp_kallsyms3.o: %.o: %.S scripts FORCE
- + $(call if_changed_dep,as_o_S)
- +
- +.tmp_kallsyms%.S: .tmp_vmlinux% $(KALLSYMS)
- + $(call cmd,kallsyms)
- +
- +# .tmp_vmlinux1 must be complete except kallsyms, so update vmlinux version
- +.tmp_vmlinux1: $(vmlinux-lds) $(vmlinux-all) FORCE
- + $(call if_changed_rule,ksym_ld)
- +
- +.tmp_vmlinux2: $(vmlinux-lds) $(vmlinux-all) .tmp_kallsyms1.o FORCE
- + $(call if_changed,vmlinux__)
- +
- +.tmp_vmlinux3: $(vmlinux-lds) $(vmlinux-all) .tmp_kallsyms2.o FORCE
- + $(call if_changed,vmlinux__)
- +
- +# Needs to visit scripts/ before $(KALLSYMS) can be used.
- +$(KALLSYMS): scripts ;
- +
- +# Generate some data for debugging strange kallsyms problems
- +debug_kallsyms: .tmp_map$(last_kallsyms)
- +
- +.tmp_map%: .tmp_vmlinux% FORCE
- + ($(OBJDUMP) -h $< | $(AWK) '/^ +[0-9]/{print $$4 " 0 " $$2}'; $(NM) $<) | sort > $@
- +
- +.tmp_map3: .tmp_map2
- +
- +.tmp_map2: .tmp_map1
- +
- +endif # ifdef CONFIG_KALLSYMS
- +
- +# Do modpost on a prelinked vmlinux. The finally linked vmlinux has
- +# relevant sections renamed as per the linker script.
- +quiet_cmd_vmlinux-modpost = LD $@
- + cmd_vmlinux-modpost = $(LD) $(LDFLAGS) -r -o $@ \
- + $(vmlinux-init) --start-group $(vmlinux-main) --end-group \
- + $(filter-out $(vmlinux-init) $(vmlinux-main) FORCE ,$^)
- +define rule_vmlinux-modpost
- + :
- + +$(call cmd,vmlinux-modpost)
- + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost $@
- + $(Q)echo 'cmd_$@ := $(cmd_vmlinux-modpost)' > $(dot-target).cmd
- +endef
- +
- +# vmlinux image - including updated kernel symbols
- +vmlinux: $(vmlinux-lds) $(vmlinux-init) $(vmlinux-main) vmlinux.o $(kallsyms.o) FORCE
- +ifdef CONFIG_HEADERS_CHECK
- + $(Q)$(MAKE) -f $(srctree)/Makefile headers_check
- +endif
- +ifdef CONFIG_SAMPLES
- + $(Q)$(MAKE) $(build)=samples
- +endif
- +ifdef CONFIG_BUILD_DOCSRC
- + $(Q)$(MAKE) $(build)=Documentation
- +endif
- + $(call vmlinux-modpost)
- + $(call if_changed_rule,vmlinux__)
- + $(Q)rm -f .old_version
- +
- +# build vmlinux.o first to catch section mismatch errors early
- +ifdef CONFIG_KALLSYMS
- +.tmp_vmlinux1: vmlinux.o
- +endif
- +
- +modpost-init := $(filter-out init/built-in.o, $(vmlinux-init))
- +vmlinux.o: $(modpost-init) $(vmlinux-main) FORCE
- + $(call if_changed_rule,vmlinux-modpost)
- +
- +# The actual objects are generated when descending,
- +# make sure no implicit rule kicks in
- +$(sort $(vmlinux-init) $(vmlinux-main)) $(vmlinux-lds): $(vmlinux-dirs) ;
- +
- +# Handle descending into subdirectories listed in $(vmlinux-dirs)
- +# Preset locale variables to speed up the build process. Limit locale
- +# tweaks to this spot to avoid wrong language settings when running
- +# make menuconfig etc.
- +# Error messages still appears in the original language
- +
- +PHONY += $(vmlinux-dirs)
- +$(vmlinux-dirs): prepare scripts
- + $(Q)$(MAKE) $(build)=$@
- +
- +# Store (new) KERNELRELASE string in include/config/kernel.release
- +include/config/kernel.release: include/config/auto.conf FORCE
- + $(Q)rm -f $@
- + $(Q)echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion $(srctree))" > $@
- +
- +
- +# Things we need to do before we recursively start building the kernel
- +# or the modules are listed in "prepare".
- +# A multi level approach is used. prepareN is processed before prepareN-1.
- +# archprepare is used in arch Makefiles and when processed asm symlink,
- +# version.h and scripts_basic is processed / created.
- +
- +# Listed in dependency order
- +PHONY += prepare archprepare prepare0 prepare1 prepare2 prepare3
- +
- +# prepare3 is used to check if we are building in a separate output directory,
- +# and if so do:
- +# 1) Check that make has not been executed in the kernel src $(srctree)
- +prepare3: include/config/kernel.release
- +ifneq ($(KBUILD_SRC),)
- + @$(kecho) ' Using $(srctree) as source for kernel'
- + $(Q)if [ -f $(srctree)/.config -o -d $(srctree)/include/config ]; then \
- + echo " $(srctree) is not clean, please run 'make mrproper'";\
- + echo " in the '$(srctree)' directory.";\
- + /bin/false; \
- + fi;
- +endif
- +
- +# prepare2 creates a makefile if using a separate output directory
- +prepare2: prepare3 outputmakefile
- +
- +prepare1: prepare2 include/linux/version.h include/generated/utsrelease.h \
- + include/config/auto.conf
- + $(cmd_crmodverdir)
- +
- +archprepare: prepare1 scripts_basic
- +
- +prepare0: archprepare FORCE
- + $(Q)$(MAKE) $(build)=.
- + $(Q)$(MAKE) $(build)=. missing-syscalls
- +
- +# All the preparing..
- +prepare: prepare0
- +
- +# Generate some files
- +# ---------------------------------------------------------------------------
- +
- +# KERNELRELEASE can change from a few different places, meaning version.h
- +# needs to be updated, so this check is forced on all builds
- +
- +uts_len := 64
- +define filechk_utsrelease.h
- + if [ `echo -n "$(KERNELRELEASE)" | wc -c ` -gt $(uts_len) ]; then \
- + echo '"$(KERNELRELEASE)" exceeds $(uts_len) characters' >&2; \
- + exit 1; \
- + fi; \
- + (echo \#define UTS_RELEASE \"$(KERNELRELEASE)\";)
- +endef
- +
- +define filechk_version.h
- + (echo \#define LINUX_VERSION_CODE $(shell \
- + expr $(VERSION) \* 65536 + $(PATCHLEVEL) \* 256 + $(SUBLEVEL)); \
- + echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))';)
- +endef
- +
- +include/linux/version.h: $(srctree)/Makefile FORCE
- + $(call filechk,version.h)
- +
- +include/generated/utsrelease.h: include/config/kernel.release FORCE
- + $(call filechk,utsrelease.h)
- +
- +PHONY += headerdep
- +headerdep:
- + $(Q)find include/ -name '*.h' | xargs --max-args 1 scripts/headerdep.pl
- +
- +# ---------------------------------------------------------------------------
- +
- +PHONY += depend dep
- +depend dep:
- + @echo '*** Warning: make $@ is unnecessary now.'
- +
- +# ---------------------------------------------------------------------------
- +# Firmware install
- +INSTALL_FW_PATH=$(INSTALL_MOD_PATH)/lib/firmware
- +export INSTALL_FW_PATH
- +
- +PHONY += firmware_install
- +firmware_install: FORCE
- + @mkdir -p $(objtree)/firmware
- + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_install
- +
- +# ---------------------------------------------------------------------------
- +# Kernel headers
- +
- +#Default location for installed headers
- +export INSTALL_HDR_PATH = $(objtree)/usr
- +
- +hdr-inst := -rR -f $(srctree)/scripts/Makefile.headersinst obj
- +
- +# If we do an all arch process set dst to asm-$(hdr-arch)
- +hdr-dst = $(if $(KBUILD_HEADERS), dst=include/asm-$(hdr-arch), dst=include/asm)
- +
- +PHONY += __headers
- +__headers: include/linux/version.h scripts_basic FORCE
- + $(Q)$(MAKE) $(build)=scripts scripts/unifdef
- +
- +PHONY += headers_install_all
- +headers_install_all:
- + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/headers.sh install
- +
- +PHONY += headers_install
- +headers_install: __headers
- + $(if $(wildcard $(srctree)/arch/$(hdr-arch)/include/asm/Kbuild),, \
- + $(error Headers not exportable for the $(SRCARCH) architecture))
- + $(Q)$(MAKE) $(hdr-inst)=include
- + $(Q)$(MAKE) $(hdr-inst)=arch/$(hdr-arch)/include/asm $(hdr-dst)
- +
- +PHONY += headers_check_all
- +headers_check_all: headers_install_all
- + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/headers.sh check
- +
- +PHONY += headers_check
- +headers_check: headers_install
- + $(Q)$(MAKE) $(hdr-inst)=include HDRCHECK=1
- + $(Q)$(MAKE) $(hdr-inst)=arch/$(hdr-arch)/include/asm $(hdr-dst) HDRCHECK=1
- +
- +# ---------------------------------------------------------------------------
- +# Modules
- +
- +ifdef CONFIG_MODULES
- +
- +# By default, build modules as well
- +
- +all: modules
- +
- +# Build modules
- +#
- +# A module can be listed more than once in obj-m resulting in
- +# duplicate lines in modules.order files. Those are removed
- +# using awk while concatenating to the final file.
- +
- +PHONY += modules
- +modules: $(vmlinux-dirs) $(if $(KBUILD_BUILTIN),vmlinux) modules.builtin
- + $(Q)$(AWK) '!x[$$0]++' $(vmlinux-dirs:%=$(objtree)/%/modules.order) > $(objtree)/modules.order
- + @$(kecho) ' Building modules, stage 2.';
- + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost
- + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_modbuild
- +
- +modules.builtin: $(vmlinux-dirs:%=%/modules.builtin)
- + $(Q)$(AWK) '!x[$$0]++' $^ > $(objtree)/modules.builtin
- +
- +%/modules.builtin: include/config/auto.conf
- + $(Q)$(MAKE) $(modbuiltin)=$*
- +
- +
- +# Target to prepare building external modules
- +PHONY += modules_prepare
- +modules_prepare: prepare scripts
- +
- +# Target to install modules
- +PHONY += modules_install
- +modules_install: _modinst_ _modinst_post
- +
- +PHONY += _modinst_
- +_modinst_:
- + @if [ -z "`$(DEPMOD) -V 2>/dev/null | grep module-init-tools`" ]; then \
- + echo "Warning: you may need to install module-init-tools"; \
- + echo "See http://www.codemonkey.org.uk/docs/post-halloween-2.6.txt";\
- + sleep 1; \
- + fi
- + @rm -rf $(MODLIB)/kernel
- + @rm -f $(MODLIB)/source
- + @mkdir -p $(MODLIB)/kernel
- + @ln -s $(srctree) $(MODLIB)/source
- + @if [ ! $(objtree) -ef $(MODLIB)/build ]; then \
- + rm -f $(MODLIB)/build ; \
- + ln -s $(objtree) $(MODLIB)/build ; \
- + fi
- + @cp -f $(objtree)/modules.order $(MODLIB)/
- + @cp -f $(objtree)/modules.builtin $(MODLIB)/
- + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modinst
- +
- +# This depmod is only for convenience to give the initial
- +# boot a modules.dep even before / is mounted read-write. However the
- +# boot script depmod is the master version.
- +PHONY += _modinst_post
- +_modinst_post: _modinst_
- + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_modinst
- + $(call cmd,depmod)
- +
- +else # CONFIG_MODULES
- +
- +# Modules not configured
- +# ---------------------------------------------------------------------------
- +
- +modules modules_install: FORCE
- + @echo
- + @echo "The present kernel configuration has modules disabled."
- + @echo "Type 'make config' and enable loadable module support."
- + @echo "Then build a kernel with module support enabled."
- + @echo
- + @exit 1
- +
- +endif # CONFIG_MODULES
- +
- +###
- +# Cleaning is done on three levels.
- +# make clean Delete most generated files
- +# Leave enough to build external modules
- +# make mrproper Delete the current configuration, and all generated files
- +# make distclean Remove editor backup files, patch leftover files and the like
- +
- +# Directories & files removed with 'make clean'
- +CLEAN_DIRS += $(MODVERDIR)
- +CLEAN_FILES += vmlinux System.map \
- + .tmp_kallsyms* .tmp_version .tmp_vmlinux* .tmp_System.map
- +
- +# Directories & files removed with 'make mrproper'
- +MRPROPER_DIRS += include/config usr/include include/generated
- +MRPROPER_FILES += .config .config.old .version .old_version \
- + include/linux/version.h \
- + Module.symvers tags TAGS cscope*
- +
- +# clean - Delete most, but leave enough to build external modules
- +#
- +clean: rm-dirs := $(CLEAN_DIRS)
- +clean: rm-files := $(CLEAN_FILES)
- +clean-dirs := $(addprefix _clean_, . $(vmlinux-alldirs) Documentation)
- +
- +PHONY += $(clean-dirs) clean archclean
- +$(clean-dirs):
- + $(Q)$(MAKE) $(clean)=$(patsubst _clean_%,%,$@)
- +
- +clean: archclean
- +
- +# mrproper - Delete all generated files, including .config
- +#
- +mrproper: rm-dirs := $(wildcard $(MRPROPER_DIRS))
- +mrproper: rm-files := $(wildcard $(MRPROPER_FILES))
- +mrproper-dirs := $(addprefix _mrproper_,Documentation/DocBook scripts)
- +
- +PHONY += $(mrproper-dirs) mrproper archmrproper
- +$(mrproper-dirs):
- + $(Q)$(MAKE) $(clean)=$(patsubst _mrproper_%,%,$@)
- +
- +mrproper: clean archmrproper $(mrproper-dirs)
- + $(call cmd,rmdirs)
- + $(call cmd,rmfiles)
- +
- +# distclean
- +#
- +PHONY += distclean
- +
- +distclean: mrproper
- + @find $(srctree) $(RCS_FIND_IGNORE) \
- + \( -name '*.orig' -o -name '*.rej' -o -name '*~' \
- + -o -name '*.bak' -o -name '#*#' -o -name '.*.orig' \
- + -o -name '.*.rej' -o -size 0 \
- + -o -name '*%' -o -name '.*.cmd' -o -name 'core' \) \
- + -type f -print | xargs rm -f
- +
- +
- +# Packaging of the kernel to various formats
- +# ---------------------------------------------------------------------------
- +# rpm target kept for backward compatibility
- +package-dir := $(srctree)/scripts/package
- +
- +%src-pkg: FORCE
- + $(Q)$(MAKE) $(build)=$(package-dir) $@
- +%pkg: include/config/kernel.release FORCE
- + $(Q)$(MAKE) $(build)=$(package-dir) $@
- +rpm: include/config/kernel.release FORCE
- + $(Q)$(MAKE) $(build)=$(package-dir) $@
- +
- +
- +# Brief documentation of the typical targets used
- +# ---------------------------------------------------------------------------
- +
- +boards := $(wildcard $(srctree)/arch/$(SRCARCH)/configs/*_defconfig)
- +boards := $(notdir $(boards))
- +board-dirs := $(dir $(wildcard $(srctree)/arch/$(SRCARCH)/configs/*/*_defconfig))
- +board-dirs := $(sort $(notdir $(board-dirs:/=)))
- +
- +help:
- + @echo 'Cleaning targets:'
- + @echo ' clean - Remove most generated files but keep the config and'
- + @echo ' enough build support to build external modules'
- + @echo ' mrproper - Remove all generated files + config + various backup files'
- + @echo ' distclean - mrproper + remove editor backup and patch files'
- + @echo ''
- + @echo 'Configuration targets:'
- + @$(MAKE) -f $(srctree)/scripts/kconfig/Makefile help
- + @echo ''
- + @echo 'Other generic targets:'
- + @echo ' all - Build all targets marked with [*]'
- + @echo '* vmlinux - Build the bare kernel'
- + @echo '* modules - Build all modules'
- + @echo ' modules_install - Install all modules to INSTALL_MOD_PATH (default: /)'
- + @echo ' firmware_install- Install all firmware to INSTALL_FW_PATH'
- + @echo ' (default: $$(INSTALL_MOD_PATH)/lib/firmware)'
- + @echo ' dir/ - Build all files in dir and below'
- + @echo ' dir/file.[oisS] - Build specified target only'
- + @echo ' dir/file.lst - Build specified mixed source/assembly target only'
- + @echo ' (requires a recent binutils and recent build (System.map))'
- + @echo ' dir/file.ko - Build module including final link'
- + @echo ' modules_prepare - Set up for building external modules'
- + @echo ' tags/TAGS - Generate tags file for editors'
- + @echo ' cscope - Generate cscope index'
- + @echo ' kernelrelease - Output the release version string'
- + @echo ' kernelversion - Output the version stored in Makefile'
- + @echo ' headers_install - Install sanitised kernel headers to INSTALL_HDR_PATH'; \
- + echo ' (default: $(INSTALL_HDR_PATH))'; \
- + echo ''
- + @echo 'Static analysers'
- + @echo ' checkstack - Generate a list of stack hogs'
- + @echo ' namespacecheck - Name space analysis on compiled kernel'
- + @echo ' versioncheck - Sanity check on version.h usage'
- + @echo ' includecheck - Check for duplicate included header files'
- + @echo ' export_report - List the usages of all exported symbols'
- + @echo ' headers_check - Sanity check on exported headers'
- + @echo ' headerdep - Detect inclusion cycles in headers'
- + @$(MAKE) -f $(srctree)/scripts/Makefile.help checker-help
- + @echo ''
- + @echo 'Kernel packaging:'
- + @$(MAKE) $(build)=$(package-dir) help
- + @echo ''
- + @echo 'Documentation targets:'
- + @$(MAKE) -f $(srctree)/Documentation/DocBook/Makefile dochelp
- + @echo ''
- + @echo 'Architecture specific targets ($(SRCARCH)):'
- + @$(if $(archhelp),$(archhelp),\
- + echo ' No architecture specific help defined for $(SRCARCH)')
- + @echo ''
- + @$(if $(boards), \
- + $(foreach b, $(boards), \
- + printf " %-24s - Build for %s\\n" $(b) $(subst _defconfig,,$(b));) \
- + echo '')
- + @$(if $(board-dirs), \
- + $(foreach b, $(board-dirs), \
- + printf " %-16s - Show %s-specific targets\\n" help-$(b) $(b);) \
- + printf " %-16s - Show all of the above\\n" help-boards; \
- + echo '')
- +
- + @echo ' make V=0|1 [targets] 0 => quiet build (default), 1 => verbose build'
- + @echo ' make V=2 [targets] 2 => give reason for rebuild of target'
- + @echo ' make O=dir [targets] Locate all output files in "dir", including .config'
- + @echo ' make C=1 [targets] Check all c source with $$CHECK (sparse by default)'
- + @echo ' make C=2 [targets] Force check of all c source with $$CHECK'
- + @echo ''
- + @echo 'Execute "make" or "make all" to build all targets marked with [*] '
- + @echo 'For further info see the ./README file'
- +
- +
- +help-board-dirs := $(addprefix help-,$(board-dirs))
- +
- +help-boards: $(help-board-dirs)
- +
- +boards-per-dir = $(notdir $(wildcard $(srctree)/arch/$(SRCARCH)/configs/$*/*_defconfig))
- +
- +$(help-board-dirs): help-%:
- + @echo 'Architecture specific targets ($(SRCARCH) $*):'
- + @$(if $(boards-per-dir), \
- + $(foreach b, $(boards-per-dir), \
- + printf " %-24s - Build for %s\\n" $*/$(b) $(subst _defconfig,,$(b));) \
- + echo '')
- +
- +
- +# Documentation targets
- +# ---------------------------------------------------------------------------
- +%docs: scripts_basic FORCE
- + $(Q)$(MAKE) $(build)=Documentation/DocBook $@
- +
- +else # KBUILD_EXTMOD
- +
- +###
- +# External module support.
- +# When building external modules the kernel used as basis is considered
- +# read-only, and no consistency checks are made and the make
- +# system is not used on the basis kernel. If updates are required
- +# in the basis kernel ordinary make commands (without M=...) must
- +# be used.
- +#
- +# The following are the only valid targets when building external
- +# modules.
- +# make M=dir clean Delete all automatically generated files
- +# make M=dir modules Make all modules in specified dir
- +# make M=dir Same as 'make M=dir modules'
- +# make M=dir modules_install
- +# Install the modules built in the module directory
- +# Assumes install directory is already created
- +
- +# We are always building modules
- +KBUILD_MODULES := 1
- +PHONY += crmodverdir
- +crmodverdir:
- + $(cmd_crmodverdir)
- +
- +PHONY += $(objtree)/Module.symvers
- +$(objtree)/Module.symvers:
- + @test -e $(objtree)/Module.symvers || ( \
- + echo; \
- + echo " WARNING: Symbol version dump $(objtree)/Module.symvers"; \
- + echo " is missing; modules will have no dependencies and modversions."; \
- + echo )
- +
- +module-dirs := $(addprefix _module_,$(KBUILD_EXTMOD))
- +PHONY += $(module-dirs) modules
- +$(module-dirs): crmodverdir $(objtree)/Module.symvers
- + $(Q)$(MAKE) $(build)=$(patsubst _module_%,%,$@)
- +
- +modules: $(module-dirs)
- + @$(kecho) ' Building modules, stage 2.';
- + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost
- +
- +PHONY += modules_install
- +modules_install: _emodinst_ _emodinst_post
- +
- +install-dir := $(if $(INSTALL_MOD_DIR),$(INSTALL_MOD_DIR),extra)
- +PHONY += _emodinst_
- +_emodinst_:
- + $(Q)mkdir -p $(MODLIB)/$(install-dir)
- + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modinst
- +
- +PHONY += _emodinst_post
- +_emodinst_post: _emodinst_
- + $(call cmd,depmod)
- +
- +clean-dirs := $(addprefix _clean_,$(KBUILD_EXTMOD))
- +
- +PHONY += $(clean-dirs) clean
- +$(clean-dirs):
- + $(Q)$(MAKE) $(clean)=$(patsubst _clean_%,%,$@)
- +
- +clean: rm-dirs := $(MODVERDIR)
- +clean: rm-files := $(KBUILD_EXTMOD)/Module.symvers
- +
- +help:
- + @echo ' Building external modules.'
- + @echo ' Syntax: make -C path/to/kernel/src M=$$PWD target'
- + @echo ''
- + @echo ' modules - default target, build the module(s)'
- + @echo ' modules_install - install the module'
- + @echo ' clean - remove generated files in module directory only'
- + @echo ''
- +
- +# Dummies...
- +PHONY += prepare scripts
- +prepare: ;
- +scripts: ;
- +endif # KBUILD_EXTMOD
- +
- +clean: $(clean-dirs)
- + $(call cmd,rmdirs)
- + $(call cmd,rmfiles)
- + @find $(or $(KBUILD_EXTMOD), .) $(RCS_FIND_IGNORE) \
- + \( -name '*.[oas]' -o -name '*.ko' -o -name '.*.cmd' \
- + -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \
- + -o -name '*.symtypes' -o -name 'modules.order' \
- + -o -name modules.builtin -o -name '.tmp_*.o.*' \
- + -o -name '*.gcno' \) -type f -print | xargs rm -f
- +
- +# Generate tags for editors
- +# ---------------------------------------------------------------------------
- +quiet_cmd_tags = GEN $@
- + cmd_tags = $(CONFIG_SHELL) $(srctree)/scripts/tags.sh $@
- +
- +tags TAGS cscope: FORCE
- + $(call cmd,tags)
- +
- +# Scripts to check various things for consistency
- +# ---------------------------------------------------------------------------
- +
- +includecheck:
- + find * $(RCS_FIND_IGNORE) \
- + -name '*.[hcS]' -type f -print | sort \
- + | xargs $(PERL) -w $(srctree)/scripts/checkincludes.pl
- +
- +versioncheck:
- + find * $(RCS_FIND_IGNORE) \
- + -name '*.[hcS]' -type f -print | sort \
- + | xargs $(PERL) -w $(srctree)/scripts/checkversion.pl
- +
- +coccicheck:
- + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/$@
- +
- +namespacecheck:
- + $(PERL) $(srctree)/scripts/namespace.pl
- +
- +export_report:
- + $(PERL) $(srctree)/scripts/export_report.pl
- +
- +endif #ifeq ($(config-targets),1)
- +endif #ifeq ($(mixed-targets),1)
- +
- +PHONY += checkstack kernelrelease kernelversion
- +
- +# UML needs a little special treatment here. It wants to use the host
- +# toolchain, so needs $(SUBARCH) passed to checkstack.pl. Everyone
- +# else wants $(ARCH), including people doing cross-builds, which means
- +# that $(SUBARCH) doesn't work here.
- +ifeq ($(ARCH), um)
- +CHECKSTACK_ARCH := $(SUBARCH)
- +else
- +CHECKSTACK_ARCH := $(ARCH)
- +endif
- +checkstack:
- + $(OBJDUMP) -d vmlinux $$(find . -name '*.ko') | \
- + $(PERL) $(src)/scripts/checkstack.pl $(CHECKSTACK_ARCH)
- +
- +kernelrelease:
- + @echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion $(srctree))"
- +
- +kernelversion:
- + @echo $(KERNELVERSION)
- +
- +# Single targets
- +# ---------------------------------------------------------------------------
- +# Single targets are compatible with:
- +# - build with mixed source and output
- +# - build with separate output dir 'make O=...'
- +# - external modules
- +#
- +# target-dir => where to store outputfile
- +# build-dir => directory in kernel source tree to use
- +
- +ifeq ($(KBUILD_EXTMOD),)
- + build-dir = $(patsubst %/,%,$(dir $@))
- + target-dir = $(dir $@)
- +else
- + zap-slash=$(filter-out .,$(patsubst %/,%,$(dir $@)))
- + build-dir = $(KBUILD_EXTMOD)$(if $(zap-slash),/$(zap-slash))
- + target-dir = $(if $(KBUILD_EXTMOD),$(dir $<),$(dir $@))
- +endif
- +
- +%.s: %.c prepare scripts FORCE
- + $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
- +%.i: %.c prepare scripts FORCE
- + $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
- +%.o: %.c prepare scripts FORCE
- + $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
- +%.lst: %.c prepare scripts FORCE
- + $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
- +%.s: %.S prepare scripts FORCE
- + $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
- +%.o: %.S prepare scripts FORCE
- + $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
- +%.symtypes: %.c prepare scripts FORCE
- + $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@)
- +
- +# Modules
- +/: prepare scripts FORCE
- + $(cmd_crmodverdir)
- + $(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
- + $(build)=$(build-dir)
- +%/: prepare scripts FORCE
- + $(cmd_crmodverdir)
- + $(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
- + $(build)=$(build-dir)
- +%.ko: prepare scripts FORCE
- + $(cmd_crmodverdir)
- + $(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
- + $(build)=$(build-dir) $(@:.ko=.o)
- + $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost
- +
- +# FIXME Should go into a make.lib or something
- +# ===========================================================================
- +
- +quiet_cmd_rmdirs = $(if $(wildcard $(rm-dirs)),CLEAN $(wildcard $(rm-dirs)))
- + cmd_rmdirs = rm -rf $(rm-dirs)
- +
- +quiet_cmd_rmfiles = $(if $(wildcard $(rm-files)),CLEAN $(wildcard $(rm-files)))
- + cmd_rmfiles = rm -f $(rm-files)
- +
- +# Run depmod only if we have System.map and depmod is executable
- +quiet_cmd_depmod = DEPMOD $(KERNELRELEASE)
- + cmd_depmod = \
- + if [ -r System.map -a -x $(DEPMOD) ]; then \
- + $(DEPMOD) -ae -F System.map \
- + $(if $(strip $(INSTALL_MOD_PATH)), -b $(INSTALL_MOD_PATH) ) \
- + $(KERNELRELEASE); \
- + fi
- +
- +# Create temporary dir for module support files
- +# clean it up only when building all modules
- +cmd_crmodverdir = $(Q)mkdir -p $(MODVERDIR) \
- + $(if $(KBUILD_MODULES),; rm -f $(MODVERDIR)/*)
- +
- +a_flags = -Wp,-MD,$(depfile) $(KBUILD_AFLAGS) $(AFLAGS_KERNEL) \
- + $(KBUILD_AFLAGS_KERNEL) \
- + $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(KBUILD_CPPFLAGS) \
- + $(modkern_aflags) $(EXTRA_AFLAGS) $(AFLAGS_$(basetarget).o)
- +
- +quiet_cmd_as_o_S = AS $@
- +cmd_as_o_S = $(CC) $(a_flags) -c -o $@ $<
- +
- +# read all saved command lines
- +
- +targets := $(wildcard $(sort $(targets)))
- +cmd_files := $(wildcard .*.cmd $(foreach f,$(targets),$(dir $(f)).$(notdir $(f)).cmd))
- +
- +ifneq ($(cmd_files),)
- + $(cmd_files): ; # Do not try to update included dependency files
- + include $(cmd_files)
- +endif
- +
- +# Shorthand for $(Q)$(MAKE) -f scripts/Makefile.clean obj=dir
- +# Usage:
- +# $(Q)$(MAKE) $(clean)=dir
- +clean := -f $(if $(KBUILD_SRC),$(srctree)/)scripts/Makefile.clean obj
- +
- +endif # skip-makefile
- +
- +PHONY += FORCE
- +FORCE:
- +
- +# Declare the contents of the .PHONY variable as phony. We keep that
- +# information in a variable so we can use it in if_changed and friends.
- +.PHONY: $(PHONY)
- diff -Nrupad linux-2.6.37//mm/cleancache.c linux-2.6.37_vanilla//mm/cleancache.c
- --- linux-2.6.37//mm/cleancache.c 1970-01-01 01:00:00.000000000 +0100
- +++ linux-2.6.37_vanilla//mm/cleancache.c 2011-02-14 01:21:43.171793147 +0100
- @@ -0,0 +1,258 @@
- +/*
- + * Cleancache frontend
- + *
- + * This code provides the generic "frontend" layer to call a matching
- + * "backend" driver implementation of cleancache. See
- + * Documentation/vm/cleancache.txt for more information.
- + *
- + * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
- + * Author: Dan Magenheimer
- + *
- + * This work is licensed under the terms of the GNU GPL, version 2.
- + */
- +
- +#include <linux/module.h>
- +#include <linux/fs.h>
- +#include <linux/exportfs.h>
- +#include <linux/mm.h>
- +#include <linux/cleancache.h>
- +
- +/*
- + * This global enablement flag may be read thousands of times per second
- + * by cleancache_get/put/flush even on systems where cleancache_ops
- + * is not claimed (e.g. cleancache is config'ed on but remains
- + * disabled), so is preferred to the slower alternative: a function
- + * call that checks a non-global.
- + */
- +int cleancache_enabled;
- +EXPORT_SYMBOL(cleancache_enabled);
- +
- +/*
- + * cleancache_ops is set by cleancache_ops_register to contain the pointers
- + * to the cleancache "backend" implementation functions.
- + */
- +static struct cleancache_ops cleancache_ops;
- +
- +/* useful stats available in /sys/kernel/mm/cleancache */
- +static unsigned long cleancache_succ_gets;
- +static unsigned long cleancache_failed_gets;
- +static unsigned long cleancache_puts;
- +static unsigned long cleancache_flushes;
- +
- +/*
- + * register operations for cleancache, returning previous thus allowing
- + * detection of multiple backends and possible nesting
- + */
- +struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
- +{
- + struct cleancache_ops old = cleancache_ops;
- +
- + cleancache_ops = *ops;
- + cleancache_enabled = 1;
- + return old;
- +}
- +EXPORT_SYMBOL(cleancache_register_ops);
- +
- +/* Called by a cleancache-enabled filesystem at time of mount */
- +void __cleancache_init_fs(struct super_block *sb)
- +{
- + sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE);
- +}
- +EXPORT_SYMBOL(__cleancache_init_fs);
- +
- +/* Called by a cleancache-enabled clustered filesystem at time of mount */
- +void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
- +{
- + sb->cleancache_poolid =
- + (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE);
- +}
- +EXPORT_SYMBOL(__cleancache_init_shared_fs);
- +
- +/*
- + * If the filesystem uses exportable filehandles, use the filehandle as
- + * the key, else use the inode number.
- + */
- +static int cleancache_get_key(struct inode *inode,
- + struct cleancache_filekey *key)
- +{
- + int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
- + int maxlen = CLEANCACHE_KEY_MAX;
- + struct super_block *sb = inode->i_sb;
- + struct dentry *d;
- +
- + key->u.ino = inode->i_ino;
- + if (sb->s_export_op != NULL) {
- + fhfn = sb->s_export_op->encode_fh;
- + if (fhfn) {
- + d = list_first_entry(&inode->i_dentry,
- + struct dentry, d_alias);
- + (void)(*fhfn)(d, &key->u.fh[0], &maxlen, 0);
- + if (maxlen > CLEANCACHE_KEY_MAX)
- + return -1;
- + }
- + }
- + return 0;
- +}
- +
- +/*
- + * "Get" data from cleancache associated with the poolid/inode/index
- + * that were specified when the data was put to cleanache and, if
- + * successful, use it to fill the specified page with data and return 0.
- + * The pageframe is unchanged and returns -1 if the get fails.
- + * Page must be locked by caller.
- + */
- +int __cleancache_get_page(struct page *page)
- +{
- + int ret = -1;
- + int pool_id;
- + struct cleancache_filekey key = { .u.key = { 0 } };
- +
- + VM_BUG_ON(!PageLocked(page));
- + pool_id = page->mapping->host->i_sb->cleancache_poolid;
- + if (pool_id < 0)
- + goto out;
- +
- + if (cleancache_get_key(page->mapping->host, &key) < 0)
- + goto out;
- +
- + ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page);
- + if (ret == 0)
- + cleancache_succ_gets++;
- + else
- + cleancache_failed_gets++;
- +out:
- + return ret;
- +}
- +EXPORT_SYMBOL(__cleancache_get_page);
- +
- +/*
- + * "Put" data from a page to cleancache and associate it with the
- + * (previously-obtained per-filesystem) poolid and the page's,
- + * inode and page index. Page must be locked. Note that a put_page
- + * always "succeeds", though a subsequent get_page may succeed or fail.
- + */
- +void __cleancache_put_page(struct page *page)
- +{
- + int pool_id;
- + struct cleancache_filekey key = { .u.key = { 0 } };
- +
- + VM_BUG_ON(!PageLocked(page));
- + pool_id = page->mapping->host->i_sb->cleancache_poolid;
- + if (pool_id >= 0 &&
- + cleancache_get_key(page->mapping->host, &key) >= 0) {
- + (*cleancache_ops.put_page)(pool_id, key, page->index, page);
- + cleancache_puts++;
- + }
- +}
- +EXPORT_SYMBOL(__cleancache_put_page);
- +
- +/*
- + * Flush any data from cleancache associated with the poolid and the
- + * page's inode and page index so that a subsequent "get" will fail.
- + */
- +void __cleancache_flush_page(struct address_space *mapping, struct page *page)
- +{
- + /* careful... page->mapping is NULL sometimes when this is called */
- + int pool_id = mapping->host->i_sb->cleancache_poolid;
- + struct cleancache_filekey key = { .u.key = { 0 } };
- +
- + if (pool_id >= 0) {
- + VM_BUG_ON(!PageLocked(page));
- + if (cleancache_get_key(mapping->host, &key) >= 0) {
- + (*cleancache_ops.flush_page)(pool_id, key, page->index);
- + cleancache_flushes++;
- + }
- + }
- +}
- +EXPORT_SYMBOL(__cleancache_flush_page);
- +
- +/*
- + * Flush all data from cleancache associated with the poolid and the
- + * mappings's inode so that all subsequent gets to this poolid/inode
- + * will fail.
- + */
- +void __cleancache_flush_inode(struct address_space *mapping)
- +{
- + int pool_id = mapping->host->i_sb->cleancache_poolid;
- + struct cleancache_filekey key = { .u.key = { 0 } };
- +
- + if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
- + (*cleancache_ops.flush_inode)(pool_id, key);
- +}
- +EXPORT_SYMBOL(__cleancache_flush_inode);
- +
- +/*
- + * Called by any cleancache-enabled filesystem at time of unmount;
- + * note that pool_id is surrendered and may be reutrned by a subsequent
- + * cleancache_init_fs or cleancache_init_shared_fs
- + */
- +void __cleancache_flush_fs(struct super_block *sb)
- +{
- + if (sb->cleancache_poolid >= 0) {
- + int old_poolid = sb->cleancache_poolid;
- + sb->cleancache_poolid = -1;
- + (*cleancache_ops.flush_fs)(old_poolid);
- + }
- +}
- +EXPORT_SYMBOL(__cleancache_flush_fs);
- +
- +#ifdef CONFIG_SYSFS
- +
- +/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
- +
- +#define CLEANCACHE_ATTR_RO(_name) \
- + static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
- +
- +static ssize_t cleancache_succ_gets_show(struct kobject *kobj,
- + struct kobj_attribute *attr, char *buf)
- +{
- + return sprintf(buf, "%lu\n", cleancache_succ_gets);
- +}
- +CLEANCACHE_ATTR_RO(cleancache_succ_gets);
- +
- +static ssize_t cleancache_failed_gets_show(struct kobject *kobj,
- + struct kobj_attribute *attr, char *buf)
- +{
- + return sprintf(buf, "%lu\n", cleancache_failed_gets);
- +}
- +CLEANCACHE_ATTR_RO(cleancache_failed_gets);
- +
- +static ssize_t cleancache_puts_show(struct kobject *kobj,
- + struct kobj_attribute *attr, char *buf)
- +{
- + return sprintf(buf, "%lu\n", cleancache_puts);
- +}
- +CLEANCACHE_ATTR_RO(cleancache_puts);
- +
- +static ssize_t cleancache_flushes_show(struct kobject *kobj,
- + struct kobj_attribute *attr, char *buf)
- +{
- + return sprintf(buf, "%lu\n", cleancache_flushes);
- +}
- +CLEANCACHE_ATTR_RO(cleancache_flushes);
- +
- +static struct attribute *cleancache_attrs[] = {
- + &cleancache_succ_gets_attr.attr,
- + &cleancache_failed_gets_attr.attr,
- + &cleancache_puts_attr.attr,
- + &cleancache_flushes_attr.attr,
- + NULL,
- +};
- +
- +static struct attribute_group cleancache_attr_group = {
- + .attrs = cleancache_attrs,
- + .name = "cleancache",
- +};
- +
- +#endif /* CONFIG_SYSFS */
- +
- +static int __init init_cleancache(void)
- +{
- +#ifdef CONFIG_SYSFS
- + int err;
- +
- + err = sysfs_create_group(mm_kobj, &cleancache_attr_group);
- +#endif /* CONFIG_SYSFS */
- + return 0;
- +}
- +module_init(init_cleancache)
- diff -Nrupad linux-2.6.37//mm/filemap.c linux-2.6.37_vanilla//mm/filemap.c
- --- linux-2.6.37//mm/filemap.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//mm/filemap.c 2011-02-14 01:21:43.172793144 +0100
- @@ -34,6 +34,7 @@
- #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
- #include <linux/memcontrol.h>
- #include <linux/mm_inline.h> /* for page_is_file_cache() */
- +#include <linux/cleancache.h>
- #include "internal.h"
- /*
- @@ -119,6 +120,16 @@ void __remove_from_page_cache(struct pag
- {
- struct address_space *mapping = page->mapping;
- + /*
- + * if we're uptodate, flush out into the cleancache, otherwise
- + * invalidate any existing cleancache entries. We can't leave
- + * stale data around in the cleancache once our page is gone
- + */
- + if (PageUptodate(page))
- + cleancache_put_page(page);
- + else
- + cleancache_flush_page(mapping, page);
- +
- radix_tree_delete(&mapping->page_tree, page->index);
- page->mapping = NULL;
- mapping->nrpages--;
- diff -Nrupad linux-2.6.37//mm/frontswap.c linux-2.6.37_vanilla//mm/frontswap.c
- --- linux-2.6.37//mm/frontswap.c 1970-01-01 01:00:00.000000000 +0100
- +++ linux-2.6.37_vanilla//mm/frontswap.c 2011-02-14 01:21:43.172793144 +0100
- @@ -0,0 +1,331 @@
- +/*
- + * Frontswap frontend
- + *
- + * This code provides the generic "frontend" layer to call a matching
- + * "backend" driver implementation of frontswap. See
- + * Documentation/vm/frontswap.txt for more information.
- + *
- + * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
- + * Author: Dan Magenheimer
- + *
- + * This work is licensed under the terms of the GNU GPL, version 2.
- + */
- +
- +#include <linux/mm.h>
- +#include <linux/mman.h>
- +#include <linux/sysctl.h>
- +#include <linux/swap.h>
- +#include <linux/swapops.h>
- +#include <linux/proc_fs.h>
- +#include <linux/security.h>
- +#include <linux/capability.h>
- +#include <linux/module.h>
- +#include <linux/uaccess.h>
- +#include <linux/frontswap.h>
- +#include <linux/swapfile.h>
- +
- +/*
- + * frontswap_ops is set by frontswap_register_ops to contain the pointers
- + * to the frontswap "backend" implementation functions.
- + */
- +static struct frontswap_ops frontswap_ops;
- +
- +/*
- + * This global enablement flag reduces overhead on systems where frontswap_ops
- + * has not been registered, so is preferred to the slower alternative: a
- + * function call that checks a non-global.
- + */
- +int frontswap_enabled;
- +EXPORT_SYMBOL(frontswap_enabled);
- +
- +/* useful stats available in /sys/kernel/mm/frontswap */
- +static unsigned long frontswap_gets;
- +static unsigned long frontswap_succ_puts;
- +static unsigned long frontswap_failed_puts;
- +static unsigned long frontswap_flushes;
- +
- +/*
- + * register operations for frontswap, returning previous thus allowing
- + * detection of multiple backends and possible nesting
- + */
- +struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
- +{
- + struct frontswap_ops old = frontswap_ops;
- +
- + frontswap_ops = *ops;
- + frontswap_enabled = 1;
- + return old;
- +}
- +EXPORT_SYMBOL(frontswap_register_ops);
- +
- +/* Called when a swap device is swapon'd */
- +void frontswap_init(unsigned type)
- +{
- + if (frontswap_enabled)
- + (*frontswap_ops.init)(type);
- +}
- +EXPORT_SYMBOL(frontswap_init);
- +
- +/*
- + * "Put" data from a page to frontswap and associate it with the page's
- + * swaptype and offset. Page must be locked and in the swap cache.
- + * If frontswap already contains a page with matching swaptype and
- + * offset, the frontswap implmentation may either overwrite the data
- + * and return success or flush the page from frontswap and return failure
- + */
- +int __frontswap_put_page(struct page *page)
- +{
- + int ret = -1, dup = 0;
- + swp_entry_t entry = { .val = page_private(page), };
- + int type = swp_type(entry);
- + struct swap_info_struct *sis = swap_info[type];
- + pgoff_t offset = swp_offset(entry);
- +
- + BUG_ON(!PageLocked(page));
- + if (frontswap_test(sis, offset))
- + dup = 1;
- + ret = (*frontswap_ops.put_page)(type, offset, page);
- + if (ret == 0) {
- + frontswap_set(sis, offset);
- + frontswap_succ_puts++;
- + if (!dup)
- + sis->frontswap_pages++;
- + } else if (dup) {
- + /*
- + failed dup always results in automatic flush of
- + the (older) page from frontswap
- + */
- + frontswap_clear(sis, offset);
- + sis->frontswap_pages--;
- + frontswap_failed_puts++;
- + } else
- + frontswap_failed_puts++;
- + return ret;
- +}
- +
- +/*
- + * "Get" data from frontswap associated with swaptype and offset that were
- + * specified when the data was put to frontswap and use it to fill the
- + * specified page with data. Page must be locked and in the swap cache
- + */
- +int __frontswap_get_page(struct page *page)
- +{
- + int ret = -1;
- + swp_entry_t entry = { .val = page_private(page), };
- + int type = swp_type(entry);
- + struct swap_info_struct *sis = swap_info[type];
- + pgoff_t offset = swp_offset(entry);
- +
- + BUG_ON(!PageLocked(page));
- + if (frontswap_test(sis, offset))
- + ret = (*frontswap_ops.get_page)(type, offset, page);
- + if (ret == 0)
- + frontswap_gets++;
- + return ret;
- +}
- +
- +/*
- + * Flush any data from frontswap associated with the specified swaptype
- + * and offset so that a subsequent "get" will fail.
- + */
- +void __frontswap_flush_page(unsigned type, pgoff_t offset)
- +{
- + struct swap_info_struct *sis = swap_info[type];
- +
- + if (frontswap_test(sis, offset)) {
- + (*frontswap_ops.flush_page)(type, offset);
- + sis->frontswap_pages--;
- + frontswap_clear(sis, offset);
- + frontswap_flushes++;
- + }
- +}
- +
- +/*
- + * Flush all data from frontswap associated with all offsets for the
- + * specified swaptype.
- + */
- +void __frontswap_flush_area(unsigned type)
- +{
- + struct swap_info_struct *sis = swap_info[type];
- +
- + (*frontswap_ops.flush_area)(type);
- + sis->frontswap_pages = 0;
- + memset(sis->frontswap_map, 0, sis->max / sizeof(long));
- +}
- +
- +/*
- + * Frontswap, like a true swap device, may unnecessarily retain pages
- + * under certain circumstances; "shrink" frontswap is essentially a
- + * "partial swapoff" and works by calling try_to_unuse to attempt to
- + * unuse enough frontswap pages to attempt to -- subject to memory
- + * constraints -- reduce the number of pages in frontswap
- + */
- +void frontswap_shrink(unsigned long target_pages)
- +{
- + int wrapped = 0;
- + bool locked = false;
- +
- + for (wrapped = 0; wrapped <= 3; wrapped++) {
- +
- + struct swap_info_struct *si = NULL;
- + unsigned long total_pages = 0, total_pages_to_unuse;
- + unsigned long pages = 0, unuse_pages = 0;
- + int type;
- +
- + /*
- + * we don't want to hold swap_lock while doing a very
- + * lengthy try_to_unuse, but swap_list may change
- + * so restart scan from swap_list.head each time
- + */
- + spin_lock(&swap_lock);
- + locked = true;
- + total_pages = 0;
- + for (type = swap_list.head; type >= 0; type = si->next) {
- + si = swap_info[type];
- + total_pages += si->frontswap_pages;
- + }
- + if (total_pages <= target_pages)
- + goto out;
- + total_pages_to_unuse = total_pages - target_pages;
- + for (type = swap_list.head; type >= 0; type = si->next) {
- + si = swap_info[type];
- + if (total_pages_to_unuse < si->frontswap_pages)
- + pages = unuse_pages = total_pages_to_unuse;
- + else {
- + pages = si->frontswap_pages;
- + unuse_pages = 0; /* unuse all */
- + }
- + if (security_vm_enough_memory_kern(pages))
- + continue;
- + vm_unacct_memory(pages);
- + break;
- + }
- + if (type < 0)
- + goto out;
- + locked = false;
- + spin_unlock(&swap_lock);
- + try_to_unuse(type, true, unuse_pages);
- + }
- +
- +out:
- + if (locked)
- + spin_unlock(&swap_lock);
- + return;
- +}
- +EXPORT_SYMBOL(frontswap_shrink);
- +
- +/*
- + * count and return the number of pages frontswap pages across all
- + * swap devices. This is exported so that a kernel module can
- + * determine current usage without reading sysfs.
- + */
- +unsigned long frontswap_curr_pages(void)
- +{
- + int type;
- + unsigned long totalpages = 0;
- + struct swap_info_struct *si = NULL;
- +
- + spin_lock(&swap_lock);
- + for (type = swap_list.head; type >= 0; type = si->next) {
- + si = swap_info[type];
- + totalpages += si->frontswap_pages;
- + }
- + spin_unlock(&swap_lock);
- + return totalpages;
- +}
- +EXPORT_SYMBOL(frontswap_curr_pages);
- +
- +#ifdef CONFIG_SYSFS
- +
- +/* see Documentation/ABI/xxx/sysfs-kernel-mm-frontswap */
- +
- +#define FRONTSWAP_ATTR_RO(_name) \
- + static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
- +#define FRONTSWAP_ATTR(_name) \
- + static struct kobj_attribute _name##_attr = \
- + __ATTR(_name, 0644, _name##_show, _name##_store)
- +
- +static ssize_t curr_pages_show(struct kobject *kobj,
- + struct kobj_attribute *attr, char *buf)
- +{
- + return sprintf(buf, "%lu\n", frontswap_curr_pages());
- +}
- +
- +static ssize_t curr_pages_store(struct kobject *kobj,
- + struct kobj_attribute *attr,
- + const char *buf, size_t count)
- +{
- + unsigned long target_pages;
- + int err;
- +
- + err = strict_strtoul(buf, 10, &target_pages);
- + if (err)
- + return -EINVAL;
- +
- + frontswap_shrink(target_pages);
- +
- + return count;
- +}
- +FRONTSWAP_ATTR(curr_pages);
- +
- +static ssize_t succ_puts_show(struct kobject *kobj,
- + struct kobj_attribute *attr, char *buf)
- +{
- + return sprintf(buf, "%lu\n", frontswap_succ_puts);
- +}
- +FRONTSWAP_ATTR_RO(succ_puts);
- +
- +static ssize_t failed_puts_show(struct kobject *kobj,
- + struct kobj_attribute *attr, char *buf)
- +{
- + return sprintf(buf, "%lu\n", frontswap_failed_puts);
- +}
- +FRONTSWAP_ATTR_RO(failed_puts);
- +
- +static ssize_t gets_show(struct kobject *kobj,
- + struct kobj_attribute *attr, char *buf)
- +{
- + return sprintf(buf, "%lu\n", frontswap_gets);
- +}
- +FRONTSWAP_ATTR_RO(gets);
- +
- +static ssize_t flushes_show(struct kobject *kobj,
- + struct kobj_attribute *attr, char *buf)
- +{
- + return sprintf(buf, "%lu\n", frontswap_flushes);
- +}
- +FRONTSWAP_ATTR_RO(flushes);
- +
- +static struct attribute *frontswap_attrs[] = {
- + &curr_pages_attr.attr,
- + &succ_puts_attr.attr,
- + &failed_puts_attr.attr,
- + &gets_attr.attr,
- + &flushes_attr.attr,
- + NULL,
- +};
- +
- +static struct attribute_group frontswap_attr_group = {
- + .attrs = frontswap_attrs,
- + .name = "frontswap",
- +};
- +
- +#endif /* CONFIG_SYSFS */
- +
- +static int __init init_frontswap(void)
- +{
- +#ifdef CONFIG_SYSFS
- + int err;
- +
- + err = sysfs_create_group(mm_kobj, &frontswap_attr_group);
- +#endif /* CONFIG_SYSFS */
- + return 0;
- +}
- +
- +static void __exit exit_frontswap(void)
- +{
- + frontswap_shrink(0UL);
- +}
- +
- +module_init(init_frontswap);
- +module_exit(exit_frontswap);
- diff -Nrupad linux-2.6.37//mm/Kconfig linux-2.6.37_vanilla//mm/Kconfig
- --- linux-2.6.37//mm/Kconfig 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//mm/Kconfig 2011-02-14 01:21:43.172793144 +0100
- @@ -309,3 +309,41 @@ config NEED_PER_CPU_KM
- depends on !SMP
- bool
- default y
- +
- +config CLEANCACHE
- + bool "Enable cleancache pseudo-RAM driver to cache clean pages"
- + default y
- + help
- + Cleancache can be thought of as a page-granularity victim cache
- + for clean pages that the kernel's pageframe replacement algorithm
- + (PFRA) would like to keep around, but can't since there isn't enough
- + memory. So when the PFRA "evicts" a page, it first attempts to put
- + it into a synchronous concurrency-safe page-oriented pseudo-RAM
- + device (such as Xen's Transcendent Memory, aka "tmem") which is not
- + directly accessible or addressable by the kernel and is of unknown
- + (and possibly time-varying) size. And when a cleancache-enabled
- + filesystem wishes to access a page in a file on disk, it first
- + checks cleancache to see if it already contains it; if it does,
- + the page is copied into the kernel and a disk access is avoided.
- + When a pseudo-RAM device is available, a significant I/O reduction
- + may be achieved. When none is available, all cleancache calls
- + are reduced to a single pointer-compare-against-NULL resulting
- + in a negligible performance hit.
- +
- + If unsure, say Y to enable cleancache
- +
- +config FRONTSWAP
- + bool "Enable frontswap pseudo-RAM driver to cache swap pages"
- + default y
- + help
- + Frontswap is so named because it can be thought of as the opposite of
- + a "backing" store for a swap device. The storage is assumed to be
- + a synchronous concurrency-safe page-oriented pseudo-RAM device (such
- + as Xen's Transcendent Memory, aka "tmem") which is not directly
- + accessible or addressable by the kernel and is of unknown (and
- + possibly time-varying) size. When a pseudo-RAM device is available,
- + a signficant swap I/O reduction may be achieved. When none is
- + available, all frontswap calls are reduced to a single pointer-
- + compare-against-NULL resulting in a negligible performance hit.
- +
- + If unsure, say Y to enable frontswap.
- diff -Nrupad linux-2.6.37//mm/Makefile linux-2.6.37_vanilla//mm/Makefile
- --- linux-2.6.37//mm/Makefile 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//mm/Makefile 2011-02-14 01:21:43.172793144 +0100
- @@ -19,6 +19,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.
- obj-$(CONFIG_BOUNCE) += bounce.o
- obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
- +obj-$(CONFIG_FRONTSWAP) += frontswap.o
- obj-$(CONFIG_HAS_DMA) += dmapool.o
- obj-$(CONFIG_HUGETLBFS) += hugetlb.o
- obj-$(CONFIG_NUMA) += mempolicy.o
- @@ -42,3 +43,5 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-f
- obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
- obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
- obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
- +obj-$(CONFIG_CLEANCACHE) += cleancache.o
- +obj-$(CONFIG_FRONTSWAP) += frontswap.o
- diff -Nrupad linux-2.6.37//mm/page_io.c linux-2.6.37_vanilla//mm/page_io.c
- --- linux-2.6.37//mm/page_io.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//mm/page_io.c 2011-02-14 01:21:43.172793144 +0100
- @@ -18,6 +18,7 @@
- #include <linux/bio.h>
- #include <linux/swapops.h>
- #include <linux/writeback.h>
- +#include <linux/frontswap.h>
- #include <asm/pgtable.h>
- static struct bio *get_swap_bio(gfp_t gfp_flags,
- @@ -98,6 +99,12 @@ int swap_writepage(struct page *page, st
- unlock_page(page);
- goto out;
- }
- + if (frontswap_put_page(page) == 0) {
- + set_page_writeback(page);
- + unlock_page(page);
- + end_page_writeback(page);
- + goto out;
- + }
- bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
- if (bio == NULL) {
- set_page_dirty(page);
- @@ -122,6 +129,11 @@ int swap_readpage(struct page *page)
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(PageUptodate(page));
- + if (frontswap_get_page(page) == 0) {
- + SetPageUptodate(page);
- + unlock_page(page);
- + goto out;
- + }
- bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
- if (bio == NULL) {
- unlock_page(page);
- diff -Nrupad linux-2.6.37//mm/swapfile.c linux-2.6.37_vanilla//mm/swapfile.c
- --- linux-2.6.37//mm/swapfile.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//mm/swapfile.c 2011-02-14 01:21:43.173793142 +0100
- @@ -31,6 +31,8 @@
- #include <linux/syscalls.h>
- #include <linux/memcontrol.h>
- #include <linux/poll.h>
- +#include <linux/frontswap.h>
- +#include <linux/swapfile.h>
- #include <asm/pgtable.h>
- #include <asm/tlbflush.h>
- @@ -42,7 +44,7 @@ static bool swap_count_continued(struct
- static void free_swap_count_continuations(struct swap_info_struct *);
- static sector_t map_swap_entry(swp_entry_t, struct block_device**);
- -static DEFINE_SPINLOCK(swap_lock);
- +DEFINE_SPINLOCK(swap_lock);
- static unsigned int nr_swapfiles;
- long nr_swap_pages;
- long total_swap_pages;
- @@ -53,9 +55,9 @@ static const char Unused_file[] = "Unuse
- static const char Bad_offset[] = "Bad swap offset entry ";
- static const char Unused_offset[] = "Unused swap offset entry ";
- -static struct swap_list_t swap_list = {-1, -1};
- +struct swap_list_t swap_list = {-1, -1};
- -static struct swap_info_struct *swap_info[MAX_SWAPFILES];
- +struct swap_info_struct *swap_info[MAX_SWAPFILES];
- static DEFINE_MUTEX(swapon_mutex);
- @@ -589,6 +591,7 @@ static unsigned char swap_entry_free(str
- swap_list.next = p->type;
- nr_swap_pages++;
- p->inuse_pages--;
- + frontswap_flush_page(p->type, offset);
- if ((p->flags & SWP_BLKDEV) &&
- disk->fops->swap_slot_free_notify)
- disk->fops->swap_slot_free_notify(p->bdev, offset);
- @@ -1052,7 +1055,7 @@ static int unuse_mm(struct mm_struct *mm
- * Recycle to start on reaching the end, returning 0 when empty.
- */
- static unsigned int find_next_to_unuse(struct swap_info_struct *si,
- - unsigned int prev)
- + unsigned int prev, bool frontswap)
- {
- unsigned int max = si->max;
- unsigned int i = prev;
- @@ -1078,6 +1081,12 @@ static unsigned int find_next_to_unuse(s
- prev = 0;
- i = 1;
- }
- + if (frontswap) {
- + if (frontswap_test(si, i))
- + break;
- + else
- + continue;
- + }
- count = si->swap_map[i];
- if (count && swap_count(count) != SWAP_MAP_BAD)
- break;
- @@ -1089,8 +1098,12 @@ static unsigned int find_next_to_unuse(s
- * We completely avoid races by reading each swap page in advance,
- * and then search for the process using it. All the necessary
- * page table adjustments can then be made atomically.
- + *
- + * if the boolean frontswap is true, only unuse pages_to_unuse pages;
- + * pages_to_unuse==0 means all pages
- */
- -static int try_to_unuse(unsigned int type)
- +int try_to_unuse(unsigned int type, bool frontswap,
- + unsigned long pages_to_unuse)
- {
- struct swap_info_struct *si = swap_info[type];
- struct mm_struct *start_mm;
- @@ -1123,7 +1136,7 @@ static int try_to_unuse(unsigned int typ
- * one pass through swap_map is enough, but not necessarily:
- * there are races when an instance of an entry might be missed.
- */
- - while ((i = find_next_to_unuse(si, i)) != 0) {
- + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
- if (signal_pending(current)) {
- retval = -EINTR;
- break;
- @@ -1290,6 +1303,10 @@ static int try_to_unuse(unsigned int typ
- * interactive performance.
- */
- cond_resched();
- + if (frontswap && pages_to_unuse > 0) {
- + if (!--pages_to_unuse)
- + break;
- + }
- }
- mmput(start_mm);
- @@ -1615,7 +1632,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
- spin_unlock(&swap_lock);
- current->flags |= PF_OOM_ORIGIN;
- - err = try_to_unuse(type);
- + err = try_to_unuse(type, false, 0);
- current->flags &= ~PF_OOM_ORIGIN;
- if (err) {
- @@ -1667,9 +1684,12 @@ SYSCALL_DEFINE1(swapoff, const char __us
- swap_map = p->swap_map;
- p->swap_map = NULL;
- p->flags = 0;
- + frontswap_flush_area(type);
- spin_unlock(&swap_lock);
- mutex_unlock(&swapon_mutex);
- vfree(swap_map);
- + if (p->frontswap_map)
- + vfree(p->frontswap_map);
- /* Destroy swap account informatin */
- swap_cgroup_swapoff(type);
- @@ -1864,6 +1884,7 @@ SYSCALL_DEFINE2(swapon, const char __use
- unsigned long maxpages;
- unsigned long swapfilepages;
- unsigned char *swap_map = NULL;
- + unsigned long *frontswap_map = NULL;
- struct page *page = NULL;
- struct inode *inode = NULL;
- int did_down = 0;
- @@ -2085,6 +2106,12 @@ SYSCALL_DEFINE2(swapon, const char __use
- error = -EINVAL;
- goto bad_swap;
- }
- + /* frontswap enabled? set up bit-per-page map for frontswap */
- + if (frontswap_enabled) {
- + frontswap_map = vmalloc(maxpages / sizeof(long));
- + if (frontswap_map)
- + memset(frontswap_map, 0, maxpages / sizeof(long));
- + }
- if (p->bdev) {
- if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
- @@ -2103,16 +2130,18 @@ SYSCALL_DEFINE2(swapon, const char __use
- else
- p->prio = --least_priority;
- p->swap_map = swap_map;
- + p->frontswap_map = frontswap_map;
- p->flags |= SWP_WRITEOK;
- nr_swap_pages += nr_good_pages;
- total_swap_pages += nr_good_pages;
- printk(KERN_INFO "Adding %uk swap on %s. "
- - "Priority:%d extents:%d across:%lluk %s%s\n",
- + "Priority:%d extents:%d across:%lluk %s%s%s\n",
- nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
- nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
- (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
- - (p->flags & SWP_DISCARDABLE) ? "D" : "");
- + (p->flags & SWP_DISCARDABLE) ? "D" : "",
- + (p->frontswap_map) ? "FS" : "");
- /* insert swap space into swap_list: */
- prev = -1;
- @@ -2126,6 +2155,7 @@ SYSCALL_DEFINE2(swapon, const char __use
- swap_list.head = swap_list.next = type;
- else
- swap_info[prev]->next = type;
- + frontswap_init(type);
- spin_unlock(&swap_lock);
- mutex_unlock(&swapon_mutex);
- atomic_inc(&proc_poll_event);
- @@ -2313,6 +2343,10 @@ int valid_swaphandles(swp_entry_t entry,
- base++;
- spin_lock(&swap_lock);
- + if (frontswap_test(si, target)) {
- + spin_unlock(&swap_lock);
- + return 0;
- + }
- if (end > si->max) /* don't go beyond end of map */
- end = si->max;
- @@ -2323,6 +2357,9 @@ int valid_swaphandles(swp_entry_t entry,
- break;
- if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
- break;
- + /* Don't read in frontswap pages */
- + if (frontswap_test(si, toff))
- + break;
- }
- /* Count contiguous allocated slots below our target */
- for (toff = target; --toff >= base; nr_pages++) {
- @@ -2331,6 +2368,9 @@ int valid_swaphandles(swp_entry_t entry,
- break;
- if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
- break;
- + /* Don't read in frontswap pages */
- + if (frontswap_test(si, toff))
- + break;
- }
- spin_unlock(&swap_lock);
- diff -Nrupad linux-2.6.37//mm/truncate.c linux-2.6.37_vanilla//mm/truncate.c
- --- linux-2.6.37//mm/truncate.c 2011-01-05 01:50:19.000000000 +0100
- +++ linux-2.6.37_vanilla//mm/truncate.c 2011-02-14 01:21:43.174793140 +0100
- @@ -19,6 +19,7 @@
- #include <linux/task_io_accounting_ops.h>
- #include <linux/buffer_head.h> /* grr. try_to_release_page,
- do_invalidatepage */
- +#include <linux/cleancache.h>
- #include "internal.h"
- @@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page
- static inline void truncate_partial_page(struct page *page, unsigned partial)
- {
- zero_user_segment(page, partial, PAGE_CACHE_SIZE);
- + cleancache_flush_page(page->mapping, page);
- if (page_has_private(page))
- do_invalidatepage(page, partial);
- }
- @@ -108,6 +110,10 @@ truncate_complete_page(struct address_sp
- clear_page_mlock(page);
- remove_from_page_cache(page);
- ClearPageMappedToDisk(page);
- + /* this must be after the remove_from_page_cache which
- + * calls cleancache_put_page (and note page->mapping is now NULL)
- + */
- + cleancache_flush_page(mapping, page);
- page_cache_release(page); /* pagecache ref */
- return 0;
- }
- @@ -215,6 +221,7 @@ void truncate_inode_pages_range(struct a
- pgoff_t next;
- int i;
- + cleancache_flush_inode(mapping);
- if (mapping->nrpages == 0)
- return;
- @@ -290,6 +297,7 @@ void truncate_inode_pages_range(struct a
- pagevec_release(&pvec);
- mem_cgroup_uncharge_end();
- }
- + cleancache_flush_inode(mapping);
- }
- EXPORT_SYMBOL(truncate_inode_pages_range);
- @@ -432,6 +440,7 @@ int invalidate_inode_pages2_range(struct
- int did_range_unmap = 0;
- int wrapped = 0;
- + cleancache_flush_inode(mapping);
- pagevec_init(&pvec, 0);
- next = start;
- while (next <= end && !wrapped &&
- @@ -490,6 +499,7 @@ int invalidate_inode_pages2_range(struct
- mem_cgroup_uncharge_end();
- cond_resched();
- }
- + cleancache_flush_inode(mapping);
- return ret;
- }
- EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
Add Comment
Please, Sign In to add comment