uksm-0.1.1.1-for-v3.4.0.patch

diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
index 5481c8b..7141876 100644
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -14,6 +14,8 @@ hwpoison.txt
    - explains what hwpoison is
 ksm.txt
    - how to use the Kernel Samepage Merging feature.
+uksm.txt
+   - Introduction to Ultra KSM
 locking
    - info on how locking and synchronization is done in the Linux vm code.
 map_hugetlb.c
diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
new file mode 100644
index 0000000..e18f6d8
--- /dev/null
+++ b/Documentation/vm/uksm.txt
@@ -0,0 +1,50 @@
+The Ultra Kernel Samepage Merging feature
+----------------------------------------------
+/*
+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
+ *
+ * This is an improvement upon KSM. Some basic data structures and routines
+ * are borrowed from ksm.c .
+ *
+ * Its new features:
+ * 1. Full system scan:
+ *      It automatically scans all user processes' anonymous VMAs. Kernel-user
+ *      interaction to submit a memory area to KSM is no longer needed.
+ *
+ * 2. Rich area detection:
+ *      It automatically detects rich areas containing abundant duplicated
+ *      pages based. Rich areas are given a full scan speed. Poor areas are
+ *      sampled at a reasonable speed with very low CPU consumption.
+ *
+ * 3. Ultra Per-page scan speed improvement:
+ *      A new hash algorithm is proposed. As a result, on a machine with
+ *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
+ *      can scan memory areas that does not contain duplicated pages at speed of
+ *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
+ *      477MB/sec ~ 923MB/sec.
+ *
+ * 4. Thrashing area avoidance:
+ *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
+ *      filtered out. My benchmark shows it's more efficient than KSM's per-page
+ *      hash value based volatile page detection.
+ *
+ *
+ * 5. Misc changes upon KSM:
+ *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
+ *        comparison. It's much faster than default C version on x86.
+ *      * rmap_item now has an struct *page member to loosely cache a
+ *        address-->page mapping, which reduces too much time-costly
+ *        follow_page().
+ *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
+ *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
+ *        ksm is needed for this case.
+ *
+ * 6. Full Zero Page consideration(contributed by Figo Zhang)
+ *    Now uksmd consider full zero pages as special pages and merge them to an
+ *    special unswappable uksm zero page.
+ */
+
+ChangeLog:
+
+2012-05-05 The creation of this Doc
+2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
diff --git a/fs/exec.c b/fs/exec.c
index ae42277..c1c65bc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -55,6 +55,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/compat.h>
+#include <linux/ksm.h>

 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 80e4645..33f9e9b 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -87,6 +87,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
        "SUnreclaim:     %8lu kB\n"
        "KernelStack:    %8lu kB\n"
        "PageTables:     %8lu kB\n"
+#ifdef CONFIG_UKSM
+       "KsmZeroPages:   %8lu kB\n"
+#endif
 #ifdef CONFIG_QUICKLIST
        "Quicklists:     %8lu kB\n"
 #endif
@@ -146,6 +149,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
        K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
        global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
        K(global_page_state(NR_PAGETABLE)),
+#ifdef CONFIG_UKSM
+       K(global_page_state(NR_UKSM_ZERO_PAGES)),
+#endif
 #ifdef CONFIG_QUICKLIST
        K(quicklist_total_size()),
 #endif
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 3319a69..f4edf33 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -22,21 +22,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
        unsigned long end, int advice, unsigned long *vm_flags);
-int __ksm_enter(struct mm_struct *mm);
-void __ksm_exit(struct mm_struct *mm);
-
-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
-{
-   if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
-       return __ksm_enter(mm);
-   return 0;
-}
-
-static inline void ksm_exit(struct mm_struct *mm)
-{
-   if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
-       __ksm_exit(mm);
-}

 /*
  * A KSM page is one of those write-protected "shared pages" or "merged pages"
@@ -90,6 +75,33 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
          struct vm_area_struct *, unsigned long, void *), void *arg);
 void ksm_migrate_page(struct page *newpage, struct page *oldpage);

+#ifdef CONFIG_KSM_LEGACY
+int __ksm_enter(struct mm_struct *mm);
+void __ksm_exit(struct mm_struct *mm);
+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+   if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
+       return __ksm_enter(mm);
+   return 0;
+}
+
+static inline void ksm_exit(struct mm_struct *mm)
+{
+   if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
+       __ksm_exit(mm);
+}
+
+#elif defined(CONFIG_UKSM)
+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+   return 0;
+}
+
+static inline void ksm_exit(struct mm_struct *mm)
+{
+}
+#endif /* !CONFIG_UKSM */
+
 #else  /* !CONFIG_KSM */

 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
@@ -142,4 +154,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
 #endif /* CONFIG_MMU */
 #endif /* !CONFIG_KSM */

+#include <linux/uksm.h>
+
 #endif /* __LINUX_KSM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3cc3062..9d8642d7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -252,6 +252,9 @@ struct vm_area_struct {
 #ifdef CONFIG_NUMA
    struct mempolicy *vm_policy;    /* NUMA policy for the VMA */
 #endif
+#ifdef CONFIG_UKSM
+   struct vma_slot *uksm_vma_slot;
+#endif
 };

 struct core_thread {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 650ba2f..2d1475f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -116,6 +116,9 @@ enum zone_stat_item {
    NUMA_OTHER,     /* allocation from other node */
 #endif
    NR_ANON_TRANSPARENT_HUGEPAGES,
+#ifdef CONFIG_UKSM
+   NR_UKSM_ZERO_PAGES,
+#endif
    NR_VM_ZONE_STAT_ITEMS };

 /*
diff --git a/include/linux/uksm.h b/include/linux/uksm.h
new file mode 100644
index 0000000..27b80bf
--- /dev/null
+++ b/include/linux/uksm.h
@@ -0,0 +1,115 @@
+#ifndef __LINUX_UKSM_H
+#define __LINUX_UKSM_H
+/*
+ * Memory merging support.
+ *
+ * This code enables dynamic sharing of identical pages found in different
+ * memory areas, even if they are not shared by fork().
+ */
+
+/* if !CONFIG_UKSM this file should not be compiled at all. */
+#ifdef CONFIG_UKSM
+
+#include <linux/bitops.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/sched.h>
+
+extern unsigned long zero_pfn __read_mostly;
+extern unsigned long uksm_zero_pfn __read_mostly;
+extern struct page *empty_uksm_zero_page;
+
+/* must be done before linked to mm */
+extern void uksm_vma_add_new(struct vm_area_struct *vma);
+extern void uksm_remove_vma(struct vm_area_struct *vma);
+
+struct vma_slot {
+   struct list_head uksm_list;
+   struct list_head slot_list;
+   unsigned long dedup_ratio;
+   unsigned long dedup_num;
+   int uksm_index; /* -1 if vma is not in inter-table,
+               positive otherwise */
+   unsigned long pages_scanned;
+   unsigned long last_scanned;
+   unsigned long pages_to_scan;
+   struct scan_rung *rung;
+   struct page **rmap_list_pool;
+   unsigned long *pool_counts;
+   unsigned long pool_size;
+   struct vm_area_struct *vma;
+   struct mm_struct *mm;
+   unsigned long ctime_j;
+   unsigned long pages;
+   unsigned long flags;
+   unsigned long pages_cowed; /* pages cowed this round */
+   unsigned long pages_merged; /* pages merged this round */
+
+   /* used for dup vma pair */
+   struct radix_tree_root dup_tree;
+};
+
+static inline void uksm_unmap_zero_page(pte_t pte)
+{
+   if (pte_pfn(pte) == uksm_zero_pfn)
+       __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
+}
+
+static inline void uksm_map_zero_page(pte_t pte)
+{
+   if (pte_pfn(pte) == uksm_zero_pfn)
+       __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
+}
+
+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
+{
+   if (vma->uksm_vma_slot && PageKsm(page))
+       vma->uksm_vma_slot->pages_cowed++;
+}
+
+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
+{
+   if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
+       vma->uksm_vma_slot->pages_cowed++;
+}
+
+
+/*
+ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
+ * be removed when uksm zero page patch is stable enough.
+ */
+static inline void uksm_bugon_zeropage(pte_t pte)
+{
+   BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
+}
+#else
+static inline void uksm_vma_add_new(struct vm_area_struct *vma)
+{
+}
+
+static inline void uksm_remove_vma(struct vm_area_struct *vma)
+{
+}
+
+static inline void uksm_unmap_zero_page(pte_t pte)
+{
+}
+
+static inline void uksm_map_zero_page(pte_t pte)
+{
+}
+
+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
+{
+}
+
+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
+{
+}
+
+static inline void uksm_bugon_zeropage(pte_t pte)
+{
+}
+#endif /* !CONFIG_UKSM */
+#endif /* __LINUX_UKSM_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 26a7a67..c23317e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -361,7 +361,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                goto fail_nomem;
            charge = len;
        }
-       tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+       tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
        if (!tmp)
            goto fail_nomem;
        *tmp = *mpnt;
@@ -413,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        __vma_link_rb(mm, tmp, rb_link, rb_parent);
        rb_link = &tmp->vm_rb.rb_right;
        rb_parent = &tmp->vm_rb;
-
+       uksm_vma_add_new(tmp);
        mm->map_count++;
        retval = copy_page_range(mm, oldmm, mpnt);

diff --git a/mm/Kconfig b/mm/Kconfig
index e338407..8df1b4f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -245,6 +245,32 @@ config KSM
      See Documentation/vm/ksm.txt for more information: KSM is inactive
      until a program has madvised that an area is MADV_MERGEABLE, and
      root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
+choice
+   prompt "Choose UKSM/KSM strategy"
+   default UKSM
+   depends on KSM
+   help
+     This option allows to select a UKSM/KSM stragety.
+
+config UKSM
+   bool "Ultra-KSM for page merging"
+   depends on KSM
+   help
+   UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
+   page Merging), but with a fundamentally rewritten core algorithm. With
+   an advanced algorithm, UKSM now can transparently scans all anonymously
+   mapped user space applications with an significantly improved scan speed
+   and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
+   UKSM. Now UKSM has its first stable release and first real world enterprise user.
+   For more information, please goto its project page.
+   (www.kerneldedup.org)
+
+config KSM_LEGACY
+   bool "Legacy KSM implementation"
+   depends on KSM
+   help
+   The legacy KSM implementation from Redhat.
+endchoice

 config DEFAULT_MMAP_MIN_ADDR
         int "Low address space to protect from user allocation"
diff --git a/mm/Makefile b/mm/Makefile
index 50ec00e..c551bae 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -34,7 +34,8 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_COMPACTION) += compaction.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
-obj-$(CONFIG_KSM) += ksm.o
+obj-$(CONFIG_KSM_LEGACY) += ksm.o
+obj-$(CONFIG_UKSM) += uksm.o
 obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
diff --git a/mm/memory.c b/mm/memory.c
index 10b4dda..be73fff 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -112,6 +112,37 @@ __setup("norandmaps", disable_randmaps);
 unsigned long zero_pfn __read_mostly;
 unsigned long highest_memmap_pfn __read_mostly;

+#ifdef CONFIG_UKSM
+unsigned long uksm_zero_pfn __read_mostly;
+struct page *empty_uksm_zero_page;
+
+static int __init setup_uksm_zero_page(void)
+{
+   unsigned long addr;
+   addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0);
+   if (!addr)
+       panic("Oh boy, that early out of memory?");
+
+   empty_uksm_zero_page = virt_to_page((void *) addr);
+   SetPageReserved(empty_uksm_zero_page);
+
+   uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
+
+   return 0;
+}
+core_initcall(setup_uksm_zero_page);
+
+static inline int is_uksm_zero_pfn(unsigned long pfn)
+{
+   return pfn == uksm_zero_pfn;
+}
+#else
+static inline int is_uksm_zero_pfn(unsigned long pfn)
+{
+   return 0;
+}
+#endif
+
 /*
  * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
  */
@@ -716,8 +716,10 @@ static inline int is_cow_mapping(vm_flags_t flags)
 #ifndef is_zero_pfn
 static inline int is_zero_pfn(unsigned long pfn)
 {
-   return pfn == zero_pfn;
+   return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
 }
+#else
+#define is_zero_pfn(pfn)   (is_zero_pfn(pfn) || is_uksm_zero_pfn(pfn))
 #endif

 #ifndef my_zero_pfn
@@ -903,6 +903,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
            rss[MM_ANONPAGES]++;
        else
            rss[MM_FILEPAGES]++;
+
+       /* Should return NULL in vm_normal_page() */
+       uksm_bugon_zeropage(pte);
+   } else {
+       uksm_map_zero_page(pte);
    }

 out_set_pte:
@@ -1138,8 +1176,10 @@ again:
            ptent = ptep_get_and_clear_full(mm, addr, pte,
                            tlb->fullmm);
            tlb_remove_tlb_entry(tlb, pte, addr);
-           if (unlikely(!page))
+           if (unlikely(!page)) {
+               uksm_unmap_zero_page(ptent);
                continue;
+           }
            if (unlikely(details) && details->nonlinear_vma
                && linear_page_index(details->nonlinear_vma,
                        addr) != page->index)
@@ -1712,7 +1752,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                page = vm_normal_page(vma, start, *pte);
                if (!page) {
                    if (!(gup_flags & FOLL_DUMP) &&
-                        is_zero_pfn(pte_pfn(*pte)))
+                       (is_zero_pfn(pte_pfn(*pte))))
                        page = pte_page(*pte);
                    else {
                        pte_unmap(pte);
@@ -2472,8 +2512,10 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
            clear_page(kaddr);
        kunmap_atomic(kaddr);
        flush_dcache_page(dst);
-   } else
+   } else {
        copy_user_highpage(dst, src, va, vma);
+       uksm_cow_page(vma, src);
+   }
 }

 /*
@@ -2671,6 +2713,7 @@ gotten:
        new_page = alloc_zeroed_user_highpage_movable(vma, address);
        if (!new_page)
            goto oom;
+       uksm_cow_pte(vma, orig_pte);
    } else {
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
        if (!new_page)
@@ -2692,8 +2735,11 @@ gotten:
                dec_mm_counter_fast(mm, MM_FILEPAGES);
                inc_mm_counter_fast(mm, MM_ANONPAGES);
            }
-       } else
+           uksm_bugon_zeropage(orig_pte);
+       } else {
+           uksm_unmap_zero_page(orig_pte);
            inc_mm_counter_fast(mm, MM_ANONPAGES);
+       }
        flush_cache_page(vma, address, pte_pfn(orig_pte));
        entry = mk_pte(new_page, vma->vm_page_prot);
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
diff --git a/mm/mmap.c b/mm/mmap.c
index da15a79..482c503 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -30,6 +30,7 @@
 #include <linux/perf_event.h>
 #include <linux/audit.h>
 #include <linux/khugepaged.h>
+#include <linux/ksm.h>

 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -236,6 +237,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
            removed_exe_file_vma(vma->vm_mm);
    }
    mpol_put(vma_policy(vma));
+   uksm_remove_vma(vma);
    kmem_cache_free(vm_area_cachep, vma);
    return next;
 }
@@ -501,9 +501,16 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
    long adjust_next = 0;
    int remove_next = 0;

+/*
+ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
+ * acquired
+ */
+   uksm_remove_vma(vma);
+
    if (next && !insert) {
        struct vm_area_struct *exporter = NULL;

+       uksm_remove_vma(next);
        if (end >= next->vm_end) {
            /*
             * vma expands, overlapping all the next, and
@@ -635,10 +635,15 @@ again:            remove_next = 1 + (end > next->vm_end);
         */
        if (remove_next == 2) {
            next = vma->vm_next;
+           uksm_remove_vma(next);
            goto again;
        }
+   } else {
+       if (next && !insert)
+           uksm_vma_add_new(next);
    }

+   uksm_vma_add_new(vma);
    validate_mm(mm);

    return 0;
@@ -1357,6 +1357,7 @@ munmap_back:

    vma_link(mm, vma, prev, rb_link, rb_parent);
    file = vma->vm_file;
+   uksm_vma_add_new(vma);

    /* Once vma denies write, undo our temporary denial count */
    if (correct_wcount)
@@ -1383,6 +1383,7 @@ unmap_and_free_vma:
    unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
    charged = 0;
 free_vma:
+   uksm_remove_vma(vma);
    kmem_cache_free(vm_area_cachep, vma);
 unacct_error:
    if (charged)
@@ -2012,6 +2012,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
    else
        err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);

+   uksm_vma_add_new(new);
+
    /* Success. */
    if (!err)
        return 0;
@@ -2253,6 +2253,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
    vma->vm_flags = flags;
    vma->vm_page_prot = vm_get_page_prot(flags);
    vma_link(mm, vma, prev, rb_link, rb_parent);
+   uksm_vma_add_new(vma);
 out:
    perf_event_mmap(vma);
    mm->total_vm += len >> PAGE_SHIFT;
@@ -2285,6 +2285,12 @@ void exit_mmap(struct mm_struct *mm)
    /* mm's last user has gone, and its about to be pulled down */
    mmu_notifier_release(mm);

+   /*
+    * Taking write lock on mmap_sem does not harm others,
+    * but it's crucial for uksm to avoid races.
+    */
+   down_write(&mm->mmap_sem);
+
    if (mm->locked_vm) {
        vma = mm->mmap;
        while (vma) {
@@ -2318,6 +2318,11 @@ void exit_mmap(struct mm_struct *mm)
    while (vma)
        vma = remove_vma(vma);

+   mm->mmap = NULL;
+   mm->mm_rb = RB_ROOT;
+   mm->mmap_cache = NULL;
+   up_write(&mm->mmap_sem);
+
    BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }

@@ -2427,6 +2427,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
            if (new_vma->vm_ops && new_vma->vm_ops->open)
                new_vma->vm_ops->open(new_vma);
            vma_link(mm, new_vma, prev, rb_link, rb_parent);
+           uksm_vma_add_new(new_vma);
        }
    }
    return new_vma;
@@ -2532,10 +2532,10 @@ int install_special_mapping(struct mm_struct *mm,
    ret = insert_vm_struct(mm, vma);
    if (ret)
        goto out;
-
    mm->total_vm += len >> PAGE_SHIFT;

    perf_event_mmap(vma);
+   uksm_vma_add_new(vma);

    return 0;

diff --git a/mm/uksm.c b/mm/uksm.c
new file mode 100644
index 0000000..c22787b
--- /dev/null
+++ b/mm/uksm.c
@@ -0,0 +1,5344 @@
+/*
+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
+ *
+ * This is an improvement upon KSM. Some basic data structures and routines
+ * are borrowed from ksm.c .
+ *
+ * Its new features:
+ * 1. Full system scan:
+ *      It automatically scans all user processes' anonymous VMAs. Kernel-user
+ *      interaction to submit a memory area to KSM is no longer needed.
+ *
+ * 2. Rich area detection:
+ *      It automatically detects rich areas containing abundant duplicated
+ *      pages based. Rich areas are given a full scan speed. Poor areas are
+ *      sampled at a reasonable speed with very low CPU consumption.
+ *
+ * 3. Ultra Per-page scan speed improvement:
+ *      A new hash algorithm is proposed. As a result, on a machine with
+ *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
+ *      can scan memory areas that does not contain duplicated pages at speed of
+ *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
+ *      477MB/sec ~ 923MB/sec.
+ *
+ * 4. Thrashing area avoidance:
+ *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
+ *      filtered out. My benchmark shows it's more efficient than KSM's per-page
+ *      hash value based volatile page detection.
+ *
+ *
+ * 5. Misc changes upon KSM:
+ *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
+ *        comparison. It's much faster than default C version on x86.
+ *      * rmap_item now has an struct *page member to loosely cache a
+ *        address-->page mapping, which reduces too much time-costly
+ *        follow_page().
+ *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
+ *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
+ *        ksm is needed for this case.
+ *
+ * 6. Full Zero Page consideration(contributed by Figo Zhang)
+ *    Now uksmd consider full zero pages as special pages and merge them to an
+ *    special unswappable uksm zero page.
+ */
+
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/spinlock.h>
+#include <linux/jhash.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/memory.h>
+#include <linux/mmu_notifier.h>
+#include <linux/swap.h>
+#include <linux/ksm.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <crypto/hash.h>
+#include <linux/random.h>
+#include <linux/math64.h>
+#include <linux/gcd.h>
+#include <linux/freezer.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+
+
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+#ifdef CONFIG_X86
+#undef memcmp
+
+#ifdef CONFIG_X86_32
+#define memcmp memcmpx86_32
+/*
+ * Compare 4-byte-aligned address s1 and s2, with length n
+ */
+int memcmpx86_32(void *s1, void *s2, size_t n)
+{
+   size_t num = n / 4;
+   register int res;
+
+   __asm__ __volatile__
+   (
+    "testl %3,%3\n\t"
+    "repe; cmpsd\n\t"
+    "je        1f\n\t"
+    "sbbl      %0,%0\n\t"
+    "orl       $1,%0\n"
+    "1:"
+    : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
+    : "0" (0)
+    : "cc");
+
+   return res;
+}
+
+/*
+ * Check the page is all zero ?
+ */
+static int is_full_zero(const void *s1, size_t len)
+{
+   unsigned char same;
+
+   len /= 4;
+
+   __asm__ __volatile__
+   ("repe; scasl;"
+    "sete %0"
+    : "=qm" (same), "+D" (s1), "+c" (len)
+    : "a" (0)
+    : "cc");
+
+   return same;
+}
+
+
+#elif defined(CONFIG_X86_64)
+#define memcmp memcmpx86_64
+/*
+ * Compare 8-byte-aligned address s1 and s2, with length n
+ */
+int memcmpx86_64(void *s1, void *s2, size_t n)
+{
+   size_t num = n / 8;
+   register int res;
+
+   __asm__ __volatile__
+   (
+    "testq %q3,%q3\n\t"
+    "repe; cmpsq\n\t"
+    "je        1f\n\t"
+    "sbbq      %q0,%q0\n\t"
+    "orq       $1,%q0\n"
+    "1:"
+    : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
+    : "0" (0)
+    : "cc");
+
+   return res;
+}
+
+static int is_full_zero(const void *s1, size_t len)
+{
+   unsigned char same;
+
+   len /= 8;
+
+   __asm__ __volatile__
+   ("repe; scasq;"
+    "sete %0"
+    : "=qm" (same), "+D" (s1), "+c" (len)
+    : "a" (0)
+    : "cc");
+
+   return same;
+}
+
+#endif
+#else
+static int is_full_zero(const void *s1, size_t len)
+{
+   unsigned long *src = s1;
+   int i;
+
+   len /= sizeof(*src);
+
+   for (i = 0; i < len; i++) {
+       if (src[i])
+           return 0;
+   }
+
+   return 1;
+}
+#endif
+
+#define U64_MAX        (~((u64)0))
+
+/* Basic data structure definition below */
+#define UKSM_RUNG_ROUND_FINISHED  (1<<0)
+#define UKSM_RUNG_BUSY_SEARCHED   (1<<1)
+
+/* Each rung of this ladder is a list of VMAs having a same scan ratio */
+struct scan_rung {
+   struct list_head vma_list;
+   struct list_head *current_scan;
+   unsigned long flags;
+   unsigned int pages_to_scan;
+   unsigned long fully_scanned_slots;
+   unsigned long scan_ratio;
+   unsigned long vma_num;
+};
+
+#define UKSM_SLOT_NEED_SORT (1<<0)
+#define UKSM_SLOT_NEED_RERAND (1<<1)
+#define UKSM_SLOT_SCANNED     (1<<2) /* It's scanned in this round */
+#define UKSM_SLOT_FUL_SCANNED (1<<3)
+
+/**
+ * node of either the stable or unstale rbtree
+ *
+ */
+struct tree_node {
+   struct rb_node node; /* link in the main (un)stable rbtree */
+   struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
+   u32 hash;
+   unsigned long count; /* how many sublevel tree nodes */
+   struct list_head all_list; /* all tree nodes in stable/unstable tree */
+};
+
+/**
+ * struct stable_node - node of the stable rbtree
+ * @node: rb node of this ksm page in the stable tree
+ * @hlist: hlist head of rmap_items using this ksm page
+ * @kpfn: page frame number of this ksm page
+ */
+struct stable_node {
+   struct rb_node node; /* link in sub-rbtree */
+   struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
+   struct hlist_head hlist;
+   unsigned long kpfn;
+   u32 hash_max; /* if ==0 then it's not been calculated yet */
+   //struct vm_area_struct *old_vma;
+   struct list_head all_list; /* in a list for all stable nodes */
+};
+
+/**
+ * struct node_vma - group rmap_items linked in a same stable
+ * node together.
+ */
+struct node_vma {
+   union {
+       struct vma_slot *slot;
+       unsigned long key;  /* slot is used as key sorted on hlist */
+   };
+   struct hlist_node hlist;
+   struct hlist_head rmap_hlist;
+   struct stable_node *head;
+   unsigned long last_update;
+};
+
+/**
+ * struct rmap_item - reverse mapping item for virtual addresses
+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
+ * @mm: the memory structure this rmap_item is pointing into
+ * @address: the virtual address this rmap_item tracks (+ flags in low bits)
+ * @node: rb node of this rmap_item in the unstable tree
+ * @head: pointer to stable_node heading this list in the stable tree
+ * @hlist: link into hlist of rmap_items hanging off that stable_node
+ */
+struct rmap_item {
+   struct vma_slot *slot;
+   struct page *page;
+   unsigned long address;  /* + low bits used for flags below */
+   /* Appendded to (un)stable tree on which scan round */
+   unsigned long append_round;
+
+   /* Which rung scan turn it was last scanned */
+   //unsigned long last_scan;
+   unsigned long entry_index;
+   union {
+       struct {/* when in unstable tree */
+           struct rb_node node;
+           struct tree_node *tree_node;
+           u32 hash_max;
+       };
+       struct { /* when in stable tree */
+           struct node_vma *head;
+           struct hlist_node hlist;
+           struct anon_vma *anon_vma;
+       };
+   };
+} __attribute__((aligned(4)));
+
+struct rmap_list_entry {
+   union {
+       struct rmap_item *item;
+       unsigned long addr;
+   };
+   // lowest bit is used for is_addr tag
+   //unsigned char is_addr;
+} __attribute__((aligned(4))); // 4 aligned to fit in to pages
+
+
+/* Basic data structure definition ends */
+#define USR_SPT_MAGIC_LEN 16
+#define USR_SPT_STR_LEN 32
+#define UKSM_USR_SPT_PORT 52013
+#define UKSM_USR_SPT_INTVL_MSEC 120000
+
+#define  UKSM_USR_SPT_NO_FROM  1
+
+struct uksm_usr_spt_s {
+   char magic[16];
+   u32 randid;
+   unsigned char flags;
+   char usr_str[32];
+} __attribute__ ((packed));
+
+static struct uksm_usr_spt_s uksm_usr_spt = {
+   .magic = "UKSM_USR_SPT_OK1",
+   .usr_str = "none",
+};
+/*static struct sockaddr_in uksm_usr_spt_addr;
+static struct socket *uksm_usr_spt_sock = NULL;
+static unsigned long uksm_usr_spt_last;*/
+static int uksm_usr_spt_enabled = 0;
+
+/*
+ * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
+ * The flags use the low bits of rmap_item.address
+ */
+#define UNSTABLE_FLAG  0x1
+#define STABLE_FLAG    0x2
+#define get_rmap_addr(x)   ((x)->address & PAGE_MASK)
+
+/*
+ * rmap_list_entry helpers
+ */
+#define IS_ADDR_FLAG   1
+#define is_addr(ptr)       ((unsigned long)(ptr) & IS_ADDR_FLAG)
+#define set_is_addr(ptr)   ((ptr) |= IS_ADDR_FLAG)
+#define get_clean_addr(ptr)    (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
+
+
+/*
+ * High speed caches for frequently allocated and freed structs
+ */
+static struct kmem_cache *rmap_item_cache;
+static struct kmem_cache *stable_node_cache;
+static struct kmem_cache *node_vma_cache;
+static struct kmem_cache *vma_slot_cache;
+static struct kmem_cache *tree_node_cache;
+#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
+       sizeof(struct __struct), __alignof__(struct __struct),\
+       (__flags), NULL)
+
+/* The scan rounds ksmd is currently in */
+static unsigned long long uksm_scan_round = 1;
+
+/* The number of pages has been scanned since the start up */
+static u64 uksm_pages_scanned;
+
+/* The number of pages has been scanned when last scan round finished */
+static u64 uksm_pages_scanned_last;
+
+/* If the scanned number is tooo large, we encode it here */
+static u64 pages_scanned_stored;
+static unsigned long pages_scanned_base;
+
+/* The number of nodes in the stable tree */
+static unsigned long uksm_pages_shared;
+
+/* The number of page slots additionally sharing those nodes */
+static unsigned long uksm_pages_sharing;
+
+/* The number of nodes in the unstable tree */
+static unsigned long uksm_pages_unshared;
+
+/*
+ * Number of pages ksmd should scan in one batch. This is the top speed for
+ * richly duplicated areas.
+ */
+static unsigned long uksm_scan_batch_pages = 60000;
+
+/* Milliseconds ksmd should sleep between batches */
+static unsigned int uksm_sleep_jiffies;
+
+/*
+ * The threshold used to filter out thrashing areas,
+ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
+ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
+ * will be considered as having a zero duplication ratio.
+ */
+static unsigned int uksm_thrash_threshold = 50;
+
+/* To avoid the float point arithmetic, this is the scale of a
+ * deduplication ratio number.
+ */
+#define UKSM_DEDUP_RATIO_SCALE 100
+
+
+#define UKSM_SCAN_RATIO_MAX    125
+
+/* minimum scan ratio for a vma, in unit of 1/UKSM_SCAN_RATIO_MAX */
+static unsigned int uksm_min_scan_ratio = 1;
+
+/*
+ * After each scan round, the scan ratio of an area with a big deduplication
+ * ratio is upgraded by *=uksm_scan_ratio_delta
+ */
+static unsigned int uksm_scan_ratio_delta = 5;
+
+#define INDIRECT_OFFSET        1
+
+/*
+ * For mapping of vma_slot and its index in inter-vma duplication number
+ * table
+ */
+static struct radix_tree_root uksm_vma_tree;
+static unsigned long uksm_vma_tree_num;
+static unsigned long uksm_vma_tree_index_end;
+
+/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
+static struct scan_rung *uksm_scan_ladder;
+static unsigned int uksm_scan_ladder_size;
+
+/* The number of VMAs we are keeping track of */
+static unsigned long uksm_vma_slot_num;
+
+/* How many times the ksmd has slept since startup */
+static u64 uksm_sleep_times;
+
+#define UKSM_RUN_STOP  0
+#define UKSM_RUN_MERGE 1
+static unsigned int uksm_run = UKSM_RUN_MERGE;
+
+static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
+static DEFINE_MUTEX(uksm_thread_mutex);
+
+/*
+ * List vma_slot_new is for newly created vma_slot waiting to be added by
+ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
+ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
+ * VMA has been removed/freed.
+ */
+struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
+struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
+struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
+static DEFINE_SPINLOCK(vma_slot_list_lock);
+
+/* The unstable tree heads */
+static struct rb_root root_unstable_tree = RB_ROOT;
+
+/*
+ * All tree_nodes are in a list to be freed at once when unstable tree is
+ * freed after each scan round.
+ */
+static struct list_head unstable_tree_node_list =
+               LIST_HEAD_INIT(unstable_tree_node_list);
+
+/* List contains all stable nodes */
+static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
+
+/*
+ * When the hash strength is changed, the stable tree must be delta_hashed and
+ * re-structured. We use two set of below structs to speed up the
+ * re-structuring of stable tree.
+ */
+static struct list_head
+stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
+               LIST_HEAD_INIT(stable_tree_node_list[1])};
+
+static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
+static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
+static struct rb_root *root_stable_treep = &root_stable_tree[0];
+static unsigned long stable_tree_index;
+
+/* The hash strength needed to hash a full page */
+#define HASH_STRENGTH_FULL     (PAGE_SIZE / sizeof(u32))
+
+/* The hash strength needed for loop-back hashing */
+#define HASH_STRENGTH_MAX      (HASH_STRENGTH_FULL + 10)
+
+/* The random offsets in a page */
+static u32 *random_nums;
+
+/* The hash strength */
+static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
+
+/* The delta value each time the hash strength increases or decreases */
+static unsigned long hash_strength_delta;
+#define HASH_STRENGTH_DELTA_MAX    5
+
+/* The time we have saved due to random_sample_hash */
+static u64 rshash_pos;
+
+/* The time we have wasted due to hash collision */
+static u64 rshash_neg;
+
+struct uksm_benefit {
+   u64 pos;
+   u64 neg;
+   u64 scanned;
+   unsigned long base;
+} benefit;
+
+/*
+ * The relative cost of memcmp, compared to 1 time unit of random sample
+ * hash, this value is tested when ksm module is initialized
+ */
+static unsigned long memcmp_cost;
+
+static unsigned long  rshash_neg_cont_zero;
+static unsigned long  rshash_cont_obscure;
+
+/* The possible states of hash strength adjustment heuristic */
+enum rshash_states {
+       RSHASH_STILL,
+       RSHASH_TRYUP,
+       RSHASH_TRYDOWN,
+       RSHASH_NEW,
+       RSHASH_PRE_STILL,
+};
+
+/* The possible direction we are about to adjust hash strength */
+enum rshash_direct {
+   GO_UP,
+   GO_DOWN,
+   OBSCURE,
+   STILL,
+};
+
+/* random sampling hash state machine */
+static struct {
+   enum rshash_states state;
+   enum rshash_direct pre_direct;
+   u8 below_count;
+   /* Keep a lookup window of size 5, iff above_count/below_count > 3
+    * in this window we stop trying.
+    */
+   u8 lookup_window_index;
+   u64 stable_benefit;
+   unsigned long turn_point_down;
+   unsigned long turn_benefit_down;
+   unsigned long turn_point_up;
+   unsigned long turn_benefit_up;
+   unsigned long stable_point;
+} rshash_state;
+
+/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
+static u32 *zero_hash_table;
+
+static inline struct node_vma *alloc_node_vma(void)
+{
+   struct node_vma *node_vma;
+   node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL);
+   if (node_vma) {
+       INIT_HLIST_HEAD(&node_vma->rmap_hlist);
+       INIT_HLIST_NODE(&node_vma->hlist);
+       node_vma->last_update = 0;
+   }
+   return node_vma;
+}
+
+static inline void free_node_vma(struct node_vma *node_vma)
+{
+   kmem_cache_free(node_vma_cache, node_vma);
+}
+
+
+static inline struct vma_slot *alloc_vma_slot(void)
+{
+   struct vma_slot *slot;
+
+   /*
+    * In case ksm is not initialized by now.
+    * Oops, we need to consider the call site of uksm_init() in the future.
+    */
+   if (!vma_slot_cache)
+       return NULL;
+
+   slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL);
+   if (slot) {
+       INIT_LIST_HEAD(&slot->uksm_list);
+       INIT_LIST_HEAD(&slot->slot_list);
+       INIT_RADIX_TREE(&slot->dup_tree, GFP_KERNEL);
+       slot->uksm_index = -1;
+       slot->flags |= UKSM_SLOT_NEED_RERAND;
+   }
+   return slot;
+}
+
+static inline void free_vma_slot(struct vma_slot *vma_slot)
+{
+   kmem_cache_free(vma_slot_cache, vma_slot);
+}
+
+
+
+static inline struct rmap_item *alloc_rmap_item(void)
+{
+   struct rmap_item *rmap_item;
+
+   rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
+   if (rmap_item) {
+       /* bug on lowest bit is not clear for flag use */
+       BUG_ON(is_addr(rmap_item));
+   }
+   return rmap_item;
+}
+
+static inline void free_rmap_item(struct rmap_item *rmap_item)
+{
+   rmap_item->slot = NULL; /* debug safety */
+   kmem_cache_free(rmap_item_cache, rmap_item);
+}
+
+static inline struct stable_node *alloc_stable_node(void)
+{
+   struct stable_node *node;
+   node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC);
+   if (!node)
+       return NULL;
+
+   INIT_HLIST_HEAD(&node->hlist);
+   list_add(&node->all_list, &stable_node_list);
+   return node;
+}
+
+static inline void free_stable_node(struct stable_node *stable_node)
+{
+   list_del(&stable_node->all_list);
+   kmem_cache_free(stable_node_cache, stable_node);
+}
+
+static inline struct tree_node *alloc_tree_node(struct list_head *list)
+{
+   struct tree_node *node;
+   node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC);
+   if (!node)
+       return NULL;
+
+   list_add(&node->all_list, list);
+   return node;
+}
+
+static inline void free_tree_node(struct tree_node *node)
+{
+   list_del(&node->all_list);
+   kmem_cache_free(tree_node_cache, node);
+}
+
+static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
+{
+   struct anon_vma *anon_vma = rmap_item->anon_vma;
+
+   put_anon_vma(anon_vma);
+}
+
+
+/**
+ * Remove a stable node from stable_tree, may unlink from its tree_node and
+ * may remove its parent tree_node if no other stable node is pending.
+ *
+ * @stable_node    The node need to be removed
+ * @unlink_rb      Will this node be unlinked from the rbtree?
+ * @remove_tree_   node Will its tree_node be removed if empty?
+ */
+static void remove_node_from_stable_tree(struct stable_node *stable_node,
+                    int unlink_rb,  int remove_tree_node)
+{
+   struct node_vma *node_vma;
+   struct rmap_item *rmap_item;
+   struct hlist_node *hlist, *rmap_hlist, *n;
+
+   if (!hlist_empty(&stable_node->hlist)) {
+       hlist_for_each_entry_safe(node_vma, hlist, n,
+                     &stable_node->hlist, hlist) {
+           hlist_for_each_entry(rmap_item, rmap_hlist,
+                        &node_vma->rmap_hlist, hlist) {
+               uksm_pages_sharing--;
+
+               uksm_drop_anon_vma(rmap_item);
+               rmap_item->address &= PAGE_MASK;
+           }
+           free_node_vma(node_vma);
+           cond_resched();
+       }
+
+       /* the last one is counted as shared */
+       uksm_pages_shared--;
+       uksm_pages_sharing++;
+   }
+
+   if (stable_node->tree_node && unlink_rb) {
+       rb_erase(&stable_node->node,
+            &stable_node->tree_node->sub_root);
+
+       if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
+           remove_tree_node) {
+           rb_erase(&stable_node->tree_node->node,
+                root_stable_treep);
+           free_tree_node(stable_node->tree_node);
+       } else {
+           stable_node->tree_node->count--;
+       }
+   }
+
+   free_stable_node(stable_node);
+}
+
+
+/*
+ * get_uksm_page: checks if the page indicated by the stable node
+ * is still its ksm page, despite having held no reference to it.
+ * In which case we can trust the content of the page, and it
+ * returns the gotten page; but if the page has now been zapped,
+ * remove the stale node from the stable tree and return NULL.
+ *
+ * You would expect the stable_node to hold a reference to the ksm page.
+ * But if it increments the page's count, swapping out has to wait for
+ * ksmd to come around again before it can free the page, which may take
+ * seconds or even minutes: much too unresponsive.  So instead we use a
+ * "keyhole reference": access to the ksm page from the stable node peeps
+ * out through its keyhole to see if that page still holds the right key,
+ * pointing back to this stable node.  This relies on freeing a PageAnon
+ * page to reset its page->mapping to NULL, and relies on no other use of
+ * a page to put something that might look like our key in page->mapping.
+ *
+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
+ * but this is different - made simpler by uksm_thread_mutex being held, but
+ * interesting for assuming that no other use of the struct page could ever
+ * put our expected_mapping into page->mapping (or a field of the union which
+ * coincides with page->mapping).  The RCU calls are not for KSM at all, but
+ * to keep the page_count protocol described with page_cache_get_speculative.
+ *
+ * Note: it is possible that get_uksm_page() will return NULL one moment,
+ * then page the next, if the page is in between page_freeze_refs() and
+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
+ * is on its way to being freed; but it is an anomaly to bear in mind.
+ *
+ * @unlink_rb:         if the removal of this node will firstly unlink from
+ * its rbtree. stable_node_reinsert will prevent this when restructuring the
+ * node from its old tree.
+ *
+ * @remove_tree_node:  if this is the last one of its tree_node, will the
+ * tree_node be freed ? If we are inserting stable node, this tree_node may
+ * be reused, so don't free it.
+ */
+static struct page *get_uksm_page(struct stable_node *stable_node,
+                int unlink_rb, int remove_tree_node)
+{
+   struct page *page;
+   void *expected_mapping;
+
+   page = pfn_to_page(stable_node->kpfn);
+   expected_mapping = (void *)stable_node +
+               (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+   rcu_read_lock();
+   if (page->mapping != expected_mapping)
+       goto stale;
+   if (!get_page_unless_zero(page))
+       goto stale;
+   if (page->mapping != expected_mapping) {
+       put_page(page);
+       goto stale;
+   }
+   rcu_read_unlock();
+   return page;
+stale:
+   rcu_read_unlock();
+   remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
+
+   return NULL;
+}
+
+/*
+ * Removing rmap_item from stable or unstable tree.
+ * This function will clean the information from the stable/unstable tree.
+ */
+static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
+{
+   if (rmap_item->address & STABLE_FLAG) {
+       struct stable_node *stable_node;
+       struct node_vma *node_vma;
+       struct page *page;
+
+       node_vma = rmap_item->head;
+       stable_node = node_vma->head;
+       page = get_uksm_page(stable_node, 1, 1);
+       if (!page)
+           goto out;
+
+       /*
+        * page lock is needed because it's racing with
+        * try_to_unmap_ksm(), etc.
+        */
+       lock_page(page);
+       hlist_del(&rmap_item->hlist);
+
+       if (hlist_empty(&node_vma->rmap_hlist)) {
+           hlist_del(&node_vma->hlist);
+           free_node_vma(node_vma);
+       }
+       unlock_page(page);
+
+       put_page(page);
+       if (hlist_empty(&stable_node->hlist)) {
+           /* do NOT call remove_node_from_stable_tree() here,
+            * it's possible for a forked rmap_item not in
+            * stable tree while the in-tree rmap_items were
+            * deleted.
+            */
+           uksm_pages_shared--;
+       } else
+           uksm_pages_sharing--;
+
+
+       uksm_drop_anon_vma(rmap_item);
+   } else if (rmap_item->address & UNSTABLE_FLAG) {
+       /*
+        * Usually ksmd can and must skip the rb_erase, because
+        * root_unstable_tree was already reset to RB_ROOT.
+        * But be careful when an mm is exiting: do the rb_erase
+        * if this rmap_item was inserted by this scan, rather
+        * than left over from before.
+        */
+       if (rmap_item->append_round == uksm_scan_round) {
+           rb_erase(&rmap_item->node,
+                &rmap_item->tree_node->sub_root);
+           if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
+               rb_erase(&rmap_item->tree_node->node,
+                    &root_unstable_tree);
+
+               free_tree_node(rmap_item->tree_node);
+           } else
+               rmap_item->tree_node->count--;
+       }
+       uksm_pages_unshared--;
+   }
+
+   rmap_item->address &= PAGE_MASK;
+   rmap_item->hash_max = 0;
+
+out:
+   cond_resched();     /* we're called from many long loops */
+}
+
+/**
+ * Need to do two things:
+ * 1. check if slot was moved to del list
+ * 2. make sure the mmap_sem is manipulated under valid vma.
+ *
+ * My concern here is that in some cases, this may make
+ * vma_slot_list_lock() waiters to serialized further by some
+ * sem->wait_lock, can this really be expensive?
+ *
+ *
+ * @return
+ * 0: if successfully locked mmap_sem
+ * -ENOENT: this slot was moved to del list
+ * -EBUSY: vma lock failed
+ */
+static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
+{
+   struct vm_area_struct *vma;
+   struct mm_struct *mm;
+   struct rw_semaphore *sem;
+
+   spin_lock(&vma_slot_list_lock);
+
+   /* the slot_list was removed and inited from new list, when it enters
+    * uksm_list. If now it's not empty, then it must be moved to del list
+    */
+   if (!list_empty(&slot->slot_list)) {
+       spin_unlock(&vma_slot_list_lock);
+       return -ENOENT;
+   }
+
+   BUG_ON(slot->pages != vma_pages(slot->vma));
+   /* Ok, vma still valid */
+   vma = slot->vma;
+   mm = vma->vm_mm;
+   sem = &mm->mmap_sem;
+   if (down_read_trylock(sem)) {
+       spin_unlock(&vma_slot_list_lock);
+       return 0;
+   }
+
+   spin_unlock(&vma_slot_list_lock);
+   return -EBUSY;
+}
+
+static inline unsigned long
+vma_page_address(struct page *page, struct vm_area_struct *vma)
+{
+   pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+   unsigned long address;
+
+   address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+   if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+       /* page should be within @vma mapping range */
+       return -EFAULT;
+   }
+   return address;
+}
+
+/*
+ * Test if the mm is exiting
+ */
+static inline bool uksm_test_exit(struct mm_struct *mm)
+{
+   return atomic_read(&mm->mm_users) == 0;
+}
+
+/* return 0 on success with the item's mmap_sem locked */
+static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
+{
+   struct mm_struct *mm;
+   struct vma_slot *slot = item->slot;
+   int err = -EINVAL;
+
+   struct page *page;
+
+   BUG_ON(!item->slot);
+   /*
+    * try_down_read_slot_mmap_sem() returns non-zero if the slot
+    * has been removed by uksm_remove_vma().
+    */
+   if (try_down_read_slot_mmap_sem(slot))
+       return -EBUSY;
+
+   mm = slot->vma->vm_mm;
+
+   if (uksm_test_exit(mm))
+       goto failout_up;
+
+   page = item->page;
+   rcu_read_lock();
+   if (!get_page_unless_zero(page)) {
+       rcu_read_unlock();
+       goto failout_up;
+   }
+
+   /* No need to consider huge page here. */
+   if (item->slot->vma->anon_vma != page_anon_vma(page) ||
+       vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
+       /*
+        * TODO:
+        * should we release this item becase of its stale page
+        * mapping?
+        */
+       put_page(page);
+       rcu_read_unlock();
+       goto failout_up;
+   }
+   rcu_read_unlock();
+   return 0;
+
+failout_up:
+   up_read(&mm->mmap_sem);
+   return err;
+}
+
+/*
+ * What kind of VMA is considered ?
+ */
+static inline int vma_can_enter(struct vm_area_struct *vma)
+{
+   return !(vma->vm_flags & (VM_PFNMAP | VM_IO  | VM_DONTEXPAND |
+                 VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
+                 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO |
+                 VM_SHARED  | VM_MAYSHARE | VM_GROWSUP
+                 | VM_GROWSDOWN));
+}
+
+/*
+ * Called whenever a fresh new vma is created A new vma_slot.
+ * is created and inserted into a global list Must be called.
+ * after vma is inserted to its mm                 .
+ */
+void uksm_vma_add_new(struct vm_area_struct *vma)
+{
+   struct vma_slot *slot;
+
+   if (!vma_can_enter(vma)) {
+       vma->uksm_vma_slot = NULL;
+       return;
+   }
+
+   slot = alloc_vma_slot();
+   if (!slot) {
+       vma->uksm_vma_slot = NULL;
+       return;
+   }
+
+   vma->uksm_vma_slot = slot;
+   vma->vm_flags |= VM_MERGEABLE;
+   slot->vma = vma;
+   slot->mm = vma->vm_mm;
+   slot->ctime_j = jiffies;
+   slot->pages = vma_pages(vma);
+   spin_lock(&vma_slot_list_lock);
+   list_add_tail(&slot->slot_list, &vma_slot_new);
+   spin_unlock(&vma_slot_list_lock);
+}
+
+/*
+ * Called after vma is unlinked from its mm
+ */
+void uksm_remove_vma(struct vm_area_struct *vma)
+{
+   struct vma_slot *slot;
+
+   if (!vma->uksm_vma_slot)
+       return;
+
+   slot = vma->uksm_vma_slot;
+   spin_lock(&vma_slot_list_lock);
+   if (list_empty(&slot->slot_list)) {
+       /**
+        * This slot has been added by ksmd, so move to the del list
+        * waiting ksmd to free it.
+        */
+       list_add_tail(&slot->slot_list, &vma_slot_del);
+   } else {
+       /**
+        * It's still on new list. It's ok to free slot directly.
+        */
+       list_del(&slot->slot_list);
+       free_vma_slot(slot);
+   }
+   spin_unlock(&vma_slot_list_lock);
+   vma->uksm_vma_slot = NULL;
+}
+
+/*   32/3 < they < 32/2 */
+#define shiftl 8
+#define shiftr 12
+
+#define HASH_FROM_TO(from, to)                 \
+for (index = from; index < to; index++) {      \
+   pos = random_nums[index];           \
+   hash += key[pos];               \
+   hash += (hash << shiftl);           \
+   hash ^= (hash >> shiftr);           \
+}
+
+
+#define HASH_FROM_DOWN_TO(from, to)            \
+for (index = from - 1; index >= to; index--) {     \
+   hash ^= (hash >> shiftr);           \
+   hash ^= (hash >> (shiftr*2));           \
+   hash -= (hash << shiftl);           \
+   hash += (hash << (shiftl*2));           \
+   pos = random_nums[index];           \
+   hash -= key[pos];               \
+}
+
+/*
+ * The main random sample hash function.
+ */
+static u32 random_sample_hash(void *addr, u32 hash_strength)
+{
+   u32 hash = 0xdeadbeef;
+   int index, pos, loop = hash_strength;
+   u32 *key = (u32 *)addr;
+
+   if (loop > HASH_STRENGTH_FULL)
+       loop = HASH_STRENGTH_FULL;
+
+   HASH_FROM_TO(0, loop);
+
+   if (hash_strength > HASH_STRENGTH_FULL) {
+       loop = hash_strength - HASH_STRENGTH_FULL;
+       HASH_FROM_TO(0, loop);
+   }
+
+   return hash;
+}
+
+
+/**
+ * It's used when hash strength is adjusted
+ *
+ * @addr The page's virtual address
+ * @from The original hash strength
+ * @to   The hash strength changed to
+ * @hash The hash value generated with "from" hash value
+ *
+ * return the hash value
+ */
+static u32 delta_hash(void *addr, int from, int to, u32 hash)
+{
+   u32 *key = (u32 *)addr;
+   int index, pos; /* make sure they are int type */
+
+   if (to > from) {
+       if (from >= HASH_STRENGTH_FULL) {
+           from -= HASH_STRENGTH_FULL;
+           to -= HASH_STRENGTH_FULL;
+           HASH_FROM_TO(from, to);
+       } else if (to <= HASH_STRENGTH_FULL) {
+           HASH_FROM_TO(from, to);
+       } else {
+           HASH_FROM_TO(from, HASH_STRENGTH_FULL);
+           HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
+       }
+   } else {
+       if (from <= HASH_STRENGTH_FULL) {
+           HASH_FROM_DOWN_TO(from, to);
+       } else if (to >= HASH_STRENGTH_FULL) {
+           from -= HASH_STRENGTH_FULL;
+           to -= HASH_STRENGTH_FULL;
+           HASH_FROM_DOWN_TO(from, to);
+       } else {
+           HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
+           HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
+       }
+   }
+
+   return hash;
+}
+
+
+
+
+#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
+
+/**
+ *
+ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
+ * has finished.
+ *
+ */
+static inline void encode_benefit(void)
+{
+   u64 scanned_delta, pos_delta, neg_delta;
+   unsigned long base = benefit.base;
+
+   scanned_delta = (uksm_pages_scanned - uksm_pages_scanned_last) >> base;
+   pos_delta = rshash_pos >> base;
+   neg_delta = rshash_neg >> base;
+
+   if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
+       CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
+       CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
+       benefit.scanned >>= 1;
+       benefit.neg >>= 1;
+       benefit.pos >>= 1;
+       benefit.base++;
+       scanned_delta >>= 1;
+       pos_delta >>= 1;
+       neg_delta >>= 1;
+   }
+
+   benefit.pos += pos_delta;
+   benefit.neg += neg_delta;
+   benefit.scanned += scanned_delta;
+
+   BUG_ON(!benefit.scanned);
+
+   rshash_pos = rshash_neg = 0;
+
+   /* -1 to make rshash_adjust() work */
+   uksm_pages_scanned_last = uksm_pages_scanned - 1;
+}
+
+static inline void reset_benefit(void)
+{
+   benefit.pos = 0;
+   benefit.neg = 0;
+   benefit.base = 0;
+   benefit.scanned = 0;
+}
+
+static inline void inc_rshash_pos(unsigned long delta)
+{
+   if (CAN_OVERFLOW_U64(rshash_pos, delta))
+       encode_benefit();
+
+   rshash_pos += delta;
+}
+
+static inline void inc_rshash_neg(unsigned long delta)
+{
+   if (CAN_OVERFLOW_U64(rshash_neg, delta))
+       encode_benefit();
+
+   rshash_neg += delta;
+}
+
+
+static inline u32 page_hash(struct page *page, unsigned long hash_strength,
+               int cost_accounting)
+{
+   u32 val;
+   unsigned long delta;
+
+   void *addr = kmap_atomic(page, KM_USER0);
+
+   val = random_sample_hash(addr, hash_strength);
+   kunmap_atomic(addr, KM_USER0);
+
+   if (cost_accounting) {
+       if (HASH_STRENGTH_FULL > hash_strength)
+           delta = HASH_STRENGTH_FULL - hash_strength;
+       else
+           delta = 0;
+
+       inc_rshash_pos(delta);
+   }
+
+   return val;
+}
+
+static int memcmp_pages(struct page *page1, struct page *page2,
+           int cost_accounting)
+{
+   char *addr1, *addr2;
+   int ret;
+
+   addr1 = kmap_atomic(page1, KM_USER0);
+   addr2 = kmap_atomic(page2, KM_USER1);
+   ret = memcmp(addr1, addr2, PAGE_SIZE);
+   kunmap_atomic(addr2, KM_USER1);
+   kunmap_atomic(addr1, KM_USER0);
+
+   if (cost_accounting)
+       inc_rshash_neg(memcmp_cost);
+
+   return ret;
+}
+
+static inline int pages_identical(struct page *page1, struct page *page2)
+{
+   return !memcmp_pages(page1, page2, 0);
+}
+
+static inline int is_page_full_zero(struct page *page)
+{
+   char *addr;
+   int ret;
+
+   addr = kmap_atomic(page, KM_USER0);
+   ret = is_full_zero(addr, PAGE_SIZE);
+   kunmap_atomic(addr, KM_USER0);
+
+   return ret;
+}
+
+static int write_protect_page(struct vm_area_struct *vma, struct page *page,
+                 pte_t *orig_pte, pte_t *old_pte)
+{
+   struct mm_struct *mm = vma->vm_mm;
+   unsigned long addr;
+   pte_t *ptep;
+   spinlock_t *ptl;
+   int swapped;
+   int err = -EFAULT;
+
+   addr = page_address_in_vma(page, vma);
+   if (addr == -EFAULT)
+       goto out;
+
+   BUG_ON(PageTransCompound(page));
+   ptep = page_check_address(page, mm, addr, &ptl, 0);
+   if (!ptep)
+       goto out;
+
+   if (old_pte)
+       *old_pte = *ptep;
+
+   if (pte_write(*ptep) || pte_dirty(*ptep)) {
+       pte_t entry;
+
+       swapped = PageSwapCache(page);
+       flush_cache_page(vma, addr, page_to_pfn(page));
+       /*
+        * Ok this is tricky, when get_user_pages_fast() run it doesnt
+        * take any lock, therefore the check that we are going to make
+        * with the pagecount against the mapcount is racey and
+        * O_DIRECT can happen right after the check.
+        * So we clear the pte and flush the tlb before the check
+        * this assure us that no O_DIRECT can happen after the check
+        * or in the middle of the check.
+        */
+       entry = ptep_clear_flush(vma, addr, ptep);
+       /*
+        * Check that no O_DIRECT or similar I/O is in progress on the
+        * page
+        */
+       if (page_mapcount(page) + 1 + swapped != page_count(page)) {
+           set_pte_at(mm, addr, ptep, entry);
+           goto out_unlock;
+       }
+       if (pte_dirty(entry))
+           set_page_dirty(page);
+       entry = pte_mkclean(pte_wrprotect(entry));
+       set_pte_at_notify(mm, addr, ptep, entry);
+   }
+   *orig_pte = *ptep;
+   err = 0;
+
+out_unlock:
+   pte_unmap_unlock(ptep, ptl);
+out:
+   return err;
+}
+
+#define MERGE_ERR_PGERR        1 /* the page is invalid cannot continue */
+#define MERGE_ERR_COLLI        2 /* there is a collision */
+#define MERGE_ERR_CHANGED  3 /* the page has changed since last hash */
+
+
+/**
+ * replace_page - replace page in vma by new ksm page
+ * @vma:      vma that holds the pte pointing to page
+ * @page:     the page we are replacing by kpage
+ * @kpage:    the ksm page we replace page by
+ * @orig_pte: the original value of the pte
+ *
+ * Returns 0 on success, MERGE_ERR_PGERR on failure.
+ */
+static int replace_page(struct vm_area_struct *vma, struct page *page,
+           struct page *kpage, pte_t orig_pte)
+{
+   struct mm_struct *mm = vma->vm_mm;
+   pgd_t *pgd;
+   pud_t *pud;
+   pmd_t *pmd;
+   pte_t *ptep;
+   spinlock_t *ptl;
+   pte_t entry;
+
+   unsigned long addr;
+   int err = MERGE_ERR_PGERR;
+
+   addr = page_address_in_vma(page, vma);
+   if (addr == -EFAULT)
+       goto out;
+
+   pgd = pgd_offset(mm, addr);
+   if (!pgd_present(*pgd))
+       goto out;
+
+   pud = pud_offset(pgd, addr);
+   if (!pud_present(*pud))
+       goto out;
+
+   pmd = pmd_offset(pud, addr);
+   BUG_ON(pmd_trans_huge(*pmd));
+   if (!pmd_present(*pmd))
+       goto out;
+
+   ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+   if (!pte_same(*ptep, orig_pte)) {
+       pte_unmap_unlock(ptep, ptl);
+       goto out;
+   }
+
+   flush_cache_page(vma, addr, pte_pfn(*ptep));
+   ptep_clear_flush(vma, addr, ptep);
+   entry = mk_pte(kpage, vma->vm_page_prot);
+
+   /* special treatment is needed for zero_page */
+   if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
+               (page_to_pfn(kpage) == zero_pfn))
+       entry = pte_mkspecial(entry);
+   else {
+       get_page(kpage);
+       page_add_anon_rmap(kpage, vma, addr);
+   }
+
+   set_pte_at_notify(mm, addr, ptep, entry);
+
+   page_remove_rmap(page);
+   if (!page_mapped(page))
+       try_to_free_swap(page);
+   put_page(page);
+
+   pte_unmap_unlock(ptep, ptl);
+   err = 0;
+out:
+   return err;
+}
+
+
+/**
+ *  Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
+ *  zero hash value at HASH_STRENGTH_MAX is used to indicated that its
+ *  hash_max member has not been calculated.
+ *
+ * @page The page needs to be hashed
+ * @hash_old The hash value calculated with current hash strength
+ *
+ * return the new hash value calculated at HASH_STRENGTH_MAX
+ */
+static inline u32 page_hash_max(struct page *page, u32 hash_old)
+{
+   u32 hash_max = 0;
+   void *addr;
+
+   addr = kmap_atomic(page, KM_USER0);
+   hash_max = delta_hash(addr, hash_strength,
+                 HASH_STRENGTH_MAX, hash_old);
+
+   kunmap_atomic(addr, KM_USER0);
+
+   if (!hash_max)
+       hash_max = 1;
+
+   inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
+   return hash_max;
+}
+
+/*
+ * We compare the hash again, to ensure that it is really a hash collision
+ * instead of being caused by page write.
+ */
+static inline int check_collision(struct rmap_item *rmap_item,
+                 u32 hash)
+{
+   int err;
+   struct page *page = rmap_item->page;
+
+   /* if this rmap_item has already been hash_maxed, then the collision
+    * must appears in the second-level rbtree search. In this case we check
+    * if its hash_max value has been changed. Otherwise, the collision
+    * happens in the first-level rbtree search, so we check against it's
+    * current hash value.
+    */
+   if (rmap_item->hash_max) {
+       inc_rshash_neg(memcmp_cost);
+       inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
+
+       if (rmap_item->hash_max == page_hash_max(page, hash))
+           err = MERGE_ERR_COLLI;
+       else
+           err = MERGE_ERR_CHANGED;
+   } else {
+       inc_rshash_neg(memcmp_cost + hash_strength);
+
+       if (page_hash(page, hash_strength, 0) == hash)
+           err = MERGE_ERR_COLLI;
+       else
+           err = MERGE_ERR_CHANGED;
+   }
+
+   return err;
+}
+
+static struct page *page_trans_compound_anon(struct page *page)
+{
+   if (PageTransCompound(page)) {
+       struct page *head = compound_trans_head(page);
+       /*
+        * head may actually be splitted and freed from under
+        * us but it's ok here.
+        */
+       if (PageAnon(head))
+           return head;
+   }
+   return NULL;
+}
+
+static int page_trans_compound_anon_split(struct page *page)
+{
+   int ret = 0;
+   struct page *transhuge_head = page_trans_compound_anon(page);
+   if (transhuge_head) {
+       /* Get the reference on the head to split it. */
+       if (get_page_unless_zero(transhuge_head)) {
+           /*
+            * Recheck we got the reference while the head
+            * was still anonymous.
+            */
+           if (PageAnon(transhuge_head))
+               ret = split_huge_page(transhuge_head);
+           else
+               /*
+                * Retry later if split_huge_page run
+                * from under us.
+                */
+               ret = 1;
+           put_page(transhuge_head);
+       } else
+           /* Retry later if split_huge_page run from under us. */
+           ret = 1;
+   }
+   return ret;
+}
+
+/**
+ * Try to merge a rmap_item.page with a kpage in stable node. kpage must
+ * already be a ksm page.
+ *
+ * @return 0 if the pages were merged, -EFAULT otherwise.
+ */
+static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
+                     struct page *kpage, u32 hash)
+{
+   struct vm_area_struct *vma = rmap_item->slot->vma;
+   struct mm_struct *mm = vma->vm_mm;
+   pte_t orig_pte = __pte(0);
+   int err = MERGE_ERR_PGERR;
+   struct page *page;
+
+   if (uksm_test_exit(mm))
+       goto out;
+
+   page = rmap_item->page;
+
+   if (page == kpage) { /* ksm page forked */
+       err = 0;
+       goto out;
+   }
+
+   if (PageTransCompound(page) && page_trans_compound_anon_split(page))
+       goto out;
+   BUG_ON(PageTransCompound(page));
+
+   if (!PageAnon(page) || !PageKsm(kpage))
+       goto out;
+
+   /*
+    * We need the page lock to read a stable PageSwapCache in
+    * write_protect_page().  We use trylock_page() instead of
+    * lock_page() because we don't want to wait here - we
+    * prefer to continue scanning and merging different pages,
+    * then come back to this page when it is unlocked.
+    */
+   if (!trylock_page(page))
+       goto out;
+   /*
+    * If this anonymous page is mapped only here, its pte may need
+    * to be write-protected.  If it's mapped elsewhere, all of its
+    * ptes are necessarily already write-protected.  But in either
+    * case, we need to lock and check page_count is not raised.
+    */
+   if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
+       if (pages_identical(page, kpage))
+           err = replace_page(vma, page, kpage, orig_pte);
+       else
+           err = check_collision(rmap_item, hash);
+   }
+
+   if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
+       munlock_vma_page(page);
+       if (!PageMlocked(kpage)) {
+           unlock_page(page);
+           lock_page(kpage);
+           mlock_vma_page(kpage);
+           page = kpage;       /* for final unlock */
+       }
+   }
+
+   unlock_page(page);
+out:
+   return err;
+}
+
+
+
+/**
+ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
+ * to restore a page mapping that has been changed in try_to_merge_two_pages.
+ *
+ * @return 0 on success.
+ */
+static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
+                pte_t orig_pte, pte_t wprt_pte)
+{
+   struct mm_struct *mm = vma->vm_mm;
+   pgd_t *pgd;
+   pud_t *pud;
+   pmd_t *pmd;
+   pte_t *ptep;
+   spinlock_t *ptl;
+
+   int err = -EFAULT;
+
+   pgd = pgd_offset(mm, addr);
+   if (!pgd_present(*pgd))
+       goto out;
+
+   pud = pud_offset(pgd, addr);
+   if (!pud_present(*pud))
+       goto out;
+
+   pmd = pmd_offset(pud, addr);
+   if (!pmd_present(*pmd))
+       goto out;
+
+   ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+   if (!pte_same(*ptep, wprt_pte)) {
+       /* already copied, let it be */
+       pte_unmap_unlock(ptep, ptl);
+       goto out;
+   }
+
+   /*
+    * Good boy, still here. When we still get the ksm page, it does not
+    * return to the free page pool, there is no way that a pte was changed
+    * to other page and gets back to this page. And remind that ksm page
+    * do not reuse in do_wp_page(). So it's safe to restore the original
+    * pte.
+    */
+   flush_cache_page(vma, addr, pte_pfn(*ptep));
+   ptep_clear_flush(vma, addr, ptep);
+   set_pte_at_notify(mm, addr, ptep, orig_pte);
+
+   pte_unmap_unlock(ptep, ptl);
+   err = 0;
+out:
+   return err;
+}
+
+/**
+ * try_to_merge_two_pages() - take two identical pages and prepare
+ * them to be merged into one page(rmap_item->page)
+ *
+ * @return 0 if we successfully merged two identical pages into
+ *         one ksm page. MERGE_ERR_COLLI if it's only a hash collision
+ *         search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
+ *         changed since it's hashed. MERGE_ERR_PGERR otherwise.
+ *
+ */
+static int try_to_merge_two_pages(struct rmap_item *rmap_item,
+                 struct rmap_item *tree_rmap_item,
+                 u32 hash)
+{
+   pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
+   pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
+   struct vm_area_struct *vma1 = rmap_item->slot->vma;
+   struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
+   struct page *page = rmap_item->page;
+   struct page *tree_page = tree_rmap_item->page;
+   int err = MERGE_ERR_PGERR;
+   struct address_space *saved_mapping;
+
+
+   if (rmap_item->page == tree_rmap_item->page)
+       goto out;
+
+   if (PageTransCompound(page) && page_trans_compound_anon_split(page))
+       goto out;
+   BUG_ON(PageTransCompound(page));
+
+   if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page))
+       goto out;
+   BUG_ON(PageTransCompound(tree_page));
+
+   if (!PageAnon(page) || !PageAnon(tree_page))
+       goto out;
+
+   if (!trylock_page(page))
+       goto out;
+
+
+   if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
+       unlock_page(page);
+       goto out;
+   }
+
+   /*
+    * While we hold page lock, upgrade page from
+    * PageAnon+anon_vma to PageKsm+NULL stable_node:
+    * stable_tree_insert() will update stable_node.
+    */
+   saved_mapping = page->mapping;
+   set_page_stable_node(page, NULL);
+   mark_page_accessed(page);
+   unlock_page(page);
+
+   if (!trylock_page(tree_page))
+       goto restore_out;
+
+   if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
+       unlock_page(tree_page);
+       goto restore_out;
+   }
+
+   if (pages_identical(page, tree_page)) {
+       err = replace_page(vma2, tree_page, page, wprt_pte2);
+       if (err) {
+           unlock_page(tree_page);
+           goto restore_out;
+       }
+
+       if ((vma2->vm_flags & VM_LOCKED)) {
+           munlock_vma_page(tree_page);
+           if (!PageMlocked(page)) {
+               unlock_page(tree_page);
+               lock_page(page);
+               mlock_vma_page(page);
+               tree_page = page; /* for final unlock */
+           }
+       }
+
+       unlock_page(tree_page);
+
+       goto out; /* success */
+
+   } else {
+       if (page_hash(page, hash_strength, 0) ==
+           page_hash(tree_page, hash_strength, 0)) {
+           inc_rshash_neg(memcmp_cost + hash_strength * 2);
+           err = MERGE_ERR_COLLI;
+       } else
+           err = MERGE_ERR_CHANGED;
+
+       unlock_page(tree_page);
+   }
+
+restore_out:
+   lock_page(page);
+   if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
+                 orig_pte1, wprt_pte1))
+       page->mapping = saved_mapping;
+
+   unlock_page(page);
+out:
+   return err;
+}
+
+static inline int hash_cmp(u32 new_val, u32 node_val)
+{
+   if (new_val > node_val)
+       return 1;
+   else if (new_val < node_val)
+       return -1;
+   else
+       return 0;
+}
+
+static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
+{
+   u32 hash_max = item->hash_max;
+
+   if (!hash_max) {
+       hash_max = page_hash_max(item->page, hash);
+
+       item->hash_max = hash_max;
+   }
+
+   return hash_max;
+}
+
+
+
+/**
+ * stable_tree_search() - search the stable tree for a page
+ *
+ * @item:  the rmap_item we are comparing with
+ * @hash:  the hash value of this item->page already calculated
+ *
+ * @return     the page we have found, NULL otherwise. The page returned has
+ *             been gotten.
+ */
+static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
+{
+   struct rb_node *node = root_stable_treep->rb_node;
+   struct tree_node *tree_node;
+   unsigned long hash_max;
+   struct page *page = item->page;
+   struct stable_node *stable_node;
+
+   stable_node = page_stable_node(page);
+   if (stable_node) {
+       /* ksm page forked, that is
+        * if (PageKsm(page) && !in_stable_tree(rmap_item))
+        * it's actually gotten once outside.
+        */
+       get_page(page);
+       return page;
+   }
+
+   while (node) {
+       int cmp;
+
+       tree_node = rb_entry(node, struct tree_node, node);
+
+       cmp = hash_cmp(hash, tree_node->hash);
+
+       if (cmp < 0)
+           node = node->rb_left;
+       else if (cmp > 0)
+           node = node->rb_right;
+       else
+           break;
+   }
+
+   if (!node)
+       return NULL;
+
+   if (tree_node->count == 1) {
+       stable_node = rb_entry(tree_node->sub_root.rb_node,
+                      struct stable_node, node);
+       BUG_ON(!stable_node);
+
+       goto get_page_out;
+   }
+
+   /*
+    * ok, we have to search the second
+    * level subtree, hash the page to a
+    * full strength.
+    */
+   node = tree_node->sub_root.rb_node;
+   BUG_ON(!node);
+   hash_max = rmap_item_hash_max(item, hash);
+
+   while (node) {
+       int cmp;
+
+       stable_node = rb_entry(node, struct stable_node, node);
+
+       cmp = hash_cmp(hash_max, stable_node->hash_max);
+
+       if (cmp < 0)
+           node = node->rb_left;
+       else if (cmp > 0)
+           node = node->rb_right;
+       else
+           goto get_page_out;
+   }
+
+   return NULL;
+
+get_page_out:
+   page = get_uksm_page(stable_node, 1, 1);
+   return page;
+}
+
+static int try_merge_rmap_item(struct rmap_item *item,
+                  struct page *kpage,
+                  struct page *tree_page)
+{
+   spinlock_t *ptl;
+   pte_t *ptep;
+   unsigned long addr;
+   struct vm_area_struct *vma = item->slot->vma;
+
+   addr = get_rmap_addr(item);
+   ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0);
+   if (!ptep)
+       return 0;
+
+   if (pte_write(*ptep)) {
+       /* has changed, abort! */
+       pte_unmap_unlock(ptep, ptl);
+       return 0;
+   }
+
+   get_page(tree_page);
+   page_add_anon_rmap(tree_page, vma, addr);
+
+   flush_cache_page(vma, addr, pte_pfn(*ptep));
+   ptep_clear_flush(vma, addr, ptep);
+   set_pte_at_notify(vma->vm_mm, addr, ptep,
+             mk_pte(tree_page, vma->vm_page_prot));
+
+   page_remove_rmap(kpage);
+   put_page(kpage);
+
+   pte_unmap_unlock(ptep, ptl);
+
+   return 1;
+}
+
+/**
+ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
+ * into stable tree, the page was found to be identical to a stable ksm page,
+ * this is the last chance we can merge them into one.
+ *
+ * @item1: the rmap_item holding the page which we wanted to insert
+ *         into stable tree.
+ * @item2: the other rmap_item we found when unstable tree search
+ * @oldpage:   the page currently mapped by the two rmap_items
+ * @tree_page:     the page we found identical in stable tree node
+ * @success1:  return if item1 is successfully merged
+ * @success2:  return if item2 is successfully merged
+ */
+static void try_merge_with_stable(struct rmap_item *item1,
+                 struct rmap_item *item2,
+                 struct page **kpage,
+                 struct page *tree_page,
+                 int *success1, int *success2)
+{
+   struct vm_area_struct *vma1 = item1->slot->vma;
+   struct vm_area_struct *vma2 = item2->slot->vma;
+   *success1 = 0;
+   *success2 = 0;
+
+   if (unlikely(*kpage == tree_page)) {
+       /* I don't think this can really happen */
+       printk(KERN_WARNING "UKSM: unexpected condition detected in "
+           "try_merge_with_stable() -- *kpage == tree_page !\n");
+       *success1 = 1;
+       *success2 = 1;
+       return;
+   }
+
+   if (!PageAnon(*kpage) || !PageKsm(*kpage))
+       goto failed;
+
+   if (!trylock_page(tree_page))
+       goto failed;
+
+   /* If the oldpage is still ksm and still pointed
+    * to in the right place, and still write protected,
+    * we are confident it's not changed, no need to
+    * memcmp anymore.
+    * be ware, we cannot take nested pte locks,
+    * deadlock risk.
+    */
+   if (!try_merge_rmap_item(item1, *kpage, tree_page))
+       goto unlock_failed;
+
+   /* ok, then vma2, remind that pte1 already set */
+   if (!try_merge_rmap_item(item2, *kpage, tree_page))
+       goto success_1;
+
+   *success2 = 1;
+success_1:
+   *success1 = 1;
+
+
+   if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
+       (*success2 && vma2->vm_flags & VM_LOCKED)) {
+       munlock_vma_page(*kpage);
+       if (!PageMlocked(tree_page))
+           mlock_vma_page(tree_page);
+   }
+
+   /*
+    * We do not need oldpage any more in the caller, so can break the lock
+    * now.
+    */
+   unlock_page(*kpage);
+   *kpage = tree_page; /* Get unlocked outside. */
+   return;
+
+unlock_failed:
+   unlock_page(tree_page);
+failed:
+   return;
+}
+
+static inline void stable_node_hash_max(struct stable_node *node,
+                    struct page *page, u32 hash)
+{
+   u32 hash_max = node->hash_max;
+
+   if (!hash_max) {
+       hash_max = page_hash_max(page, hash);
+       node->hash_max = hash_max;
+   }
+}
+
+static inline
+struct stable_node *new_stable_node(struct tree_node *tree_node,
+                   struct page *kpage, u32 hash_max)
+{
+   struct stable_node *new_stable_node;
+
+   new_stable_node = alloc_stable_node();
+   if (!new_stable_node)
+       return NULL;
+
+   new_stable_node->kpfn = page_to_pfn(kpage);
+   new_stable_node->hash_max = hash_max;
+   new_stable_node->tree_node = tree_node;
+   set_page_stable_node(kpage, new_stable_node);
+
+   return new_stable_node;
+}
+
+static inline
+struct stable_node *first_level_insert(struct tree_node *tree_node,
+                      struct rmap_item *rmap_item,
+                      struct rmap_item *tree_rmap_item,
+                      struct page **kpage, u32 hash,
+                      int *success1, int *success2)
+{
+   int cmp;
+   struct page *tree_page;
+   u32 hash_max = 0;
+   struct stable_node *stable_node, *new_snode;
+   struct rb_node *parent = NULL, **new;
+
+   /* this tree node contains no sub-tree yet */
+   stable_node = rb_entry(tree_node->sub_root.rb_node,
+                  struct stable_node, node);
+
+   tree_page = get_uksm_page(stable_node, 1, 0);
+   if (tree_page) {
+       cmp = memcmp_pages(*kpage, tree_page, 1);
+       if (!cmp) {
+           try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
+                         tree_page, success1, success2);
+           put_page(tree_page);
+           if (!*success1 && !*success2)
+               goto failed;
+
+           return stable_node;
+
+       } else {
+           /*
+            * collision in first level try to create a subtree.
+            * A new node need to be created.
+            */
+           put_page(tree_page);
+
+           stable_node_hash_max(stable_node, tree_page,
+                        tree_node->hash);
+           hash_max = rmap_item_hash_max(rmap_item, hash);
+           cmp = hash_cmp(hash_max, stable_node->hash_max);
+
+           parent = &stable_node->node;
+           if (cmp < 0) {
+               new = &parent->rb_left;
+           } else if (cmp > 0) {
+               new = &parent->rb_right;
+           } else {
+               goto failed;
+           }
+       }
+
+   } else {
+       /* the only stable_node deleted, we reuse its tree_node.
+        */
+       parent = NULL;
+       new = &tree_node->sub_root.rb_node;
+   }
+
+   new_snode = new_stable_node(tree_node, *kpage, hash_max);
+   if (!new_snode)
+       goto failed;
+
+   rb_link_node(&new_snode->node, parent, new);
+   rb_insert_color(&new_snode->node, &tree_node->sub_root);
+   tree_node->count++;
+   *success1 = *success2 = 1;
+
+   return new_snode;
+
+failed:
+   return NULL;
+}
+
+static inline
+struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
+                     struct rmap_item *rmap_item,
+                     struct rmap_item *tree_rmap_item,
+                     struct page **kpage, u32 hash,
+                     int *success1, int *success2)
+{
+   struct page *tree_page;
+   u32 hash_max;
+   struct stable_node *stable_node, *new_snode;
+   struct rb_node *parent, **new;
+
+research:
+   parent = NULL;
+   new = &tree_node->sub_root.rb_node;
+   BUG_ON(!*new);
+   hash_max = rmap_item_hash_max(rmap_item, hash);
+   while (*new) {
+       int cmp;
+
+       stable_node = rb_entry(*new, struct stable_node, node);
+
+       cmp = hash_cmp(hash_max, stable_node->hash_max);
+
+       if (cmp < 0) {
+           parent = *new;
+           new = &parent->rb_left;
+       } else if (cmp > 0) {
+           parent = *new;
+           new = &parent->rb_right;
+       } else {
+           tree_page = get_uksm_page(stable_node, 1, 0);
+           if (tree_page) {
+               cmp = memcmp_pages(*kpage, tree_page, 1);
+               if (!cmp) {
+                   try_merge_with_stable(rmap_item,
+                       tree_rmap_item, kpage,
+                       tree_page, success1, success2);
+
+                   put_page(tree_page);
+                   if (!*success1 && !*success2)
+                       goto failed;
+                   /*
+                    * successfully merged with a stable
+                    * node
+                    */
+                   return stable_node;
+               } else {
+                   put_page(tree_page);
+                   goto failed;
+               }
+           } else {
+               /*
+                * stable node may be deleted,
+                * and subtree maybe
+                * restructed, cannot
+                * continue, research it.
+                */
+               if (tree_node->count) {
+                   goto research;
+               } else {
+                   /* reuse the tree node*/
+                   parent = NULL;
+                   new = &tree_node->sub_root.rb_node;
+               }
+           }
+       }
+   }
+
+   new_snode = new_stable_node(tree_node, *kpage, hash_max);
+   if (!new_snode)
+       goto failed;
+
+   rb_link_node(&new_snode->node, parent, new);
+   rb_insert_color(&new_snode->node, &tree_node->sub_root);
+   tree_node->count++;
+   *success1 = *success2 = 1;
+
+   return new_snode;
+
+failed:
+   return NULL;
+}
+
+
+/**
+ * stable_tree_insert() - try to insert a merged page in unstable tree to
+ * the stable tree
+ *
+ * @kpage:     the page need to be inserted
+ * @hash:      the current hash of this page
+ * @rmap_item:     the rmap_item being scanned
+ * @tree_rmap_item:    the rmap_item found on unstable tree
+ * @success1:      return if rmap_item is merged
+ * @success2:      return if tree_rmap_item is merged
+ *
+ * @return         the stable_node on stable tree if at least one
+ *             rmap_item is inserted into stable tree, NULL
+ *             otherwise.
+ */
+static struct stable_node *
+stable_tree_insert(struct page **kpage, u32 hash,
+          struct rmap_item *rmap_item,
+          struct rmap_item *tree_rmap_item,
+          int *success1, int *success2)
+{
+   struct rb_node **new = &root_stable_treep->rb_node;
+   struct rb_node *parent = NULL;
+   struct stable_node *stable_node;
+   struct tree_node *tree_node;
+   u32 hash_max = 0;
+
+   *success1 = *success2 = 0;
+
+   while (*new) {
+       int cmp;
+
+       tree_node = rb_entry(*new, struct tree_node, node);
+
+       cmp = hash_cmp(hash, tree_node->hash);
+
+       if (cmp < 0) {
+           parent = *new;
+           new = &parent->rb_left;
+       } else if (cmp > 0) {
+           parent = *new;
+           new = &parent->rb_right;
+       } else
+           break;
+   }
+
+   if (*new) {
+       if (tree_node->count == 1) {
+           stable_node = first_level_insert(tree_node, rmap_item,
+                       tree_rmap_item, kpage,
+                       hash, success1, success2);
+       } else {
+           stable_node = stable_subtree_insert(tree_node,
+                   rmap_item, tree_rmap_item, kpage,
+                   hash, success1, success2);
+       }
+   } else {
+
+       /* no tree node found */
+       tree_node = alloc_tree_node(stable_tree_node_listp);
+       if (!tree_node) {
+           stable_node = NULL;
+           goto out;
+       }
+
+       stable_node = new_stable_node(tree_node, *kpage, hash_max);
+       if (!stable_node) {
+           free_tree_node(tree_node);
+           goto out;
+       }
+
+       tree_node->hash = hash;
+       rb_link_node(&tree_node->node, parent, new);
+       rb_insert_color(&tree_node->node, root_stable_treep);
+       parent = NULL;
+       new = &tree_node->sub_root.rb_node;
+
+       rb_link_node(&stable_node->node, parent, new);
+       rb_insert_color(&stable_node->node, &tree_node->sub_root);
+       tree_node->count++;
+       *success1 = *success2 = 1;
+   }
+
+out:
+   return stable_node;
+}
+
+
+/**
+ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
+ *
+ * @return     0 on success, -EBUSY if unable to lock the mmap_sem,
+ *             -EINVAL if the page mapping has been changed.
+ */
+static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
+{
+   int err;
+
+   err = get_mergeable_page_lock_mmap(tree_rmap_item);
+
+   if (err == -EINVAL) {
+       /* its page map has been changed, remove it */
+       remove_rmap_item_from_tree(tree_rmap_item);
+   }
+
+   /* The page is gotten and mmap_sem is locked now. */
+   return err;
+}
+
+
+/**
+ * unstable_tree_search_insert() - search an unstable tree rmap_item with the
+ * same hash value. Get its page and trylock the mmap_sem
+ */
+static inline
+struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
+                         u32 hash)
+
+{
+   struct rb_node **new = &root_unstable_tree.rb_node;
+   struct rb_node *parent = NULL;
+   struct tree_node *tree_node;
+   u32 hash_max;
+   struct rmap_item *tree_rmap_item;
+
+   while (*new) {
+       int cmp;
+
+       tree_node = rb_entry(*new, struct tree_node, node);
+
+       cmp = hash_cmp(hash, tree_node->hash);
+
+       if (cmp < 0) {
+           parent = *new;
+           new = &parent->rb_left;
+       } else if (cmp > 0) {
+           parent = *new;
+           new = &parent->rb_right;
+       } else
+           break;
+   }
+
+   if (*new) {
+       /* got the tree_node */
+       if (tree_node->count == 1) {
+           tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
+                         struct rmap_item, node);
+           BUG_ON(!tree_rmap_item);
+
+           goto get_page_out;
+       }
+
+       /* well, search the collision subtree */
+       new = &tree_node->sub_root.rb_node;
+       BUG_ON(!*new);
+       hash_max = rmap_item_hash_max(rmap_item, hash);
+
+       while (*new) {
+           int cmp;
+
+           tree_rmap_item = rb_entry(*new, struct rmap_item,
+                         node);
+
+           cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
+           parent = *new;
+           if (cmp < 0)
+               new = &parent->rb_left;
+           else if (cmp > 0)
+               new = &parent->rb_right;
+           else
+               goto get_page_out;
+       }
+   } else {
+       /* alloc a new tree_node */
+       tree_node = alloc_tree_node(&unstable_tree_node_list);
+       if (!tree_node)
+           return NULL;
+
+       tree_node->hash = hash;
+       rb_link_node(&tree_node->node, parent, new);
+       rb_insert_color(&tree_node->node, &root_unstable_tree);
+       parent = NULL;
+       new = &tree_node->sub_root.rb_node;
+   }
+
+   /* did not found even in sub-tree */
+   rmap_item->tree_node = tree_node;
+   rmap_item->address |= UNSTABLE_FLAG;
+   rmap_item->append_round = uksm_scan_round;
+   rb_link_node(&rmap_item->node, parent, new);
+   rb_insert_color(&rmap_item->node, &tree_node->sub_root);
+
+   uksm_pages_unshared++;
+   return NULL;
+
+get_page_out:
+   if (tree_rmap_item->page == rmap_item->page)
+       return NULL;
+
+   if (get_tree_rmap_item_page(tree_rmap_item))
+       return NULL;
+
+   return tree_rmap_item;
+}
+
+static void enter_vma_tree(struct vma_slot *slot)
+{
+   unsigned long i;
+   int ret;
+
+   i = uksm_vma_tree_index_end;
+
+   ret = radix_tree_insert(&uksm_vma_tree, i, slot);
+   BUG_ON(ret);
+
+   slot->uksm_index = i;
+   uksm_vma_tree_num++;
+   uksm_vma_tree_index_end++;
+}
+
+static inline void get_sub_dup_vma(struct vma_slot **slot,
+                  struct vma_slot **sub_slot)
+{
+   struct vma_slot *tmp;
+
+   if ((*slot)->uksm_index > (*sub_slot)->uksm_index) {
+       tmp = *slot;
+       *slot = *sub_slot;
+       *sub_slot = tmp;
+   }
+}
+
+/*
+ * Inc or dec the dup pages stored in a slot, return the dup page num after
+ * the operation.
+ */
+static inline unsigned long dup_pages_mod(void **slot, int inc)
+{
+   unsigned long item, ret;
+
+   item = (unsigned long)(*slot) >> INDIRECT_OFFSET;
+   if (inc) {
+       item++;
+       BUG_ON(!item);
+   } else {
+       BUG_ON(!item);
+       item--;
+   }
+   ret = item;
+   item <<= INDIRECT_OFFSET;
+   *slot = (void *)item;
+
+   return ret;
+}
+
+static void inc_dup_vma(struct vma_slot *slot, struct vma_slot *sub_slot)
+{
+   void **dup_slot;
+   unsigned long dup_pages;
+   int ret;
+
+   if (slot->uksm_index == -1)
+       enter_vma_tree(slot);
+
+   if (sub_slot->uksm_index == -1)
+       enter_vma_tree(sub_slot);
+
+   get_sub_dup_vma(&slot, &sub_slot);
+
+   dup_slot = radix_tree_lookup_slot(&slot->dup_tree, sub_slot->uksm_index);
+   if (dup_slot)
+       goto found;
+
+   /*
+    * In order to store dup_pages in radix tree, we must make
+    * radix_tree_is_indirect_ptr() happy.
+    */
+   dup_pages = 1 << INDIRECT_OFFSET;
+
+   /* no such entry yet, insert one */
+   ret = radix_tree_insert(&slot->dup_tree, sub_slot->uksm_index,
+               (void *)dup_pages);
+   BUG_ON(ret);
+
+   return;
+
+found:
+   dup_pages_mod(dup_slot, 1);
+}
+
+static void dec_dup_vma(struct vma_slot *slot, struct vma_slot *sub_slot)
+{
+   void **dup_slot;
+   unsigned long dup_pages;
+
+   BUG_ON(slot->uksm_index == -1 || sub_slot->uksm_index == -1);
+
+   get_sub_dup_vma(&slot, &sub_slot);
+
+   dup_slot = radix_tree_lookup_slot(&slot->dup_tree, sub_slot->uksm_index);
+   BUG_ON(!dup_slot);
+
+   dup_pages = dup_pages_mod(dup_slot, 0);
+
+   /* dup_pages == 0, we need to kick it out */
+   if (!dup_pages)
+       radix_tree_delete(&slot->dup_tree, sub_slot->uksm_index);
+}
+
+static void hold_anon_vma(struct rmap_item *rmap_item,
+             struct anon_vma *anon_vma)
+{
+   rmap_item->anon_vma = anon_vma;
+   get_anon_vma(anon_vma);
+}
+
+
+/**
+ * stable_tree_append() - append a rmap_item to a stable node. Deduplication
+ * ratio statistics is done in this function.
+ *
+ */
+static void stable_tree_append(struct rmap_item *rmap_item,
+                  struct stable_node *stable_node)
+{
+   struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_iter = NULL;
+   struct hlist_node *hlist, *cont_p = NULL;
+   unsigned long key = (unsigned long)rmap_item->slot;
+
+   BUG_ON(!stable_node);
+   rmap_item->address |= STABLE_FLAG;
+   rmap_item->append_round = uksm_scan_round;
+
+   if (hlist_empty(&stable_node->hlist)) {
+       uksm_pages_shared++;
+       goto node_vma_new;
+   } else {
+       uksm_pages_sharing++;
+   }
+
+   hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
+       if (node_vma->last_update == uksm_scan_round)
+           inc_dup_vma(rmap_item->slot, node_vma->slot);
+
+       if (node_vma->key >= key)
+           break;
+   }
+
+   cont_p = hlist;
+
+   if (node_vma && node_vma->key == key) {
+       if (node_vma->last_update == uksm_scan_round) {
+           /**
+            * we consider this page a inner duplicate, cancel
+            * other updates
+            */
+           hlist_for_each_entry(node_vma_iter, hlist,
+                        &stable_node->hlist, hlist) {
+               if (node_vma_iter->key == key)
+                   break;
+
+               /* only need to increase the same vma */
+               if (node_vma_iter->last_update ==
+                   uksm_scan_round) {
+                   dec_dup_vma(rmap_item->slot,
+                           node_vma_iter->slot);
+               }
+           }
+       } else {
+           /**
+            * Although it's same vma, it contains no duplicate for this
+            * round. Continue scan other vma.
+            */
+           hlist_for_each_entry_continue(node_vma_iter,
+                             hlist, hlist) {
+               if (node_vma_iter->last_update ==
+                   uksm_scan_round) {
+                   inc_dup_vma(rmap_item->slot,
+                           node_vma_iter->slot);
+               }
+           }
+
+       }
+
+       goto node_vma_ok;
+   }
+
+node_vma_new:
+   /* no same vma already in node, alloc a new node_vma */
+   new_node_vma = alloc_node_vma();
+   BUG_ON(!new_node_vma);
+   new_node_vma->head = stable_node;
+   new_node_vma->slot = rmap_item->slot;
+
+   if (!node_vma) {
+       hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
+   } else if (node_vma->key != key) {
+       if (node_vma->key < key)
+           hlist_add_after(&node_vma->hlist, &new_node_vma->hlist);
+       else {
+           hlist_for_each_entry_continue(node_vma_iter, cont_p,
+                             hlist) {
+               if (node_vma_iter->last_update ==
+                   uksm_scan_round) {
+                   inc_dup_vma(rmap_item->slot,
+                           node_vma_iter->slot);
+               }
+           }
+           hlist_add_before(&new_node_vma->hlist,
+                    &node_vma->hlist);
+       }
+
+   }
+   node_vma = new_node_vma;
+
+node_vma_ok: /* ok, ready to add to the list */
+   rmap_item->head = node_vma;
+   hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
+   node_vma->last_update = uksm_scan_round;
+   hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
+   rmap_item->slot->pages_merged++;
+}
+
+/*
+ * We use break_ksm to break COW on a ksm page: it's a stripped down
+ *
+ * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
+ *     put_page(page);
+ *
+ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
+ * in case the application has unmapped and remapped mm,addr meanwhile.
+ * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
+ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
+ */
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
+{
+   struct page *page;
+   int ret = 0;
+
+   do {
+       cond_resched();
+       page = follow_page(vma, addr, FOLL_GET);
+       if (IS_ERR_OR_NULL(page))
+           break;
+       if (PageKsm(page)) {
+           ret = handle_mm_fault(vma->vm_mm, vma, addr,
+                         FAULT_FLAG_WRITE);
+       } else
+           ret = VM_FAULT_WRITE;
+       put_page(page);
+   } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
+   /*
+    * We must loop because handle_mm_fault() may back out if there's
+    * any difficulty e.g. if pte accessed bit gets updated concurrently.
+    *
+    * VM_FAULT_WRITE is what we have been hoping for: it indicates that
+    * COW has been broken, even if the vma does not permit VM_WRITE;
+    * but note that a concurrent fault might break PageKsm for us.
+    *
+    * VM_FAULT_SIGBUS could occur if we race with truncation of the
+    * backing file, which also invalidates anonymous pages: that's
+    * okay, that truncation will have unmapped the PageKsm for us.
+    *
+    * VM_FAULT_OOM: at the time of writing (late July 2009), setting
+    * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
+    * current task has TIF_MEMDIE set, and will be OOM killed on return
+    * to user; and ksmd, having no mm, would never be chosen for that.
+    *
+    * But if the mm is in a limited mem_cgroup, then the fault may fail
+    * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
+    * even ksmd can fail in this way - though it's usually breaking ksm
+    * just to undo a merge it made a moment before, so unlikely to oom.
+    *
+    * That's a pity: we might therefore have more kernel pages allocated
+    * than we're counting as nodes in the stable tree; but uksm_do_scan
+    * will retry to break_cow on each pass, so should recover the page
+    * in due course.  The important thing is to not let VM_MERGEABLE
+    * be cleared while any such pages might remain in the area.
+    */
+   return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
+}
+
+static void break_cow(struct rmap_item *rmap_item)
+{
+   struct vm_area_struct *vma = rmap_item->slot->vma;
+   struct mm_struct *mm = vma->vm_mm;
+   unsigned long addr = get_rmap_addr(rmap_item);
+
+   if (uksm_test_exit(mm))
+       goto out;
+
+   break_ksm(vma, addr);
+out:
+   return;
+}
+
+/*
+ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
+ * than check every pte of a given vma, the locking doesn't quite work for
+ * that - an rmap_item is assigned to the stable tree after inserting ksm
+ * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
+ * rmap_items from parent to child at fork time (so as not to waste time
+ * if exit comes before the next scan reaches it).
+ *
+ * Similarly, although we'd like to remove rmap_items (so updating counts
+ * and freeing memory) when unmerging an area, it's easier to leave that
+ * to the next pass of ksmd - consider, for example, how ksmd might be
+ * in cmp_and_merge_page on one of the rmap_items we would be removing.
+ */
+inline int unmerge_uksm_pages(struct vm_area_struct *vma,
+             unsigned long start, unsigned long end)
+{
+   unsigned long addr;
+   int err = 0;
+
+   for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
+       if (uksm_test_exit(vma->vm_mm))
+           break;
+       if (signal_pending(current))
+           err = -ERESTARTSYS;
+       else
+           err = break_ksm(vma, addr);
+   }
+   return err;
+}
+
+static inline void inc_uksm_pages_scanned(void)
+{
+   u64 delta;
+
+
+   if (uksm_pages_scanned == U64_MAX) {
+       encode_benefit();
+
+       delta = uksm_pages_scanned >> pages_scanned_base;
+
+       if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
+           pages_scanned_stored >>= 1;
+           delta >>= 1;
+           pages_scanned_base++;
+       }
+
+       pages_scanned_stored += delta;
+
+       uksm_pages_scanned = uksm_pages_scanned_last = 0;
+   }
+
+   uksm_pages_scanned++;
+}
+
+static inline int find_zero_page_hash(int strength, u32 hash)
+{
+   return (zero_hash_table[strength] == hash);
+}
+
+static
+int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
+{
+   struct page *zero_page = empty_uksm_zero_page;
+   struct mm_struct *mm = vma->vm_mm;
+   pte_t orig_pte = __pte(0);
+   int err = -EFAULT;
+
+   if (uksm_test_exit(mm))
+       goto out;
+
+   if (PageTransCompound(page) && page_trans_compound_anon_split(page))
+       goto out;
+   BUG_ON(PageTransCompound(page));
+
+   if (!PageAnon(page))
+       goto out;
+
+   if (!trylock_page(page))
+       goto out;
+
+   if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
+       if (is_page_full_zero(page))
+           err = replace_page(vma, page, zero_page, orig_pte);
+   }
+
+   unlock_page(page);
+out:
+   return err;
+}
+
+/*
+ * cmp_and_merge_page() - first see if page can be merged into the stable
+ * tree; if not, compare hash to previous and if it's the same, see if page
+ * can be inserted into the unstable tree, or merged with a page already there
+ * and both transferred to the stable tree.
+ *
+ * @page: the page that we are searching identical page to.
+ * @rmap_item: the reverse mapping into the virtual address of this page
+ */
+static void cmp_and_merge_page(struct rmap_item *rmap_item)
+{
+   struct rmap_item *tree_rmap_item;
+   struct page *page;
+   struct page *kpage = NULL;
+   u32 hash, hash_max;
+   int err;
+   unsigned int success1, success2;
+   struct stable_node *snode;
+   int cmp;
+   struct rb_node *parent = NULL, **new;
+
+   remove_rmap_item_from_tree(rmap_item);
+   inc_uksm_pages_scanned();
+
+   page = rmap_item->page;
+
+   hash = page_hash(page, hash_strength, 1);
+
+   /*if the page content all zero, re-map to zero-page*/
+   if (find_zero_page_hash(hash_strength, hash)) {
+       if (!cmp_and_merge_zero_page(rmap_item->slot->vma, page)) {
+           inc_dup_vma(rmap_item->slot, rmap_item->slot);
+           rmap_item->slot->pages_merged++;
+           __inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
+           return ;
+       } else {
+           inc_rshash_neg(memcmp_cost / 2);
+       }
+   }
+   //uksm_pages_scanned++;
+
+   /* We first start with searching the page inside the stable tree */
+   kpage = stable_tree_search(rmap_item, hash);
+   if (kpage) {
+       err = try_to_merge_with_uksm_page(rmap_item, kpage,
+                        hash);
+       if (!err) {
+           /*
+            * The page was successfully merged, add
+            * its rmap_item to the stable tree.
+            * page lock is needed because it's
+            * racing with try_to_unmap_ksm(), etc.
+            */
+           lock_page(kpage);
+           stable_tree_append(rmap_item, page_stable_node(kpage));
+           unlock_page(kpage);
+           put_page(kpage);
+           return; /* success */
+       }
+       put_page(kpage);
+
+       /*
+        * if it's a collision and it has been search in sub-rbtree
+        * (hash_max != 0), we want to abort, because if it is
+        * successfully merged in unstable tree, the collision trends to
+        * happen again.
+        */
+       if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
+           return;
+   }
+
+   tree_rmap_item =
+       unstable_tree_search_insert(rmap_item, hash);
+   if (tree_rmap_item) {
+       err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
+       /*
+        * As soon as we merge this page, we want to remove the
+        * rmap_item of the page we have merged with from the unstable
+        * tree, and insert it instead as new node in the stable tree.
+        */
+       if (!err) {
+           kpage = page;
+           remove_rmap_item_from_tree(tree_rmap_item);
+           lock_page(kpage);
+           snode = stable_tree_insert(&kpage, hash,
+                          rmap_item, tree_rmap_item,
+                          &success1, &success2);
+
+           if (success1)
+               stable_tree_append(rmap_item, snode);
+           if (success2)
+               stable_tree_append(tree_rmap_item, snode);
+
+           /*
+            * The original kpage may be unlocked inside
+            * stable_tree_insert() already. This page
+            * should be unlocked before doing
+            * break_cow().
+            */
+           unlock_page(kpage);
+
+           if (!success1)
+               break_cow(rmap_item);
+
+           if (!success2)
+               break_cow(tree_rmap_item);
+
+       } else if (err == MERGE_ERR_COLLI) {
+           if (tree_rmap_item->tree_node->count == 1) {
+               rmap_item_hash_max(tree_rmap_item,
+               tree_rmap_item->tree_node->hash);
+           } else
+               BUG_ON(!(tree_rmap_item->hash_max));
+
+           hash_max = rmap_item_hash_max(rmap_item, hash);
+           cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
+           parent = &tree_rmap_item->node;
+           if (cmp < 0)
+               new = &parent->rb_left;
+           else if (cmp > 0)
+               new = &parent->rb_right;
+           else
+               goto put_up_out;
+
+           rmap_item->tree_node = tree_rmap_item->tree_node;
+           rmap_item->address |= UNSTABLE_FLAG;
+           rmap_item->append_round = uksm_scan_round;
+           rb_link_node(&rmap_item->node, parent, new);
+           rb_insert_color(&rmap_item->node,
+                   &tree_rmap_item->tree_node->sub_root);
+           rmap_item->tree_node->count++;
+       }
+put_up_out:
+       put_page(tree_rmap_item->page);
+       up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem);
+   }
+}
+
+
+
+
+static inline unsigned long get_pool_index(struct vma_slot *slot,
+                      unsigned long index)
+{
+   unsigned long pool_index;
+
+   pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
+   if (pool_index >= slot->pool_size)
+       BUG();
+   return pool_index;
+}
+
+static inline unsigned long index_page_offset(unsigned long index)
+{
+   return offset_in_page(sizeof(struct rmap_list_entry *) * index);
+}
+
+static inline
+struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
+                       unsigned long index, int need_alloc)
+{
+   unsigned long pool_index;
+   void *addr;
+
+
+   pool_index = get_pool_index(slot, index);
+   if (!slot->rmap_list_pool[pool_index]) {
+       if (!need_alloc)
+           return NULL;
+
+       slot->rmap_list_pool[pool_index] =
+           alloc_page(GFP_KERNEL | __GFP_ZERO);
+       BUG_ON(!slot->rmap_list_pool[pool_index]);
+   }
+
+   addr = kmap(slot->rmap_list_pool[pool_index]);
+   addr += index_page_offset(index);
+
+   return addr;
+}
+
+static inline void put_rmap_list_entry(struct vma_slot *slot,
+                      unsigned long index)
+{
+   unsigned long pool_index;
+
+   pool_index = get_pool_index(slot, index);
+   BUG_ON(!slot->rmap_list_pool[pool_index]);
+   kunmap(slot->rmap_list_pool[pool_index]);
+}
+
+static inline int entry_is_new(struct rmap_list_entry *entry)
+{
+   return !entry->item;
+}
+
+static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
+                       unsigned long index)
+{
+   return slot->vma->vm_start + (index << PAGE_SHIFT);
+}
+
+static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
+{
+   unsigned long addr;
+
+   if (is_addr(entry->addr))
+       addr = get_clean_addr(entry->addr);
+   else if (entry->item)
+       addr = get_rmap_addr(entry->item);
+   else
+       BUG();
+
+   return addr;
+}
+
+static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
+{
+   if (is_addr(entry->addr))
+       return NULL;
+
+   return entry->item;
+}
+
+static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
+                       unsigned long index)
+{
+   unsigned long pool_index;
+
+   pool_index = get_pool_index(slot, index);
+   BUG_ON(!slot->rmap_list_pool[pool_index]);
+   slot->pool_counts[pool_index]++;
+}
+
+static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
+                       unsigned long index)
+{
+   unsigned long pool_index;
+
+   pool_index = get_pool_index(slot, index);
+   BUG_ON(!slot->rmap_list_pool[pool_index]);
+   BUG_ON(!slot->pool_counts[pool_index]);
+   slot->pool_counts[pool_index]--;
+}
+
+static inline int entry_has_rmap(struct rmap_list_entry *entry)
+{
+   return !is_addr(entry->addr) && entry->item;
+}
+
+static inline void swap_entries(struct rmap_list_entry *entry1,
+               unsigned long index1,
+               struct rmap_list_entry *entry2,
+               unsigned long index2)
+{
+   struct rmap_list_entry tmp;
+
+   /* swapping two new entries is meaningless */
+   BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
+
+   tmp = *entry1;
+   *entry1 = *entry2;
+   *entry2 = tmp;
+
+   if (entry_has_rmap(entry1))
+       entry1->item->entry_index = index1;
+
+   if (entry_has_rmap(entry2))
+       entry2->item->entry_index = index2;
+
+   if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
+       inc_rmap_list_pool_count(entry1->item->slot, index1);
+       dec_rmap_list_pool_count(entry1->item->slot, index2);
+   } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
+       inc_rmap_list_pool_count(entry2->item->slot, index2);
+       dec_rmap_list_pool_count(entry2->item->slot, index1);
+   }
+}
+
+static inline void free_entry_item(struct rmap_list_entry *entry)
+{
+   unsigned long index;
+   struct rmap_item *item;
+
+   if (!is_addr(entry->addr)) {
+       BUG_ON(!entry->item);
+       item = entry->item;
+       entry->addr = get_rmap_addr(item);
+       set_is_addr(entry->addr);
+       index = item->entry_index;
+       remove_rmap_item_from_tree(item);
+       dec_rmap_list_pool_count(item->slot, index);
+       free_rmap_item(item);
+   }
+}
+
+static inline int pool_entry_boundary(unsigned long index)
+{
+   unsigned long linear_addr;
+
+   linear_addr = sizeof(struct rmap_list_entry *) * index;
+   return index && !offset_in_page(linear_addr);
+}
+
+static inline void try_free_last_pool(struct vma_slot *slot,
+                     unsigned long index)
+{
+   unsigned long pool_index;
+
+   pool_index = get_pool_index(slot, index);
+   if (slot->rmap_list_pool[pool_index] &&
+       !slot->pool_counts[pool_index]) {
+       __free_page(slot->rmap_list_pool[pool_index]);
+       slot->rmap_list_pool[pool_index] = NULL;
+       slot->flags |= UKSM_SLOT_NEED_SORT;
+   }
+
+}
+
+static inline unsigned long vma_item_index(struct vm_area_struct *vma,
+                      struct rmap_item *item)
+{
+   return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
+}
+
+static int within_same_pool(struct vma_slot *slot,
+               unsigned long i, unsigned long j)
+{
+   unsigned long pool_i, pool_j;
+
+   pool_i = get_pool_index(slot, i);
+   pool_j = get_pool_index(slot, j);
+
+   return (pool_i == pool_j);
+}
+
+static void sort_rmap_entry_list(struct vma_slot *slot)
+{
+   unsigned long i, j;
+   struct rmap_list_entry *entry, *swap_entry;
+
+   entry = get_rmap_list_entry(slot, 0, 0);
+   for (i = 0; i < slot->pages; ) {
+
+       if (!entry)
+           goto skip_whole_pool;
+
+       if (entry_is_new(entry))
+           goto next_entry;
+
+       if (is_addr(entry->addr)) {
+           entry->addr = 0;
+           goto next_entry;
+       }
+
+       j = vma_item_index(slot->vma, entry->item);
+       if (j == i)
+           goto next_entry;
+
+       if (within_same_pool(slot, i, j))
+           swap_entry = entry + j - i;
+       else
+           swap_entry = get_rmap_list_entry(slot, j, 1);
+
+       swap_entries(entry, i, swap_entry, j);
+       if (!within_same_pool(slot, i, j))
+           put_rmap_list_entry(slot, j);
+       continue;
+
+skip_whole_pool:
+       i += PAGE_SIZE / sizeof(*entry);
+       if (i < slot->pages)
+           entry = get_rmap_list_entry(slot, i, 0);
+       continue;
+
+next_entry:
+       if (i >= slot->pages - 1 ||
+           !within_same_pool(slot, i, i + 1)) {
+           put_rmap_list_entry(slot, i);
+           if (i + 1 < slot->pages)
+               entry = get_rmap_list_entry(slot, i + 1, 0);
+       } else
+           entry++;
+       i++;
+       continue;
+   }
+
+   /* free empty pool entries which contain no rmap_item */
+   /* CAN be simplied to based on only pool_counts when bug freed !!!!! */
+   for (i = 0; i < slot->pool_size; i++) {
+       unsigned char has_rmap;
+       void *addr;
+
+       if (!slot->rmap_list_pool[i])
+           continue;
+
+       has_rmap = 0;
+       addr = kmap(slot->rmap_list_pool[i]);
+       BUG_ON(!addr);
+       for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
+           entry = (struct rmap_list_entry *)addr + j;
+           if (is_addr(entry->addr))
+               continue;
+           if (!entry->item)
+               continue;
+           has_rmap = 1;
+       }
+       kunmap(slot->rmap_list_pool[i]);
+       if (!has_rmap) {
+           BUG_ON(slot->pool_counts[i]);
+           __free_page(slot->rmap_list_pool[i]);
+           slot->rmap_list_pool[i] = NULL;
+       }
+   }
+
+   slot->flags &= ~UKSM_SLOT_NEED_SORT;
+}
+
+/*
+ * vma_fully_scanned() - if all the pages in this slot have been scanned.
+ */
+static inline int vma_fully_scanned(struct vma_slot *slot)
+{
+   return slot->pages_scanned && !(slot->pages_scanned % slot->pages);
+}
+
+/**
+ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
+ * its random permutation. This function is embedded with the random
+ * permutation index management code.
+ */
+static struct rmap_item *get_next_rmap_item(struct vma_slot *slot)
+{
+   unsigned long rand_range, addr, swap_index, scan_index;
+   struct rmap_item *item = NULL;
+   struct rmap_list_entry *scan_entry, *swap_entry = NULL;
+   struct page *page;
+
+   scan_index = swap_index = slot->pages_scanned % slot->pages;
+
+   if (pool_entry_boundary(scan_index))
+       try_free_last_pool(slot, scan_index - 1);
+
+   if (vma_fully_scanned(slot)) {
+       if (slot->flags & UKSM_SLOT_NEED_SORT)
+           slot->flags |= UKSM_SLOT_NEED_RERAND;
+       else
+           slot->flags &= ~UKSM_SLOT_NEED_RERAND;
+       if (slot->flags & UKSM_SLOT_NEED_SORT)
+           sort_rmap_entry_list(slot);
+   }
+
+   scan_entry = get_rmap_list_entry(slot, scan_index, 1);
+   if (entry_is_new(scan_entry)) {
+       scan_entry->addr = get_index_orig_addr(slot, scan_index);
+       set_is_addr(scan_entry->addr);
+   }
+
+   if (slot->flags & UKSM_SLOT_NEED_RERAND) {
+       rand_range = slot->pages - scan_index;
+       BUG_ON(!rand_range);
+       swap_index = scan_index + (random32() % rand_range);
+   }
+
+   if (swap_index != scan_index) {
+       swap_entry = get_rmap_list_entry(slot, swap_index, 1);
+       if (entry_is_new(swap_entry)) {
+           swap_entry->addr = get_index_orig_addr(slot,
+                                  swap_index);
+           set_is_addr(swap_entry->addr);
+       }
+       swap_entries(scan_entry, scan_index, swap_entry, swap_index);
+   }
+
+   addr = get_entry_address(scan_entry);
+   item = get_entry_item(scan_entry);
+   BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
+
+   page = follow_page(slot->vma, addr, FOLL_GET);
+   if (IS_ERR_OR_NULL(page))
+       goto nopage;
+
+   if (!PageAnon(page) && !page_trans_compound_anon(page))
+       goto putpage;
+
+   /*check is zero_page pfn or uksm_zero_page*/
+   if ((page_to_pfn(page) == zero_pfn)
+           || (page_to_pfn(page) == uksm_zero_pfn))
+       goto putpage;
+
+   flush_anon_page(slot->vma, page, addr);
+   flush_dcache_page(page);
+
+   if (!item) {
+       item = alloc_rmap_item();
+       if (item) {
+           /* It has already been zeroed */
+           item->slot = slot;
+           item->address = addr;
+           item->entry_index = scan_index;
+           scan_entry->item = item;
+           inc_rmap_list_pool_count(slot, scan_index);
+       } else
+           goto putpage;
+   }
+
+   BUG_ON(item->slot != slot);
+   /* the page may have changed */
+   item->page = page;
+   put_rmap_list_entry(slot, scan_index);
+   if (swap_entry)
+       put_rmap_list_entry(slot, swap_index);
+   return item;
+
+putpage:
+   put_page(page);
+   page = NULL;
+nopage:
+   /* no page, store addr back and free rmap_item if possible */
+   free_entry_item(scan_entry);
+   put_rmap_list_entry(slot, scan_index);
+   if (swap_entry)
+       put_rmap_list_entry(slot, swap_index);
+   return NULL;
+}
+
+static inline int in_stable_tree(struct rmap_item *rmap_item)
+{
+   return rmap_item->address & STABLE_FLAG;
+}
+
+/**
+ * scan_vma_one_page() - scan the next page in a vma_slot. Called with
+ * mmap_sem locked.
+ */
+static void scan_vma_one_page(struct vma_slot *slot)
+{
+   struct mm_struct *mm;
+   struct rmap_item *rmap_item = NULL;
+   struct vm_area_struct *vma = slot->vma;
+
+   mm = vma->vm_mm;
+   BUG_ON(!mm);
+   BUG_ON(!slot);
+
+   rmap_item = get_next_rmap_item(slot);
+   if (!rmap_item)
+       goto out1;
+
+   if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
+       goto out2;
+
+   cmp_and_merge_page(rmap_item);
+out2:
+   put_page(rmap_item->page);
+out1:
+   slot->pages_scanned++;
+   slot->flags |= UKSM_SLOT_SCANNED;
+   if (vma_fully_scanned(slot)) {
+       slot->flags |= UKSM_SLOT_FUL_SCANNED;
+       slot->rung->fully_scanned_slots++;
+       BUG_ON(!slot->rung->fully_scanned_slots);
+   }
+}
+
+static unsigned long get_vma_random_scan_num(struct vma_slot *slot,
+                        unsigned long scan_ratio)
+{
+   return slot->pages * scan_ratio / UKSM_SCAN_RATIO_MAX;
+}
+
+static inline void vma_rung_enter(struct vma_slot *slot,
+                 struct scan_rung *rung)
+{
+   unsigned long pages_to_scan;
+   struct scan_rung *old_rung = slot->rung;
+
+   /* leave the old rung it was in */
+   BUG_ON(list_empty(&slot->uksm_list));
+
+   if (old_rung->current_scan == &slot->uksm_list)
+       old_rung->current_scan = slot->uksm_list.next;
+   list_del_init(&slot->uksm_list);
+   old_rung->vma_num--;
+   if (slot->flags & UKSM_SLOT_FUL_SCANNED)
+       old_rung->fully_scanned_slots--;
+
+   if (old_rung->current_scan == &old_rung->vma_list) {
+       /* This rung finishes a round */
+       old_rung->flags |= UKSM_RUNG_ROUND_FINISHED;
+       old_rung->current_scan = old_rung->vma_list.next;
+       BUG_ON(old_rung->current_scan == &old_rung->vma_list &&
+              !list_empty(&old_rung->vma_list));
+   }
+
+   /* enter the new rung */
+   while (!(pages_to_scan =
+       get_vma_random_scan_num(slot, rung->scan_ratio))) {
+       rung++;
+       BUG_ON(rung > &uksm_scan_ladder[uksm_scan_ladder_size - 1]);
+   }
+   if (list_empty(&rung->vma_list))
+       rung->current_scan = &slot->uksm_list;
+   list_add(&slot->uksm_list, &rung->vma_list);
+   slot->rung = rung;
+   slot->pages_to_scan = pages_to_scan;
+   slot->rung->vma_num++;
+   if (slot->flags & UKSM_SLOT_FUL_SCANNED)
+       rung->fully_scanned_slots++;
+
+   BUG_ON(rung->current_scan == &rung->vma_list &&
+          !list_empty(&rung->vma_list));
+}
+
+static inline void vma_rung_up(struct vma_slot *slot)
+{
+   if (slot->rung == &uksm_scan_ladder[uksm_scan_ladder_size-1])
+       return;
+
+   vma_rung_enter(slot, slot->rung + 1);
+}
+
+static inline void vma_rung_down(struct vma_slot *slot)
+{
+   if (slot->rung == &uksm_scan_ladder[0])
+       return;
+
+   vma_rung_enter(slot, slot->rung - 1);
+}
+
+/**
+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
+ */
+static unsigned long cal_dedup_ratio(struct vma_slot *slot)
+{
+   struct vma_slot *slot2;
+   void **dup_slot;
+   unsigned long dup_pages;
+   unsigned long dedup_num, pages1, scanned1;
+   unsigned long ret;
+   int i;
+
+   if (!slot->pages_scanned)
+       return 0;
+
+   pages1 = slot->pages;
+   scanned1 = slot->pages_scanned - slot->last_scanned;
+   BUG_ON(scanned1 > slot->pages_scanned);
+
+   for (i = slot->uksm_index; i < uksm_vma_tree_index_end; i++) {
+       unsigned long pages2, scanned2;
+
+       dup_slot = radix_tree_lookup_slot(&slot->dup_tree, i);
+       if (!dup_slot)
+           continue;
+
+       dup_pages = (unsigned long)(*dup_slot) >> INDIRECT_OFFSET;
+
+       slot2 = radix_tree_lookup(&uksm_vma_tree, i);
+       BUG_ON(!slot2 || !slot2->pages_scanned);
+
+       pages2 = slot2->pages;
+       scanned2 = slot2->pages_scanned - slot2->last_scanned;
+       BUG_ON(scanned2 > slot2->pages_scanned);
+
+       BUG_ON(!scanned1 || !scanned2);
+
+       dedup_num = dup_pages * pages1 / scanned1 * pages2 / scanned2;
+       slot->dedup_num += dedup_num;
+       slot2->dedup_num += dedup_num;
+   }
+
+   ret = (slot->dedup_num * UKSM_DEDUP_RATIO_SCALE / pages1);
+
+   /* Thrashing area filtering */
+   if (uksm_thrash_threshold) {
+       if (slot->pages_cowed * 100 / slot->pages_merged
+           > uksm_thrash_threshold) {
+           ret = 0;
+       } else {
+           ret = ret * (slot->pages_merged - slot->pages_cowed)
+                 / slot->pages_merged;
+       }
+   }
+
+   return ret;
+}
+
+
+/**
+ * stable_node_reinsert() - When the hash_strength has been adjusted, the
+ * stable tree need to be restructured, this is the function re-inserting the
+ * stable node.
+ */
+static inline void stable_node_reinsert(struct stable_node *new_node,
+                   struct page *page,
+                   struct rb_root *root_treep,
+                   struct list_head *tree_node_listp,
+                   u32 hash)
+{
+   struct rb_node **new = &root_treep->rb_node;
+   struct rb_node *parent = NULL;
+   struct stable_node *stable_node;
+   struct tree_node *tree_node;
+   struct page *tree_page;
+   int cmp;
+
+   while (*new) {
+       int cmp;
+
+       tree_node = rb_entry(*new, struct tree_node, node);
+
+       cmp = hash_cmp(hash, tree_node->hash);
+
+       if (cmp < 0) {
+           parent = *new;
+           new = &parent->rb_left;
+       } else if (cmp > 0) {
+           parent = *new;
+           new = &parent->rb_right;
+       } else
+           break;
+   }
+
+   if (*new) {
+       /* find a stable tree node with same first level hash value */
+       stable_node_hash_max(new_node, page, hash);
+       if (tree_node->count == 1) {
+           stable_node = rb_entry(tree_node->sub_root.rb_node,
+                          struct stable_node, node);
+           tree_page = get_uksm_page(stable_node, 1, 0);
+           if (tree_page) {
+               stable_node_hash_max(stable_node,
+                             tree_page, hash);
+               put_page(tree_page);
+
+               /* prepare for stable node insertion */
+
+               cmp = hash_cmp(new_node->hash_max,
+                          stable_node->hash_max);
+               parent = &stable_node->node;
+               if (cmp < 0)
+                   new = &parent->rb_left;
+               else if (cmp > 0)
+                   new = &parent->rb_right;
+               else
+                   goto failed;
+
+               goto add_node;
+           } else {
+               /* the only stable_node deleted, the tree node
+                * was not deleted.
+                */
+               goto tree_node_reuse;
+           }
+       }
+
+       /* well, search the collision subtree */
+       new = &tree_node->sub_root.rb_node;
+       parent = NULL;
+       BUG_ON(!*new);
+       while (*new) {
+           int cmp;
+
+           stable_node = rb_entry(*new, struct stable_node, node);
+
+           cmp = hash_cmp(new_node->hash_max,
+                      stable_node->hash_max);
+
+           if (cmp < 0) {
+               parent = *new;
+               new = &parent->rb_left;
+           } else if (cmp > 0) {
+               parent = *new;
+               new = &parent->rb_right;
+           } else {
+               /* oh, no, still a collision */
+               goto failed;
+           }
+       }
+
+       goto add_node;
+   }
+
+   /* no tree node found */
+   tree_node = alloc_tree_node(tree_node_listp);
+   if (!tree_node) {
+       printk(KERN_ERR "UKSM: memory allocation error!\n");
+       goto failed;
+   } else {
+       tree_node->hash = hash;
+       rb_link_node(&tree_node->node, parent, new);
+       rb_insert_color(&tree_node->node, root_treep);
+
+tree_node_reuse:
+       /* prepare for stable node insertion */
+       parent = NULL;
+       new = &tree_node->sub_root.rb_node;
+   }
+
+add_node:
+   rb_link_node(&new_node->node, parent, new);
+   rb_insert_color(&new_node->node, &tree_node->sub_root);
+   new_node->tree_node = tree_node;
+   tree_node->count++;
+   return;
+
+failed:
+   /* This can only happen when two nodes have collided
+    * in two levels.
+    */
+   new_node->tree_node = NULL;
+   return;
+}
+
+static inline void free_all_tree_nodes(struct list_head *list)
+{
+   struct tree_node *node, *tmp;
+
+   list_for_each_entry_safe(node, tmp, list, all_list) {
+       free_tree_node(node);
+   }
+}
+
+/**
+ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
+ * strength to the current hash_strength. It re-structures the hole tree.
+ */
+static inline void stable_tree_delta_hash(u32 prev_hash_strength)
+{
+   struct stable_node *node, *tmp;
+   struct rb_root *root_new_treep;
+   struct list_head *new_tree_node_listp;
+
+   stable_tree_index = (stable_tree_index + 1) % 2;
+   root_new_treep = &root_stable_tree[stable_tree_index];
+   new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
+   *root_new_treep = RB_ROOT;
+   BUG_ON(!list_empty(new_tree_node_listp));
+
+   /*
+    * we need to be safe, the node could be removed by get_uksm_page()
+    */
+   list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
+       void *addr;
+       struct page *node_page;
+       u32 hash;
+
+       /*
+        * We are completely re-structuring the stable nodes to a new
+        * stable tree. We don't want to touch the old tree unlinks and
+        * old tree_nodes. The old tree_nodes will be freed at once.
+        */
+       node_page = get_uksm_page(node, 0, 0);
+       if (!node_page)
+           continue;
+
+       if (node->tree_node) {
+           hash = node->tree_node->hash;
+
+           addr = kmap_atomic(node_page, KM_USER0);
+
+           hash = delta_hash(addr, prev_hash_strength,
+                     hash_strength, hash);
+           kunmap_atomic(addr, KM_USER0);
+       } else {
+           /*
+            *it was not inserted to rbtree due to collision in last
+            *round scan.
+            */
+           hash = page_hash(node_page, hash_strength, 0);
+       }
+
+       stable_node_reinsert(node, node_page, root_new_treep,
+                    new_tree_node_listp, hash);
+       put_page(node_page);
+   }
+
+   root_stable_treep = root_new_treep;
+   free_all_tree_nodes(stable_tree_node_listp);
+   BUG_ON(!list_empty(stable_tree_node_listp));
+   stable_tree_node_listp = new_tree_node_listp;
+}
+
+static inline void inc_hash_strength(unsigned long delta)
+{
+   hash_strength += 1 << delta;
+   if (hash_strength > HASH_STRENGTH_MAX)
+       hash_strength = HASH_STRENGTH_MAX;
+}
+
+static inline void dec_hash_strength(unsigned long delta)
+{
+   unsigned long change = 1 << delta;
+
+   if (hash_strength <= change + 1)
+       hash_strength = 1;
+   else
+       hash_strength -= change;
+}
+
+static inline void inc_hash_strength_delta(void)
+{
+   hash_strength_delta++;
+   if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
+       hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
+}
+
+/*
+static inline unsigned long get_current_neg_ratio(void)
+{
+   if (!rshash_pos || rshash_neg > rshash_pos)
+       return 100;
+
+   return div64_u64(100 * rshash_neg , rshash_pos);
+}
+*/
+
+static inline unsigned long get_current_neg_ratio(void)
+{
+   u64 pos = benefit.pos;
+   u64 neg = benefit.neg;
+
+   if (!neg)
+       return 0;
+
+   if (!pos || neg > pos)
+       return 100;
+
+   if (neg > div64_u64(U64_MAX, 100))
+       pos = div64_u64(pos, 100);
+   else
+       neg *= 100;
+
+   return div64_u64(neg, pos);
+}
+
+static inline unsigned long get_current_benefit(void)
+{
+   u64 pos = benefit.pos;
+   u64 neg = benefit.neg;
+   u64 scanned = benefit.scanned;
+
+   if (neg > pos)
+       return 0;
+
+   return div64_u64((pos - neg), scanned);
+}
+
+static inline int judge_rshash_direction(void)
+{
+   u64 current_neg_ratio, stable_benefit;
+   u64 current_benefit, delta = 0;
+   int ret = STILL;
+
+   /* In case the system are still for a long time. */
+   if (uksm_scan_round % 1024 == 3) {
+       ret = OBSCURE;
+       goto out;
+   }
+
+   current_neg_ratio = get_current_neg_ratio();
+
+   if (current_neg_ratio == 0) {
+       rshash_neg_cont_zero++;
+       if (rshash_neg_cont_zero > 2)
+           return GO_DOWN;
+       else
+           return STILL;
+   }
+   rshash_neg_cont_zero = 0;
+
+   if (current_neg_ratio > 90) {
+       ret = GO_UP;
+       goto out;
+   }
+
+   current_benefit = get_current_benefit();
+   stable_benefit = rshash_state.stable_benefit;
+
+   if (!stable_benefit) {
+       ret = OBSCURE;
+       goto out;
+   }
+
+   if (current_benefit > stable_benefit)
+       delta = current_benefit - stable_benefit;
+   else if (current_benefit < stable_benefit)
+       delta = stable_benefit - current_benefit;
+
+   delta = div64_u64(100 * delta , stable_benefit);
+
+   if (delta > 50) {
+       rshash_cont_obscure++;
+       if (rshash_cont_obscure > 2)
+           return OBSCURE;
+       else
+           return STILL;
+   }
+
+out:
+   rshash_cont_obscure = 0;
+   return ret;
+}
+
+/**
+ * rshash_adjust() - The main function to control the random sampling state
+ * machine for hash strength adapting.
+ */
+static void rshash_adjust(void)
+{
+   unsigned long prev_hash_strength = hash_strength;
+
+   if (uksm_pages_scanned == uksm_pages_scanned_last)
+       return;
+
+   encode_benefit();
+
+   switch (rshash_state.state) {
+   case RSHASH_STILL:
+       switch (judge_rshash_direction()) {
+       case GO_UP:
+           if (rshash_state.pre_direct == GO_DOWN)
+               hash_strength_delta = 0;
+
+           inc_hash_strength(hash_strength_delta);
+           inc_hash_strength_delta();
+           rshash_state.stable_benefit = get_current_benefit();
+           rshash_state.pre_direct = GO_UP;
+           break;
+
+       case GO_DOWN:
+           if (rshash_state.pre_direct == GO_UP)
+               hash_strength_delta = 0;
+
+           dec_hash_strength(hash_strength_delta);
+           inc_hash_strength_delta();
+           rshash_state.stable_benefit = get_current_benefit();
+           rshash_state.pre_direct = GO_DOWN;
+           break;
+
+       case OBSCURE:
+           rshash_state.stable_point = hash_strength;
+           rshash_state.turn_point_down = hash_strength;
+           rshash_state.turn_point_up = hash_strength;
+           rshash_state.turn_benefit_down = get_current_benefit();
+           rshash_state.turn_benefit_up = get_current_benefit();
+           rshash_state.lookup_window_index = 0;
+           rshash_state.state = RSHASH_TRYDOWN;
+           dec_hash_strength(hash_strength_delta);
+           inc_hash_strength_delta();
+           break;
+
+       case STILL:
+           break;
+       default:
+           BUG();
+       }
+       break;
+
+   case RSHASH_TRYDOWN:
+       if (rshash_state.lookup_window_index++ % 5 == 0)
+           rshash_state.below_count = 0;
+
+       if (get_current_benefit() < rshash_state.stable_benefit)
+           rshash_state.below_count++;
+       else if (get_current_benefit() >
+            rshash_state.turn_benefit_down) {
+           rshash_state.turn_point_down = hash_strength;
+           rshash_state.turn_benefit_down = get_current_benefit();
+       }
+
+       if (rshash_state.below_count >= 3 ||
+           judge_rshash_direction() == GO_UP ||
+           hash_strength == 1) {
+           hash_strength = rshash_state.stable_point;
+           hash_strength_delta = 0;
+           inc_hash_strength(hash_strength_delta);
+           inc_hash_strength_delta();
+           rshash_state.lookup_window_index = 0;
+           rshash_state.state = RSHASH_TRYUP;
+           hash_strength_delta = 0;
+       } else {
+           dec_hash_strength(hash_strength_delta);
+           inc_hash_strength_delta();
+       }
+       break;
+
+   case RSHASH_TRYUP:
+       if (rshash_state.lookup_window_index++ % 5 == 0)
+           rshash_state.below_count = 0;
+
+       if (get_current_benefit() < rshash_state.turn_benefit_down)
+           rshash_state.below_count++;
+       else if (get_current_benefit() > rshash_state.turn_benefit_up) {
+           rshash_state.turn_point_up = hash_strength;
+           rshash_state.turn_benefit_up = get_current_benefit();
+       }
+
+       if (rshash_state.below_count >= 3 ||
+           judge_rshash_direction() == GO_DOWN ||
+           hash_strength == HASH_STRENGTH_MAX) {
+           hash_strength = rshash_state.turn_benefit_up >
+               rshash_state.turn_benefit_down ?
+               rshash_state.turn_point_up :
+               rshash_state.turn_point_down;
+
+           rshash_state.state = RSHASH_PRE_STILL;
+       } else {
+           inc_hash_strength(hash_strength_delta);
+           inc_hash_strength_delta();
+       }
+
+       break;
+
+   case RSHASH_NEW:
+   case RSHASH_PRE_STILL:
+       rshash_state.stable_benefit = get_current_benefit();
+       rshash_state.state = RSHASH_STILL;
+       hash_strength_delta = 0;
+       break;
+   default:
+       BUG();
+   }
+
+   /* rshash_neg = rshash_pos = 0; */
+   reset_benefit();
+
+   if (prev_hash_strength != hash_strength)
+       stable_tree_delta_hash(prev_hash_strength);
+}
+
+static void free_vma_dup_tree(struct vma_slot *slot)
+{
+   struct vma_slot *tmp_slot;
+   int i;
+
+   /* step 1: free entries in smaller vmas' dup tree */
+   for (i = 0; i < slot->uksm_index; i++) {
+       tmp_slot = radix_tree_lookup(&uksm_vma_tree, i);
+       if (tmp_slot)
+           radix_tree_delete(&tmp_slot->dup_tree, slot->uksm_index);
+   }
+
+   /* step 2: free my own dup tree */
+   for (i = slot->uksm_index; i < uksm_vma_tree_index_end; i++)
+       radix_tree_delete(&slot->dup_tree, i);
+
+   BUG_ON(slot->dup_tree.rnode);
+}
+
+/**
+ * round_update_ladder() - The main function to do update of all the
+ * adjustments whenever a scan round is finished.
+ */
+static void round_update_ladder(void)
+{
+   int i;
+   struct vma_slot *slot, *tmp_slot;
+   unsigned long dedup_ratio_max = 0, dedup_ratio_mean = 0;
+   unsigned long threshold;
+
+   for (i = 0; i < uksm_vma_tree_index_end; i++) {
+       slot = radix_tree_lookup(&uksm_vma_tree, i);
+
+       if (slot) {
+           slot->dedup_ratio = cal_dedup_ratio(slot);
+           if (dedup_ratio_max < slot->dedup_ratio)
+               dedup_ratio_max = slot->dedup_ratio;
+           dedup_ratio_mean += slot->dedup_ratio;
+       }
+   }
+
+   dedup_ratio_mean /= uksm_vma_slot_num;
+   threshold = dedup_ratio_mean;
+
+   for (i = 0; i < uksm_vma_tree_index_end; i++) {
+       slot = radix_tree_lookup(&uksm_vma_tree, i);
+
+       if (slot) {
+           if (slot->dedup_ratio &&
+               slot->dedup_ratio >= threshold) {
+               vma_rung_up(slot);
+           } else {
+               vma_rung_down(slot);
+           }
+
+           free_vma_dup_tree(slot);
+           radix_tree_delete(&uksm_vma_tree, i);
+           uksm_vma_tree_num--;
+           slot->uksm_index = -1;
+           slot->flags &= ~UKSM_SLOT_SCANNED;
+           slot->dedup_ratio = 0;
+           slot->dedup_num = 0;
+       }
+   }
+
+   for (i = 0; i < uksm_scan_ladder_size; i++) {
+       list_for_each_entry_safe(slot, tmp_slot,
+                    &uksm_scan_ladder[i].vma_list,
+                    uksm_list) {
+           /*
+            * The slots were scanned but not in inter_tab, their
+            * dedup must be 0.
+            */
+           if (slot->flags & UKSM_SLOT_SCANNED) {
+               BUG_ON(slot->dedup_ratio != 0);
+               vma_rung_down(slot);
+           }
+
+           slot->dedup_ratio = 0;
+       }
+   }
+
+   BUG_ON(uksm_vma_tree_num != 0);
+   uksm_vma_tree_index_end = 0;
+
+   for (i = 0; i < uksm_scan_ladder_size; i++) {
+       uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
+       uksm_scan_ladder[i].flags &= ~UKSM_RUNG_BUSY_SEARCHED;
+
+       list_for_each_entry(slot, &uksm_scan_ladder[i].vma_list,
+                   uksm_list) {
+           slot->last_scanned = slot->pages_scanned;
+           slot->flags &= ~UKSM_SLOT_SCANNED;
+           slot->pages_cowed = 0;
+           slot->pages_merged = 0;
+           if (slot->flags & UKSM_SLOT_FUL_SCANNED) {
+               slot->flags &= ~UKSM_SLOT_FUL_SCANNED;
+               uksm_scan_ladder[i].fully_scanned_slots--;
+           }
+           BUG_ON(slot->uksm_index != -1);
+       }
+
+       BUG_ON(uksm_scan_ladder[i].fully_scanned_slots);
+   }
+
+   rshash_adjust();
+
+   //uksm_pages_scanned_last = uksm_pages_scanned;
+}
+
+static inline unsigned int uksm_pages_to_scan(unsigned int batch_pages)
+{
+   return totalram_pages * batch_pages / 1000000;
+}
+
+static inline void cal_ladder_pages_to_scan(unsigned int num)
+{
+   int i;
+
+   for (i = 0; i < uksm_scan_ladder_size; i++) {
+       uksm_scan_ladder[i].pages_to_scan = num
+           * uksm_scan_ladder[i].scan_ratio / UKSM_SCAN_RATIO_MAX;
+   }
+   uksm_scan_ladder[0].pages_to_scan /= 16;
+   uksm_scan_ladder[1].pages_to_scan /= 4;
+}
+
+static inline void uksm_del_vma_slot(struct vma_slot *slot)
+{
+   int i, j;
+   struct rmap_list_entry *entry;
+   struct vma_slot *tmp;
+
+   /* mutex lock contention maybe intensive, other idea ? */
+   BUG_ON(list_empty(&slot->uksm_list) || !slot->rung);
+
+   if (slot->rung->current_scan == &slot->uksm_list)
+       slot->rung->current_scan = slot->rung->current_scan->next;
+
+   list_del_init(&slot->uksm_list);
+   slot->rung->vma_num--;
+   if (slot->flags & UKSM_SLOT_FUL_SCANNED)
+       slot->rung->fully_scanned_slots--;
+
+   if (slot->rung->current_scan == &slot->rung->vma_list) {
+       /* This rung finishes a round */
+       slot->rung->flags |= UKSM_RUNG_ROUND_FINISHED;
+       slot->rung->current_scan = slot->rung->vma_list.next;
+       BUG_ON(slot->rung->current_scan == &slot->rung->vma_list
+              && !list_empty(&slot->rung->vma_list));
+   }
+
+   if (slot->uksm_index == -1)
+       goto skip;
+
+   tmp = radix_tree_delete(&uksm_vma_tree, slot->uksm_index);
+   BUG_ON(!tmp || tmp != slot);
+   free_vma_dup_tree(slot);
+   uksm_vma_tree_num--;
+   if (slot->uksm_index == uksm_vma_tree_index_end - 1)
+       uksm_vma_tree_index_end--;
+
+skip:
+   if (!slot->rmap_list_pool)
+       goto out;
+
+   for (i = 0; i < slot->pool_size; i++) {
+       void *addr;
+
+       if (!slot->rmap_list_pool[i])
+           continue;
+
+       addr = kmap(slot->rmap_list_pool[i]);
+       BUG_ON(!addr);
+       for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
+           entry = (struct rmap_list_entry *)addr + j;
+           if (is_addr(entry->addr))
+               continue;
+           if (!entry->item)
+               continue;
+
+           remove_rmap_item_from_tree(entry->item);
+           free_rmap_item(entry->item);
+           slot->pool_counts[i]--;
+       }
+       BUG_ON(slot->pool_counts[i]);
+       kunmap(slot->rmap_list_pool[i]);
+       __free_page(slot->rmap_list_pool[i]);
+   }
+   kfree(slot->rmap_list_pool);
+   kfree(slot->pool_counts);
+
+out:
+   slot->rung = NULL;
+   free_vma_slot(slot);
+   BUG_ON(!uksm_vma_slot_num);
+   uksm_vma_slot_num--;
+}
+
+
+static inline void cleanup_vma_slots(void)
+{
+   struct vma_slot *slot;
+
+   spin_lock(&vma_slot_list_lock);
+   while (!list_empty(&vma_slot_del)) {
+       slot = list_entry(vma_slot_del.next,
+                 struct vma_slot, slot_list);
+       list_del(&slot->slot_list);
+       spin_unlock(&vma_slot_list_lock);
+       uksm_del_vma_slot(slot);
+       spin_lock(&vma_slot_list_lock);
+   }
+   spin_unlock(&vma_slot_list_lock);
+}
+
+static inline int rung_fully_scanned(struct scan_rung *rung)
+{
+   return (rung->fully_scanned_slots == rung->vma_num &&
+       rung->fully_scanned_slots);
+}
+
+/**
+ * uksm_do_scan()  - the main worker function.
+ */
+static void uksm_do_scan(void)
+{
+   struct vma_slot *slot, *iter;
+   struct list_head *next_scan, *iter_head;
+   struct mm_struct *busy_mm;
+   unsigned char round_finished, all_rungs_emtpy;
+   int i, err;
+   unsigned long rest_pages;
+
+   might_sleep();
+
+   rest_pages = 0;
+repeat_all:
+   for (i = uksm_scan_ladder_size - 1; i >= 0; i--) {
+       struct scan_rung *rung = &uksm_scan_ladder[i];
+
+       if (!rung->pages_to_scan)
+           continue;
+
+       if (list_empty(&rung->vma_list)) {
+           rung->pages_to_scan = 0;
+           continue;
+       }
+
+       /*
+        * if a higher rung is fully scanned, its rest pages should be
+        * propagated to the lower rungs. This can prevent the higher
+        * rung from waiting a long time while it still has its
+        * pages_to_scan quota.
+        *
+        */
+       if (rung_fully_scanned(rung)) {
+           rest_pages += rung->pages_to_scan;
+           rung->pages_to_scan = 0;
+           continue;
+       }
+
+       rung->pages_to_scan += rest_pages;
+       rest_pages = 0;
+       while (rung->pages_to_scan && likely(!freezing(current))) {
+cleanup:
+           cleanup_vma_slots();
+
+           if (list_empty(&rung->vma_list))
+               break;
+
+rescan:
+           BUG_ON(rung->current_scan == &rung->vma_list &&
+                  !list_empty(&rung->vma_list));
+
+           slot = list_entry(rung->current_scan,
+                    struct vma_slot, uksm_list);
+
+
+           if (slot->flags & UKSM_SLOT_FUL_SCANNED)
+               goto next_scan;
+
+           err = try_down_read_slot_mmap_sem(slot);
+           if (err == -ENOENT)
+               goto cleanup;
+
+           busy_mm = slot->mm;
+
+busy:
+           if (err == -EBUSY) {
+               /* skip other vmas on the same mm */
+               rung->flags |= UKSM_RUNG_BUSY_SEARCHED;
+               iter = slot;
+               iter_head = slot->uksm_list.next;
+
+               while (iter_head != &rung->vma_list) {
+                   iter = list_entry(iter_head,
+                             struct vma_slot,
+                             uksm_list);
+                   if (iter->vma->vm_mm != busy_mm)
+                       break;
+                   iter_head = iter_head->next;
+               }
+
+               if (iter->vma->vm_mm != busy_mm) {
+                   rung->current_scan = &iter->uksm_list;
+                   goto rescan;
+               } else {
+                   /* at the end, but still busy */
+                   rung->current_scan = iter->uksm_list.next;
+                   goto next_scan;
+                   break;
+               }
+           }
+
+           BUG_ON(!vma_can_enter(slot->vma));
+           if (uksm_test_exit(slot->vma->vm_mm)) {
+               busy_mm = slot->vma->vm_mm;
+               up_read(&slot->vma->vm_mm->mmap_sem);
+               err = -EBUSY;
+               goto busy;
+           }
+
+           if (rung->flags & UKSM_RUNG_BUSY_SEARCHED)
+               rung->flags &= ~UKSM_RUNG_BUSY_SEARCHED;
+           /* Ok, we have take the mmap_sem, ready to scan */
+           scan_vma_one_page(slot);
+           up_read(&slot->vma->vm_mm->mmap_sem);
+           rung->pages_to_scan--;
+
+           if ((slot->pages_scanned &&
+                slot->pages_scanned % slot->pages_to_scan == 0)
+               || (slot->flags & UKSM_SLOT_FUL_SCANNED)) {
+next_scan:
+               next_scan = rung->current_scan->next;
+               if (next_scan == &rung->vma_list) {
+                   /*
+                    * All the slots in this rung
+                    * have been traveled in this
+                    * round.
+                    */
+                   rung->flags |= UKSM_RUNG_ROUND_FINISHED;
+                   rung->current_scan =
+                       rung->vma_list.next;
+                   if (rung_fully_scanned(rung) ||
+                       (rung->flags &
+                        UKSM_RUNG_BUSY_SEARCHED)) {
+                       /*
+                        * All the pages in all slots
+                        * have been scanned. Or we
+                        * did not make any progress
+                        * because of busy mm.
+                        */
+                       rest_pages +=
+                           rung->pages_to_scan;
+                       rung->pages_to_scan = 0;
+                       break;
+                   }
+               } else {
+                   rung->current_scan = next_scan;
+               }
+           }
+
+           cond_resched();
+       }
+
+       if (freezing(current))
+           break;
+   }
+
+   if (freezing(current))
+       return;
+
+   round_finished = 1;
+   all_rungs_emtpy = 1;
+   for (i = 0; i < uksm_scan_ladder_size; i++) {
+       struct scan_rung *rung = &uksm_scan_ladder[i];
+
+       if (!list_empty(&rung->vma_list)) {
+           all_rungs_emtpy = 0;
+           if (!(rung->flags & UKSM_RUNG_ROUND_FINISHED))
+               round_finished = 0;
+           break;
+       }
+   }
+
+   if (all_rungs_emtpy)
+       round_finished = 0;
+
+   cleanup_vma_slots();
+
+   if (round_finished) {
+       round_update_ladder();
+
+       /*
+        * A number of pages can hang around indefinitely on per-cpu
+        * pagevecs, raised page count preventing write_protect_page
+        * from merging them.  Though it doesn't really matter much,
+        * it is puzzling to see some stuck in pages_volatile until
+        * other activity jostles them out, and they also prevented
+        * LTP's KSM test from succeeding deterministically; so drain
+        * them here (here rather than on entry to uksm_do_scan(),
+        * so we don't IPI too often when pages_to_scan is set low).
+        */
+       lru_add_drain_all();
+
+       /* sync with uksm_remove_vma for rb_erase */
+       uksm_scan_round++;
+       root_unstable_tree = RB_ROOT;
+       free_all_tree_nodes(&unstable_tree_node_list);
+   }
+
+   for (i = 0; i < uksm_scan_ladder_size; i++) {
+       struct scan_rung *rung = &uksm_scan_ladder[i];
+
+       /*
+        * Before we can go sleep, we should make sure that all the
+        * pages_to_scan quota for this scan has been finished
+        */
+       if (!list_empty(&rung->vma_list) && rung->pages_to_scan)
+           goto repeat_all;
+   }
+
+   cal_ladder_pages_to_scan(uksm_scan_batch_pages);
+}
+
+static int ksmd_should_run(void)
+{
+   return uksm_run & UKSM_RUN_MERGE;
+}
+
+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
+
+static inline unsigned long vma_pool_size(struct vm_area_struct *vma)
+{
+   return round_up(sizeof(struct rmap_list_entry) * vma_pages(vma),
+           PAGE_SIZE) >> PAGE_SHIFT;
+}
+
+/**
+ *
+ *
+ *
+ * @param slot
+ *
+ * @return int , 1 on success, 0 on failure
+ */
+static int uksm_vma_enter(struct vma_slot *slot)
+{
+   struct scan_rung *rung;
+   unsigned long pages_to_scan, pool_size;
+
+   BUG_ON(slot->pages != vma_pages(slot->vma));
+   rung = &uksm_scan_ladder[0];
+
+   pages_to_scan = get_vma_random_scan_num(slot, rung->scan_ratio);
+   if (pages_to_scan) {
+       if (list_empty(&rung->vma_list))
+           rung->current_scan = &slot->uksm_list;
+       BUG_ON(!list_empty(&slot->uksm_list));
+
+       list_add(&slot->uksm_list, &rung->vma_list);
+       slot->rung = rung;
+       slot->pages_to_scan = pages_to_scan;
+       slot->rung->vma_num++;
+       BUG_ON(PAGE_SIZE % sizeof(struct rmap_list_entry) != 0);
+
+       pool_size = vma_pool_size(slot->vma);
+
+       slot->rmap_list_pool = kzalloc(sizeof(struct page *) *
+                          pool_size, GFP_NOWAIT);
+       slot->pool_counts = kzalloc(sizeof(unsigned long) * pool_size,
+                       GFP_NOWAIT);
+       slot->pool_size = pool_size;
+       if (!slot->rmap_list_pool)
+           goto failed;
+
+       if (!slot->pool_counts) {
+           kfree(slot->rmap_list_pool);
+           goto failed;
+       }
+
+       BUG_ON(rung->current_scan == &rung->vma_list &&
+              !list_empty(&rung->vma_list));
+
+       uksm_vma_slot_num++;
+       BUG_ON(!uksm_vma_slot_num);
+       return 1;
+   }
+
+failed:
+   return 0;
+}
+
+
+static void uksm_enter_all_slots(void)
+{
+   struct vma_slot *slot;
+   int added;
+
+   spin_lock(&vma_slot_list_lock);
+   while (!list_empty(&vma_slot_new)) {
+       slot = list_entry(vma_slot_new.next,
+                 struct vma_slot, slot_list);
+       /**
+        * slots are sorted by ctime_j, if one found to be too
+        * young, just stop scanning the rest ones.
+        */
+       /*
+
+           if (time_before(jiffies, slot->ctime_j +
+                   msecs_to_jiffies(1000))) {
+               spin_unlock(&vma_slot_list_lock);
+               return;
+           }
+       */
+
+       list_del_init(&slot->slot_list);
+       added = 0;
+       if (vma_can_enter(slot->vma))
+           added = uksm_vma_enter(slot);
+
+       if (!added) {
+           /* Put back to new list to be del by its creator */
+           slot->ctime_j = jiffies;
+           list_del(&slot->slot_list);
+           list_add_tail(&slot->slot_list, &vma_slot_noadd);
+       }
+       spin_unlock(&vma_slot_list_lock);
+       cond_resched();
+       spin_lock(&vma_slot_list_lock);
+   }
+   spin_unlock(&vma_slot_list_lock);
+}
+/*
+static int uksm_usr_support_init(void)
+{
+   int err = -EINVAL;
+
+   err = sock_create(AF_INET, SOCK_DGRAM, 0, &uksm_usr_spt_sock);
+   if (err < 0)
+       goto out;
+
+   uksm_usr_spt.randid = random32();
+
+out:
+   if ((err < 0) && uksm_usr_spt_sock) {
+       sock_release(uksm_usr_spt_sock);
+       uksm_usr_spt_sock = NULL;
+   }
+
+   return err;
+}
+
+static void uksm_usr_support(void)
+{
+   struct msghdr uksm_usr_spt_msg;
+   struct iovec uksm_usr_spt_iov;
+   int err;
+
+   if (!uksm_usr_spt_enabled)*/
+/*     goto out;
+
+   if (!uksm_usr_spt_last)
+       goto ok;
+*/
+   /* timer wrap around, reset uksm_usr_spt_last. */
+/* if (jiffies < uksm_usr_spt_last) {
+       uksm_usr_spt_last = jiffies;
+       goto out;
+   }
+
+   if (jiffies_to_msecs(jiffies - uksm_usr_spt_last) <
+       UKSM_USR_SPT_INTVL_MSEC)
+       goto out;
+
+ok:
+   if (uksm_usr_spt_sock == NULL) {
+       err = uksm_usr_support_init();
+       if (err < 0) {
+           printk(KERN_ERR "UKSM: uksm_usr_support"
+                   "init failed exiting.");
+           return;
+       }
+   }
+
+   uksm_usr_spt_addr.sin_family = AF_INET;
+   uksm_usr_spt_addr.sin_port = htons(UKSM_USR_SPT_PORT);
+   uksm_usr_spt_addr.sin_addr.s_addr = in_aton("114.212.190.16");
+
+   uksm_usr_spt_iov.iov_base = &uksm_usr_spt;
+   uksm_usr_spt_iov.iov_len = sizeof(uksm_usr_spt);
+   uksm_usr_spt_msg.msg_name = (struct sockaddr *)&uksm_usr_spt_addr;
+   uksm_usr_spt_msg.msg_iov = &uksm_usr_spt_iov;
+   uksm_usr_spt_msg.msg_iovlen = 1;
+   uksm_usr_spt_msg.msg_control = NULL;
+   uksm_usr_spt_msg.msg_controllen = 0;
+   uksm_usr_spt_msg.msg_namelen = sizeof(uksm_usr_spt_addr);
+   uksm_usr_spt_msg.msg_flags = MSG_DONTWAIT;
+
+   sock_sendmsg(uksm_usr_spt_sock, &uksm_usr_spt_msg, sizeof(uksm_usr_spt));
+   uksm_usr_spt_last = jiffies;*/
+/*out:
+   return;
+}
+*/
+static int uksm_scan_thread(void *nothing)
+{
+   set_freezable();
+   set_user_nice(current, 5);
+
+   while (!kthread_should_stop()) {
+/*     uksm_usr_support();*/
+
+       mutex_lock(&uksm_thread_mutex);
+       if (ksmd_should_run()) {
+           uksm_enter_all_slots();
+           uksm_do_scan();
+       }
+       mutex_unlock(&uksm_thread_mutex);
+
+       try_to_freeze();
+
+       if (ksmd_should_run()) {
+           schedule_timeout_interruptible(uksm_sleep_jiffies);
+           uksm_sleep_times++;
+       } else {
+           wait_event_freezable(uksm_thread_wait,
+               ksmd_should_run() || kthread_should_stop());
+       }
+   }
+   return 0;
+}
+
+int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
+           unsigned long *vm_flags)
+{
+   struct stable_node *stable_node;
+   struct node_vma *node_vma;
+   struct rmap_item *rmap_item;
+   struct hlist_node *hlist, *rmap_hlist;
+   unsigned int mapcount = page_mapcount(page);
+   int referenced = 0;
+   int search_new_forks = 0;
+   unsigned long address;
+
+   VM_BUG_ON(!PageKsm(page));
+   VM_BUG_ON(!PageLocked(page));
+
+   stable_node = page_stable_node(page);
+   if (!stable_node)
+       return 0;
+
+
+again:
+   hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
+       hlist_for_each_entry(rmap_item, rmap_hlist,
+                    &node_vma->rmap_hlist, hlist) {
+           struct anon_vma *anon_vma = rmap_item->anon_vma;
+           struct anon_vma_chain *vmac;
+           struct vm_area_struct *vma;
+
+           anon_vma_lock(anon_vma);
+           list_for_each_entry(vmac, &anon_vma->head,
+                       same_anon_vma) {
+               vma = vmac->vma;
+               address = get_rmap_addr(rmap_item);
+
+               if (address < vma->vm_start ||
+                   address >= vma->vm_end)
+                   continue;
+               /*
+                * Initially we examine only the vma which
+                * covers this rmap_item; but later, if there
+                * is still work to do, we examine covering
+                * vmas in other mms: in case they were forked
+                * from the original since ksmd passed.
+                */
+               if ((rmap_item->slot->vma == vma) ==
+                   search_new_forks)
+                   continue;
+
+               if (memcg &&
+                   !mm_match_cgroup(vma->vm_mm, memcg))
+                   continue;
+
+               referenced +=
+                   page_referenced_one(page, vma,
+                       address, &mapcount, vm_flags);
+               if (!search_new_forks || !mapcount)
+                   break;
+           }
+
+           anon_vma_unlock(anon_vma);
+           if (!mapcount)
+               goto out;
+       }
+   }
+   if (!search_new_forks++)
+       goto again;
+out:
+   return referenced;
+}
+
+int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
+{
+   struct stable_node *stable_node;
+   struct node_vma *node_vma;
+   struct hlist_node *hlist, *rmap_hlist;
+   struct rmap_item *rmap_item;
+   int ret = SWAP_AGAIN;
+   int search_new_forks = 0;
+   unsigned long address;
+
+   VM_BUG_ON(!PageKsm(page));
+   VM_BUG_ON(!PageLocked(page));
+
+   stable_node = page_stable_node(page);
+   if (!stable_node)
+       return SWAP_FAIL;
+again:
+   hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
+       hlist_for_each_entry(rmap_item, rmap_hlist,
+                    &node_vma->rmap_hlist, hlist) {
+           struct anon_vma *anon_vma = rmap_item->anon_vma;
+           struct anon_vma_chain *vmac;
+           struct vm_area_struct *vma;
+
+           anon_vma_lock(anon_vma);
+           list_for_each_entry(vmac, &anon_vma->head,
+                       same_anon_vma) {
+               vma = vmac->vma;
+               address = get_rmap_addr(rmap_item);
+
+               if (address < vma->vm_start ||
+                   address >= vma->vm_end)
+                   continue;
+               /*
+                * Initially we examine only the vma which
+                * covers this rmap_item; but later, if there
+                * is still work to do, we examine covering
+                * vmas in other mms: in case they were forked
+                * from the original since ksmd passed.
+                */
+               if ((rmap_item->slot->vma == vma) ==
+                   search_new_forks)
+                   continue;
+
+               ret = try_to_unmap_one(page, vma,
+                              address, flags);
+               if (ret != SWAP_AGAIN || !page_mapped(page)) {
+                   anon_vma_unlock(anon_vma);
+                   goto out;
+               }
+           }
+           anon_vma_unlock(anon_vma);
+       }
+   }
+   if (!search_new_forks++)
+       goto again;
+out:
+   return ret;
+}
+
+#ifdef CONFIG_MIGRATION
+int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
+         struct vm_area_struct *, unsigned long, void *), void *arg)
+{
+   struct stable_node *stable_node;
+   struct node_vma *node_vma;
+   struct hlist_node *hlist, *rmap_hlist;
+   struct rmap_item *rmap_item;
+   int ret = SWAP_AGAIN;
+   int search_new_forks = 0;
+   unsigned long address;
+
+   VM_BUG_ON(!PageKsm(page));
+   VM_BUG_ON(!PageLocked(page));
+
+   stable_node = page_stable_node(page);
+   if (!stable_node)
+       return ret;
+again:
+   hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
+       hlist_for_each_entry(rmap_item, rmap_hlist,
+                    &node_vma->rmap_hlist, hlist) {
+           struct anon_vma *anon_vma = rmap_item->anon_vma;
+           struct anon_vma_chain *vmac;
+           struct vm_area_struct *vma;
+
+           anon_vma_lock(anon_vma);
+           list_for_each_entry(vmac, &anon_vma->head,
+                       same_anon_vma) {
+               vma = vmac->vma;
+               address = get_rmap_addr(rmap_item);
+
+               if (address < vma->vm_start ||
+                   address >= vma->vm_end)
+                   continue;
+
+               if ((rmap_item->slot->vma == vma) ==
+                   search_new_forks)
+                   continue;
+
+               ret = rmap_one(page, vma, address, arg);
+               if (ret != SWAP_AGAIN) {
+                   anon_vma_unlock(anon_vma);
+                   goto out;
+               }
+           }
+           anon_vma_unlock(anon_vma);
+       }
+   }
+   if (!search_new_forks++)
+       goto again;
+out:
+   return ret;
+}
+
+/* Common ksm interface but may be specific to uksm */
+void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+{
+   struct stable_node *stable_node;
+
+   VM_BUG_ON(!PageLocked(oldpage));
+   VM_BUG_ON(!PageLocked(newpage));
+   VM_BUG_ON(newpage->mapping != oldpage->mapping);
+
+   stable_node = page_stable_node(newpage);
+   if (stable_node) {
+       VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
+       stable_node->kpfn = page_to_pfn(newpage);
+   }
+}
+#endif /* CONFIG_MIGRATION */
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
+                        unsigned long end_pfn)
+{
+   struct rb_node *node;
+
+   for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
+       struct stable_node *stable_node;
+
+       stable_node = rb_entry(node, struct stable_node, node);
+       if (stable_node->kpfn >= start_pfn &&
+           stable_node->kpfn < end_pfn)
+           return stable_node;
+   }
+   return NULL;
+}
+
+static int uksm_memory_callback(struct notifier_block *self,
+                  unsigned long action, void *arg)
+{
+   struct memory_notify *mn = arg;
+   struct stable_node *stable_node;
+
+   switch (action) {
+   case MEM_GOING_OFFLINE:
+       /*
+        * Keep it very simple for now: just lock out ksmd and
+        * MADV_UNMERGEABLE while any memory is going offline.
+        * mutex_lock_nested() is necessary because lockdep was alarmed
+        * that here we take uksm_thread_mutex inside notifier chain
+        * mutex, and later take notifier chain mutex inside
+        * uksm_thread_mutex to unlock it.   But that's safe because both
+        * are inside mem_hotplug_mutex.
+        */
+       mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
+       break;
+
+   case MEM_OFFLINE:
+       /*
+        * Most of the work is done by page migration; but there might
+        * be a few stable_nodes left over, still pointing to struct
+        * pages which have been offlined: prune those from the tree.
+        */
+       while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
+                   mn->start_pfn + mn->nr_pages)) != NULL)
+           remove_node_from_stable_tree(stable_node, 1, 1);
+       /* fallthrough */
+
+   case MEM_CANCEL_OFFLINE:
+       mutex_unlock(&uksm_thread_mutex);
+       break;
+   }
+   return NOTIFY_OK;
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
+#ifdef CONFIG_SYSFS
+/*
+ * This all compiles without CONFIG_SYSFS, but is a waste of space.
+ */
+
+#define UKSM_ATTR_RO(_name) \
+   static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define UKSM_ATTR(_name) \
+   static struct kobj_attribute _name##_attr = \
+       __ATTR(_name, 0644, _name##_show, _name##_store)
+
+static ssize_t sleep_millisecs_show(struct kobject *kobj,
+                   struct kobj_attribute *attr, char *buf)
+{
+   return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
+}
+
+static ssize_t sleep_millisecs_store(struct kobject *kobj,
+                    struct kobj_attribute *attr,
+                    const char *buf, size_t count)
+{
+   unsigned long msecs;
+   int err;
+
+   err = strict_strtoul(buf, 10, &msecs);
+   if (err || msecs > UINT_MAX)
+       return -EINVAL;
+
+   uksm_sleep_jiffies = msecs_to_jiffies(msecs);
+
+   return count;
+}
+UKSM_ATTR(sleep_millisecs);
+
+static ssize_t min_scan_ratio_show(struct kobject *kobj,
+                   struct kobj_attribute *attr, char *buf)
+{
+   return sprintf(buf, "%u\n", uksm_min_scan_ratio);
+}
+
+static ssize_t min_scan_ratio_store(struct kobject *kobj,
+                    struct kobj_attribute *attr,
+                    const char *buf, size_t count)
+{
+   unsigned long msr;
+   int err;
+
+   err = strict_strtoul(buf, 10, &msr);
+   if (err || msr > UINT_MAX)
+       return -EINVAL;
+
+   uksm_min_scan_ratio = msr;
+
+   return count;
+}
+UKSM_ATTR(min_scan_ratio);
+
+static ssize_t scan_batch_pages_show(struct kobject *kobj,
+                 struct kobj_attribute *attr, char *buf)
+{
+   return sprintf(buf, "%lu\n", uksm_scan_batch_pages);
+}
+
+static ssize_t scan_batch_pages_store(struct kobject *kobj,
+                  struct kobj_attribute *attr,
+                  const char *buf, size_t count)
+{
+   int err;
+   unsigned long batch_pages;
+
+   err = strict_strtoul(buf, 10, &batch_pages);
+   if (err || batch_pages > UINT_MAX)
+       return -EINVAL;
+
+   uksm_scan_batch_pages = batch_pages;
+   cal_ladder_pages_to_scan(uksm_scan_batch_pages);
+
+   return count;
+}
+UKSM_ATTR(scan_batch_pages);
+
+static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
+           char *buf)
+{
+   return sprintf(buf, "%u\n", uksm_run);
+}
+
+static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
+            const char *buf, size_t count)
+{
+   int err;
+   unsigned long flags;
+
+   err = strict_strtoul(buf, 10, &flags);
+   if (err || flags > UINT_MAX)
+       return -EINVAL;
+   if (flags > UKSM_RUN_MERGE)
+       return -EINVAL;
+
+   mutex_lock(&uksm_thread_mutex);
+   if (uksm_run != flags) {
+       uksm_run = flags;
+   }
+   mutex_unlock(&uksm_thread_mutex);
+
+   if (flags & UKSM_RUN_MERGE)
+       wake_up_interruptible(&uksm_thread_wait);
+
+   return count;
+}
+UKSM_ATTR(run);
+
+static ssize_t usr_spt_enabled_show(struct kobject *kobj, struct kobj_attribute *attr,
+           char *buf)
+{
+   return sprintf(buf, "%u\n", uksm_usr_spt_enabled);
+}
+
+static ssize_t usr_spt_enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
+            const char *buf, size_t count)
+{
+   int err;
+   unsigned long flags;
+
+   err = strict_strtoul(buf, 10, &flags);
+   if (err || flags > UINT_MAX)
+       return -EINVAL;
+
+   uksm_usr_spt_enabled = !!flags;
+
+   return count;
+}
+UKSM_ATTR(usr_spt_enabled);
+
+//------
+static ssize_t usr_spt_msg_show(struct kobject *kobj, struct kobj_attribute *attr,
+           char *buf)
+{
+   return sprintf(buf, "%s\n", uksm_usr_spt.usr_str);
+}
+
+static ssize_t usr_spt_msg_store(struct kobject *kobj, struct kobj_attribute *attr,
+            const char *buf, size_t count)
+{
+   int s;
+
+   s = sizeof(uksm_usr_spt.usr_str);
+   strncpy(uksm_usr_spt.usr_str, buf, s);
+   uksm_usr_spt.usr_str[s - 1] = '\0';
+
+   return count;
+}
+UKSM_ATTR(usr_spt_msg);
+
+static ssize_t usr_spt_flags_show(struct kobject *kobj, struct kobj_attribute *attr,
+           char *buf)
+{
+   return sprintf(buf, "%x\n", uksm_usr_spt.flags);
+}
+
+static ssize_t usr_spt_flags_store(struct kobject *kobj, struct kobj_attribute *attr,
+            const char *buf, size_t count)
+{
+   int err;
+   unsigned long flags;
+
+   err = strict_strtoul(buf, 10, &flags);
+   if (err || flags > (u8)(~0U))
+       return -EINVAL;
+
+   uksm_usr_spt.flags = (unsigned char) flags;
+
+   return count;
+}
+UKSM_ATTR(usr_spt_flags);
+
+//------
+
+static ssize_t thrash_threshold_show(struct kobject *kobj,
+                    struct kobj_attribute *attr, char *buf)
+{
+   return sprintf(buf, "%u\n", uksm_thrash_threshold);
+}
+
+static ssize_t thrash_threshold_store(struct kobject *kobj,
+                     struct kobj_attribute *attr,
+                     const char *buf, size_t count)
+{
+   int err;
+   unsigned long flags;
+
+   err = strict_strtoul(buf, 10, &flags);
+   if (err || flags > 99)
+       return -EINVAL;
+
+   uksm_thrash_threshold = flags;
+
+   return count;
+}
+UKSM_ATTR(thrash_threshold);
+
+static ssize_t pages_shared_show(struct kobject *kobj,
+                struct kobj_attribute *attr, char *buf)
+{
+   return sprintf(buf, "%lu\n", uksm_pages_shared);
+}
+UKSM_ATTR_RO(pages_shared);
+
+static ssize_t pages_sharing_show(struct kobject *kobj,
+                 struct kobj_attribute *attr, char *buf)
+{
+   return sprintf(buf, "%lu\n", uksm_pages_sharing);
+}
+UKSM_ATTR_RO(pages_sharing);
+
+static ssize_t pages_unshared_show(struct kobject *kobj,
+                  struct kobj_attribute *attr, char *buf)
+{
+   return sprintf(buf, "%lu\n", uksm_pages_unshared);
+}
+UKSM_ATTR_RO(pages_unshared);
+
+static ssize_t full_scans_show(struct kobject *kobj,
+                  struct kobj_attribute *attr, char *buf)
+{
+   return sprintf(buf, "%llu\n", uksm_scan_round);
+}
+UKSM_ATTR_RO(full_scans);
+
+static ssize_t pages_scanned_show(struct kobject *kobj,
+                 struct kobj_attribute *attr, char *buf)
+{
+   unsigned long base = 0;
+   u64 delta, ret;
+
+   if (pages_scanned_stored) {
+       base = pages_scanned_base;
+       ret = pages_scanned_stored;
+       delta = uksm_pages_scanned >> base;
+       if (CAN_OVERFLOW_U64(ret, delta)) {
+           ret >>= 1;
+           delta >>= 1;
+           base++;
+           ret += delta;
+       }
+   } else {
+       ret = uksm_pages_scanned;
+   }
+
+   while (ret > ULONG_MAX) {
+       ret >>= 1;
+       base++;
+   }
+
+   if (base)
+       return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
+   else
+       return sprintf(buf, "%lu\n", (unsigned long)ret);
+}
+UKSM_ATTR_RO(pages_scanned);
+
+static ssize_t hash_strength_show(struct kobject *kobj,
+                 struct kobj_attribute *attr, char *buf)
+{
+   return sprintf(buf, "%lu\n", hash_strength);
+}
+UKSM_ATTR_RO(hash_strength);
+
+static ssize_t sleep_times_show(struct kobject *kobj,
+                 struct kobj_attribute *attr, char *buf)
+{
+   return sprintf(buf, "%llu\n", uksm_sleep_times);
+}
+UKSM_ATTR_RO(sleep_times);
+
+
+static struct attribute *uksm_attrs[] = {
+   &sleep_millisecs_attr.attr,
+   &scan_batch_pages_attr.attr,
+   &run_attr.attr,
+   &usr_spt_enabled_attr.attr,
+   &usr_spt_msg_attr.attr,
+   &usr_spt_flags_attr.attr,
+   &pages_shared_attr.attr,
+   &pages_sharing_attr.attr,
+   &pages_unshared_attr.attr,
+   &full_scans_attr.attr,
+   &min_scan_ratio_attr.attr,
+   &pages_scanned_attr.attr,
+   &hash_strength_attr.attr,
+   &sleep_times_attr.attr,
+   &thrash_threshold_attr.attr,
+   NULL,
+};
+
+static struct attribute_group uksm_attr_group = {
+   .attrs = uksm_attrs,
+   .name = "uksm",
+};
+#endif /* CONFIG_SYSFS */
+
+static inline void init_scan_ladder(void)
+{
+   int i;
+   unsigned long mul = 1;
+
+   for (i = 0; i < uksm_scan_ladder_size; i++,
+         mul *= uksm_scan_ratio_delta) {
+
+       uksm_scan_ladder[i].scan_ratio = uksm_min_scan_ratio * mul;
+       INIT_LIST_HEAD(&uksm_scan_ladder[i].vma_list);
+       uksm_scan_ladder[i].vma_num = 0;
+       uksm_scan_ladder[i].fully_scanned_slots = 0;
+       uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
+       uksm_scan_ladder[i].flags &= ~UKSM_RUNG_BUSY_SEARCHED;
+   }
+
+   cal_ladder_pages_to_scan(uksm_scan_batch_pages);
+}
+
+static inline int cal_positive_negative_costs(void)
+{
+   struct page *p1, *p2;
+   unsigned char *addr1, *addr2;
+   unsigned long i, time_start, hash_cost;
+   unsigned long loopnum = 0;
+
+   /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
+   volatile u32 hash;
+   volatile int ret;
+
+   p1 = alloc_page(GFP_KERNEL);
+   if (!p1)
+       return -ENOMEM;
+
+   p2 = alloc_page(GFP_KERNEL);
+   if (!p2)
+       return -ENOMEM;
+
+   addr1 = kmap_atomic(p1, KM_USER0);
+   addr2 = kmap_atomic(p2, KM_USER1);
+   memset(addr1, random32(), PAGE_SIZE);
+   memcpy(addr2, addr1, PAGE_SIZE);
+
+   /* make sure that the two pages differ in last byte */
+   addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
+   kunmap_atomic(addr2, KM_USER1);
+   kunmap_atomic(addr1, KM_USER0);
+
+   time_start = jiffies;
+   while (jiffies - time_start < 100) {
+       for (i = 0; i < 100; i++)
+           hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
+       loopnum += 100;
+   }
+   hash_cost = (jiffies - time_start);
+
+   time_start = jiffies;
+   for (i = 0; i < loopnum; i++)
+       ret = pages_identical(p1, p2);
+   memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
+   memcmp_cost /= hash_cost;
+   printk(KERN_INFO "UKSM: relative memcmp_cost = %lu "
+            "hash=%u cmp_ret=%d.\n",
+          memcmp_cost, hash, ret);
+
+   __free_page(p1);
+   __free_page(p2);
+   return 0;
+}
+
+static int init_zeropage_hash_table(void)
+{
+   struct page *page;
+   char *addr;
+   int i;
+
+   page = alloc_page(GFP_KERNEL);
+   if (!page)
+       return -ENOMEM;
+
+   addr = kmap_atomic(page, KM_USER0);
+   memset(addr, 0, PAGE_SIZE);
+   kunmap_atomic(addr, KM_USER0);
+
+   zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32),
+       GFP_KERNEL);
+   if (!zero_hash_table)
+       return -ENOMEM;
+
+   for (i = 0; i < HASH_STRENGTH_MAX; i++)
+       zero_hash_table[i] = page_hash(page, i, 0);
+
+   __free_page(page);
+
+   return 0;
+}
+
+static inline int init_random_sampling(void)
+{
+   unsigned long i;
+   random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
+   if (!random_nums)
+       return -ENOMEM;
+
+   for (i = 0; i < HASH_STRENGTH_FULL; i++)
+       random_nums[i] = i;
+
+   for (i = 0; i < HASH_STRENGTH_FULL; i++) {
+       unsigned long rand_range, swap_index, tmp;
+
+       rand_range = HASH_STRENGTH_FULL - i;
+       swap_index = i + random32() % rand_range;
+       tmp = random_nums[i];
+       random_nums[i] =  random_nums[swap_index];
+       random_nums[swap_index] = tmp;
+   }
+
+   rshash_state.state = RSHASH_NEW;
+   rshash_state.below_count = 0;
+   rshash_state.lookup_window_index = 0;
+
+   return cal_positive_negative_costs();
+}
+
+static int __init uksm_slab_init(void)
+{
+   rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
+   if (!rmap_item_cache)
+       goto out;
+
+   stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
+   if (!stable_node_cache)
+       goto out_free1;
+
+   node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
+   if (!node_vma_cache)
+       goto out_free2;
+
+   vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
+   if (!vma_slot_cache)
+       goto out_free3;
+
+   tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
+   if (!tree_node_cache)
+       goto out_free4;
+
+   return 0;
+
+out_free4:
+   kmem_cache_destroy(vma_slot_cache);
+out_free3:
+   kmem_cache_destroy(node_vma_cache);
+out_free2:
+   kmem_cache_destroy(stable_node_cache);
+out_free1:
+   kmem_cache_destroy(rmap_item_cache);
+out:
+   return -ENOMEM;
+}
+
+static void __init uksm_slab_free(void)
+{
+   kmem_cache_destroy(stable_node_cache);
+   kmem_cache_destroy(rmap_item_cache);
+   kmem_cache_destroy(node_vma_cache);
+   kmem_cache_destroy(vma_slot_cache);
+   kmem_cache_destroy(tree_node_cache);
+}
+
+/* Common interface to ksm, different to it. */
+int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
+       unsigned long end, int advice, unsigned long *vm_flags)
+{
+   int err;
+
+   switch (advice) {
+   case MADV_MERGEABLE:
+       return 0;       /* just ignore the advice */
+
+   case MADV_UNMERGEABLE:
+       if (!(*vm_flags & VM_MERGEABLE))
+           return 0;       /* just ignore the advice */
+
+       if (vma->anon_vma) {
+           err = unmerge_uksm_pages(vma, start, end);
+           if (err)
+               return err;
+       }
+
+       uksm_remove_vma(vma);
+       *vm_flags &= ~VM_MERGEABLE;
+       break;
+   }
+
+   return 0;
+}
+
+/* Common interface to ksm, actually the same. */
+struct page *ksm_does_need_to_copy(struct page *page,
+           struct vm_area_struct *vma, unsigned long address)
+{
+   struct page *new_page;
+
+   new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+   if (new_page) {
+       copy_user_highpage(new_page, page, address, vma);
+
+       SetPageDirty(new_page);
+       __SetPageUptodate(new_page);
+       SetPageSwapBacked(new_page);
+       __set_page_locked(new_page);
+
+       if (page_evictable(new_page, vma))
+           lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
+       else
+           add_page_to_unevictable_list(new_page);
+   }
+
+   return new_page;
+}
+
+static int __init uksm_init(void)
+{
+   struct task_struct *uksm_thread;
+   int err;
+   unsigned int sr = uksm_min_scan_ratio;
+
+   uksm_sleep_jiffies = msecs_to_jiffies(10);
+
+   uksm_scan_ladder_size = 1;
+   while (sr < UKSM_SCAN_RATIO_MAX) {
+       sr *= uksm_scan_ratio_delta;
+       uksm_scan_ladder_size++;
+   }
+   uksm_scan_ladder = kzalloc(sizeof(struct scan_rung) *
+                 uksm_scan_ladder_size, GFP_KERNEL);
+   if (!uksm_scan_ladder) {
+       printk(KERN_ERR "UKSM: scan ladder allocation failed, size=%d\n",
+              uksm_scan_ladder_size);
+       err = ENOMEM;
+       goto out;
+   }
+   init_scan_ladder();
+
+   INIT_RADIX_TREE(&uksm_vma_tree, GFP_KERNEL);
+
+   err = init_random_sampling();
+   if (err)
+       goto out_free2;
+
+   err = uksm_slab_init();
+   if (err)
+       goto out_free1;
+
+   err = init_zeropage_hash_table();
+   if (err)
+       goto out_free0;
+
+   uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
+   if (IS_ERR(uksm_thread)) {
+       printk(KERN_ERR "uksm: creating kthread failed\n");
+       err = PTR_ERR(uksm_thread);
+       goto out_free;
+   }
+
+#ifdef CONFIG_SYSFS
+   err = sysfs_create_group(mm_kobj, &uksm_attr_group);
+   if (err) {
+       printk(KERN_ERR "uksm: register sysfs failed\n");
+       kthread_stop(uksm_thread);
+       goto out_free;
+   }
+#else
+   uksm_run = UKSM_RUN_MERGE;  /* no way for user to start it */
+
+#endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+   /*
+    * Choose a high priority since the callback takes uksm_thread_mutex:
+    * later callbacks could only be taking locks which nest within that.
+    */
+   hotplug_memory_notifier(uksm_memory_callback, 100);
+#endif
+   return 0;
+
+out_free:
+   kfree(zero_hash_table);
+out_free0:
+   uksm_slab_free();
+out_free1:
+   kfree(random_nums);
+out_free2:
+   kfree(uksm_scan_ladder);
+out:
+   return err;
+}
+
+#ifdef MODULE
+module_init(uksm_init)
+#else
+late_initcall(uksm_init);
+#endif
+