Advertisement
Guest User

uksm-0.1.1.1-for-v3.4.0.patch

a guest
May 21st, 2012
376
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 158.43 KB | None | 0 0
  1. diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
  2. index 5481c8b..7141876 100644
  3. --- a/Documentation/vm/00-INDEX
  4. +++ b/Documentation/vm/00-INDEX
  5. @@ -14,6 +14,8 @@ hwpoison.txt
  6.     - explains what hwpoison is
  7.  ksm.txt
  8.     - how to use the Kernel Samepage Merging feature.
  9. +uksm.txt
  10. +   - Introduction to Ultra KSM
  11.  locking
  12.     - info on how locking and synchronization is done in the Linux vm code.
  13.  map_hugetlb.c
  14. diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
  15. new file mode 100644
  16. index 0000000..e18f6d8
  17. --- /dev/null
  18. +++ b/Documentation/vm/uksm.txt
  19. @@ -0,0 +1,50 @@
  20. +The Ultra Kernel Samepage Merging feature
  21. +----------------------------------------------
  22. +/*
  23. + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
  24. + *
  25. + * This is an improvement upon KSM. Some basic data structures and routines
  26. + * are borrowed from ksm.c .
  27. + *
  28. + * Its new features:
  29. + * 1. Full system scan:
  30. + *      It automatically scans all user processes' anonymous VMAs. Kernel-user
  31. + *      interaction to submit a memory area to KSM is no longer needed.
  32. + *
  33. + * 2. Rich area detection:
  34. + *      It automatically detects rich areas containing abundant duplicated
  35. + *      pages based. Rich areas are given a full scan speed. Poor areas are
  36. + *      sampled at a reasonable speed with very low CPU consumption.
  37. + *
  38. + * 3. Ultra Per-page scan speed improvement:
  39. + *      A new hash algorithm is proposed. As a result, on a machine with
  40. + *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
  41. + *      can scan memory areas that does not contain duplicated pages at speed of
  42. + *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
  43. + *      477MB/sec ~ 923MB/sec.
  44. + *
  45. + * 4. Thrashing area avoidance:
  46. + *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
  47. + *      filtered out. My benchmark shows it's more efficient than KSM's per-page
  48. + *      hash value based volatile page detection.
  49. + *
  50. + *
  51. + * 5. Misc changes upon KSM:
  52. + *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
  53. + *        comparison. It's much faster than default C version on x86.
  54. + *      * rmap_item now has an struct *page member to loosely cache a
  55. + *        address-->page mapping, which reduces too much time-costly
  56. + *        follow_page().
  57. + *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
  58. + *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
  59. + *        ksm is needed for this case.
  60. + *
  61. + * 6. Full Zero Page consideration(contributed by Figo Zhang)
  62. + *    Now uksmd consider full zero pages as special pages and merge them to an
  63. + *    special unswappable uksm zero page.
  64. + */
  65. +
  66. +ChangeLog:
  67. +
  68. +2012-05-05 The creation of this Doc
  69. +2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
  70. diff --git a/fs/exec.c b/fs/exec.c
  71. index ae42277..c1c65bc 100644
  72. --- a/fs/exec.c
  73. +++ b/fs/exec.c
  74. @@ -55,6 +55,7 @@
  75.  #include <linux/pipe_fs_i.h>
  76.  #include <linux/oom.h>
  77.  #include <linux/compat.h>
  78. +#include <linux/ksm.h>
  79.  
  80.  #include <asm/uaccess.h>
  81.  #include <asm/mmu_context.h>
  82. diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
  83. index 80e4645..33f9e9b 100644
  84. --- a/fs/proc/meminfo.c
  85. +++ b/fs/proc/meminfo.c
  86. @@ -87,6 +87,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
  87.         "SUnreclaim:     %8lu kB\n"
  88.         "KernelStack:    %8lu kB\n"
  89.         "PageTables:     %8lu kB\n"
  90. +#ifdef CONFIG_UKSM
  91. +       "KsmZeroPages:   %8lu kB\n"
  92. +#endif
  93.  #ifdef CONFIG_QUICKLIST
  94.         "Quicklists:     %8lu kB\n"
  95.  #endif
  96. @@ -146,6 +149,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
  97.         K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
  98.         global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
  99.         K(global_page_state(NR_PAGETABLE)),
  100. +#ifdef CONFIG_UKSM
  101. +       K(global_page_state(NR_UKSM_ZERO_PAGES)),
  102. +#endif
  103.  #ifdef CONFIG_QUICKLIST
  104.         K(quicklist_total_size()),
  105.  #endif
  106. diff --git a/include/linux/ksm.h b/include/linux/ksm.h
  107. index 3319a69..f4edf33 100644
  108. --- a/include/linux/ksm.h
  109. +++ b/include/linux/ksm.h
  110. @@ -22,21 +22,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
  111.  #ifdef CONFIG_KSM
  112.  int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
  113.         unsigned long end, int advice, unsigned long *vm_flags);
  114. -int __ksm_enter(struct mm_struct *mm);
  115. -void __ksm_exit(struct mm_struct *mm);
  116. -
  117. -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
  118. -{
  119. -   if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
  120. -       return __ksm_enter(mm);
  121. -   return 0;
  122. -}
  123. -
  124. -static inline void ksm_exit(struct mm_struct *mm)
  125. -{
  126. -   if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
  127. -       __ksm_exit(mm);
  128. -}
  129.  
  130.  /*
  131.   * A KSM page is one of those write-protected "shared pages" or "merged pages"
  132. @@ -90,6 +75,33 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
  133.           struct vm_area_struct *, unsigned long, void *), void *arg);
  134.  void ksm_migrate_page(struct page *newpage, struct page *oldpage);
  135.  
  136. +#ifdef CONFIG_KSM_LEGACY
  137. +int __ksm_enter(struct mm_struct *mm);
  138. +void __ksm_exit(struct mm_struct *mm);
  139. +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
  140. +{
  141. +   if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
  142. +       return __ksm_enter(mm);
  143. +   return 0;
  144. +}
  145. +
  146. +static inline void ksm_exit(struct mm_struct *mm)
  147. +{
  148. +   if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
  149. +       __ksm_exit(mm);
  150. +}
  151. +
  152. +#elif defined(CONFIG_UKSM)
  153. +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
  154. +{
  155. +   return 0;
  156. +}
  157. +
  158. +static inline void ksm_exit(struct mm_struct *mm)
  159. +{
  160. +}
  161. +#endif /* !CONFIG_UKSM */
  162. +
  163.  #else  /* !CONFIG_KSM */
  164.  
  165.  static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
  166. @@ -142,4 +154,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
  167.  #endif /* CONFIG_MMU */
  168.  #endif /* !CONFIG_KSM */
  169.  
  170. +#include <linux/uksm.h>
  171. +
  172.  #endif /* __LINUX_KSM_H */
  173. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
  174. index 3cc3062..9d8642d7 100644
  175. --- a/include/linux/mm_types.h
  176. +++ b/include/linux/mm_types.h
  177. @@ -252,6 +252,9 @@ struct vm_area_struct {
  178.  #ifdef CONFIG_NUMA
  179.     struct mempolicy *vm_policy;    /* NUMA policy for the VMA */
  180.  #endif
  181. +#ifdef CONFIG_UKSM
  182. +   struct vma_slot *uksm_vma_slot;
  183. +#endif
  184.  };
  185.  
  186.  struct core_thread {
  187. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
  188. index 650ba2f..2d1475f 100644
  189. --- a/include/linux/mmzone.h
  190. +++ b/include/linux/mmzone.h
  191. @@ -116,6 +116,9 @@ enum zone_stat_item {
  192.     NUMA_OTHER,     /* allocation from other node */
  193.  #endif
  194.     NR_ANON_TRANSPARENT_HUGEPAGES,
  195. +#ifdef CONFIG_UKSM
  196. +   NR_UKSM_ZERO_PAGES,
  197. +#endif
  198.     NR_VM_ZONE_STAT_ITEMS };
  199.  
  200.  /*
  201. diff --git a/include/linux/uksm.h b/include/linux/uksm.h
  202. new file mode 100644
  203. index 0000000..27b80bf
  204. --- /dev/null
  205. +++ b/include/linux/uksm.h
  206. @@ -0,0 +1,115 @@
  207. +#ifndef __LINUX_UKSM_H
  208. +#define __LINUX_UKSM_H
  209. +/*
  210. + * Memory merging support.
  211. + *
  212. + * This code enables dynamic sharing of identical pages found in different
  213. + * memory areas, even if they are not shared by fork().
  214. + */
  215. +
  216. +/* if !CONFIG_UKSM this file should not be compiled at all. */
  217. +#ifdef CONFIG_UKSM
  218. +
  219. +#include <linux/bitops.h>
  220. +#include <linux/mm.h>
  221. +#include <linux/pagemap.h>
  222. +#include <linux/rmap.h>
  223. +#include <linux/sched.h>
  224. +
  225. +extern unsigned long zero_pfn __read_mostly;
  226. +extern unsigned long uksm_zero_pfn __read_mostly;
  227. +extern struct page *empty_uksm_zero_page;
  228. +
  229. +/* must be done before linked to mm */
  230. +extern void uksm_vma_add_new(struct vm_area_struct *vma);
  231. +extern void uksm_remove_vma(struct vm_area_struct *vma);
  232. +
  233. +struct vma_slot {
  234. +   struct list_head uksm_list;
  235. +   struct list_head slot_list;
  236. +   unsigned long dedup_ratio;
  237. +   unsigned long dedup_num;
  238. +   int uksm_index; /* -1 if vma is not in inter-table,
  239. +               positive otherwise */
  240. +   unsigned long pages_scanned;
  241. +   unsigned long last_scanned;
  242. +   unsigned long pages_to_scan;
  243. +   struct scan_rung *rung;
  244. +   struct page **rmap_list_pool;
  245. +   unsigned long *pool_counts;
  246. +   unsigned long pool_size;
  247. +   struct vm_area_struct *vma;
  248. +   struct mm_struct *mm;
  249. +   unsigned long ctime_j;
  250. +   unsigned long pages;
  251. +   unsigned long flags;
  252. +   unsigned long pages_cowed; /* pages cowed this round */
  253. +   unsigned long pages_merged; /* pages merged this round */
  254. +
  255. +   /* used for dup vma pair */
  256. +   struct radix_tree_root dup_tree;
  257. +};
  258. +
  259. +static inline void uksm_unmap_zero_page(pte_t pte)
  260. +{
  261. +   if (pte_pfn(pte) == uksm_zero_pfn)
  262. +       __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
  263. +}
  264. +
  265. +static inline void uksm_map_zero_page(pte_t pte)
  266. +{
  267. +   if (pte_pfn(pte) == uksm_zero_pfn)
  268. +       __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
  269. +}
  270. +
  271. +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
  272. +{
  273. +   if (vma->uksm_vma_slot && PageKsm(page))
  274. +       vma->uksm_vma_slot->pages_cowed++;
  275. +}
  276. +
  277. +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
  278. +{
  279. +   if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
  280. +       vma->uksm_vma_slot->pages_cowed++;
  281. +}
  282. +
  283. +
  284. +/*
  285. + * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
  286. + * be removed when uksm zero page patch is stable enough.
  287. + */
  288. +static inline void uksm_bugon_zeropage(pte_t pte)
  289. +{
  290. +   BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
  291. +}
  292. +#else
  293. +static inline void uksm_vma_add_new(struct vm_area_struct *vma)
  294. +{
  295. +}
  296. +
  297. +static inline void uksm_remove_vma(struct vm_area_struct *vma)
  298. +{
  299. +}
  300. +
  301. +static inline void uksm_unmap_zero_page(pte_t pte)
  302. +{
  303. +}
  304. +
  305. +static inline void uksm_map_zero_page(pte_t pte)
  306. +{
  307. +}
  308. +
  309. +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
  310. +{
  311. +}
  312. +
  313. +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
  314. +{
  315. +}
  316. +
  317. +static inline void uksm_bugon_zeropage(pte_t pte)
  318. +{
  319. +}
  320. +#endif /* !CONFIG_UKSM */
  321. +#endif /* __LINUX_UKSM_H */
  322. diff --git a/kernel/fork.c b/kernel/fork.c
  323. index 26a7a67..c23317e 100644
  324. --- a/kernel/fork.c
  325. +++ b/kernel/fork.c
  326. @@ -361,7 +361,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
  327.                 goto fail_nomem;
  328.             charge = len;
  329.         }
  330. -       tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
  331. +       tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
  332.         if (!tmp)
  333.             goto fail_nomem;
  334.         *tmp = *mpnt;
  335. @@ -413,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
  336.         __vma_link_rb(mm, tmp, rb_link, rb_parent);
  337.         rb_link = &tmp->vm_rb.rb_right;
  338.         rb_parent = &tmp->vm_rb;
  339. -
  340. +       uksm_vma_add_new(tmp);
  341.         mm->map_count++;
  342.         retval = copy_page_range(mm, oldmm, mpnt);
  343.  
  344. diff --git a/mm/Kconfig b/mm/Kconfig
  345. index e338407..8df1b4f 100644
  346. --- a/mm/Kconfig
  347. +++ b/mm/Kconfig
  348. @@ -245,6 +245,32 @@ config KSM
  349.       See Documentation/vm/ksm.txt for more information: KSM is inactive
  350.       until a program has madvised that an area is MADV_MERGEABLE, and
  351.       root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
  352. +choice
  353. +   prompt "Choose UKSM/KSM strategy"
  354. +   default UKSM
  355. +   depends on KSM
  356. +   help
  357. +     This option allows to select a UKSM/KSM stragety.
  358. +
  359. +config UKSM
  360. +   bool "Ultra-KSM for page merging"
  361. +   depends on KSM
  362. +   help
  363. +   UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
  364. +   page Merging), but with a fundamentally rewritten core algorithm. With
  365. +   an advanced algorithm, UKSM now can transparently scans all anonymously
  366. +   mapped user space applications with an significantly improved scan speed
  367. +   and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
  368. +   UKSM. Now UKSM has its first stable release and first real world enterprise user.
  369. +   For more information, please goto its project page.
  370. +   (www.kerneldedup.org)
  371. +
  372. +config KSM_LEGACY
  373. +   bool "Legacy KSM implementation"
  374. +   depends on KSM
  375. +   help
  376. +   The legacy KSM implementation from Redhat.
  377. +endchoice
  378.  
  379.  config DEFAULT_MMAP_MIN_ADDR
  380.          int "Low address space to protect from user allocation"
  381. diff --git a/mm/Makefile b/mm/Makefile
  382. index 50ec00e..c551bae 100644
  383. --- a/mm/Makefile
  384. +++ b/mm/Makefile
  385. @@ -34,7 +34,8 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
  386.  obj-$(CONFIG_SLOB) += slob.o
  387.  obj-$(CONFIG_COMPACTION) += compaction.o
  388.  obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
  389. -obj-$(CONFIG_KSM) += ksm.o
  390. +obj-$(CONFIG_KSM_LEGACY) += ksm.o
  391. +obj-$(CONFIG_UKSM) += uksm.o
  392.  obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
  393.  obj-$(CONFIG_SLAB) += slab.o
  394.  obj-$(CONFIG_SLUB) += slub.o
  395. diff --git a/mm/memory.c b/mm/memory.c
  396. index 10b4dda..be73fff 100644
  397. --- a/mm/memory.c
  398. +++ b/mm/memory.c
  399. @@ -112,6 +112,37 @@ __setup("norandmaps", disable_randmaps);
  400.  unsigned long zero_pfn __read_mostly;
  401.  unsigned long highest_memmap_pfn __read_mostly;
  402.  
  403. +#ifdef CONFIG_UKSM
  404. +unsigned long uksm_zero_pfn __read_mostly;
  405. +struct page *empty_uksm_zero_page;
  406. +
  407. +static int __init setup_uksm_zero_page(void)
  408. +{
  409. +   unsigned long addr;
  410. +   addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0);
  411. +   if (!addr)
  412. +       panic("Oh boy, that early out of memory?");
  413. +
  414. +   empty_uksm_zero_page = virt_to_page((void *) addr);
  415. +   SetPageReserved(empty_uksm_zero_page);
  416. +
  417. +   uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
  418. +
  419. +   return 0;
  420. +}
  421. +core_initcall(setup_uksm_zero_page);
  422. +
  423. +static inline int is_uksm_zero_pfn(unsigned long pfn)
  424. +{
  425. +   return pfn == uksm_zero_pfn;
  426. +}
  427. +#else
  428. +static inline int is_uksm_zero_pfn(unsigned long pfn)
  429. +{
  430. +   return 0;
  431. +}
  432. +#endif
  433. +
  434.  /*
  435.   * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
  436.   */
  437. @@ -716,8 +716,10 @@ static inline int is_cow_mapping(vm_flags_t flags)
  438.  #ifndef is_zero_pfn
  439.  static inline int is_zero_pfn(unsigned long pfn)
  440.  {
  441. -   return pfn == zero_pfn;
  442. +   return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
  443.  }
  444. +#else
  445. +#define is_zero_pfn(pfn)   (is_zero_pfn(pfn) || is_uksm_zero_pfn(pfn))
  446.  #endif
  447.  
  448.  #ifndef my_zero_pfn
  449. @@ -903,6 +903,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  450.             rss[MM_ANONPAGES]++;
  451.         else
  452.             rss[MM_FILEPAGES]++;
  453. +
  454. +       /* Should return NULL in vm_normal_page() */
  455. +       uksm_bugon_zeropage(pte);
  456. +   } else {
  457. +       uksm_map_zero_page(pte);
  458.     }
  459.  
  460.  out_set_pte:
  461. @@ -1138,8 +1176,10 @@ again:
  462.             ptent = ptep_get_and_clear_full(mm, addr, pte,
  463.                             tlb->fullmm);
  464.             tlb_remove_tlb_entry(tlb, pte, addr);
  465. -           if (unlikely(!page))
  466. +           if (unlikely(!page)) {
  467. +               uksm_unmap_zero_page(ptent);
  468.                 continue;
  469. +           }
  470.             if (unlikely(details) && details->nonlinear_vma
  471.                 && linear_page_index(details->nonlinear_vma,
  472.                         addr) != page->index)
  473. @@ -1712,7 +1752,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
  474.                 page = vm_normal_page(vma, start, *pte);
  475.                 if (!page) {
  476.                     if (!(gup_flags & FOLL_DUMP) &&
  477. -                        is_zero_pfn(pte_pfn(*pte)))
  478. +                       (is_zero_pfn(pte_pfn(*pte))))
  479.                         page = pte_page(*pte);
  480.                     else {
  481.                         pte_unmap(pte);
  482. @@ -2472,8 +2512,10 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
  483.             clear_page(kaddr);
  484.         kunmap_atomic(kaddr);
  485.         flush_dcache_page(dst);
  486. -   } else
  487. +   } else {
  488.         copy_user_highpage(dst, src, va, vma);
  489. +       uksm_cow_page(vma, src);
  490. +   }
  491.  }
  492.  
  493.  /*
  494. @@ -2671,6 +2713,7 @@ gotten:
  495.         new_page = alloc_zeroed_user_highpage_movable(vma, address);
  496.         if (!new_page)
  497.             goto oom;
  498. +       uksm_cow_pte(vma, orig_pte);
  499.     } else {
  500.         new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
  501.         if (!new_page)
  502. @@ -2692,8 +2735,11 @@ gotten:
  503.                 dec_mm_counter_fast(mm, MM_FILEPAGES);
  504.                 inc_mm_counter_fast(mm, MM_ANONPAGES);
  505.             }
  506. -       } else
  507. +           uksm_bugon_zeropage(orig_pte);
  508. +       } else {
  509. +           uksm_unmap_zero_page(orig_pte);
  510.             inc_mm_counter_fast(mm, MM_ANONPAGES);
  511. +       }
  512.         flush_cache_page(vma, address, pte_pfn(orig_pte));
  513.         entry = mk_pte(new_page, vma->vm_page_prot);
  514.         entry = maybe_mkwrite(pte_mkdirty(entry), vma);
  515. diff --git a/mm/mmap.c b/mm/mmap.c
  516. index da15a79..482c503 100644
  517. --- a/mm/mmap.c
  518. +++ b/mm/mmap.c
  519. @@ -30,6 +30,7 @@
  520.  #include <linux/perf_event.h>
  521.  #include <linux/audit.h>
  522.  #include <linux/khugepaged.h>
  523. +#include <linux/ksm.h>
  524.  
  525.  #include <asm/uaccess.h>
  526.  #include <asm/cacheflush.h>
  527. @@ -236,6 +237,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
  528.             removed_exe_file_vma(vma->vm_mm);
  529.     }
  530.     mpol_put(vma_policy(vma));
  531. +   uksm_remove_vma(vma);
  532.     kmem_cache_free(vm_area_cachep, vma);
  533.     return next;
  534.  }
  535. @@ -501,9 +501,16 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
  536.     long adjust_next = 0;
  537.     int remove_next = 0;
  538.  
  539. +/*
  540. + * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
  541. + * acquired
  542. + */
  543. +   uksm_remove_vma(vma);
  544. +
  545.     if (next && !insert) {
  546.         struct vm_area_struct *exporter = NULL;
  547.  
  548. +       uksm_remove_vma(next);
  549.         if (end >= next->vm_end) {
  550.             /*
  551.              * vma expands, overlapping all the next, and
  552. @@ -635,10 +635,15 @@ again:            remove_next = 1 + (end > next->vm_end);
  553.          */
  554.         if (remove_next == 2) {
  555.             next = vma->vm_next;
  556. +           uksm_remove_vma(next);
  557.             goto again;
  558.         }
  559. +   } else {
  560. +       if (next && !insert)
  561. +           uksm_vma_add_new(next);
  562.     }
  563.  
  564. +   uksm_vma_add_new(vma);
  565.     validate_mm(mm);
  566.  
  567.     return 0;
  568. @@ -1357,6 +1357,7 @@ munmap_back:
  569.  
  570.     vma_link(mm, vma, prev, rb_link, rb_parent);
  571.     file = vma->vm_file;
  572. +   uksm_vma_add_new(vma);
  573.  
  574.     /* Once vma denies write, undo our temporary denial count */
  575.     if (correct_wcount)
  576. @@ -1383,6 +1383,7 @@ unmap_and_free_vma:
  577.     unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
  578.     charged = 0;
  579.  free_vma:
  580. +   uksm_remove_vma(vma);
  581.     kmem_cache_free(vm_area_cachep, vma);
  582.  unacct_error:
  583.     if (charged)
  584. @@ -2012,6 +2012,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
  585.     else
  586.         err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
  587.  
  588. +   uksm_vma_add_new(new);
  589. +
  590.     /* Success. */
  591.     if (!err)
  592.         return 0;
  593. @@ -2253,6 +2253,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
  594.     vma->vm_flags = flags;
  595.     vma->vm_page_prot = vm_get_page_prot(flags);
  596.     vma_link(mm, vma, prev, rb_link, rb_parent);
  597. +   uksm_vma_add_new(vma);
  598.  out:
  599.     perf_event_mmap(vma);
  600.     mm->total_vm += len >> PAGE_SHIFT;
  601. @@ -2285,6 +2285,12 @@ void exit_mmap(struct mm_struct *mm)
  602.     /* mm's last user has gone, and its about to be pulled down */
  603.     mmu_notifier_release(mm);
  604.  
  605. +   /*
  606. +    * Taking write lock on mmap_sem does not harm others,
  607. +    * but it's crucial for uksm to avoid races.
  608. +    */
  609. +   down_write(&mm->mmap_sem);
  610. +
  611.     if (mm->locked_vm) {
  612.         vma = mm->mmap;
  613.         while (vma) {
  614. @@ -2318,6 +2318,11 @@ void exit_mmap(struct mm_struct *mm)
  615.     while (vma)
  616.         vma = remove_vma(vma);
  617.  
  618. +   mm->mmap = NULL;
  619. +   mm->mm_rb = RB_ROOT;
  620. +   mm->mmap_cache = NULL;
  621. +   up_write(&mm->mmap_sem);
  622. +
  623.     BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
  624.  }
  625.  
  626. @@ -2427,6 +2427,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
  627.             if (new_vma->vm_ops && new_vma->vm_ops->open)
  628.                 new_vma->vm_ops->open(new_vma);
  629.             vma_link(mm, new_vma, prev, rb_link, rb_parent);
  630. +           uksm_vma_add_new(new_vma);
  631.         }
  632.     }
  633.     return new_vma;
  634. @@ -2532,10 +2532,10 @@ int install_special_mapping(struct mm_struct *mm,
  635.     ret = insert_vm_struct(mm, vma);
  636.     if (ret)
  637.         goto out;
  638. -
  639.     mm->total_vm += len >> PAGE_SHIFT;
  640.  
  641.     perf_event_mmap(vma);
  642. +   uksm_vma_add_new(vma);
  643.  
  644.     return 0;
  645.  
  646. diff --git a/mm/uksm.c b/mm/uksm.c
  647. new file mode 100644
  648. index 0000000..c22787b
  649. --- /dev/null
  650. +++ b/mm/uksm.c
  651. @@ -0,0 +1,5344 @@
  652. +/*
  653. + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
  654. + *
  655. + * This is an improvement upon KSM. Some basic data structures and routines
  656. + * are borrowed from ksm.c .
  657. + *
  658. + * Its new features:
  659. + * 1. Full system scan:
  660. + *      It automatically scans all user processes' anonymous VMAs. Kernel-user
  661. + *      interaction to submit a memory area to KSM is no longer needed.
  662. + *
  663. + * 2. Rich area detection:
  664. + *      It automatically detects rich areas containing abundant duplicated
  665. + *      pages based. Rich areas are given a full scan speed. Poor areas are
  666. + *      sampled at a reasonable speed with very low CPU consumption.
  667. + *
  668. + * 3. Ultra Per-page scan speed improvement:
  669. + *      A new hash algorithm is proposed. As a result, on a machine with
  670. + *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
  671. + *      can scan memory areas that does not contain duplicated pages at speed of
  672. + *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
  673. + *      477MB/sec ~ 923MB/sec.
  674. + *
  675. + * 4. Thrashing area avoidance:
  676. + *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
  677. + *      filtered out. My benchmark shows it's more efficient than KSM's per-page
  678. + *      hash value based volatile page detection.
  679. + *
  680. + *
  681. + * 5. Misc changes upon KSM:
  682. + *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
  683. + *        comparison. It's much faster than default C version on x86.
  684. + *      * rmap_item now has an struct *page member to loosely cache a
  685. + *        address-->page mapping, which reduces too much time-costly
  686. + *        follow_page().
  687. + *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
  688. + *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
  689. + *        ksm is needed for this case.
  690. + *
  691. + * 6. Full Zero Page consideration(contributed by Figo Zhang)
  692. + *    Now uksmd consider full zero pages as special pages and merge them to an
  693. + *    special unswappable uksm zero page.
  694. + */
  695. +
  696. +#include <linux/errno.h>
  697. +#include <linux/mm.h>
  698. +#include <linux/fs.h>
  699. +#include <linux/mman.h>
  700. +#include <linux/sched.h>
  701. +#include <linux/rwsem.h>
  702. +#include <linux/pagemap.h>
  703. +#include <linux/rmap.h>
  704. +#include <linux/spinlock.h>
  705. +#include <linux/jhash.h>
  706. +#include <linux/delay.h>
  707. +#include <linux/kthread.h>
  708. +#include <linux/wait.h>
  709. +#include <linux/slab.h>
  710. +#include <linux/rbtree.h>
  711. +#include <linux/memory.h>
  712. +#include <linux/mmu_notifier.h>
  713. +#include <linux/swap.h>
  714. +#include <linux/ksm.h>
  715. +#include <linux/crypto.h>
  716. +#include <linux/scatterlist.h>
  717. +#include <crypto/hash.h>
  718. +#include <linux/random.h>
  719. +#include <linux/math64.h>
  720. +#include <linux/gcd.h>
  721. +#include <linux/freezer.h>
  722. +#include <linux/in.h>
  723. +#include <linux/ip.h>
  724. +#include <linux/udp.h>
  725. +
  726. +
  727. +#include <asm/tlbflush.h>
  728. +#include "internal.h"
  729. +
  730. +#ifdef CONFIG_X86
  731. +#undef memcmp
  732. +
  733. +#ifdef CONFIG_X86_32
  734. +#define memcmp memcmpx86_32
  735. +/*
  736. + * Compare 4-byte-aligned address s1 and s2, with length n
  737. + */
  738. +int memcmpx86_32(void *s1, void *s2, size_t n)
  739. +{
  740. +   size_t num = n / 4;
  741. +   register int res;
  742. +
  743. +   __asm__ __volatile__
  744. +   (
  745. +    "testl %3,%3\n\t"
  746. +    "repe; cmpsd\n\t"
  747. +    "je        1f\n\t"
  748. +    "sbbl      %0,%0\n\t"
  749. +    "orl       $1,%0\n"
  750. +    "1:"
  751. +    : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
  752. +    : "0" (0)
  753. +    : "cc");
  754. +
  755. +   return res;
  756. +}
  757. +
  758. +/*
  759. + * Check the page is all zero ?
  760. + */
  761. +static int is_full_zero(const void *s1, size_t len)
  762. +{
  763. +   unsigned char same;
  764. +
  765. +   len /= 4;
  766. +
  767. +   __asm__ __volatile__
  768. +   ("repe; scasl;"
  769. +    "sete %0"
  770. +    : "=qm" (same), "+D" (s1), "+c" (len)
  771. +    : "a" (0)
  772. +    : "cc");
  773. +
  774. +   return same;
  775. +}
  776. +
  777. +
  778. +#elif defined(CONFIG_X86_64)
  779. +#define memcmp memcmpx86_64
  780. +/*
  781. + * Compare 8-byte-aligned address s1 and s2, with length n
  782. + */
  783. +int memcmpx86_64(void *s1, void *s2, size_t n)
  784. +{
  785. +   size_t num = n / 8;
  786. +   register int res;
  787. +
  788. +   __asm__ __volatile__
  789. +   (
  790. +    "testq %q3,%q3\n\t"
  791. +    "repe; cmpsq\n\t"
  792. +    "je        1f\n\t"
  793. +    "sbbq      %q0,%q0\n\t"
  794. +    "orq       $1,%q0\n"
  795. +    "1:"
  796. +    : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
  797. +    : "0" (0)
  798. +    : "cc");
  799. +
  800. +   return res;
  801. +}
  802. +
  803. +static int is_full_zero(const void *s1, size_t len)
  804. +{
  805. +   unsigned char same;
  806. +
  807. +   len /= 8;
  808. +
  809. +   __asm__ __volatile__
  810. +   ("repe; scasq;"
  811. +    "sete %0"
  812. +    : "=qm" (same), "+D" (s1), "+c" (len)
  813. +    : "a" (0)
  814. +    : "cc");
  815. +
  816. +   return same;
  817. +}
  818. +
  819. +#endif
  820. +#else
  821. +static int is_full_zero(const void *s1, size_t len)
  822. +{
  823. +   unsigned long *src = s1;
  824. +   int i;
  825. +
  826. +   len /= sizeof(*src);
  827. +
  828. +   for (i = 0; i < len; i++) {
  829. +       if (src[i])
  830. +           return 0;
  831. +   }
  832. +
  833. +   return 1;
  834. +}
  835. +#endif
  836. +
  837. +#define U64_MAX        (~((u64)0))
  838. +
  839. +/* Basic data structure definition below */
  840. +#define UKSM_RUNG_ROUND_FINISHED  (1<<0)
  841. +#define UKSM_RUNG_BUSY_SEARCHED   (1<<1)
  842. +
  843. +/* Each rung of this ladder is a list of VMAs having a same scan ratio */
  844. +struct scan_rung {
  845. +   struct list_head vma_list;
  846. +   struct list_head *current_scan;
  847. +   unsigned long flags;
  848. +   unsigned int pages_to_scan;
  849. +   unsigned long fully_scanned_slots;
  850. +   unsigned long scan_ratio;
  851. +   unsigned long vma_num;
  852. +};
  853. +
  854. +#define UKSM_SLOT_NEED_SORT (1<<0)
  855. +#define UKSM_SLOT_NEED_RERAND (1<<1)
  856. +#define UKSM_SLOT_SCANNED     (1<<2) /* It's scanned in this round */
  857. +#define UKSM_SLOT_FUL_SCANNED (1<<3)
  858. +
  859. +/**
  860. + * node of either the stable or unstale rbtree
  861. + *
  862. + */
  863. +struct tree_node {
  864. +   struct rb_node node; /* link in the main (un)stable rbtree */
  865. +   struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
  866. +   u32 hash;
  867. +   unsigned long count; /* how many sublevel tree nodes */
  868. +   struct list_head all_list; /* all tree nodes in stable/unstable tree */
  869. +};
  870. +
  871. +/**
  872. + * struct stable_node - node of the stable rbtree
  873. + * @node: rb node of this ksm page in the stable tree
  874. + * @hlist: hlist head of rmap_items using this ksm page
  875. + * @kpfn: page frame number of this ksm page
  876. + */
  877. +struct stable_node {
  878. +   struct rb_node node; /* link in sub-rbtree */
  879. +   struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
  880. +   struct hlist_head hlist;
  881. +   unsigned long kpfn;
  882. +   u32 hash_max; /* if ==0 then it's not been calculated yet */
  883. +   //struct vm_area_struct *old_vma;
  884. +   struct list_head all_list; /* in a list for all stable nodes */
  885. +};
  886. +
  887. +/**
  888. + * struct node_vma - group rmap_items linked in a same stable
  889. + * node together.
  890. + */
  891. +struct node_vma {
  892. +   union {
  893. +       struct vma_slot *slot;
  894. +       unsigned long key;  /* slot is used as key sorted on hlist */
  895. +   };
  896. +   struct hlist_node hlist;
  897. +   struct hlist_head rmap_hlist;
  898. +   struct stable_node *head;
  899. +   unsigned long last_update;
  900. +};
  901. +
  902. +/**
  903. + * struct rmap_item - reverse mapping item for virtual addresses
  904. + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
  905. + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
  906. + * @mm: the memory structure this rmap_item is pointing into
  907. + * @address: the virtual address this rmap_item tracks (+ flags in low bits)
  908. + * @node: rb node of this rmap_item in the unstable tree
  909. + * @head: pointer to stable_node heading this list in the stable tree
  910. + * @hlist: link into hlist of rmap_items hanging off that stable_node
  911. + */
  912. +struct rmap_item {
  913. +   struct vma_slot *slot;
  914. +   struct page *page;
  915. +   unsigned long address;  /* + low bits used for flags below */
  916. +   /* Appendded to (un)stable tree on which scan round */
  917. +   unsigned long append_round;
  918. +
  919. +   /* Which rung scan turn it was last scanned */
  920. +   //unsigned long last_scan;
  921. +   unsigned long entry_index;
  922. +   union {
  923. +       struct {/* when in unstable tree */
  924. +           struct rb_node node;
  925. +           struct tree_node *tree_node;
  926. +           u32 hash_max;
  927. +       };
  928. +       struct { /* when in stable tree */
  929. +           struct node_vma *head;
  930. +           struct hlist_node hlist;
  931. +           struct anon_vma *anon_vma;
  932. +       };
  933. +   };
  934. +} __attribute__((aligned(4)));
  935. +
  936. +struct rmap_list_entry {
  937. +   union {
  938. +       struct rmap_item *item;
  939. +       unsigned long addr;
  940. +   };
  941. +   // lowest bit is used for is_addr tag
  942. +   //unsigned char is_addr;
  943. +} __attribute__((aligned(4))); // 4 aligned to fit in to pages
  944. +
  945. +
  946. +/* Basic data structure definition ends */
  947. +#define USR_SPT_MAGIC_LEN 16
  948. +#define USR_SPT_STR_LEN 32
  949. +#define UKSM_USR_SPT_PORT 52013
  950. +#define UKSM_USR_SPT_INTVL_MSEC 120000
  951. +
  952. +#define  UKSM_USR_SPT_NO_FROM  1
  953. +
  954. +struct uksm_usr_spt_s {
  955. +   char magic[16];
  956. +   u32 randid;
  957. +   unsigned char flags;
  958. +   char usr_str[32];
  959. +} __attribute__ ((packed));
  960. +
  961. +static struct uksm_usr_spt_s uksm_usr_spt = {
  962. +   .magic = "UKSM_USR_SPT_OK1",
  963. +   .usr_str = "none",
  964. +};
  965. +/*static struct sockaddr_in uksm_usr_spt_addr;
  966. +static struct socket *uksm_usr_spt_sock = NULL;
  967. +static unsigned long uksm_usr_spt_last;*/
  968. +static int uksm_usr_spt_enabled = 0;
  969. +
  970. +/*
  971. + * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
  972. + * The flags use the low bits of rmap_item.address
  973. + */
  974. +#define UNSTABLE_FLAG  0x1
  975. +#define STABLE_FLAG    0x2
  976. +#define get_rmap_addr(x)   ((x)->address & PAGE_MASK)
  977. +
  978. +/*
  979. + * rmap_list_entry helpers
  980. + */
  981. +#define IS_ADDR_FLAG   1
  982. +#define is_addr(ptr)       ((unsigned long)(ptr) & IS_ADDR_FLAG)
  983. +#define set_is_addr(ptr)   ((ptr) |= IS_ADDR_FLAG)
  984. +#define get_clean_addr(ptr)    (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
  985. +
  986. +
  987. +/*
  988. + * High speed caches for frequently allocated and freed structs
  989. + */
  990. +static struct kmem_cache *rmap_item_cache;
  991. +static struct kmem_cache *stable_node_cache;
  992. +static struct kmem_cache *node_vma_cache;
  993. +static struct kmem_cache *vma_slot_cache;
  994. +static struct kmem_cache *tree_node_cache;
  995. +#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
  996. +       sizeof(struct __struct), __alignof__(struct __struct),\
  997. +       (__flags), NULL)
  998. +
  999. +/* The scan rounds ksmd is currently in */
  1000. +static unsigned long long uksm_scan_round = 1;
  1001. +
  1002. +/* The number of pages has been scanned since the start up */
  1003. +static u64 uksm_pages_scanned;
  1004. +
  1005. +/* The number of pages has been scanned when last scan round finished */
  1006. +static u64 uksm_pages_scanned_last;
  1007. +
  1008. +/* If the scanned number is tooo large, we encode it here */
  1009. +static u64 pages_scanned_stored;
  1010. +static unsigned long pages_scanned_base;
  1011. +
  1012. +/* The number of nodes in the stable tree */
  1013. +static unsigned long uksm_pages_shared;
  1014. +
  1015. +/* The number of page slots additionally sharing those nodes */
  1016. +static unsigned long uksm_pages_sharing;
  1017. +
  1018. +/* The number of nodes in the unstable tree */
  1019. +static unsigned long uksm_pages_unshared;
  1020. +
  1021. +/*
  1022. + * Number of pages ksmd should scan in one batch. This is the top speed for
  1023. + * richly duplicated areas.
  1024. + */
  1025. +static unsigned long uksm_scan_batch_pages = 60000;
  1026. +
  1027. +/* Milliseconds ksmd should sleep between batches */
  1028. +static unsigned int uksm_sleep_jiffies;
  1029. +
  1030. +/*
  1031. + * The threshold used to filter out thrashing areas,
  1032. + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
  1033. + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
  1034. + * will be considered as having a zero duplication ratio.
  1035. + */
  1036. +static unsigned int uksm_thrash_threshold = 50;
  1037. +
  1038. +/* To avoid the float point arithmetic, this is the scale of a
  1039. + * deduplication ratio number.
  1040. + */
  1041. +#define UKSM_DEDUP_RATIO_SCALE 100
  1042. +
  1043. +
  1044. +#define UKSM_SCAN_RATIO_MAX    125
  1045. +
  1046. +/* minimum scan ratio for a vma, in unit of 1/UKSM_SCAN_RATIO_MAX */
  1047. +static unsigned int uksm_min_scan_ratio = 1;
  1048. +
  1049. +/*
  1050. + * After each scan round, the scan ratio of an area with a big deduplication
  1051. + * ratio is upgraded by *=uksm_scan_ratio_delta
  1052. + */
  1053. +static unsigned int uksm_scan_ratio_delta = 5;
  1054. +
  1055. +#define INDIRECT_OFFSET        1
  1056. +
  1057. +/*
  1058. + * For mapping of vma_slot and its index in inter-vma duplication number
  1059. + * table
  1060. + */
  1061. +static struct radix_tree_root uksm_vma_tree;
  1062. +static unsigned long uksm_vma_tree_num;
  1063. +static unsigned long uksm_vma_tree_index_end;
  1064. +
  1065. +/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
  1066. +static struct scan_rung *uksm_scan_ladder;
  1067. +static unsigned int uksm_scan_ladder_size;
  1068. +
  1069. +/* The number of VMAs we are keeping track of */
  1070. +static unsigned long uksm_vma_slot_num;
  1071. +
  1072. +/* How many times the ksmd has slept since startup */
  1073. +static u64 uksm_sleep_times;
  1074. +
  1075. +#define UKSM_RUN_STOP  0
  1076. +#define UKSM_RUN_MERGE 1
  1077. +static unsigned int uksm_run = UKSM_RUN_MERGE;
  1078. +
  1079. +static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
  1080. +static DEFINE_MUTEX(uksm_thread_mutex);
  1081. +
  1082. +/*
  1083. + * List vma_slot_new is for newly created vma_slot waiting to be added by
  1084. + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
  1085. + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
  1086. + * VMA has been removed/freed.
  1087. + */
  1088. +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
  1089. +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
  1090. +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
  1091. +static DEFINE_SPINLOCK(vma_slot_list_lock);
  1092. +
  1093. +/* The unstable tree heads */
  1094. +static struct rb_root root_unstable_tree = RB_ROOT;
  1095. +
  1096. +/*
  1097. + * All tree_nodes are in a list to be freed at once when unstable tree is
  1098. + * freed after each scan round.
  1099. + */
  1100. +static struct list_head unstable_tree_node_list =
  1101. +               LIST_HEAD_INIT(unstable_tree_node_list);
  1102. +
  1103. +/* List contains all stable nodes */
  1104. +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
  1105. +
  1106. +/*
  1107. + * When the hash strength is changed, the stable tree must be delta_hashed and
  1108. + * re-structured. We use two set of below structs to speed up the
  1109. + * re-structuring of stable tree.
  1110. + */
  1111. +static struct list_head
  1112. +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
  1113. +               LIST_HEAD_INIT(stable_tree_node_list[1])};
  1114. +
  1115. +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
  1116. +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
  1117. +static struct rb_root *root_stable_treep = &root_stable_tree[0];
  1118. +static unsigned long stable_tree_index;
  1119. +
  1120. +/* The hash strength needed to hash a full page */
  1121. +#define HASH_STRENGTH_FULL     (PAGE_SIZE / sizeof(u32))
  1122. +
  1123. +/* The hash strength needed for loop-back hashing */
  1124. +#define HASH_STRENGTH_MAX      (HASH_STRENGTH_FULL + 10)
  1125. +
  1126. +/* The random offsets in a page */
  1127. +static u32 *random_nums;
  1128. +
  1129. +/* The hash strength */
  1130. +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
  1131. +
  1132. +/* The delta value each time the hash strength increases or decreases */
  1133. +static unsigned long hash_strength_delta;
  1134. +#define HASH_STRENGTH_DELTA_MAX    5
  1135. +
  1136. +/* The time we have saved due to random_sample_hash */
  1137. +static u64 rshash_pos;
  1138. +
  1139. +/* The time we have wasted due to hash collision */
  1140. +static u64 rshash_neg;
  1141. +
  1142. +struct uksm_benefit {
  1143. +   u64 pos;
  1144. +   u64 neg;
  1145. +   u64 scanned;
  1146. +   unsigned long base;
  1147. +} benefit;
  1148. +
  1149. +/*
  1150. + * The relative cost of memcmp, compared to 1 time unit of random sample
  1151. + * hash, this value is tested when ksm module is initialized
  1152. + */
  1153. +static unsigned long memcmp_cost;
  1154. +
  1155. +static unsigned long  rshash_neg_cont_zero;
  1156. +static unsigned long  rshash_cont_obscure;
  1157. +
  1158. +/* The possible states of hash strength adjustment heuristic */
  1159. +enum rshash_states {
  1160. +       RSHASH_STILL,
  1161. +       RSHASH_TRYUP,
  1162. +       RSHASH_TRYDOWN,
  1163. +       RSHASH_NEW,
  1164. +       RSHASH_PRE_STILL,
  1165. +};
  1166. +
  1167. +/* The possible direction we are about to adjust hash strength */
  1168. +enum rshash_direct {
  1169. +   GO_UP,
  1170. +   GO_DOWN,
  1171. +   OBSCURE,
  1172. +   STILL,
  1173. +};
  1174. +
  1175. +/* random sampling hash state machine */
  1176. +static struct {
  1177. +   enum rshash_states state;
  1178. +   enum rshash_direct pre_direct;
  1179. +   u8 below_count;
  1180. +   /* Keep a lookup window of size 5, iff above_count/below_count > 3
  1181. +    * in this window we stop trying.
  1182. +    */
  1183. +   u8 lookup_window_index;
  1184. +   u64 stable_benefit;
  1185. +   unsigned long turn_point_down;
  1186. +   unsigned long turn_benefit_down;
  1187. +   unsigned long turn_point_up;
  1188. +   unsigned long turn_benefit_up;
  1189. +   unsigned long stable_point;
  1190. +} rshash_state;
  1191. +
  1192. +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
  1193. +static u32 *zero_hash_table;
  1194. +
  1195. +static inline struct node_vma *alloc_node_vma(void)
  1196. +{
  1197. +   struct node_vma *node_vma;
  1198. +   node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL);
  1199. +   if (node_vma) {
  1200. +       INIT_HLIST_HEAD(&node_vma->rmap_hlist);
  1201. +       INIT_HLIST_NODE(&node_vma->hlist);
  1202. +       node_vma->last_update = 0;
  1203. +   }
  1204. +   return node_vma;
  1205. +}
  1206. +
  1207. +static inline void free_node_vma(struct node_vma *node_vma)
  1208. +{
  1209. +   kmem_cache_free(node_vma_cache, node_vma);
  1210. +}
  1211. +
  1212. +
  1213. +static inline struct vma_slot *alloc_vma_slot(void)
  1214. +{
  1215. +   struct vma_slot *slot;
  1216. +
  1217. +   /*
  1218. +    * In case ksm is not initialized by now.
  1219. +    * Oops, we need to consider the call site of uksm_init() in the future.
  1220. +    */
  1221. +   if (!vma_slot_cache)
  1222. +       return NULL;
  1223. +
  1224. +   slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL);
  1225. +   if (slot) {
  1226. +       INIT_LIST_HEAD(&slot->uksm_list);
  1227. +       INIT_LIST_HEAD(&slot->slot_list);
  1228. +       INIT_RADIX_TREE(&slot->dup_tree, GFP_KERNEL);
  1229. +       slot->uksm_index = -1;
  1230. +       slot->flags |= UKSM_SLOT_NEED_RERAND;
  1231. +   }
  1232. +   return slot;
  1233. +}
  1234. +
  1235. +static inline void free_vma_slot(struct vma_slot *vma_slot)
  1236. +{
  1237. +   kmem_cache_free(vma_slot_cache, vma_slot);
  1238. +}
  1239. +
  1240. +
  1241. +
  1242. +static inline struct rmap_item *alloc_rmap_item(void)
  1243. +{
  1244. +   struct rmap_item *rmap_item;
  1245. +
  1246. +   rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
  1247. +   if (rmap_item) {
  1248. +       /* bug on lowest bit is not clear for flag use */
  1249. +       BUG_ON(is_addr(rmap_item));
  1250. +   }
  1251. +   return rmap_item;
  1252. +}
  1253. +
  1254. +static inline void free_rmap_item(struct rmap_item *rmap_item)
  1255. +{
  1256. +   rmap_item->slot = NULL; /* debug safety */
  1257. +   kmem_cache_free(rmap_item_cache, rmap_item);
  1258. +}
  1259. +
  1260. +static inline struct stable_node *alloc_stable_node(void)
  1261. +{
  1262. +   struct stable_node *node;
  1263. +   node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC);
  1264. +   if (!node)
  1265. +       return NULL;
  1266. +
  1267. +   INIT_HLIST_HEAD(&node->hlist);
  1268. +   list_add(&node->all_list, &stable_node_list);
  1269. +   return node;
  1270. +}
  1271. +
  1272. +static inline void free_stable_node(struct stable_node *stable_node)
  1273. +{
  1274. +   list_del(&stable_node->all_list);
  1275. +   kmem_cache_free(stable_node_cache, stable_node);
  1276. +}
  1277. +
  1278. +static inline struct tree_node *alloc_tree_node(struct list_head *list)
  1279. +{
  1280. +   struct tree_node *node;
  1281. +   node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC);
  1282. +   if (!node)
  1283. +       return NULL;
  1284. +
  1285. +   list_add(&node->all_list, list);
  1286. +   return node;
  1287. +}
  1288. +
  1289. +static inline void free_tree_node(struct tree_node *node)
  1290. +{
  1291. +   list_del(&node->all_list);
  1292. +   kmem_cache_free(tree_node_cache, node);
  1293. +}
  1294. +
  1295. +static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
  1296. +{
  1297. +   struct anon_vma *anon_vma = rmap_item->anon_vma;
  1298. +
  1299. +   put_anon_vma(anon_vma);
  1300. +}
  1301. +
  1302. +
  1303. +/**
  1304. + * Remove a stable node from stable_tree, may unlink from its tree_node and
  1305. + * may remove its parent tree_node if no other stable node is pending.
  1306. + *
  1307. + * @stable_node    The node need to be removed
  1308. + * @unlink_rb      Will this node be unlinked from the rbtree?
  1309. + * @remove_tree_   node Will its tree_node be removed if empty?
  1310. + */
  1311. +static void remove_node_from_stable_tree(struct stable_node *stable_node,
  1312. +                    int unlink_rb,  int remove_tree_node)
  1313. +{
  1314. +   struct node_vma *node_vma;
  1315. +   struct rmap_item *rmap_item;
  1316. +   struct hlist_node *hlist, *rmap_hlist, *n;
  1317. +
  1318. +   if (!hlist_empty(&stable_node->hlist)) {
  1319. +       hlist_for_each_entry_safe(node_vma, hlist, n,
  1320. +                     &stable_node->hlist, hlist) {
  1321. +           hlist_for_each_entry(rmap_item, rmap_hlist,
  1322. +                        &node_vma->rmap_hlist, hlist) {
  1323. +               uksm_pages_sharing--;
  1324. +
  1325. +               uksm_drop_anon_vma(rmap_item);
  1326. +               rmap_item->address &= PAGE_MASK;
  1327. +           }
  1328. +           free_node_vma(node_vma);
  1329. +           cond_resched();
  1330. +       }
  1331. +
  1332. +       /* the last one is counted as shared */
  1333. +       uksm_pages_shared--;
  1334. +       uksm_pages_sharing++;
  1335. +   }
  1336. +
  1337. +   if (stable_node->tree_node && unlink_rb) {
  1338. +       rb_erase(&stable_node->node,
  1339. +            &stable_node->tree_node->sub_root);
  1340. +
  1341. +       if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
  1342. +           remove_tree_node) {
  1343. +           rb_erase(&stable_node->tree_node->node,
  1344. +                root_stable_treep);
  1345. +           free_tree_node(stable_node->tree_node);
  1346. +       } else {
  1347. +           stable_node->tree_node->count--;
  1348. +       }
  1349. +   }
  1350. +
  1351. +   free_stable_node(stable_node);
  1352. +}
  1353. +
  1354. +
  1355. +/*
  1356. + * get_uksm_page: checks if the page indicated by the stable node
  1357. + * is still its ksm page, despite having held no reference to it.
  1358. + * In which case we can trust the content of the page, and it
  1359. + * returns the gotten page; but if the page has now been zapped,
  1360. + * remove the stale node from the stable tree and return NULL.
  1361. + *
  1362. + * You would expect the stable_node to hold a reference to the ksm page.
  1363. + * But if it increments the page's count, swapping out has to wait for
  1364. + * ksmd to come around again before it can free the page, which may take
  1365. + * seconds or even minutes: much too unresponsive.  So instead we use a
  1366. + * "keyhole reference": access to the ksm page from the stable node peeps
  1367. + * out through its keyhole to see if that page still holds the right key,
  1368. + * pointing back to this stable node.  This relies on freeing a PageAnon
  1369. + * page to reset its page->mapping to NULL, and relies on no other use of
  1370. + * a page to put something that might look like our key in page->mapping.
  1371. + *
  1372. + * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
  1373. + * but this is different - made simpler by uksm_thread_mutex being held, but
  1374. + * interesting for assuming that no other use of the struct page could ever
  1375. + * put our expected_mapping into page->mapping (or a field of the union which
  1376. + * coincides with page->mapping).  The RCU calls are not for KSM at all, but
  1377. + * to keep the page_count protocol described with page_cache_get_speculative.
  1378. + *
  1379. + * Note: it is possible that get_uksm_page() will return NULL one moment,
  1380. + * then page the next, if the page is in between page_freeze_refs() and
  1381. + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
  1382. + * is on its way to being freed; but it is an anomaly to bear in mind.
  1383. + *
  1384. + * @unlink_rb:         if the removal of this node will firstly unlink from
  1385. + * its rbtree. stable_node_reinsert will prevent this when restructuring the
  1386. + * node from its old tree.
  1387. + *
  1388. + * @remove_tree_node:  if this is the last one of its tree_node, will the
  1389. + * tree_node be freed ? If we are inserting stable node, this tree_node may
  1390. + * be reused, so don't free it.
  1391. + */
  1392. +static struct page *get_uksm_page(struct stable_node *stable_node,
  1393. +                int unlink_rb, int remove_tree_node)
  1394. +{
  1395. +   struct page *page;
  1396. +   void *expected_mapping;
  1397. +
  1398. +   page = pfn_to_page(stable_node->kpfn);
  1399. +   expected_mapping = (void *)stable_node +
  1400. +               (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
  1401. +   rcu_read_lock();
  1402. +   if (page->mapping != expected_mapping)
  1403. +       goto stale;
  1404. +   if (!get_page_unless_zero(page))
  1405. +       goto stale;
  1406. +   if (page->mapping != expected_mapping) {
  1407. +       put_page(page);
  1408. +       goto stale;
  1409. +   }
  1410. +   rcu_read_unlock();
  1411. +   return page;
  1412. +stale:
  1413. +   rcu_read_unlock();
  1414. +   remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
  1415. +
  1416. +   return NULL;
  1417. +}
  1418. +
  1419. +/*
  1420. + * Removing rmap_item from stable or unstable tree.
  1421. + * This function will clean the information from the stable/unstable tree.
  1422. + */
  1423. +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
  1424. +{
  1425. +   if (rmap_item->address & STABLE_FLAG) {
  1426. +       struct stable_node *stable_node;
  1427. +       struct node_vma *node_vma;
  1428. +       struct page *page;
  1429. +
  1430. +       node_vma = rmap_item->head;
  1431. +       stable_node = node_vma->head;
  1432. +       page = get_uksm_page(stable_node, 1, 1);
  1433. +       if (!page)
  1434. +           goto out;
  1435. +
  1436. +       /*
  1437. +        * page lock is needed because it's racing with
  1438. +        * try_to_unmap_ksm(), etc.
  1439. +        */
  1440. +       lock_page(page);
  1441. +       hlist_del(&rmap_item->hlist);
  1442. +
  1443. +       if (hlist_empty(&node_vma->rmap_hlist)) {
  1444. +           hlist_del(&node_vma->hlist);
  1445. +           free_node_vma(node_vma);
  1446. +       }
  1447. +       unlock_page(page);
  1448. +
  1449. +       put_page(page);
  1450. +       if (hlist_empty(&stable_node->hlist)) {
  1451. +           /* do NOT call remove_node_from_stable_tree() here,
  1452. +            * it's possible for a forked rmap_item not in
  1453. +            * stable tree while the in-tree rmap_items were
  1454. +            * deleted.
  1455. +            */
  1456. +           uksm_pages_shared--;
  1457. +       } else
  1458. +           uksm_pages_sharing--;
  1459. +
  1460. +
  1461. +       uksm_drop_anon_vma(rmap_item);
  1462. +   } else if (rmap_item->address & UNSTABLE_FLAG) {
  1463. +       /*
  1464. +        * Usually ksmd can and must skip the rb_erase, because
  1465. +        * root_unstable_tree was already reset to RB_ROOT.
  1466. +        * But be careful when an mm is exiting: do the rb_erase
  1467. +        * if this rmap_item was inserted by this scan, rather
  1468. +        * than left over from before.
  1469. +        */
  1470. +       if (rmap_item->append_round == uksm_scan_round) {
  1471. +           rb_erase(&rmap_item->node,
  1472. +                &rmap_item->tree_node->sub_root);
  1473. +           if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
  1474. +               rb_erase(&rmap_item->tree_node->node,
  1475. +                    &root_unstable_tree);
  1476. +
  1477. +               free_tree_node(rmap_item->tree_node);
  1478. +           } else
  1479. +               rmap_item->tree_node->count--;
  1480. +       }
  1481. +       uksm_pages_unshared--;
  1482. +   }
  1483. +
  1484. +   rmap_item->address &= PAGE_MASK;
  1485. +   rmap_item->hash_max = 0;
  1486. +
  1487. +out:
  1488. +   cond_resched();     /* we're called from many long loops */
  1489. +}
  1490. +
  1491. +/**
  1492. + * Need to do two things:
  1493. + * 1. check if slot was moved to del list
  1494. + * 2. make sure the mmap_sem is manipulated under valid vma.
  1495. + *
  1496. + * My concern here is that in some cases, this may make
  1497. + * vma_slot_list_lock() waiters to serialized further by some
  1498. + * sem->wait_lock, can this really be expensive?
  1499. + *
  1500. + *
  1501. + * @return
  1502. + * 0: if successfully locked mmap_sem
  1503. + * -ENOENT: this slot was moved to del list
  1504. + * -EBUSY: vma lock failed
  1505. + */
  1506. +static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
  1507. +{
  1508. +   struct vm_area_struct *vma;
  1509. +   struct mm_struct *mm;
  1510. +   struct rw_semaphore *sem;
  1511. +
  1512. +   spin_lock(&vma_slot_list_lock);
  1513. +
  1514. +   /* the slot_list was removed and inited from new list, when it enters
  1515. +    * uksm_list. If now it's not empty, then it must be moved to del list
  1516. +    */
  1517. +   if (!list_empty(&slot->slot_list)) {
  1518. +       spin_unlock(&vma_slot_list_lock);
  1519. +       return -ENOENT;
  1520. +   }
  1521. +
  1522. +   BUG_ON(slot->pages != vma_pages(slot->vma));
  1523. +   /* Ok, vma still valid */
  1524. +   vma = slot->vma;
  1525. +   mm = vma->vm_mm;
  1526. +   sem = &mm->mmap_sem;
  1527. +   if (down_read_trylock(sem)) {
  1528. +       spin_unlock(&vma_slot_list_lock);
  1529. +       return 0;
  1530. +   }
  1531. +
  1532. +   spin_unlock(&vma_slot_list_lock);
  1533. +   return -EBUSY;
  1534. +}
  1535. +
  1536. +static inline unsigned long
  1537. +vma_page_address(struct page *page, struct vm_area_struct *vma)
  1538. +{
  1539. +   pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  1540. +   unsigned long address;
  1541. +
  1542. +   address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  1543. +   if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
  1544. +       /* page should be within @vma mapping range */
  1545. +       return -EFAULT;
  1546. +   }
  1547. +   return address;
  1548. +}
  1549. +
  1550. +/*
  1551. + * Test if the mm is exiting
  1552. + */
  1553. +static inline bool uksm_test_exit(struct mm_struct *mm)
  1554. +{
  1555. +   return atomic_read(&mm->mm_users) == 0;
  1556. +}
  1557. +
  1558. +/* return 0 on success with the item's mmap_sem locked */
  1559. +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
  1560. +{
  1561. +   struct mm_struct *mm;
  1562. +   struct vma_slot *slot = item->slot;
  1563. +   int err = -EINVAL;
  1564. +
  1565. +   struct page *page;
  1566. +
  1567. +   BUG_ON(!item->slot);
  1568. +   /*
  1569. +    * try_down_read_slot_mmap_sem() returns non-zero if the slot
  1570. +    * has been removed by uksm_remove_vma().
  1571. +    */
  1572. +   if (try_down_read_slot_mmap_sem(slot))
  1573. +       return -EBUSY;
  1574. +
  1575. +   mm = slot->vma->vm_mm;
  1576. +
  1577. +   if (uksm_test_exit(mm))
  1578. +       goto failout_up;
  1579. +
  1580. +   page = item->page;
  1581. +   rcu_read_lock();
  1582. +   if (!get_page_unless_zero(page)) {
  1583. +       rcu_read_unlock();
  1584. +       goto failout_up;
  1585. +   }
  1586. +
  1587. +   /* No need to consider huge page here. */
  1588. +   if (item->slot->vma->anon_vma != page_anon_vma(page) ||
  1589. +       vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
  1590. +       /*
  1591. +        * TODO:
  1592. +        * should we release this item becase of its stale page
  1593. +        * mapping?
  1594. +        */
  1595. +       put_page(page);
  1596. +       rcu_read_unlock();
  1597. +       goto failout_up;
  1598. +   }
  1599. +   rcu_read_unlock();
  1600. +   return 0;
  1601. +
  1602. +failout_up:
  1603. +   up_read(&mm->mmap_sem);
  1604. +   return err;
  1605. +}
  1606. +
  1607. +/*
  1608. + * What kind of VMA is considered ?
  1609. + */
  1610. +static inline int vma_can_enter(struct vm_area_struct *vma)
  1611. +{
  1612. +   return !(vma->vm_flags & (VM_PFNMAP | VM_IO  | VM_DONTEXPAND |
  1613. +                 VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
  1614. +                 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO |
  1615. +                 VM_SHARED  | VM_MAYSHARE | VM_GROWSUP
  1616. +                 | VM_GROWSDOWN));
  1617. +}
  1618. +
  1619. +/*
  1620. + * Called whenever a fresh new vma is created A new vma_slot.
  1621. + * is created and inserted into a global list Must be called.
  1622. + * after vma is inserted to its mm                 .
  1623. + */
  1624. +void uksm_vma_add_new(struct vm_area_struct *vma)
  1625. +{
  1626. +   struct vma_slot *slot;
  1627. +
  1628. +   if (!vma_can_enter(vma)) {
  1629. +       vma->uksm_vma_slot = NULL;
  1630. +       return;
  1631. +   }
  1632. +
  1633. +   slot = alloc_vma_slot();
  1634. +   if (!slot) {
  1635. +       vma->uksm_vma_slot = NULL;
  1636. +       return;
  1637. +   }
  1638. +
  1639. +   vma->uksm_vma_slot = slot;
  1640. +   vma->vm_flags |= VM_MERGEABLE;
  1641. +   slot->vma = vma;
  1642. +   slot->mm = vma->vm_mm;
  1643. +   slot->ctime_j = jiffies;
  1644. +   slot->pages = vma_pages(vma);
  1645. +   spin_lock(&vma_slot_list_lock);
  1646. +   list_add_tail(&slot->slot_list, &vma_slot_new);
  1647. +   spin_unlock(&vma_slot_list_lock);
  1648. +}
  1649. +
  1650. +/*
  1651. + * Called after vma is unlinked from its mm
  1652. + */
  1653. +void uksm_remove_vma(struct vm_area_struct *vma)
  1654. +{
  1655. +   struct vma_slot *slot;
  1656. +
  1657. +   if (!vma->uksm_vma_slot)
  1658. +       return;
  1659. +
  1660. +   slot = vma->uksm_vma_slot;
  1661. +   spin_lock(&vma_slot_list_lock);
  1662. +   if (list_empty(&slot->slot_list)) {
  1663. +       /**
  1664. +        * This slot has been added by ksmd, so move to the del list
  1665. +        * waiting ksmd to free it.
  1666. +        */
  1667. +       list_add_tail(&slot->slot_list, &vma_slot_del);
  1668. +   } else {
  1669. +       /**
  1670. +        * It's still on new list. It's ok to free slot directly.
  1671. +        */
  1672. +       list_del(&slot->slot_list);
  1673. +       free_vma_slot(slot);
  1674. +   }
  1675. +   spin_unlock(&vma_slot_list_lock);
  1676. +   vma->uksm_vma_slot = NULL;
  1677. +}
  1678. +
  1679. +/*   32/3 < they < 32/2 */
  1680. +#define shiftl 8
  1681. +#define shiftr 12
  1682. +
  1683. +#define HASH_FROM_TO(from, to)                 \
  1684. +for (index = from; index < to; index++) {      \
  1685. +   pos = random_nums[index];           \
  1686. +   hash += key[pos];               \
  1687. +   hash += (hash << shiftl);           \
  1688. +   hash ^= (hash >> shiftr);           \
  1689. +}
  1690. +
  1691. +
  1692. +#define HASH_FROM_DOWN_TO(from, to)            \
  1693. +for (index = from - 1; index >= to; index--) {     \
  1694. +   hash ^= (hash >> shiftr);           \
  1695. +   hash ^= (hash >> (shiftr*2));           \
  1696. +   hash -= (hash << shiftl);           \
  1697. +   hash += (hash << (shiftl*2));           \
  1698. +   pos = random_nums[index];           \
  1699. +   hash -= key[pos];               \
  1700. +}
  1701. +
  1702. +/*
  1703. + * The main random sample hash function.
  1704. + */
  1705. +static u32 random_sample_hash(void *addr, u32 hash_strength)
  1706. +{
  1707. +   u32 hash = 0xdeadbeef;
  1708. +   int index, pos, loop = hash_strength;
  1709. +   u32 *key = (u32 *)addr;
  1710. +
  1711. +   if (loop > HASH_STRENGTH_FULL)
  1712. +       loop = HASH_STRENGTH_FULL;
  1713. +
  1714. +   HASH_FROM_TO(0, loop);
  1715. +
  1716. +   if (hash_strength > HASH_STRENGTH_FULL) {
  1717. +       loop = hash_strength - HASH_STRENGTH_FULL;
  1718. +       HASH_FROM_TO(0, loop);
  1719. +   }
  1720. +
  1721. +   return hash;
  1722. +}
  1723. +
  1724. +
  1725. +/**
  1726. + * It's used when hash strength is adjusted
  1727. + *
  1728. + * @addr The page's virtual address
  1729. + * @from The original hash strength
  1730. + * @to   The hash strength changed to
  1731. + * @hash The hash value generated with "from" hash value
  1732. + *
  1733. + * return the hash value
  1734. + */
  1735. +static u32 delta_hash(void *addr, int from, int to, u32 hash)
  1736. +{
  1737. +   u32 *key = (u32 *)addr;
  1738. +   int index, pos; /* make sure they are int type */
  1739. +
  1740. +   if (to > from) {
  1741. +       if (from >= HASH_STRENGTH_FULL) {
  1742. +           from -= HASH_STRENGTH_FULL;
  1743. +           to -= HASH_STRENGTH_FULL;
  1744. +           HASH_FROM_TO(from, to);
  1745. +       } else if (to <= HASH_STRENGTH_FULL) {
  1746. +           HASH_FROM_TO(from, to);
  1747. +       } else {
  1748. +           HASH_FROM_TO(from, HASH_STRENGTH_FULL);
  1749. +           HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
  1750. +       }
  1751. +   } else {
  1752. +       if (from <= HASH_STRENGTH_FULL) {
  1753. +           HASH_FROM_DOWN_TO(from, to);
  1754. +       } else if (to >= HASH_STRENGTH_FULL) {
  1755. +           from -= HASH_STRENGTH_FULL;
  1756. +           to -= HASH_STRENGTH_FULL;
  1757. +           HASH_FROM_DOWN_TO(from, to);
  1758. +       } else {
  1759. +           HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
  1760. +           HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
  1761. +       }
  1762. +   }
  1763. +
  1764. +   return hash;
  1765. +}
  1766. +
  1767. +
  1768. +
  1769. +
  1770. +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
  1771. +
  1772. +/**
  1773. + *
  1774. + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
  1775. + * has finished.
  1776. + *
  1777. + */
  1778. +static inline void encode_benefit(void)
  1779. +{
  1780. +   u64 scanned_delta, pos_delta, neg_delta;
  1781. +   unsigned long base = benefit.base;
  1782. +
  1783. +   scanned_delta = (uksm_pages_scanned - uksm_pages_scanned_last) >> base;
  1784. +   pos_delta = rshash_pos >> base;
  1785. +   neg_delta = rshash_neg >> base;
  1786. +
  1787. +   if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
  1788. +       CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
  1789. +       CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
  1790. +       benefit.scanned >>= 1;
  1791. +       benefit.neg >>= 1;
  1792. +       benefit.pos >>= 1;
  1793. +       benefit.base++;
  1794. +       scanned_delta >>= 1;
  1795. +       pos_delta >>= 1;
  1796. +       neg_delta >>= 1;
  1797. +   }
  1798. +
  1799. +   benefit.pos += pos_delta;
  1800. +   benefit.neg += neg_delta;
  1801. +   benefit.scanned += scanned_delta;
  1802. +
  1803. +   BUG_ON(!benefit.scanned);
  1804. +
  1805. +   rshash_pos = rshash_neg = 0;
  1806. +
  1807. +   /* -1 to make rshash_adjust() work */
  1808. +   uksm_pages_scanned_last = uksm_pages_scanned - 1;
  1809. +}
  1810. +
  1811. +static inline void reset_benefit(void)
  1812. +{
  1813. +   benefit.pos = 0;
  1814. +   benefit.neg = 0;
  1815. +   benefit.base = 0;
  1816. +   benefit.scanned = 0;
  1817. +}
  1818. +
  1819. +static inline void inc_rshash_pos(unsigned long delta)
  1820. +{
  1821. +   if (CAN_OVERFLOW_U64(rshash_pos, delta))
  1822. +       encode_benefit();
  1823. +
  1824. +   rshash_pos += delta;
  1825. +}
  1826. +
  1827. +static inline void inc_rshash_neg(unsigned long delta)
  1828. +{
  1829. +   if (CAN_OVERFLOW_U64(rshash_neg, delta))
  1830. +       encode_benefit();
  1831. +
  1832. +   rshash_neg += delta;
  1833. +}
  1834. +
  1835. +
  1836. +static inline u32 page_hash(struct page *page, unsigned long hash_strength,
  1837. +               int cost_accounting)
  1838. +{
  1839. +   u32 val;
  1840. +   unsigned long delta;
  1841. +
  1842. +   void *addr = kmap_atomic(page, KM_USER0);
  1843. +
  1844. +   val = random_sample_hash(addr, hash_strength);
  1845. +   kunmap_atomic(addr, KM_USER0);
  1846. +
  1847. +   if (cost_accounting) {
  1848. +       if (HASH_STRENGTH_FULL > hash_strength)
  1849. +           delta = HASH_STRENGTH_FULL - hash_strength;
  1850. +       else
  1851. +           delta = 0;
  1852. +
  1853. +       inc_rshash_pos(delta);
  1854. +   }
  1855. +
  1856. +   return val;
  1857. +}
  1858. +
  1859. +static int memcmp_pages(struct page *page1, struct page *page2,
  1860. +           int cost_accounting)
  1861. +{
  1862. +   char *addr1, *addr2;
  1863. +   int ret;
  1864. +
  1865. +   addr1 = kmap_atomic(page1, KM_USER0);
  1866. +   addr2 = kmap_atomic(page2, KM_USER1);
  1867. +   ret = memcmp(addr1, addr2, PAGE_SIZE);
  1868. +   kunmap_atomic(addr2, KM_USER1);
  1869. +   kunmap_atomic(addr1, KM_USER0);
  1870. +
  1871. +   if (cost_accounting)
  1872. +       inc_rshash_neg(memcmp_cost);
  1873. +
  1874. +   return ret;
  1875. +}
  1876. +
  1877. +static inline int pages_identical(struct page *page1, struct page *page2)
  1878. +{
  1879. +   return !memcmp_pages(page1, page2, 0);
  1880. +}
  1881. +
  1882. +static inline int is_page_full_zero(struct page *page)
  1883. +{
  1884. +   char *addr;
  1885. +   int ret;
  1886. +
  1887. +   addr = kmap_atomic(page, KM_USER0);
  1888. +   ret = is_full_zero(addr, PAGE_SIZE);
  1889. +   kunmap_atomic(addr, KM_USER0);
  1890. +
  1891. +   return ret;
  1892. +}
  1893. +
  1894. +static int write_protect_page(struct vm_area_struct *vma, struct page *page,
  1895. +                 pte_t *orig_pte, pte_t *old_pte)
  1896. +{
  1897. +   struct mm_struct *mm = vma->vm_mm;
  1898. +   unsigned long addr;
  1899. +   pte_t *ptep;
  1900. +   spinlock_t *ptl;
  1901. +   int swapped;
  1902. +   int err = -EFAULT;
  1903. +
  1904. +   addr = page_address_in_vma(page, vma);
  1905. +   if (addr == -EFAULT)
  1906. +       goto out;
  1907. +
  1908. +   BUG_ON(PageTransCompound(page));
  1909. +   ptep = page_check_address(page, mm, addr, &ptl, 0);
  1910. +   if (!ptep)
  1911. +       goto out;
  1912. +
  1913. +   if (old_pte)
  1914. +       *old_pte = *ptep;
  1915. +
  1916. +   if (pte_write(*ptep) || pte_dirty(*ptep)) {
  1917. +       pte_t entry;
  1918. +
  1919. +       swapped = PageSwapCache(page);
  1920. +       flush_cache_page(vma, addr, page_to_pfn(page));
  1921. +       /*
  1922. +        * Ok this is tricky, when get_user_pages_fast() run it doesnt
  1923. +        * take any lock, therefore the check that we are going to make
  1924. +        * with the pagecount against the mapcount is racey and
  1925. +        * O_DIRECT can happen right after the check.
  1926. +        * So we clear the pte and flush the tlb before the check
  1927. +        * this assure us that no O_DIRECT can happen after the check
  1928. +        * or in the middle of the check.
  1929. +        */
  1930. +       entry = ptep_clear_flush(vma, addr, ptep);
  1931. +       /*
  1932. +        * Check that no O_DIRECT or similar I/O is in progress on the
  1933. +        * page
  1934. +        */
  1935. +       if (page_mapcount(page) + 1 + swapped != page_count(page)) {
  1936. +           set_pte_at(mm, addr, ptep, entry);
  1937. +           goto out_unlock;
  1938. +       }
  1939. +       if (pte_dirty(entry))
  1940. +           set_page_dirty(page);
  1941. +       entry = pte_mkclean(pte_wrprotect(entry));
  1942. +       set_pte_at_notify(mm, addr, ptep, entry);
  1943. +   }
  1944. +   *orig_pte = *ptep;
  1945. +   err = 0;
  1946. +
  1947. +out_unlock:
  1948. +   pte_unmap_unlock(ptep, ptl);
  1949. +out:
  1950. +   return err;
  1951. +}
  1952. +
  1953. +#define MERGE_ERR_PGERR        1 /* the page is invalid cannot continue */
  1954. +#define MERGE_ERR_COLLI        2 /* there is a collision */
  1955. +#define MERGE_ERR_CHANGED  3 /* the page has changed since last hash */
  1956. +
  1957. +
  1958. +/**
  1959. + * replace_page - replace page in vma by new ksm page
  1960. + * @vma:      vma that holds the pte pointing to page
  1961. + * @page:     the page we are replacing by kpage
  1962. + * @kpage:    the ksm page we replace page by
  1963. + * @orig_pte: the original value of the pte
  1964. + *
  1965. + * Returns 0 on success, MERGE_ERR_PGERR on failure.
  1966. + */
  1967. +static int replace_page(struct vm_area_struct *vma, struct page *page,
  1968. +           struct page *kpage, pte_t orig_pte)
  1969. +{
  1970. +   struct mm_struct *mm = vma->vm_mm;
  1971. +   pgd_t *pgd;
  1972. +   pud_t *pud;
  1973. +   pmd_t *pmd;
  1974. +   pte_t *ptep;
  1975. +   spinlock_t *ptl;
  1976. +   pte_t entry;
  1977. +
  1978. +   unsigned long addr;
  1979. +   int err = MERGE_ERR_PGERR;
  1980. +
  1981. +   addr = page_address_in_vma(page, vma);
  1982. +   if (addr == -EFAULT)
  1983. +       goto out;
  1984. +
  1985. +   pgd = pgd_offset(mm, addr);
  1986. +   if (!pgd_present(*pgd))
  1987. +       goto out;
  1988. +
  1989. +   pud = pud_offset(pgd, addr);
  1990. +   if (!pud_present(*pud))
  1991. +       goto out;
  1992. +
  1993. +   pmd = pmd_offset(pud, addr);
  1994. +   BUG_ON(pmd_trans_huge(*pmd));
  1995. +   if (!pmd_present(*pmd))
  1996. +       goto out;
  1997. +
  1998. +   ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
  1999. +   if (!pte_same(*ptep, orig_pte)) {
  2000. +       pte_unmap_unlock(ptep, ptl);
  2001. +       goto out;
  2002. +   }
  2003. +
  2004. +   flush_cache_page(vma, addr, pte_pfn(*ptep));
  2005. +   ptep_clear_flush(vma, addr, ptep);
  2006. +   entry = mk_pte(kpage, vma->vm_page_prot);
  2007. +
  2008. +   /* special treatment is needed for zero_page */
  2009. +   if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
  2010. +               (page_to_pfn(kpage) == zero_pfn))
  2011. +       entry = pte_mkspecial(entry);
  2012. +   else {
  2013. +       get_page(kpage);
  2014. +       page_add_anon_rmap(kpage, vma, addr);
  2015. +   }
  2016. +
  2017. +   set_pte_at_notify(mm, addr, ptep, entry);
  2018. +
  2019. +   page_remove_rmap(page);
  2020. +   if (!page_mapped(page))
  2021. +       try_to_free_swap(page);
  2022. +   put_page(page);
  2023. +
  2024. +   pte_unmap_unlock(ptep, ptl);
  2025. +   err = 0;
  2026. +out:
  2027. +   return err;
  2028. +}
  2029. +
  2030. +
  2031. +/**
  2032. + *  Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
  2033. + *  zero hash value at HASH_STRENGTH_MAX is used to indicated that its
  2034. + *  hash_max member has not been calculated.
  2035. + *
  2036. + * @page The page needs to be hashed
  2037. + * @hash_old The hash value calculated with current hash strength
  2038. + *
  2039. + * return the new hash value calculated at HASH_STRENGTH_MAX
  2040. + */
  2041. +static inline u32 page_hash_max(struct page *page, u32 hash_old)
  2042. +{
  2043. +   u32 hash_max = 0;
  2044. +   void *addr;
  2045. +
  2046. +   addr = kmap_atomic(page, KM_USER0);
  2047. +   hash_max = delta_hash(addr, hash_strength,
  2048. +                 HASH_STRENGTH_MAX, hash_old);
  2049. +
  2050. +   kunmap_atomic(addr, KM_USER0);
  2051. +
  2052. +   if (!hash_max)
  2053. +       hash_max = 1;
  2054. +
  2055. +   inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
  2056. +   return hash_max;
  2057. +}
  2058. +
  2059. +/*
  2060. + * We compare the hash again, to ensure that it is really a hash collision
  2061. + * instead of being caused by page write.
  2062. + */
  2063. +static inline int check_collision(struct rmap_item *rmap_item,
  2064. +                 u32 hash)
  2065. +{
  2066. +   int err;
  2067. +   struct page *page = rmap_item->page;
  2068. +
  2069. +   /* if this rmap_item has already been hash_maxed, then the collision
  2070. +    * must appears in the second-level rbtree search. In this case we check
  2071. +    * if its hash_max value has been changed. Otherwise, the collision
  2072. +    * happens in the first-level rbtree search, so we check against it's
  2073. +    * current hash value.
  2074. +    */
  2075. +   if (rmap_item->hash_max) {
  2076. +       inc_rshash_neg(memcmp_cost);
  2077. +       inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
  2078. +
  2079. +       if (rmap_item->hash_max == page_hash_max(page, hash))
  2080. +           err = MERGE_ERR_COLLI;
  2081. +       else
  2082. +           err = MERGE_ERR_CHANGED;
  2083. +   } else {
  2084. +       inc_rshash_neg(memcmp_cost + hash_strength);
  2085. +
  2086. +       if (page_hash(page, hash_strength, 0) == hash)
  2087. +           err = MERGE_ERR_COLLI;
  2088. +       else
  2089. +           err = MERGE_ERR_CHANGED;
  2090. +   }
  2091. +
  2092. +   return err;
  2093. +}
  2094. +
  2095. +static struct page *page_trans_compound_anon(struct page *page)
  2096. +{
  2097. +   if (PageTransCompound(page)) {
  2098. +       struct page *head = compound_trans_head(page);
  2099. +       /*
  2100. +        * head may actually be splitted and freed from under
  2101. +        * us but it's ok here.
  2102. +        */
  2103. +       if (PageAnon(head))
  2104. +           return head;
  2105. +   }
  2106. +   return NULL;
  2107. +}
  2108. +
  2109. +static int page_trans_compound_anon_split(struct page *page)
  2110. +{
  2111. +   int ret = 0;
  2112. +   struct page *transhuge_head = page_trans_compound_anon(page);
  2113. +   if (transhuge_head) {
  2114. +       /* Get the reference on the head to split it. */
  2115. +       if (get_page_unless_zero(transhuge_head)) {
  2116. +           /*
  2117. +            * Recheck we got the reference while the head
  2118. +            * was still anonymous.
  2119. +            */
  2120. +           if (PageAnon(transhuge_head))
  2121. +               ret = split_huge_page(transhuge_head);
  2122. +           else
  2123. +               /*
  2124. +                * Retry later if split_huge_page run
  2125. +                * from under us.
  2126. +                */
  2127. +               ret = 1;
  2128. +           put_page(transhuge_head);
  2129. +       } else
  2130. +           /* Retry later if split_huge_page run from under us. */
  2131. +           ret = 1;
  2132. +   }
  2133. +   return ret;
  2134. +}
  2135. +
  2136. +/**
  2137. + * Try to merge a rmap_item.page with a kpage in stable node. kpage must
  2138. + * already be a ksm page.
  2139. + *
  2140. + * @return 0 if the pages were merged, -EFAULT otherwise.
  2141. + */
  2142. +static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
  2143. +                     struct page *kpage, u32 hash)
  2144. +{
  2145. +   struct vm_area_struct *vma = rmap_item->slot->vma;
  2146. +   struct mm_struct *mm = vma->vm_mm;
  2147. +   pte_t orig_pte = __pte(0);
  2148. +   int err = MERGE_ERR_PGERR;
  2149. +   struct page *page;
  2150. +
  2151. +   if (uksm_test_exit(mm))
  2152. +       goto out;
  2153. +
  2154. +   page = rmap_item->page;
  2155. +
  2156. +   if (page == kpage) { /* ksm page forked */
  2157. +       err = 0;
  2158. +       goto out;
  2159. +   }
  2160. +
  2161. +   if (PageTransCompound(page) && page_trans_compound_anon_split(page))
  2162. +       goto out;
  2163. +   BUG_ON(PageTransCompound(page));
  2164. +
  2165. +   if (!PageAnon(page) || !PageKsm(kpage))
  2166. +       goto out;
  2167. +
  2168. +   /*
  2169. +    * We need the page lock to read a stable PageSwapCache in
  2170. +    * write_protect_page().  We use trylock_page() instead of
  2171. +    * lock_page() because we don't want to wait here - we
  2172. +    * prefer to continue scanning and merging different pages,
  2173. +    * then come back to this page when it is unlocked.
  2174. +    */
  2175. +   if (!trylock_page(page))
  2176. +       goto out;
  2177. +   /*
  2178. +    * If this anonymous page is mapped only here, its pte may need
  2179. +    * to be write-protected.  If it's mapped elsewhere, all of its
  2180. +    * ptes are necessarily already write-protected.  But in either
  2181. +    * case, we need to lock and check page_count is not raised.
  2182. +    */
  2183. +   if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
  2184. +       if (pages_identical(page, kpage))
  2185. +           err = replace_page(vma, page, kpage, orig_pte);
  2186. +       else
  2187. +           err = check_collision(rmap_item, hash);
  2188. +   }
  2189. +
  2190. +   if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
  2191. +       munlock_vma_page(page);
  2192. +       if (!PageMlocked(kpage)) {
  2193. +           unlock_page(page);
  2194. +           lock_page(kpage);
  2195. +           mlock_vma_page(kpage);
  2196. +           page = kpage;       /* for final unlock */
  2197. +       }
  2198. +   }
  2199. +
  2200. +   unlock_page(page);
  2201. +out:
  2202. +   return err;
  2203. +}
  2204. +
  2205. +
  2206. +
  2207. +/**
  2208. + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
  2209. + * to restore a page mapping that has been changed in try_to_merge_two_pages.
  2210. + *
  2211. + * @return 0 on success.
  2212. + */
  2213. +static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
  2214. +                pte_t orig_pte, pte_t wprt_pte)
  2215. +{
  2216. +   struct mm_struct *mm = vma->vm_mm;
  2217. +   pgd_t *pgd;
  2218. +   pud_t *pud;
  2219. +   pmd_t *pmd;
  2220. +   pte_t *ptep;
  2221. +   spinlock_t *ptl;
  2222. +
  2223. +   int err = -EFAULT;
  2224. +
  2225. +   pgd = pgd_offset(mm, addr);
  2226. +   if (!pgd_present(*pgd))
  2227. +       goto out;
  2228. +
  2229. +   pud = pud_offset(pgd, addr);
  2230. +   if (!pud_present(*pud))
  2231. +       goto out;
  2232. +
  2233. +   pmd = pmd_offset(pud, addr);
  2234. +   if (!pmd_present(*pmd))
  2235. +       goto out;
  2236. +
  2237. +   ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
  2238. +   if (!pte_same(*ptep, wprt_pte)) {
  2239. +       /* already copied, let it be */
  2240. +       pte_unmap_unlock(ptep, ptl);
  2241. +       goto out;
  2242. +   }
  2243. +
  2244. +   /*
  2245. +    * Good boy, still here. When we still get the ksm page, it does not
  2246. +    * return to the free page pool, there is no way that a pte was changed
  2247. +    * to other page and gets back to this page. And remind that ksm page
  2248. +    * do not reuse in do_wp_page(). So it's safe to restore the original
  2249. +    * pte.
  2250. +    */
  2251. +   flush_cache_page(vma, addr, pte_pfn(*ptep));
  2252. +   ptep_clear_flush(vma, addr, ptep);
  2253. +   set_pte_at_notify(mm, addr, ptep, orig_pte);
  2254. +
  2255. +   pte_unmap_unlock(ptep, ptl);
  2256. +   err = 0;
  2257. +out:
  2258. +   return err;
  2259. +}
  2260. +
  2261. +/**
  2262. + * try_to_merge_two_pages() - take two identical pages and prepare
  2263. + * them to be merged into one page(rmap_item->page)
  2264. + *
  2265. + * @return 0 if we successfully merged two identical pages into
  2266. + *         one ksm page. MERGE_ERR_COLLI if it's only a hash collision
  2267. + *         search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
  2268. + *         changed since it's hashed. MERGE_ERR_PGERR otherwise.
  2269. + *
  2270. + */
  2271. +static int try_to_merge_two_pages(struct rmap_item *rmap_item,
  2272. +                 struct rmap_item *tree_rmap_item,
  2273. +                 u32 hash)
  2274. +{
  2275. +   pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
  2276. +   pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
  2277. +   struct vm_area_struct *vma1 = rmap_item->slot->vma;
  2278. +   struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
  2279. +   struct page *page = rmap_item->page;
  2280. +   struct page *tree_page = tree_rmap_item->page;
  2281. +   int err = MERGE_ERR_PGERR;
  2282. +   struct address_space *saved_mapping;
  2283. +
  2284. +
  2285. +   if (rmap_item->page == tree_rmap_item->page)
  2286. +       goto out;
  2287. +
  2288. +   if (PageTransCompound(page) && page_trans_compound_anon_split(page))
  2289. +       goto out;
  2290. +   BUG_ON(PageTransCompound(page));
  2291. +
  2292. +   if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page))
  2293. +       goto out;
  2294. +   BUG_ON(PageTransCompound(tree_page));
  2295. +
  2296. +   if (!PageAnon(page) || !PageAnon(tree_page))
  2297. +       goto out;
  2298. +
  2299. +   if (!trylock_page(page))
  2300. +       goto out;
  2301. +
  2302. +
  2303. +   if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
  2304. +       unlock_page(page);
  2305. +       goto out;
  2306. +   }
  2307. +
  2308. +   /*
  2309. +    * While we hold page lock, upgrade page from
  2310. +    * PageAnon+anon_vma to PageKsm+NULL stable_node:
  2311. +    * stable_tree_insert() will update stable_node.
  2312. +    */
  2313. +   saved_mapping = page->mapping;
  2314. +   set_page_stable_node(page, NULL);
  2315. +   mark_page_accessed(page);
  2316. +   unlock_page(page);
  2317. +
  2318. +   if (!trylock_page(tree_page))
  2319. +       goto restore_out;
  2320. +
  2321. +   if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
  2322. +       unlock_page(tree_page);
  2323. +       goto restore_out;
  2324. +   }
  2325. +
  2326. +   if (pages_identical(page, tree_page)) {
  2327. +       err = replace_page(vma2, tree_page, page, wprt_pte2);
  2328. +       if (err) {
  2329. +           unlock_page(tree_page);
  2330. +           goto restore_out;
  2331. +       }
  2332. +
  2333. +       if ((vma2->vm_flags & VM_LOCKED)) {
  2334. +           munlock_vma_page(tree_page);
  2335. +           if (!PageMlocked(page)) {
  2336. +               unlock_page(tree_page);
  2337. +               lock_page(page);
  2338. +               mlock_vma_page(page);
  2339. +               tree_page = page; /* for final unlock */
  2340. +           }
  2341. +       }
  2342. +
  2343. +       unlock_page(tree_page);
  2344. +
  2345. +       goto out; /* success */
  2346. +
  2347. +   } else {
  2348. +       if (page_hash(page, hash_strength, 0) ==
  2349. +           page_hash(tree_page, hash_strength, 0)) {
  2350. +           inc_rshash_neg(memcmp_cost + hash_strength * 2);
  2351. +           err = MERGE_ERR_COLLI;
  2352. +       } else
  2353. +           err = MERGE_ERR_CHANGED;
  2354. +
  2355. +       unlock_page(tree_page);
  2356. +   }
  2357. +
  2358. +restore_out:
  2359. +   lock_page(page);
  2360. +   if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
  2361. +                 orig_pte1, wprt_pte1))
  2362. +       page->mapping = saved_mapping;
  2363. +
  2364. +   unlock_page(page);
  2365. +out:
  2366. +   return err;
  2367. +}
  2368. +
  2369. +static inline int hash_cmp(u32 new_val, u32 node_val)
  2370. +{
  2371. +   if (new_val > node_val)
  2372. +       return 1;
  2373. +   else if (new_val < node_val)
  2374. +       return -1;
  2375. +   else
  2376. +       return 0;
  2377. +}
  2378. +
  2379. +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
  2380. +{
  2381. +   u32 hash_max = item->hash_max;
  2382. +
  2383. +   if (!hash_max) {
  2384. +       hash_max = page_hash_max(item->page, hash);
  2385. +
  2386. +       item->hash_max = hash_max;
  2387. +   }
  2388. +
  2389. +   return hash_max;
  2390. +}
  2391. +
  2392. +
  2393. +
  2394. +/**
  2395. + * stable_tree_search() - search the stable tree for a page
  2396. + *
  2397. + * @item:  the rmap_item we are comparing with
  2398. + * @hash:  the hash value of this item->page already calculated
  2399. + *
  2400. + * @return     the page we have found, NULL otherwise. The page returned has
  2401. + *             been gotten.
  2402. + */
  2403. +static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
  2404. +{
  2405. +   struct rb_node *node = root_stable_treep->rb_node;
  2406. +   struct tree_node *tree_node;
  2407. +   unsigned long hash_max;
  2408. +   struct page *page = item->page;
  2409. +   struct stable_node *stable_node;
  2410. +
  2411. +   stable_node = page_stable_node(page);
  2412. +   if (stable_node) {
  2413. +       /* ksm page forked, that is
  2414. +        * if (PageKsm(page) && !in_stable_tree(rmap_item))
  2415. +        * it's actually gotten once outside.
  2416. +        */
  2417. +       get_page(page);
  2418. +       return page;
  2419. +   }
  2420. +
  2421. +   while (node) {
  2422. +       int cmp;
  2423. +
  2424. +       tree_node = rb_entry(node, struct tree_node, node);
  2425. +
  2426. +       cmp = hash_cmp(hash, tree_node->hash);
  2427. +
  2428. +       if (cmp < 0)
  2429. +           node = node->rb_left;
  2430. +       else if (cmp > 0)
  2431. +           node = node->rb_right;
  2432. +       else
  2433. +           break;
  2434. +   }
  2435. +
  2436. +   if (!node)
  2437. +       return NULL;
  2438. +
  2439. +   if (tree_node->count == 1) {
  2440. +       stable_node = rb_entry(tree_node->sub_root.rb_node,
  2441. +                      struct stable_node, node);
  2442. +       BUG_ON(!stable_node);
  2443. +
  2444. +       goto get_page_out;
  2445. +   }
  2446. +
  2447. +   /*
  2448. +    * ok, we have to search the second
  2449. +    * level subtree, hash the page to a
  2450. +    * full strength.
  2451. +    */
  2452. +   node = tree_node->sub_root.rb_node;
  2453. +   BUG_ON(!node);
  2454. +   hash_max = rmap_item_hash_max(item, hash);
  2455. +
  2456. +   while (node) {
  2457. +       int cmp;
  2458. +
  2459. +       stable_node = rb_entry(node, struct stable_node, node);
  2460. +
  2461. +       cmp = hash_cmp(hash_max, stable_node->hash_max);
  2462. +
  2463. +       if (cmp < 0)
  2464. +           node = node->rb_left;
  2465. +       else if (cmp > 0)
  2466. +           node = node->rb_right;
  2467. +       else
  2468. +           goto get_page_out;
  2469. +   }
  2470. +
  2471. +   return NULL;
  2472. +
  2473. +get_page_out:
  2474. +   page = get_uksm_page(stable_node, 1, 1);
  2475. +   return page;
  2476. +}
  2477. +
  2478. +static int try_merge_rmap_item(struct rmap_item *item,
  2479. +                  struct page *kpage,
  2480. +                  struct page *tree_page)
  2481. +{
  2482. +   spinlock_t *ptl;
  2483. +   pte_t *ptep;
  2484. +   unsigned long addr;
  2485. +   struct vm_area_struct *vma = item->slot->vma;
  2486. +
  2487. +   addr = get_rmap_addr(item);
  2488. +   ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0);
  2489. +   if (!ptep)
  2490. +       return 0;
  2491. +
  2492. +   if (pte_write(*ptep)) {
  2493. +       /* has changed, abort! */
  2494. +       pte_unmap_unlock(ptep, ptl);
  2495. +       return 0;
  2496. +   }
  2497. +
  2498. +   get_page(tree_page);
  2499. +   page_add_anon_rmap(tree_page, vma, addr);
  2500. +
  2501. +   flush_cache_page(vma, addr, pte_pfn(*ptep));
  2502. +   ptep_clear_flush(vma, addr, ptep);
  2503. +   set_pte_at_notify(vma->vm_mm, addr, ptep,
  2504. +             mk_pte(tree_page, vma->vm_page_prot));
  2505. +
  2506. +   page_remove_rmap(kpage);
  2507. +   put_page(kpage);
  2508. +
  2509. +   pte_unmap_unlock(ptep, ptl);
  2510. +
  2511. +   return 1;
  2512. +}
  2513. +
  2514. +/**
  2515. + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
  2516. + * into stable tree, the page was found to be identical to a stable ksm page,
  2517. + * this is the last chance we can merge them into one.
  2518. + *
  2519. + * @item1: the rmap_item holding the page which we wanted to insert
  2520. + *         into stable tree.
  2521. + * @item2: the other rmap_item we found when unstable tree search
  2522. + * @oldpage:   the page currently mapped by the two rmap_items
  2523. + * @tree_page:     the page we found identical in stable tree node
  2524. + * @success1:  return if item1 is successfully merged
  2525. + * @success2:  return if item2 is successfully merged
  2526. + */
  2527. +static void try_merge_with_stable(struct rmap_item *item1,
  2528. +                 struct rmap_item *item2,
  2529. +                 struct page **kpage,
  2530. +                 struct page *tree_page,
  2531. +                 int *success1, int *success2)
  2532. +{
  2533. +   struct vm_area_struct *vma1 = item1->slot->vma;
  2534. +   struct vm_area_struct *vma2 = item2->slot->vma;
  2535. +   *success1 = 0;
  2536. +   *success2 = 0;
  2537. +
  2538. +   if (unlikely(*kpage == tree_page)) {
  2539. +       /* I don't think this can really happen */
  2540. +       printk(KERN_WARNING "UKSM: unexpected condition detected in "
  2541. +           "try_merge_with_stable() -- *kpage == tree_page !\n");
  2542. +       *success1 = 1;
  2543. +       *success2 = 1;
  2544. +       return;
  2545. +   }
  2546. +
  2547. +   if (!PageAnon(*kpage) || !PageKsm(*kpage))
  2548. +       goto failed;
  2549. +
  2550. +   if (!trylock_page(tree_page))
  2551. +       goto failed;
  2552. +
  2553. +   /* If the oldpage is still ksm and still pointed
  2554. +    * to in the right place, and still write protected,
  2555. +    * we are confident it's not changed, no need to
  2556. +    * memcmp anymore.
  2557. +    * be ware, we cannot take nested pte locks,
  2558. +    * deadlock risk.
  2559. +    */
  2560. +   if (!try_merge_rmap_item(item1, *kpage, tree_page))
  2561. +       goto unlock_failed;
  2562. +
  2563. +   /* ok, then vma2, remind that pte1 already set */
  2564. +   if (!try_merge_rmap_item(item2, *kpage, tree_page))
  2565. +       goto success_1;
  2566. +
  2567. +   *success2 = 1;
  2568. +success_1:
  2569. +   *success1 = 1;
  2570. +
  2571. +
  2572. +   if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
  2573. +       (*success2 && vma2->vm_flags & VM_LOCKED)) {
  2574. +       munlock_vma_page(*kpage);
  2575. +       if (!PageMlocked(tree_page))
  2576. +           mlock_vma_page(tree_page);
  2577. +   }
  2578. +
  2579. +   /*
  2580. +    * We do not need oldpage any more in the caller, so can break the lock
  2581. +    * now.
  2582. +    */
  2583. +   unlock_page(*kpage);
  2584. +   *kpage = tree_page; /* Get unlocked outside. */
  2585. +   return;
  2586. +
  2587. +unlock_failed:
  2588. +   unlock_page(tree_page);
  2589. +failed:
  2590. +   return;
  2591. +}
  2592. +
  2593. +static inline void stable_node_hash_max(struct stable_node *node,
  2594. +                    struct page *page, u32 hash)
  2595. +{
  2596. +   u32 hash_max = node->hash_max;
  2597. +
  2598. +   if (!hash_max) {
  2599. +       hash_max = page_hash_max(page, hash);
  2600. +       node->hash_max = hash_max;
  2601. +   }
  2602. +}
  2603. +
  2604. +static inline
  2605. +struct stable_node *new_stable_node(struct tree_node *tree_node,
  2606. +                   struct page *kpage, u32 hash_max)
  2607. +{
  2608. +   struct stable_node *new_stable_node;
  2609. +
  2610. +   new_stable_node = alloc_stable_node();
  2611. +   if (!new_stable_node)
  2612. +       return NULL;
  2613. +
  2614. +   new_stable_node->kpfn = page_to_pfn(kpage);
  2615. +   new_stable_node->hash_max = hash_max;
  2616. +   new_stable_node->tree_node = tree_node;
  2617. +   set_page_stable_node(kpage, new_stable_node);
  2618. +
  2619. +   return new_stable_node;
  2620. +}
  2621. +
  2622. +static inline
  2623. +struct stable_node *first_level_insert(struct tree_node *tree_node,
  2624. +                      struct rmap_item *rmap_item,
  2625. +                      struct rmap_item *tree_rmap_item,
  2626. +                      struct page **kpage, u32 hash,
  2627. +                      int *success1, int *success2)
  2628. +{
  2629. +   int cmp;
  2630. +   struct page *tree_page;
  2631. +   u32 hash_max = 0;
  2632. +   struct stable_node *stable_node, *new_snode;
  2633. +   struct rb_node *parent = NULL, **new;
  2634. +
  2635. +   /* this tree node contains no sub-tree yet */
  2636. +   stable_node = rb_entry(tree_node->sub_root.rb_node,
  2637. +                  struct stable_node, node);
  2638. +
  2639. +   tree_page = get_uksm_page(stable_node, 1, 0);
  2640. +   if (tree_page) {
  2641. +       cmp = memcmp_pages(*kpage, tree_page, 1);
  2642. +       if (!cmp) {
  2643. +           try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
  2644. +                         tree_page, success1, success2);
  2645. +           put_page(tree_page);
  2646. +           if (!*success1 && !*success2)
  2647. +               goto failed;
  2648. +
  2649. +           return stable_node;
  2650. +
  2651. +       } else {
  2652. +           /*
  2653. +            * collision in first level try to create a subtree.
  2654. +            * A new node need to be created.
  2655. +            */
  2656. +           put_page(tree_page);
  2657. +
  2658. +           stable_node_hash_max(stable_node, tree_page,
  2659. +                        tree_node->hash);
  2660. +           hash_max = rmap_item_hash_max(rmap_item, hash);
  2661. +           cmp = hash_cmp(hash_max, stable_node->hash_max);
  2662. +
  2663. +           parent = &stable_node->node;
  2664. +           if (cmp < 0) {
  2665. +               new = &parent->rb_left;
  2666. +           } else if (cmp > 0) {
  2667. +               new = &parent->rb_right;
  2668. +           } else {
  2669. +               goto failed;
  2670. +           }
  2671. +       }
  2672. +
  2673. +   } else {
  2674. +       /* the only stable_node deleted, we reuse its tree_node.
  2675. +        */
  2676. +       parent = NULL;
  2677. +       new = &tree_node->sub_root.rb_node;
  2678. +   }
  2679. +
  2680. +   new_snode = new_stable_node(tree_node, *kpage, hash_max);
  2681. +   if (!new_snode)
  2682. +       goto failed;
  2683. +
  2684. +   rb_link_node(&new_snode->node, parent, new);
  2685. +   rb_insert_color(&new_snode->node, &tree_node->sub_root);
  2686. +   tree_node->count++;
  2687. +   *success1 = *success2 = 1;
  2688. +
  2689. +   return new_snode;
  2690. +
  2691. +failed:
  2692. +   return NULL;
  2693. +}
  2694. +
  2695. +static inline
  2696. +struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
  2697. +                     struct rmap_item *rmap_item,
  2698. +                     struct rmap_item *tree_rmap_item,
  2699. +                     struct page **kpage, u32 hash,
  2700. +                     int *success1, int *success2)
  2701. +{
  2702. +   struct page *tree_page;
  2703. +   u32 hash_max;
  2704. +   struct stable_node *stable_node, *new_snode;
  2705. +   struct rb_node *parent, **new;
  2706. +
  2707. +research:
  2708. +   parent = NULL;
  2709. +   new = &tree_node->sub_root.rb_node;
  2710. +   BUG_ON(!*new);
  2711. +   hash_max = rmap_item_hash_max(rmap_item, hash);
  2712. +   while (*new) {
  2713. +       int cmp;
  2714. +
  2715. +       stable_node = rb_entry(*new, struct stable_node, node);
  2716. +
  2717. +       cmp = hash_cmp(hash_max, stable_node->hash_max);
  2718. +
  2719. +       if (cmp < 0) {
  2720. +           parent = *new;
  2721. +           new = &parent->rb_left;
  2722. +       } else if (cmp > 0) {
  2723. +           parent = *new;
  2724. +           new = &parent->rb_right;
  2725. +       } else {
  2726. +           tree_page = get_uksm_page(stable_node, 1, 0);
  2727. +           if (tree_page) {
  2728. +               cmp = memcmp_pages(*kpage, tree_page, 1);
  2729. +               if (!cmp) {
  2730. +                   try_merge_with_stable(rmap_item,
  2731. +                       tree_rmap_item, kpage,
  2732. +                       tree_page, success1, success2);
  2733. +
  2734. +                   put_page(tree_page);
  2735. +                   if (!*success1 && !*success2)
  2736. +                       goto failed;
  2737. +                   /*
  2738. +                    * successfully merged with a stable
  2739. +                    * node
  2740. +                    */
  2741. +                   return stable_node;
  2742. +               } else {
  2743. +                   put_page(tree_page);
  2744. +                   goto failed;
  2745. +               }
  2746. +           } else {
  2747. +               /*
  2748. +                * stable node may be deleted,
  2749. +                * and subtree maybe
  2750. +                * restructed, cannot
  2751. +                * continue, research it.
  2752. +                */
  2753. +               if (tree_node->count) {
  2754. +                   goto research;
  2755. +               } else {
  2756. +                   /* reuse the tree node*/
  2757. +                   parent = NULL;
  2758. +                   new = &tree_node->sub_root.rb_node;
  2759. +               }
  2760. +           }
  2761. +       }
  2762. +   }
  2763. +
  2764. +   new_snode = new_stable_node(tree_node, *kpage, hash_max);
  2765. +   if (!new_snode)
  2766. +       goto failed;
  2767. +
  2768. +   rb_link_node(&new_snode->node, parent, new);
  2769. +   rb_insert_color(&new_snode->node, &tree_node->sub_root);
  2770. +   tree_node->count++;
  2771. +   *success1 = *success2 = 1;
  2772. +
  2773. +   return new_snode;
  2774. +
  2775. +failed:
  2776. +   return NULL;
  2777. +}
  2778. +
  2779. +
  2780. +/**
  2781. + * stable_tree_insert() - try to insert a merged page in unstable tree to
  2782. + * the stable tree
  2783. + *
  2784. + * @kpage:     the page need to be inserted
  2785. + * @hash:      the current hash of this page
  2786. + * @rmap_item:     the rmap_item being scanned
  2787. + * @tree_rmap_item:    the rmap_item found on unstable tree
  2788. + * @success1:      return if rmap_item is merged
  2789. + * @success2:      return if tree_rmap_item is merged
  2790. + *
  2791. + * @return         the stable_node on stable tree if at least one
  2792. + *             rmap_item is inserted into stable tree, NULL
  2793. + *             otherwise.
  2794. + */
  2795. +static struct stable_node *
  2796. +stable_tree_insert(struct page **kpage, u32 hash,
  2797. +          struct rmap_item *rmap_item,
  2798. +          struct rmap_item *tree_rmap_item,
  2799. +          int *success1, int *success2)
  2800. +{
  2801. +   struct rb_node **new = &root_stable_treep->rb_node;
  2802. +   struct rb_node *parent = NULL;
  2803. +   struct stable_node *stable_node;
  2804. +   struct tree_node *tree_node;
  2805. +   u32 hash_max = 0;
  2806. +
  2807. +   *success1 = *success2 = 0;
  2808. +
  2809. +   while (*new) {
  2810. +       int cmp;
  2811. +
  2812. +       tree_node = rb_entry(*new, struct tree_node, node);
  2813. +
  2814. +       cmp = hash_cmp(hash, tree_node->hash);
  2815. +
  2816. +       if (cmp < 0) {
  2817. +           parent = *new;
  2818. +           new = &parent->rb_left;
  2819. +       } else if (cmp > 0) {
  2820. +           parent = *new;
  2821. +           new = &parent->rb_right;
  2822. +       } else
  2823. +           break;
  2824. +   }
  2825. +
  2826. +   if (*new) {
  2827. +       if (tree_node->count == 1) {
  2828. +           stable_node = first_level_insert(tree_node, rmap_item,
  2829. +                       tree_rmap_item, kpage,
  2830. +                       hash, success1, success2);
  2831. +       } else {
  2832. +           stable_node = stable_subtree_insert(tree_node,
  2833. +                   rmap_item, tree_rmap_item, kpage,
  2834. +                   hash, success1, success2);
  2835. +       }
  2836. +   } else {
  2837. +
  2838. +       /* no tree node found */
  2839. +       tree_node = alloc_tree_node(stable_tree_node_listp);
  2840. +       if (!tree_node) {
  2841. +           stable_node = NULL;
  2842. +           goto out;
  2843. +       }
  2844. +
  2845. +       stable_node = new_stable_node(tree_node, *kpage, hash_max);
  2846. +       if (!stable_node) {
  2847. +           free_tree_node(tree_node);
  2848. +           goto out;
  2849. +       }
  2850. +
  2851. +       tree_node->hash = hash;
  2852. +       rb_link_node(&tree_node->node, parent, new);
  2853. +       rb_insert_color(&tree_node->node, root_stable_treep);
  2854. +       parent = NULL;
  2855. +       new = &tree_node->sub_root.rb_node;
  2856. +
  2857. +       rb_link_node(&stable_node->node, parent, new);
  2858. +       rb_insert_color(&stable_node->node, &tree_node->sub_root);
  2859. +       tree_node->count++;
  2860. +       *success1 = *success2 = 1;
  2861. +   }
  2862. +
  2863. +out:
  2864. +   return stable_node;
  2865. +}
  2866. +
  2867. +
  2868. +/**
  2869. + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
  2870. + *
  2871. + * @return     0 on success, -EBUSY if unable to lock the mmap_sem,
  2872. + *             -EINVAL if the page mapping has been changed.
  2873. + */
  2874. +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
  2875. +{
  2876. +   int err;
  2877. +
  2878. +   err = get_mergeable_page_lock_mmap(tree_rmap_item);
  2879. +
  2880. +   if (err == -EINVAL) {
  2881. +       /* its page map has been changed, remove it */
  2882. +       remove_rmap_item_from_tree(tree_rmap_item);
  2883. +   }
  2884. +
  2885. +   /* The page is gotten and mmap_sem is locked now. */
  2886. +   return err;
  2887. +}
  2888. +
  2889. +
  2890. +/**
  2891. + * unstable_tree_search_insert() - search an unstable tree rmap_item with the
  2892. + * same hash value. Get its page and trylock the mmap_sem
  2893. + */
  2894. +static inline
  2895. +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
  2896. +                         u32 hash)
  2897. +
  2898. +{
  2899. +   struct rb_node **new = &root_unstable_tree.rb_node;
  2900. +   struct rb_node *parent = NULL;
  2901. +   struct tree_node *tree_node;
  2902. +   u32 hash_max;
  2903. +   struct rmap_item *tree_rmap_item;
  2904. +
  2905. +   while (*new) {
  2906. +       int cmp;
  2907. +
  2908. +       tree_node = rb_entry(*new, struct tree_node, node);
  2909. +
  2910. +       cmp = hash_cmp(hash, tree_node->hash);
  2911. +
  2912. +       if (cmp < 0) {
  2913. +           parent = *new;
  2914. +           new = &parent->rb_left;
  2915. +       } else if (cmp > 0) {
  2916. +           parent = *new;
  2917. +           new = &parent->rb_right;
  2918. +       } else
  2919. +           break;
  2920. +   }
  2921. +
  2922. +   if (*new) {
  2923. +       /* got the tree_node */
  2924. +       if (tree_node->count == 1) {
  2925. +           tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
  2926. +                         struct rmap_item, node);
  2927. +           BUG_ON(!tree_rmap_item);
  2928. +
  2929. +           goto get_page_out;
  2930. +       }
  2931. +
  2932. +       /* well, search the collision subtree */
  2933. +       new = &tree_node->sub_root.rb_node;
  2934. +       BUG_ON(!*new);
  2935. +       hash_max = rmap_item_hash_max(rmap_item, hash);
  2936. +
  2937. +       while (*new) {
  2938. +           int cmp;
  2939. +
  2940. +           tree_rmap_item = rb_entry(*new, struct rmap_item,
  2941. +                         node);
  2942. +
  2943. +           cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
  2944. +           parent = *new;
  2945. +           if (cmp < 0)
  2946. +               new = &parent->rb_left;
  2947. +           else if (cmp > 0)
  2948. +               new = &parent->rb_right;
  2949. +           else
  2950. +               goto get_page_out;
  2951. +       }
  2952. +   } else {
  2953. +       /* alloc a new tree_node */
  2954. +       tree_node = alloc_tree_node(&unstable_tree_node_list);
  2955. +       if (!tree_node)
  2956. +           return NULL;
  2957. +
  2958. +       tree_node->hash = hash;
  2959. +       rb_link_node(&tree_node->node, parent, new);
  2960. +       rb_insert_color(&tree_node->node, &root_unstable_tree);
  2961. +       parent = NULL;
  2962. +       new = &tree_node->sub_root.rb_node;
  2963. +   }
  2964. +
  2965. +   /* did not found even in sub-tree */
  2966. +   rmap_item->tree_node = tree_node;
  2967. +   rmap_item->address |= UNSTABLE_FLAG;
  2968. +   rmap_item->append_round = uksm_scan_round;
  2969. +   rb_link_node(&rmap_item->node, parent, new);
  2970. +   rb_insert_color(&rmap_item->node, &tree_node->sub_root);
  2971. +
  2972. +   uksm_pages_unshared++;
  2973. +   return NULL;
  2974. +
  2975. +get_page_out:
  2976. +   if (tree_rmap_item->page == rmap_item->page)
  2977. +       return NULL;
  2978. +
  2979. +   if (get_tree_rmap_item_page(tree_rmap_item))
  2980. +       return NULL;
  2981. +
  2982. +   return tree_rmap_item;
  2983. +}
  2984. +
  2985. +static void enter_vma_tree(struct vma_slot *slot)
  2986. +{
  2987. +   unsigned long i;
  2988. +   int ret;
  2989. +
  2990. +   i = uksm_vma_tree_index_end;
  2991. +
  2992. +   ret = radix_tree_insert(&uksm_vma_tree, i, slot);
  2993. +   BUG_ON(ret);
  2994. +
  2995. +   slot->uksm_index = i;
  2996. +   uksm_vma_tree_num++;
  2997. +   uksm_vma_tree_index_end++;
  2998. +}
  2999. +
  3000. +static inline void get_sub_dup_vma(struct vma_slot **slot,
  3001. +                  struct vma_slot **sub_slot)
  3002. +{
  3003. +   struct vma_slot *tmp;
  3004. +
  3005. +   if ((*slot)->uksm_index > (*sub_slot)->uksm_index) {
  3006. +       tmp = *slot;
  3007. +       *slot = *sub_slot;
  3008. +       *sub_slot = tmp;
  3009. +   }
  3010. +}
  3011. +
  3012. +/*
  3013. + * Inc or dec the dup pages stored in a slot, return the dup page num after
  3014. + * the operation.
  3015. + */
  3016. +static inline unsigned long dup_pages_mod(void **slot, int inc)
  3017. +{
  3018. +   unsigned long item, ret;
  3019. +
  3020. +   item = (unsigned long)(*slot) >> INDIRECT_OFFSET;
  3021. +   if (inc) {
  3022. +       item++;
  3023. +       BUG_ON(!item);
  3024. +   } else {
  3025. +       BUG_ON(!item);
  3026. +       item--;
  3027. +   }
  3028. +   ret = item;
  3029. +   item <<= INDIRECT_OFFSET;
  3030. +   *slot = (void *)item;
  3031. +
  3032. +   return ret;
  3033. +}
  3034. +
  3035. +static void inc_dup_vma(struct vma_slot *slot, struct vma_slot *sub_slot)
  3036. +{
  3037. +   void **dup_slot;
  3038. +   unsigned long dup_pages;
  3039. +   int ret;
  3040. +
  3041. +   if (slot->uksm_index == -1)
  3042. +       enter_vma_tree(slot);
  3043. +
  3044. +   if (sub_slot->uksm_index == -1)
  3045. +       enter_vma_tree(sub_slot);
  3046. +
  3047. +   get_sub_dup_vma(&slot, &sub_slot);
  3048. +
  3049. +   dup_slot = radix_tree_lookup_slot(&slot->dup_tree, sub_slot->uksm_index);
  3050. +   if (dup_slot)
  3051. +       goto found;
  3052. +
  3053. +   /*
  3054. +    * In order to store dup_pages in radix tree, we must make
  3055. +    * radix_tree_is_indirect_ptr() happy.
  3056. +    */
  3057. +   dup_pages = 1 << INDIRECT_OFFSET;
  3058. +
  3059. +   /* no such entry yet, insert one */
  3060. +   ret = radix_tree_insert(&slot->dup_tree, sub_slot->uksm_index,
  3061. +               (void *)dup_pages);
  3062. +   BUG_ON(ret);
  3063. +
  3064. +   return;
  3065. +
  3066. +found:
  3067. +   dup_pages_mod(dup_slot, 1);
  3068. +}
  3069. +
  3070. +static void dec_dup_vma(struct vma_slot *slot, struct vma_slot *sub_slot)
  3071. +{
  3072. +   void **dup_slot;
  3073. +   unsigned long dup_pages;
  3074. +
  3075. +   BUG_ON(slot->uksm_index == -1 || sub_slot->uksm_index == -1);
  3076. +
  3077. +   get_sub_dup_vma(&slot, &sub_slot);
  3078. +
  3079. +   dup_slot = radix_tree_lookup_slot(&slot->dup_tree, sub_slot->uksm_index);
  3080. +   BUG_ON(!dup_slot);
  3081. +
  3082. +   dup_pages = dup_pages_mod(dup_slot, 0);
  3083. +
  3084. +   /* dup_pages == 0, we need to kick it out */
  3085. +   if (!dup_pages)
  3086. +       radix_tree_delete(&slot->dup_tree, sub_slot->uksm_index);
  3087. +}
  3088. +
  3089. +static void hold_anon_vma(struct rmap_item *rmap_item,
  3090. +             struct anon_vma *anon_vma)
  3091. +{
  3092. +   rmap_item->anon_vma = anon_vma;
  3093. +   get_anon_vma(anon_vma);
  3094. +}
  3095. +
  3096. +
  3097. +/**
  3098. + * stable_tree_append() - append a rmap_item to a stable node. Deduplication
  3099. + * ratio statistics is done in this function.
  3100. + *
  3101. + */
  3102. +static void stable_tree_append(struct rmap_item *rmap_item,
  3103. +                  struct stable_node *stable_node)
  3104. +{
  3105. +   struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_iter = NULL;
  3106. +   struct hlist_node *hlist, *cont_p = NULL;
  3107. +   unsigned long key = (unsigned long)rmap_item->slot;
  3108. +
  3109. +   BUG_ON(!stable_node);
  3110. +   rmap_item->address |= STABLE_FLAG;
  3111. +   rmap_item->append_round = uksm_scan_round;
  3112. +
  3113. +   if (hlist_empty(&stable_node->hlist)) {
  3114. +       uksm_pages_shared++;
  3115. +       goto node_vma_new;
  3116. +   } else {
  3117. +       uksm_pages_sharing++;
  3118. +   }
  3119. +
  3120. +   hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
  3121. +       if (node_vma->last_update == uksm_scan_round)
  3122. +           inc_dup_vma(rmap_item->slot, node_vma->slot);
  3123. +
  3124. +       if (node_vma->key >= key)
  3125. +           break;
  3126. +   }
  3127. +
  3128. +   cont_p = hlist;
  3129. +
  3130. +   if (node_vma && node_vma->key == key) {
  3131. +       if (node_vma->last_update == uksm_scan_round) {
  3132. +           /**
  3133. +            * we consider this page a inner duplicate, cancel
  3134. +            * other updates
  3135. +            */
  3136. +           hlist_for_each_entry(node_vma_iter, hlist,
  3137. +                        &stable_node->hlist, hlist) {
  3138. +               if (node_vma_iter->key == key)
  3139. +                   break;
  3140. +
  3141. +               /* only need to increase the same vma */
  3142. +               if (node_vma_iter->last_update ==
  3143. +                   uksm_scan_round) {
  3144. +                   dec_dup_vma(rmap_item->slot,
  3145. +                           node_vma_iter->slot);
  3146. +               }
  3147. +           }
  3148. +       } else {
  3149. +           /**
  3150. +            * Although it's same vma, it contains no duplicate for this
  3151. +            * round. Continue scan other vma.
  3152. +            */
  3153. +           hlist_for_each_entry_continue(node_vma_iter,
  3154. +                             hlist, hlist) {
  3155. +               if (node_vma_iter->last_update ==
  3156. +                   uksm_scan_round) {
  3157. +                   inc_dup_vma(rmap_item->slot,
  3158. +                           node_vma_iter->slot);
  3159. +               }
  3160. +           }
  3161. +
  3162. +       }
  3163. +
  3164. +       goto node_vma_ok;
  3165. +   }
  3166. +
  3167. +node_vma_new:
  3168. +   /* no same vma already in node, alloc a new node_vma */
  3169. +   new_node_vma = alloc_node_vma();
  3170. +   BUG_ON(!new_node_vma);
  3171. +   new_node_vma->head = stable_node;
  3172. +   new_node_vma->slot = rmap_item->slot;
  3173. +
  3174. +   if (!node_vma) {
  3175. +       hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
  3176. +   } else if (node_vma->key != key) {
  3177. +       if (node_vma->key < key)
  3178. +           hlist_add_after(&node_vma->hlist, &new_node_vma->hlist);
  3179. +       else {
  3180. +           hlist_for_each_entry_continue(node_vma_iter, cont_p,
  3181. +                             hlist) {
  3182. +               if (node_vma_iter->last_update ==
  3183. +                   uksm_scan_round) {
  3184. +                   inc_dup_vma(rmap_item->slot,
  3185. +                           node_vma_iter->slot);
  3186. +               }
  3187. +           }
  3188. +           hlist_add_before(&new_node_vma->hlist,
  3189. +                    &node_vma->hlist);
  3190. +       }
  3191. +
  3192. +   }
  3193. +   node_vma = new_node_vma;
  3194. +
  3195. +node_vma_ok: /* ok, ready to add to the list */
  3196. +   rmap_item->head = node_vma;
  3197. +   hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
  3198. +   node_vma->last_update = uksm_scan_round;
  3199. +   hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
  3200. +   rmap_item->slot->pages_merged++;
  3201. +}
  3202. +
  3203. +/*
  3204. + * We use break_ksm to break COW on a ksm page: it's a stripped down
  3205. + *
  3206. + * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
  3207. + *     put_page(page);
  3208. + *
  3209. + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
  3210. + * in case the application has unmapped and remapped mm,addr meanwhile.
  3211. + * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
  3212. + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
  3213. + */
  3214. +static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
  3215. +{
  3216. +   struct page *page;
  3217. +   int ret = 0;
  3218. +
  3219. +   do {
  3220. +       cond_resched();
  3221. +       page = follow_page(vma, addr, FOLL_GET);
  3222. +       if (IS_ERR_OR_NULL(page))
  3223. +           break;
  3224. +       if (PageKsm(page)) {
  3225. +           ret = handle_mm_fault(vma->vm_mm, vma, addr,
  3226. +                         FAULT_FLAG_WRITE);
  3227. +       } else
  3228. +           ret = VM_FAULT_WRITE;
  3229. +       put_page(page);
  3230. +   } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
  3231. +   /*
  3232. +    * We must loop because handle_mm_fault() may back out if there's
  3233. +    * any difficulty e.g. if pte accessed bit gets updated concurrently.
  3234. +    *
  3235. +    * VM_FAULT_WRITE is what we have been hoping for: it indicates that
  3236. +    * COW has been broken, even if the vma does not permit VM_WRITE;
  3237. +    * but note that a concurrent fault might break PageKsm for us.
  3238. +    *
  3239. +    * VM_FAULT_SIGBUS could occur if we race with truncation of the
  3240. +    * backing file, which also invalidates anonymous pages: that's
  3241. +    * okay, that truncation will have unmapped the PageKsm for us.
  3242. +    *
  3243. +    * VM_FAULT_OOM: at the time of writing (late July 2009), setting
  3244. +    * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
  3245. +    * current task has TIF_MEMDIE set, and will be OOM killed on return
  3246. +    * to user; and ksmd, having no mm, would never be chosen for that.
  3247. +    *
  3248. +    * But if the mm is in a limited mem_cgroup, then the fault may fail
  3249. +    * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
  3250. +    * even ksmd can fail in this way - though it's usually breaking ksm
  3251. +    * just to undo a merge it made a moment before, so unlikely to oom.
  3252. +    *
  3253. +    * That's a pity: we might therefore have more kernel pages allocated
  3254. +    * than we're counting as nodes in the stable tree; but uksm_do_scan
  3255. +    * will retry to break_cow on each pass, so should recover the page
  3256. +    * in due course.  The important thing is to not let VM_MERGEABLE
  3257. +    * be cleared while any such pages might remain in the area.
  3258. +    */
  3259. +   return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
  3260. +}
  3261. +
  3262. +static void break_cow(struct rmap_item *rmap_item)
  3263. +{
  3264. +   struct vm_area_struct *vma = rmap_item->slot->vma;
  3265. +   struct mm_struct *mm = vma->vm_mm;
  3266. +   unsigned long addr = get_rmap_addr(rmap_item);
  3267. +
  3268. +   if (uksm_test_exit(mm))
  3269. +       goto out;
  3270. +
  3271. +   break_ksm(vma, addr);
  3272. +out:
  3273. +   return;
  3274. +}
  3275. +
  3276. +/*
  3277. + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
  3278. + * than check every pte of a given vma, the locking doesn't quite work for
  3279. + * that - an rmap_item is assigned to the stable tree after inserting ksm
  3280. + * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
  3281. + * rmap_items from parent to child at fork time (so as not to waste time
  3282. + * if exit comes before the next scan reaches it).
  3283. + *
  3284. + * Similarly, although we'd like to remove rmap_items (so updating counts
  3285. + * and freeing memory) when unmerging an area, it's easier to leave that
  3286. + * to the next pass of ksmd - consider, for example, how ksmd might be
  3287. + * in cmp_and_merge_page on one of the rmap_items we would be removing.
  3288. + */
  3289. +inline int unmerge_uksm_pages(struct vm_area_struct *vma,
  3290. +             unsigned long start, unsigned long end)
  3291. +{
  3292. +   unsigned long addr;
  3293. +   int err = 0;
  3294. +
  3295. +   for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
  3296. +       if (uksm_test_exit(vma->vm_mm))
  3297. +           break;
  3298. +       if (signal_pending(current))
  3299. +           err = -ERESTARTSYS;
  3300. +       else
  3301. +           err = break_ksm(vma, addr);
  3302. +   }
  3303. +   return err;
  3304. +}
  3305. +
  3306. +static inline void inc_uksm_pages_scanned(void)
  3307. +{
  3308. +   u64 delta;
  3309. +
  3310. +
  3311. +   if (uksm_pages_scanned == U64_MAX) {
  3312. +       encode_benefit();
  3313. +
  3314. +       delta = uksm_pages_scanned >> pages_scanned_base;
  3315. +
  3316. +       if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
  3317. +           pages_scanned_stored >>= 1;
  3318. +           delta >>= 1;
  3319. +           pages_scanned_base++;
  3320. +       }
  3321. +
  3322. +       pages_scanned_stored += delta;
  3323. +
  3324. +       uksm_pages_scanned = uksm_pages_scanned_last = 0;
  3325. +   }
  3326. +
  3327. +   uksm_pages_scanned++;
  3328. +}
  3329. +
  3330. +static inline int find_zero_page_hash(int strength, u32 hash)
  3331. +{
  3332. +   return (zero_hash_table[strength] == hash);
  3333. +}
  3334. +
  3335. +static
  3336. +int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
  3337. +{
  3338. +   struct page *zero_page = empty_uksm_zero_page;
  3339. +   struct mm_struct *mm = vma->vm_mm;
  3340. +   pte_t orig_pte = __pte(0);
  3341. +   int err = -EFAULT;
  3342. +
  3343. +   if (uksm_test_exit(mm))
  3344. +       goto out;
  3345. +
  3346. +   if (PageTransCompound(page) && page_trans_compound_anon_split(page))
  3347. +       goto out;
  3348. +   BUG_ON(PageTransCompound(page));
  3349. +
  3350. +   if (!PageAnon(page))
  3351. +       goto out;
  3352. +
  3353. +   if (!trylock_page(page))
  3354. +       goto out;
  3355. +
  3356. +   if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
  3357. +       if (is_page_full_zero(page))
  3358. +           err = replace_page(vma, page, zero_page, orig_pte);
  3359. +   }
  3360. +
  3361. +   unlock_page(page);
  3362. +out:
  3363. +   return err;
  3364. +}
  3365. +
  3366. +/*
  3367. + * cmp_and_merge_page() - first see if page can be merged into the stable
  3368. + * tree; if not, compare hash to previous and if it's the same, see if page
  3369. + * can be inserted into the unstable tree, or merged with a page already there
  3370. + * and both transferred to the stable tree.
  3371. + *
  3372. + * @page: the page that we are searching identical page to.
  3373. + * @rmap_item: the reverse mapping into the virtual address of this page
  3374. + */
  3375. +static void cmp_and_merge_page(struct rmap_item *rmap_item)
  3376. +{
  3377. +   struct rmap_item *tree_rmap_item;
  3378. +   struct page *page;
  3379. +   struct page *kpage = NULL;
  3380. +   u32 hash, hash_max;
  3381. +   int err;
  3382. +   unsigned int success1, success2;
  3383. +   struct stable_node *snode;
  3384. +   int cmp;
  3385. +   struct rb_node *parent = NULL, **new;
  3386. +
  3387. +   remove_rmap_item_from_tree(rmap_item);
  3388. +   inc_uksm_pages_scanned();
  3389. +
  3390. +   page = rmap_item->page;
  3391. +
  3392. +   hash = page_hash(page, hash_strength, 1);
  3393. +
  3394. +   /*if the page content all zero, re-map to zero-page*/
  3395. +   if (find_zero_page_hash(hash_strength, hash)) {
  3396. +       if (!cmp_and_merge_zero_page(rmap_item->slot->vma, page)) {
  3397. +           inc_dup_vma(rmap_item->slot, rmap_item->slot);
  3398. +           rmap_item->slot->pages_merged++;
  3399. +           __inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
  3400. +           return ;
  3401. +       } else {
  3402. +           inc_rshash_neg(memcmp_cost / 2);
  3403. +       }
  3404. +   }
  3405. +   //uksm_pages_scanned++;
  3406. +
  3407. +   /* We first start with searching the page inside the stable tree */
  3408. +   kpage = stable_tree_search(rmap_item, hash);
  3409. +   if (kpage) {
  3410. +       err = try_to_merge_with_uksm_page(rmap_item, kpage,
  3411. +                        hash);
  3412. +       if (!err) {
  3413. +           /*
  3414. +            * The page was successfully merged, add
  3415. +            * its rmap_item to the stable tree.
  3416. +            * page lock is needed because it's
  3417. +            * racing with try_to_unmap_ksm(), etc.
  3418. +            */
  3419. +           lock_page(kpage);
  3420. +           stable_tree_append(rmap_item, page_stable_node(kpage));
  3421. +           unlock_page(kpage);
  3422. +           put_page(kpage);
  3423. +           return; /* success */
  3424. +       }
  3425. +       put_page(kpage);
  3426. +
  3427. +       /*
  3428. +        * if it's a collision and it has been search in sub-rbtree
  3429. +        * (hash_max != 0), we want to abort, because if it is
  3430. +        * successfully merged in unstable tree, the collision trends to
  3431. +        * happen again.
  3432. +        */
  3433. +       if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
  3434. +           return;
  3435. +   }
  3436. +
  3437. +   tree_rmap_item =
  3438. +       unstable_tree_search_insert(rmap_item, hash);
  3439. +   if (tree_rmap_item) {
  3440. +       err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
  3441. +       /*
  3442. +        * As soon as we merge this page, we want to remove the
  3443. +        * rmap_item of the page we have merged with from the unstable
  3444. +        * tree, and insert it instead as new node in the stable tree.
  3445. +        */
  3446. +       if (!err) {
  3447. +           kpage = page;
  3448. +           remove_rmap_item_from_tree(tree_rmap_item);
  3449. +           lock_page(kpage);
  3450. +           snode = stable_tree_insert(&kpage, hash,
  3451. +                          rmap_item, tree_rmap_item,
  3452. +                          &success1, &success2);
  3453. +
  3454. +           if (success1)
  3455. +               stable_tree_append(rmap_item, snode);
  3456. +           if (success2)
  3457. +               stable_tree_append(tree_rmap_item, snode);
  3458. +
  3459. +           /*
  3460. +            * The original kpage may be unlocked inside
  3461. +            * stable_tree_insert() already. This page
  3462. +            * should be unlocked before doing
  3463. +            * break_cow().
  3464. +            */
  3465. +           unlock_page(kpage);
  3466. +
  3467. +           if (!success1)
  3468. +               break_cow(rmap_item);
  3469. +
  3470. +           if (!success2)
  3471. +               break_cow(tree_rmap_item);
  3472. +
  3473. +       } else if (err == MERGE_ERR_COLLI) {
  3474. +           if (tree_rmap_item->tree_node->count == 1) {
  3475. +               rmap_item_hash_max(tree_rmap_item,
  3476. +               tree_rmap_item->tree_node->hash);
  3477. +           } else
  3478. +               BUG_ON(!(tree_rmap_item->hash_max));
  3479. +
  3480. +           hash_max = rmap_item_hash_max(rmap_item, hash);
  3481. +           cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
  3482. +           parent = &tree_rmap_item->node;
  3483. +           if (cmp < 0)
  3484. +               new = &parent->rb_left;
  3485. +           else if (cmp > 0)
  3486. +               new = &parent->rb_right;
  3487. +           else
  3488. +               goto put_up_out;
  3489. +
  3490. +           rmap_item->tree_node = tree_rmap_item->tree_node;
  3491. +           rmap_item->address |= UNSTABLE_FLAG;
  3492. +           rmap_item->append_round = uksm_scan_round;
  3493. +           rb_link_node(&rmap_item->node, parent, new);
  3494. +           rb_insert_color(&rmap_item->node,
  3495. +                   &tree_rmap_item->tree_node->sub_root);
  3496. +           rmap_item->tree_node->count++;
  3497. +       }
  3498. +put_up_out:
  3499. +       put_page(tree_rmap_item->page);
  3500. +       up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem);
  3501. +   }
  3502. +}
  3503. +
  3504. +
  3505. +
  3506. +
  3507. +static inline unsigned long get_pool_index(struct vma_slot *slot,
  3508. +                      unsigned long index)
  3509. +{
  3510. +   unsigned long pool_index;
  3511. +
  3512. +   pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
  3513. +   if (pool_index >= slot->pool_size)
  3514. +       BUG();
  3515. +   return pool_index;
  3516. +}
  3517. +
  3518. +static inline unsigned long index_page_offset(unsigned long index)
  3519. +{
  3520. +   return offset_in_page(sizeof(struct rmap_list_entry *) * index);
  3521. +}
  3522. +
  3523. +static inline
  3524. +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
  3525. +                       unsigned long index, int need_alloc)
  3526. +{
  3527. +   unsigned long pool_index;
  3528. +   void *addr;
  3529. +
  3530. +
  3531. +   pool_index = get_pool_index(slot, index);
  3532. +   if (!slot->rmap_list_pool[pool_index]) {
  3533. +       if (!need_alloc)
  3534. +           return NULL;
  3535. +
  3536. +       slot->rmap_list_pool[pool_index] =
  3537. +           alloc_page(GFP_KERNEL | __GFP_ZERO);
  3538. +       BUG_ON(!slot->rmap_list_pool[pool_index]);
  3539. +   }
  3540. +
  3541. +   addr = kmap(slot->rmap_list_pool[pool_index]);
  3542. +   addr += index_page_offset(index);
  3543. +
  3544. +   return addr;
  3545. +}
  3546. +
  3547. +static inline void put_rmap_list_entry(struct vma_slot *slot,
  3548. +                      unsigned long index)
  3549. +{
  3550. +   unsigned long pool_index;
  3551. +
  3552. +   pool_index = get_pool_index(slot, index);
  3553. +   BUG_ON(!slot->rmap_list_pool[pool_index]);
  3554. +   kunmap(slot->rmap_list_pool[pool_index]);
  3555. +}
  3556. +
  3557. +static inline int entry_is_new(struct rmap_list_entry *entry)
  3558. +{
  3559. +   return !entry->item;
  3560. +}
  3561. +
  3562. +static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
  3563. +                       unsigned long index)
  3564. +{
  3565. +   return slot->vma->vm_start + (index << PAGE_SHIFT);
  3566. +}
  3567. +
  3568. +static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
  3569. +{
  3570. +   unsigned long addr;
  3571. +
  3572. +   if (is_addr(entry->addr))
  3573. +       addr = get_clean_addr(entry->addr);
  3574. +   else if (entry->item)
  3575. +       addr = get_rmap_addr(entry->item);
  3576. +   else
  3577. +       BUG();
  3578. +
  3579. +   return addr;
  3580. +}
  3581. +
  3582. +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
  3583. +{
  3584. +   if (is_addr(entry->addr))
  3585. +       return NULL;
  3586. +
  3587. +   return entry->item;
  3588. +}
  3589. +
  3590. +static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
  3591. +                       unsigned long index)
  3592. +{
  3593. +   unsigned long pool_index;
  3594. +
  3595. +   pool_index = get_pool_index(slot, index);
  3596. +   BUG_ON(!slot->rmap_list_pool[pool_index]);
  3597. +   slot->pool_counts[pool_index]++;
  3598. +}
  3599. +
  3600. +static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
  3601. +                       unsigned long index)
  3602. +{
  3603. +   unsigned long pool_index;
  3604. +
  3605. +   pool_index = get_pool_index(slot, index);
  3606. +   BUG_ON(!slot->rmap_list_pool[pool_index]);
  3607. +   BUG_ON(!slot->pool_counts[pool_index]);
  3608. +   slot->pool_counts[pool_index]--;
  3609. +}
  3610. +
  3611. +static inline int entry_has_rmap(struct rmap_list_entry *entry)
  3612. +{
  3613. +   return !is_addr(entry->addr) && entry->item;
  3614. +}
  3615. +
  3616. +static inline void swap_entries(struct rmap_list_entry *entry1,
  3617. +               unsigned long index1,
  3618. +               struct rmap_list_entry *entry2,
  3619. +               unsigned long index2)
  3620. +{
  3621. +   struct rmap_list_entry tmp;
  3622. +
  3623. +   /* swapping two new entries is meaningless */
  3624. +   BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
  3625. +
  3626. +   tmp = *entry1;
  3627. +   *entry1 = *entry2;
  3628. +   *entry2 = tmp;
  3629. +
  3630. +   if (entry_has_rmap(entry1))
  3631. +       entry1->item->entry_index = index1;
  3632. +
  3633. +   if (entry_has_rmap(entry2))
  3634. +       entry2->item->entry_index = index2;
  3635. +
  3636. +   if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
  3637. +       inc_rmap_list_pool_count(entry1->item->slot, index1);
  3638. +       dec_rmap_list_pool_count(entry1->item->slot, index2);
  3639. +   } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
  3640. +       inc_rmap_list_pool_count(entry2->item->slot, index2);
  3641. +       dec_rmap_list_pool_count(entry2->item->slot, index1);
  3642. +   }
  3643. +}
  3644. +
  3645. +static inline void free_entry_item(struct rmap_list_entry *entry)
  3646. +{
  3647. +   unsigned long index;
  3648. +   struct rmap_item *item;
  3649. +
  3650. +   if (!is_addr(entry->addr)) {
  3651. +       BUG_ON(!entry->item);
  3652. +       item = entry->item;
  3653. +       entry->addr = get_rmap_addr(item);
  3654. +       set_is_addr(entry->addr);
  3655. +       index = item->entry_index;
  3656. +       remove_rmap_item_from_tree(item);
  3657. +       dec_rmap_list_pool_count(item->slot, index);
  3658. +       free_rmap_item(item);
  3659. +   }
  3660. +}
  3661. +
  3662. +static inline int pool_entry_boundary(unsigned long index)
  3663. +{
  3664. +   unsigned long linear_addr;
  3665. +
  3666. +   linear_addr = sizeof(struct rmap_list_entry *) * index;
  3667. +   return index && !offset_in_page(linear_addr);
  3668. +}
  3669. +
  3670. +static inline void try_free_last_pool(struct vma_slot *slot,
  3671. +                     unsigned long index)
  3672. +{
  3673. +   unsigned long pool_index;
  3674. +
  3675. +   pool_index = get_pool_index(slot, index);
  3676. +   if (slot->rmap_list_pool[pool_index] &&
  3677. +       !slot->pool_counts[pool_index]) {
  3678. +       __free_page(slot->rmap_list_pool[pool_index]);
  3679. +       slot->rmap_list_pool[pool_index] = NULL;
  3680. +       slot->flags |= UKSM_SLOT_NEED_SORT;
  3681. +   }
  3682. +
  3683. +}
  3684. +
  3685. +static inline unsigned long vma_item_index(struct vm_area_struct *vma,
  3686. +                      struct rmap_item *item)
  3687. +{
  3688. +   return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
  3689. +}
  3690. +
  3691. +static int within_same_pool(struct vma_slot *slot,
  3692. +               unsigned long i, unsigned long j)
  3693. +{
  3694. +   unsigned long pool_i, pool_j;
  3695. +
  3696. +   pool_i = get_pool_index(slot, i);
  3697. +   pool_j = get_pool_index(slot, j);
  3698. +
  3699. +   return (pool_i == pool_j);
  3700. +}
  3701. +
  3702. +static void sort_rmap_entry_list(struct vma_slot *slot)
  3703. +{
  3704. +   unsigned long i, j;
  3705. +   struct rmap_list_entry *entry, *swap_entry;
  3706. +
  3707. +   entry = get_rmap_list_entry(slot, 0, 0);
  3708. +   for (i = 0; i < slot->pages; ) {
  3709. +
  3710. +       if (!entry)
  3711. +           goto skip_whole_pool;
  3712. +
  3713. +       if (entry_is_new(entry))
  3714. +           goto next_entry;
  3715. +
  3716. +       if (is_addr(entry->addr)) {
  3717. +           entry->addr = 0;
  3718. +           goto next_entry;
  3719. +       }
  3720. +
  3721. +       j = vma_item_index(slot->vma, entry->item);
  3722. +       if (j == i)
  3723. +           goto next_entry;
  3724. +
  3725. +       if (within_same_pool(slot, i, j))
  3726. +           swap_entry = entry + j - i;
  3727. +       else
  3728. +           swap_entry = get_rmap_list_entry(slot, j, 1);
  3729. +
  3730. +       swap_entries(entry, i, swap_entry, j);
  3731. +       if (!within_same_pool(slot, i, j))
  3732. +           put_rmap_list_entry(slot, j);
  3733. +       continue;
  3734. +
  3735. +skip_whole_pool:
  3736. +       i += PAGE_SIZE / sizeof(*entry);
  3737. +       if (i < slot->pages)
  3738. +           entry = get_rmap_list_entry(slot, i, 0);
  3739. +       continue;
  3740. +
  3741. +next_entry:
  3742. +       if (i >= slot->pages - 1 ||
  3743. +           !within_same_pool(slot, i, i + 1)) {
  3744. +           put_rmap_list_entry(slot, i);
  3745. +           if (i + 1 < slot->pages)
  3746. +               entry = get_rmap_list_entry(slot, i + 1, 0);
  3747. +       } else
  3748. +           entry++;
  3749. +       i++;
  3750. +       continue;
  3751. +   }
  3752. +
  3753. +   /* free empty pool entries which contain no rmap_item */
  3754. +   /* CAN be simplied to based on only pool_counts when bug freed !!!!! */
  3755. +   for (i = 0; i < slot->pool_size; i++) {
  3756. +       unsigned char has_rmap;
  3757. +       void *addr;
  3758. +
  3759. +       if (!slot->rmap_list_pool[i])
  3760. +           continue;
  3761. +
  3762. +       has_rmap = 0;
  3763. +       addr = kmap(slot->rmap_list_pool[i]);
  3764. +       BUG_ON(!addr);
  3765. +       for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
  3766. +           entry = (struct rmap_list_entry *)addr + j;
  3767. +           if (is_addr(entry->addr))
  3768. +               continue;
  3769. +           if (!entry->item)
  3770. +               continue;
  3771. +           has_rmap = 1;
  3772. +       }
  3773. +       kunmap(slot->rmap_list_pool[i]);
  3774. +       if (!has_rmap) {
  3775. +           BUG_ON(slot->pool_counts[i]);
  3776. +           __free_page(slot->rmap_list_pool[i]);
  3777. +           slot->rmap_list_pool[i] = NULL;
  3778. +       }
  3779. +   }
  3780. +
  3781. +   slot->flags &= ~UKSM_SLOT_NEED_SORT;
  3782. +}
  3783. +
  3784. +/*
  3785. + * vma_fully_scanned() - if all the pages in this slot have been scanned.
  3786. + */
  3787. +static inline int vma_fully_scanned(struct vma_slot *slot)
  3788. +{
  3789. +   return slot->pages_scanned && !(slot->pages_scanned % slot->pages);
  3790. +}
  3791. +
  3792. +/**
  3793. + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
  3794. + * its random permutation. This function is embedded with the random
  3795. + * permutation index management code.
  3796. + */
  3797. +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot)
  3798. +{
  3799. +   unsigned long rand_range, addr, swap_index, scan_index;
  3800. +   struct rmap_item *item = NULL;
  3801. +   struct rmap_list_entry *scan_entry, *swap_entry = NULL;
  3802. +   struct page *page;
  3803. +
  3804. +   scan_index = swap_index = slot->pages_scanned % slot->pages;
  3805. +
  3806. +   if (pool_entry_boundary(scan_index))
  3807. +       try_free_last_pool(slot, scan_index - 1);
  3808. +
  3809. +   if (vma_fully_scanned(slot)) {
  3810. +       if (slot->flags & UKSM_SLOT_NEED_SORT)
  3811. +           slot->flags |= UKSM_SLOT_NEED_RERAND;
  3812. +       else
  3813. +           slot->flags &= ~UKSM_SLOT_NEED_RERAND;
  3814. +       if (slot->flags & UKSM_SLOT_NEED_SORT)
  3815. +           sort_rmap_entry_list(slot);
  3816. +   }
  3817. +
  3818. +   scan_entry = get_rmap_list_entry(slot, scan_index, 1);
  3819. +   if (entry_is_new(scan_entry)) {
  3820. +       scan_entry->addr = get_index_orig_addr(slot, scan_index);
  3821. +       set_is_addr(scan_entry->addr);
  3822. +   }
  3823. +
  3824. +   if (slot->flags & UKSM_SLOT_NEED_RERAND) {
  3825. +       rand_range = slot->pages - scan_index;
  3826. +       BUG_ON(!rand_range);
  3827. +       swap_index = scan_index + (random32() % rand_range);
  3828. +   }
  3829. +
  3830. +   if (swap_index != scan_index) {
  3831. +       swap_entry = get_rmap_list_entry(slot, swap_index, 1);
  3832. +       if (entry_is_new(swap_entry)) {
  3833. +           swap_entry->addr = get_index_orig_addr(slot,
  3834. +                                  swap_index);
  3835. +           set_is_addr(swap_entry->addr);
  3836. +       }
  3837. +       swap_entries(scan_entry, scan_index, swap_entry, swap_index);
  3838. +   }
  3839. +
  3840. +   addr = get_entry_address(scan_entry);
  3841. +   item = get_entry_item(scan_entry);
  3842. +   BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
  3843. +
  3844. +   page = follow_page(slot->vma, addr, FOLL_GET);
  3845. +   if (IS_ERR_OR_NULL(page))
  3846. +       goto nopage;
  3847. +
  3848. +   if (!PageAnon(page) && !page_trans_compound_anon(page))
  3849. +       goto putpage;
  3850. +
  3851. +   /*check is zero_page pfn or uksm_zero_page*/
  3852. +   if ((page_to_pfn(page) == zero_pfn)
  3853. +           || (page_to_pfn(page) == uksm_zero_pfn))
  3854. +       goto putpage;
  3855. +
  3856. +   flush_anon_page(slot->vma, page, addr);
  3857. +   flush_dcache_page(page);
  3858. +
  3859. +   if (!item) {
  3860. +       item = alloc_rmap_item();
  3861. +       if (item) {
  3862. +           /* It has already been zeroed */
  3863. +           item->slot = slot;
  3864. +           item->address = addr;
  3865. +           item->entry_index = scan_index;
  3866. +           scan_entry->item = item;
  3867. +           inc_rmap_list_pool_count(slot, scan_index);
  3868. +       } else
  3869. +           goto putpage;
  3870. +   }
  3871. +
  3872. +   BUG_ON(item->slot != slot);
  3873. +   /* the page may have changed */
  3874. +   item->page = page;
  3875. +   put_rmap_list_entry(slot, scan_index);
  3876. +   if (swap_entry)
  3877. +       put_rmap_list_entry(slot, swap_index);
  3878. +   return item;
  3879. +
  3880. +putpage:
  3881. +   put_page(page);
  3882. +   page = NULL;
  3883. +nopage:
  3884. +   /* no page, store addr back and free rmap_item if possible */
  3885. +   free_entry_item(scan_entry);
  3886. +   put_rmap_list_entry(slot, scan_index);
  3887. +   if (swap_entry)
  3888. +       put_rmap_list_entry(slot, swap_index);
  3889. +   return NULL;
  3890. +}
  3891. +
  3892. +static inline int in_stable_tree(struct rmap_item *rmap_item)
  3893. +{
  3894. +   return rmap_item->address & STABLE_FLAG;
  3895. +}
  3896. +
  3897. +/**
  3898. + * scan_vma_one_page() - scan the next page in a vma_slot. Called with
  3899. + * mmap_sem locked.
  3900. + */
  3901. +static void scan_vma_one_page(struct vma_slot *slot)
  3902. +{
  3903. +   struct mm_struct *mm;
  3904. +   struct rmap_item *rmap_item = NULL;
  3905. +   struct vm_area_struct *vma = slot->vma;
  3906. +
  3907. +   mm = vma->vm_mm;
  3908. +   BUG_ON(!mm);
  3909. +   BUG_ON(!slot);
  3910. +
  3911. +   rmap_item = get_next_rmap_item(slot);
  3912. +   if (!rmap_item)
  3913. +       goto out1;
  3914. +
  3915. +   if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
  3916. +       goto out2;
  3917. +
  3918. +   cmp_and_merge_page(rmap_item);
  3919. +out2:
  3920. +   put_page(rmap_item->page);
  3921. +out1:
  3922. +   slot->pages_scanned++;
  3923. +   slot->flags |= UKSM_SLOT_SCANNED;
  3924. +   if (vma_fully_scanned(slot)) {
  3925. +       slot->flags |= UKSM_SLOT_FUL_SCANNED;
  3926. +       slot->rung->fully_scanned_slots++;
  3927. +       BUG_ON(!slot->rung->fully_scanned_slots);
  3928. +   }
  3929. +}
  3930. +
  3931. +static unsigned long get_vma_random_scan_num(struct vma_slot *slot,
  3932. +                        unsigned long scan_ratio)
  3933. +{
  3934. +   return slot->pages * scan_ratio / UKSM_SCAN_RATIO_MAX;
  3935. +}
  3936. +
  3937. +static inline void vma_rung_enter(struct vma_slot *slot,
  3938. +                 struct scan_rung *rung)
  3939. +{
  3940. +   unsigned long pages_to_scan;
  3941. +   struct scan_rung *old_rung = slot->rung;
  3942. +
  3943. +   /* leave the old rung it was in */
  3944. +   BUG_ON(list_empty(&slot->uksm_list));
  3945. +
  3946. +   if (old_rung->current_scan == &slot->uksm_list)
  3947. +       old_rung->current_scan = slot->uksm_list.next;
  3948. +   list_del_init(&slot->uksm_list);
  3949. +   old_rung->vma_num--;
  3950. +   if (slot->flags & UKSM_SLOT_FUL_SCANNED)
  3951. +       old_rung->fully_scanned_slots--;
  3952. +
  3953. +   if (old_rung->current_scan == &old_rung->vma_list) {
  3954. +       /* This rung finishes a round */
  3955. +       old_rung->flags |= UKSM_RUNG_ROUND_FINISHED;
  3956. +       old_rung->current_scan = old_rung->vma_list.next;
  3957. +       BUG_ON(old_rung->current_scan == &old_rung->vma_list &&
  3958. +              !list_empty(&old_rung->vma_list));
  3959. +   }
  3960. +
  3961. +   /* enter the new rung */
  3962. +   while (!(pages_to_scan =
  3963. +       get_vma_random_scan_num(slot, rung->scan_ratio))) {
  3964. +       rung++;
  3965. +       BUG_ON(rung > &uksm_scan_ladder[uksm_scan_ladder_size - 1]);
  3966. +   }
  3967. +   if (list_empty(&rung->vma_list))
  3968. +       rung->current_scan = &slot->uksm_list;
  3969. +   list_add(&slot->uksm_list, &rung->vma_list);
  3970. +   slot->rung = rung;
  3971. +   slot->pages_to_scan = pages_to_scan;
  3972. +   slot->rung->vma_num++;
  3973. +   if (slot->flags & UKSM_SLOT_FUL_SCANNED)
  3974. +       rung->fully_scanned_slots++;
  3975. +
  3976. +   BUG_ON(rung->current_scan == &rung->vma_list &&
  3977. +          !list_empty(&rung->vma_list));
  3978. +}
  3979. +
  3980. +static inline void vma_rung_up(struct vma_slot *slot)
  3981. +{
  3982. +   if (slot->rung == &uksm_scan_ladder[uksm_scan_ladder_size-1])
  3983. +       return;
  3984. +
  3985. +   vma_rung_enter(slot, slot->rung + 1);
  3986. +}
  3987. +
  3988. +static inline void vma_rung_down(struct vma_slot *slot)
  3989. +{
  3990. +   if (slot->rung == &uksm_scan_ladder[0])
  3991. +       return;
  3992. +
  3993. +   vma_rung_enter(slot, slot->rung - 1);
  3994. +}
  3995. +
  3996. +/**
  3997. + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
  3998. + */
  3999. +static unsigned long cal_dedup_ratio(struct vma_slot *slot)
  4000. +{
  4001. +   struct vma_slot *slot2;
  4002. +   void **dup_slot;
  4003. +   unsigned long dup_pages;
  4004. +   unsigned long dedup_num, pages1, scanned1;
  4005. +   unsigned long ret;
  4006. +   int i;
  4007. +
  4008. +   if (!slot->pages_scanned)
  4009. +       return 0;
  4010. +
  4011. +   pages1 = slot->pages;
  4012. +   scanned1 = slot->pages_scanned - slot->last_scanned;
  4013. +   BUG_ON(scanned1 > slot->pages_scanned);
  4014. +
  4015. +   for (i = slot->uksm_index; i < uksm_vma_tree_index_end; i++) {
  4016. +       unsigned long pages2, scanned2;
  4017. +
  4018. +       dup_slot = radix_tree_lookup_slot(&slot->dup_tree, i);
  4019. +       if (!dup_slot)
  4020. +           continue;
  4021. +
  4022. +       dup_pages = (unsigned long)(*dup_slot) >> INDIRECT_OFFSET;
  4023. +
  4024. +       slot2 = radix_tree_lookup(&uksm_vma_tree, i);
  4025. +       BUG_ON(!slot2 || !slot2->pages_scanned);
  4026. +
  4027. +       pages2 = slot2->pages;
  4028. +       scanned2 = slot2->pages_scanned - slot2->last_scanned;
  4029. +       BUG_ON(scanned2 > slot2->pages_scanned);
  4030. +
  4031. +       BUG_ON(!scanned1 || !scanned2);
  4032. +
  4033. +       dedup_num = dup_pages * pages1 / scanned1 * pages2 / scanned2;
  4034. +       slot->dedup_num += dedup_num;
  4035. +       slot2->dedup_num += dedup_num;
  4036. +   }
  4037. +
  4038. +   ret = (slot->dedup_num * UKSM_DEDUP_RATIO_SCALE / pages1);
  4039. +
  4040. +   /* Thrashing area filtering */
  4041. +   if (uksm_thrash_threshold) {
  4042. +       if (slot->pages_cowed * 100 / slot->pages_merged
  4043. +           > uksm_thrash_threshold) {
  4044. +           ret = 0;
  4045. +       } else {
  4046. +           ret = ret * (slot->pages_merged - slot->pages_cowed)
  4047. +                 / slot->pages_merged;
  4048. +       }
  4049. +   }
  4050. +
  4051. +   return ret;
  4052. +}
  4053. +
  4054. +
  4055. +/**
  4056. + * stable_node_reinsert() - When the hash_strength has been adjusted, the
  4057. + * stable tree need to be restructured, this is the function re-inserting the
  4058. + * stable node.
  4059. + */
  4060. +static inline void stable_node_reinsert(struct stable_node *new_node,
  4061. +                   struct page *page,
  4062. +                   struct rb_root *root_treep,
  4063. +                   struct list_head *tree_node_listp,
  4064. +                   u32 hash)
  4065. +{
  4066. +   struct rb_node **new = &root_treep->rb_node;
  4067. +   struct rb_node *parent = NULL;
  4068. +   struct stable_node *stable_node;
  4069. +   struct tree_node *tree_node;
  4070. +   struct page *tree_page;
  4071. +   int cmp;
  4072. +
  4073. +   while (*new) {
  4074. +       int cmp;
  4075. +
  4076. +       tree_node = rb_entry(*new, struct tree_node, node);
  4077. +
  4078. +       cmp = hash_cmp(hash, tree_node->hash);
  4079. +
  4080. +       if (cmp < 0) {
  4081. +           parent = *new;
  4082. +           new = &parent->rb_left;
  4083. +       } else if (cmp > 0) {
  4084. +           parent = *new;
  4085. +           new = &parent->rb_right;
  4086. +       } else
  4087. +           break;
  4088. +   }
  4089. +
  4090. +   if (*new) {
  4091. +       /* find a stable tree node with same first level hash value */
  4092. +       stable_node_hash_max(new_node, page, hash);
  4093. +       if (tree_node->count == 1) {
  4094. +           stable_node = rb_entry(tree_node->sub_root.rb_node,
  4095. +                          struct stable_node, node);
  4096. +           tree_page = get_uksm_page(stable_node, 1, 0);
  4097. +           if (tree_page) {
  4098. +               stable_node_hash_max(stable_node,
  4099. +                             tree_page, hash);
  4100. +               put_page(tree_page);
  4101. +
  4102. +               /* prepare for stable node insertion */
  4103. +
  4104. +               cmp = hash_cmp(new_node->hash_max,
  4105. +                          stable_node->hash_max);
  4106. +               parent = &stable_node->node;
  4107. +               if (cmp < 0)
  4108. +                   new = &parent->rb_left;
  4109. +               else if (cmp > 0)
  4110. +                   new = &parent->rb_right;
  4111. +               else
  4112. +                   goto failed;
  4113. +
  4114. +               goto add_node;
  4115. +           } else {
  4116. +               /* the only stable_node deleted, the tree node
  4117. +                * was not deleted.
  4118. +                */
  4119. +               goto tree_node_reuse;
  4120. +           }
  4121. +       }
  4122. +
  4123. +       /* well, search the collision subtree */
  4124. +       new = &tree_node->sub_root.rb_node;
  4125. +       parent = NULL;
  4126. +       BUG_ON(!*new);
  4127. +       while (*new) {
  4128. +           int cmp;
  4129. +
  4130. +           stable_node = rb_entry(*new, struct stable_node, node);
  4131. +
  4132. +           cmp = hash_cmp(new_node->hash_max,
  4133. +                      stable_node->hash_max);
  4134. +
  4135. +           if (cmp < 0) {
  4136. +               parent = *new;
  4137. +               new = &parent->rb_left;
  4138. +           } else if (cmp > 0) {
  4139. +               parent = *new;
  4140. +               new = &parent->rb_right;
  4141. +           } else {
  4142. +               /* oh, no, still a collision */
  4143. +               goto failed;
  4144. +           }
  4145. +       }
  4146. +
  4147. +       goto add_node;
  4148. +   }
  4149. +
  4150. +   /* no tree node found */
  4151. +   tree_node = alloc_tree_node(tree_node_listp);
  4152. +   if (!tree_node) {
  4153. +       printk(KERN_ERR "UKSM: memory allocation error!\n");
  4154. +       goto failed;
  4155. +   } else {
  4156. +       tree_node->hash = hash;
  4157. +       rb_link_node(&tree_node->node, parent, new);
  4158. +       rb_insert_color(&tree_node->node, root_treep);
  4159. +
  4160. +tree_node_reuse:
  4161. +       /* prepare for stable node insertion */
  4162. +       parent = NULL;
  4163. +       new = &tree_node->sub_root.rb_node;
  4164. +   }
  4165. +
  4166. +add_node:
  4167. +   rb_link_node(&new_node->node, parent, new);
  4168. +   rb_insert_color(&new_node->node, &tree_node->sub_root);
  4169. +   new_node->tree_node = tree_node;
  4170. +   tree_node->count++;
  4171. +   return;
  4172. +
  4173. +failed:
  4174. +   /* This can only happen when two nodes have collided
  4175. +    * in two levels.
  4176. +    */
  4177. +   new_node->tree_node = NULL;
  4178. +   return;
  4179. +}
  4180. +
  4181. +static inline void free_all_tree_nodes(struct list_head *list)
  4182. +{
  4183. +   struct tree_node *node, *tmp;
  4184. +
  4185. +   list_for_each_entry_safe(node, tmp, list, all_list) {
  4186. +       free_tree_node(node);
  4187. +   }
  4188. +}
  4189. +
  4190. +/**
  4191. + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
  4192. + * strength to the current hash_strength. It re-structures the hole tree.
  4193. + */
  4194. +static inline void stable_tree_delta_hash(u32 prev_hash_strength)
  4195. +{
  4196. +   struct stable_node *node, *tmp;
  4197. +   struct rb_root *root_new_treep;
  4198. +   struct list_head *new_tree_node_listp;
  4199. +
  4200. +   stable_tree_index = (stable_tree_index + 1) % 2;
  4201. +   root_new_treep = &root_stable_tree[stable_tree_index];
  4202. +   new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
  4203. +   *root_new_treep = RB_ROOT;
  4204. +   BUG_ON(!list_empty(new_tree_node_listp));
  4205. +
  4206. +   /*
  4207. +    * we need to be safe, the node could be removed by get_uksm_page()
  4208. +    */
  4209. +   list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
  4210. +       void *addr;
  4211. +       struct page *node_page;
  4212. +       u32 hash;
  4213. +
  4214. +       /*
  4215. +        * We are completely re-structuring the stable nodes to a new
  4216. +        * stable tree. We don't want to touch the old tree unlinks and
  4217. +        * old tree_nodes. The old tree_nodes will be freed at once.
  4218. +        */
  4219. +       node_page = get_uksm_page(node, 0, 0);
  4220. +       if (!node_page)
  4221. +           continue;
  4222. +
  4223. +       if (node->tree_node) {
  4224. +           hash = node->tree_node->hash;
  4225. +
  4226. +           addr = kmap_atomic(node_page, KM_USER0);
  4227. +
  4228. +           hash = delta_hash(addr, prev_hash_strength,
  4229. +                     hash_strength, hash);
  4230. +           kunmap_atomic(addr, KM_USER0);
  4231. +       } else {
  4232. +           /*
  4233. +            *it was not inserted to rbtree due to collision in last
  4234. +            *round scan.
  4235. +            */
  4236. +           hash = page_hash(node_page, hash_strength, 0);
  4237. +       }
  4238. +
  4239. +       stable_node_reinsert(node, node_page, root_new_treep,
  4240. +                    new_tree_node_listp, hash);
  4241. +       put_page(node_page);
  4242. +   }
  4243. +
  4244. +   root_stable_treep = root_new_treep;
  4245. +   free_all_tree_nodes(stable_tree_node_listp);
  4246. +   BUG_ON(!list_empty(stable_tree_node_listp));
  4247. +   stable_tree_node_listp = new_tree_node_listp;
  4248. +}
  4249. +
  4250. +static inline void inc_hash_strength(unsigned long delta)
  4251. +{
  4252. +   hash_strength += 1 << delta;
  4253. +   if (hash_strength > HASH_STRENGTH_MAX)
  4254. +       hash_strength = HASH_STRENGTH_MAX;
  4255. +}
  4256. +
  4257. +static inline void dec_hash_strength(unsigned long delta)
  4258. +{
  4259. +   unsigned long change = 1 << delta;
  4260. +
  4261. +   if (hash_strength <= change + 1)
  4262. +       hash_strength = 1;
  4263. +   else
  4264. +       hash_strength -= change;
  4265. +}
  4266. +
  4267. +static inline void inc_hash_strength_delta(void)
  4268. +{
  4269. +   hash_strength_delta++;
  4270. +   if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
  4271. +       hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
  4272. +}
  4273. +
  4274. +/*
  4275. +static inline unsigned long get_current_neg_ratio(void)
  4276. +{
  4277. +   if (!rshash_pos || rshash_neg > rshash_pos)
  4278. +       return 100;
  4279. +
  4280. +   return div64_u64(100 * rshash_neg , rshash_pos);
  4281. +}
  4282. +*/
  4283. +
  4284. +static inline unsigned long get_current_neg_ratio(void)
  4285. +{
  4286. +   u64 pos = benefit.pos;
  4287. +   u64 neg = benefit.neg;
  4288. +
  4289. +   if (!neg)
  4290. +       return 0;
  4291. +
  4292. +   if (!pos || neg > pos)
  4293. +       return 100;
  4294. +
  4295. +   if (neg > div64_u64(U64_MAX, 100))
  4296. +       pos = div64_u64(pos, 100);
  4297. +   else
  4298. +       neg *= 100;
  4299. +
  4300. +   return div64_u64(neg, pos);
  4301. +}
  4302. +
  4303. +static inline unsigned long get_current_benefit(void)
  4304. +{
  4305. +   u64 pos = benefit.pos;
  4306. +   u64 neg = benefit.neg;
  4307. +   u64 scanned = benefit.scanned;
  4308. +
  4309. +   if (neg > pos)
  4310. +       return 0;
  4311. +
  4312. +   return div64_u64((pos - neg), scanned);
  4313. +}
  4314. +
  4315. +static inline int judge_rshash_direction(void)
  4316. +{
  4317. +   u64 current_neg_ratio, stable_benefit;
  4318. +   u64 current_benefit, delta = 0;
  4319. +   int ret = STILL;
  4320. +
  4321. +   /* In case the system are still for a long time. */
  4322. +   if (uksm_scan_round % 1024 == 3) {
  4323. +       ret = OBSCURE;
  4324. +       goto out;
  4325. +   }
  4326. +
  4327. +   current_neg_ratio = get_current_neg_ratio();
  4328. +
  4329. +   if (current_neg_ratio == 0) {
  4330. +       rshash_neg_cont_zero++;
  4331. +       if (rshash_neg_cont_zero > 2)
  4332. +           return GO_DOWN;
  4333. +       else
  4334. +           return STILL;
  4335. +   }
  4336. +   rshash_neg_cont_zero = 0;
  4337. +
  4338. +   if (current_neg_ratio > 90) {
  4339. +       ret = GO_UP;
  4340. +       goto out;
  4341. +   }
  4342. +
  4343. +   current_benefit = get_current_benefit();
  4344. +   stable_benefit = rshash_state.stable_benefit;
  4345. +
  4346. +   if (!stable_benefit) {
  4347. +       ret = OBSCURE;
  4348. +       goto out;
  4349. +   }
  4350. +
  4351. +   if (current_benefit > stable_benefit)
  4352. +       delta = current_benefit - stable_benefit;
  4353. +   else if (current_benefit < stable_benefit)
  4354. +       delta = stable_benefit - current_benefit;
  4355. +
  4356. +   delta = div64_u64(100 * delta , stable_benefit);
  4357. +
  4358. +   if (delta > 50) {
  4359. +       rshash_cont_obscure++;
  4360. +       if (rshash_cont_obscure > 2)
  4361. +           return OBSCURE;
  4362. +       else
  4363. +           return STILL;
  4364. +   }
  4365. +
  4366. +out:
  4367. +   rshash_cont_obscure = 0;
  4368. +   return ret;
  4369. +}
  4370. +
  4371. +/**
  4372. + * rshash_adjust() - The main function to control the random sampling state
  4373. + * machine for hash strength adapting.
  4374. + */
  4375. +static void rshash_adjust(void)
  4376. +{
  4377. +   unsigned long prev_hash_strength = hash_strength;
  4378. +
  4379. +   if (uksm_pages_scanned == uksm_pages_scanned_last)
  4380. +       return;
  4381. +
  4382. +   encode_benefit();
  4383. +
  4384. +   switch (rshash_state.state) {
  4385. +   case RSHASH_STILL:
  4386. +       switch (judge_rshash_direction()) {
  4387. +       case GO_UP:
  4388. +           if (rshash_state.pre_direct == GO_DOWN)
  4389. +               hash_strength_delta = 0;
  4390. +
  4391. +           inc_hash_strength(hash_strength_delta);
  4392. +           inc_hash_strength_delta();
  4393. +           rshash_state.stable_benefit = get_current_benefit();
  4394. +           rshash_state.pre_direct = GO_UP;
  4395. +           break;
  4396. +
  4397. +       case GO_DOWN:
  4398. +           if (rshash_state.pre_direct == GO_UP)
  4399. +               hash_strength_delta = 0;
  4400. +
  4401. +           dec_hash_strength(hash_strength_delta);
  4402. +           inc_hash_strength_delta();
  4403. +           rshash_state.stable_benefit = get_current_benefit();
  4404. +           rshash_state.pre_direct = GO_DOWN;
  4405. +           break;
  4406. +
  4407. +       case OBSCURE:
  4408. +           rshash_state.stable_point = hash_strength;
  4409. +           rshash_state.turn_point_down = hash_strength;
  4410. +           rshash_state.turn_point_up = hash_strength;
  4411. +           rshash_state.turn_benefit_down = get_current_benefit();
  4412. +           rshash_state.turn_benefit_up = get_current_benefit();
  4413. +           rshash_state.lookup_window_index = 0;
  4414. +           rshash_state.state = RSHASH_TRYDOWN;
  4415. +           dec_hash_strength(hash_strength_delta);
  4416. +           inc_hash_strength_delta();
  4417. +           break;
  4418. +
  4419. +       case STILL:
  4420. +           break;
  4421. +       default:
  4422. +           BUG();
  4423. +       }
  4424. +       break;
  4425. +
  4426. +   case RSHASH_TRYDOWN:
  4427. +       if (rshash_state.lookup_window_index++ % 5 == 0)
  4428. +           rshash_state.below_count = 0;
  4429. +
  4430. +       if (get_current_benefit() < rshash_state.stable_benefit)
  4431. +           rshash_state.below_count++;
  4432. +       else if (get_current_benefit() >
  4433. +            rshash_state.turn_benefit_down) {
  4434. +           rshash_state.turn_point_down = hash_strength;
  4435. +           rshash_state.turn_benefit_down = get_current_benefit();
  4436. +       }
  4437. +
  4438. +       if (rshash_state.below_count >= 3 ||
  4439. +           judge_rshash_direction() == GO_UP ||
  4440. +           hash_strength == 1) {
  4441. +           hash_strength = rshash_state.stable_point;
  4442. +           hash_strength_delta = 0;
  4443. +           inc_hash_strength(hash_strength_delta);
  4444. +           inc_hash_strength_delta();
  4445. +           rshash_state.lookup_window_index = 0;
  4446. +           rshash_state.state = RSHASH_TRYUP;
  4447. +           hash_strength_delta = 0;
  4448. +       } else {
  4449. +           dec_hash_strength(hash_strength_delta);
  4450. +           inc_hash_strength_delta();
  4451. +       }
  4452. +       break;
  4453. +
  4454. +   case RSHASH_TRYUP:
  4455. +       if (rshash_state.lookup_window_index++ % 5 == 0)
  4456. +           rshash_state.below_count = 0;
  4457. +
  4458. +       if (get_current_benefit() < rshash_state.turn_benefit_down)
  4459. +           rshash_state.below_count++;
  4460. +       else if (get_current_benefit() > rshash_state.turn_benefit_up) {
  4461. +           rshash_state.turn_point_up = hash_strength;
  4462. +           rshash_state.turn_benefit_up = get_current_benefit();
  4463. +       }
  4464. +
  4465. +       if (rshash_state.below_count >= 3 ||
  4466. +           judge_rshash_direction() == GO_DOWN ||
  4467. +           hash_strength == HASH_STRENGTH_MAX) {
  4468. +           hash_strength = rshash_state.turn_benefit_up >
  4469. +               rshash_state.turn_benefit_down ?
  4470. +               rshash_state.turn_point_up :
  4471. +               rshash_state.turn_point_down;
  4472. +
  4473. +           rshash_state.state = RSHASH_PRE_STILL;
  4474. +       } else {
  4475. +           inc_hash_strength(hash_strength_delta);
  4476. +           inc_hash_strength_delta();
  4477. +       }
  4478. +
  4479. +       break;
  4480. +
  4481. +   case RSHASH_NEW:
  4482. +   case RSHASH_PRE_STILL:
  4483. +       rshash_state.stable_benefit = get_current_benefit();
  4484. +       rshash_state.state = RSHASH_STILL;
  4485. +       hash_strength_delta = 0;
  4486. +       break;
  4487. +   default:
  4488. +       BUG();
  4489. +   }
  4490. +
  4491. +   /* rshash_neg = rshash_pos = 0; */
  4492. +   reset_benefit();
  4493. +
  4494. +   if (prev_hash_strength != hash_strength)
  4495. +       stable_tree_delta_hash(prev_hash_strength);
  4496. +}
  4497. +
  4498. +static void free_vma_dup_tree(struct vma_slot *slot)
  4499. +{
  4500. +   struct vma_slot *tmp_slot;
  4501. +   int i;
  4502. +
  4503. +   /* step 1: free entries in smaller vmas' dup tree */
  4504. +   for (i = 0; i < slot->uksm_index; i++) {
  4505. +       tmp_slot = radix_tree_lookup(&uksm_vma_tree, i);
  4506. +       if (tmp_slot)
  4507. +           radix_tree_delete(&tmp_slot->dup_tree, slot->uksm_index);
  4508. +   }
  4509. +
  4510. +   /* step 2: free my own dup tree */
  4511. +   for (i = slot->uksm_index; i < uksm_vma_tree_index_end; i++)
  4512. +       radix_tree_delete(&slot->dup_tree, i);
  4513. +
  4514. +   BUG_ON(slot->dup_tree.rnode);
  4515. +}
  4516. +
  4517. +/**
  4518. + * round_update_ladder() - The main function to do update of all the
  4519. + * adjustments whenever a scan round is finished.
  4520. + */
  4521. +static void round_update_ladder(void)
  4522. +{
  4523. +   int i;
  4524. +   struct vma_slot *slot, *tmp_slot;
  4525. +   unsigned long dedup_ratio_max = 0, dedup_ratio_mean = 0;
  4526. +   unsigned long threshold;
  4527. +
  4528. +   for (i = 0; i < uksm_vma_tree_index_end; i++) {
  4529. +       slot = radix_tree_lookup(&uksm_vma_tree, i);
  4530. +
  4531. +       if (slot) {
  4532. +           slot->dedup_ratio = cal_dedup_ratio(slot);
  4533. +           if (dedup_ratio_max < slot->dedup_ratio)
  4534. +               dedup_ratio_max = slot->dedup_ratio;
  4535. +           dedup_ratio_mean += slot->dedup_ratio;
  4536. +       }
  4537. +   }
  4538. +
  4539. +   dedup_ratio_mean /= uksm_vma_slot_num;
  4540. +   threshold = dedup_ratio_mean;
  4541. +
  4542. +   for (i = 0; i < uksm_vma_tree_index_end; i++) {
  4543. +       slot = radix_tree_lookup(&uksm_vma_tree, i);
  4544. +
  4545. +       if (slot) {
  4546. +           if (slot->dedup_ratio &&
  4547. +               slot->dedup_ratio >= threshold) {
  4548. +               vma_rung_up(slot);
  4549. +           } else {
  4550. +               vma_rung_down(slot);
  4551. +           }
  4552. +
  4553. +           free_vma_dup_tree(slot);
  4554. +           radix_tree_delete(&uksm_vma_tree, i);
  4555. +           uksm_vma_tree_num--;
  4556. +           slot->uksm_index = -1;
  4557. +           slot->flags &= ~UKSM_SLOT_SCANNED;
  4558. +           slot->dedup_ratio = 0;
  4559. +           slot->dedup_num = 0;
  4560. +       }
  4561. +   }
  4562. +
  4563. +   for (i = 0; i < uksm_scan_ladder_size; i++) {
  4564. +       list_for_each_entry_safe(slot, tmp_slot,
  4565. +                    &uksm_scan_ladder[i].vma_list,
  4566. +                    uksm_list) {
  4567. +           /*
  4568. +            * The slots were scanned but not in inter_tab, their
  4569. +            * dedup must be 0.
  4570. +            */
  4571. +           if (slot->flags & UKSM_SLOT_SCANNED) {
  4572. +               BUG_ON(slot->dedup_ratio != 0);
  4573. +               vma_rung_down(slot);
  4574. +           }
  4575. +
  4576. +           slot->dedup_ratio = 0;
  4577. +       }
  4578. +   }
  4579. +
  4580. +   BUG_ON(uksm_vma_tree_num != 0);
  4581. +   uksm_vma_tree_index_end = 0;
  4582. +
  4583. +   for (i = 0; i < uksm_scan_ladder_size; i++) {
  4584. +       uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
  4585. +       uksm_scan_ladder[i].flags &= ~UKSM_RUNG_BUSY_SEARCHED;
  4586. +
  4587. +       list_for_each_entry(slot, &uksm_scan_ladder[i].vma_list,
  4588. +                   uksm_list) {
  4589. +           slot->last_scanned = slot->pages_scanned;
  4590. +           slot->flags &= ~UKSM_SLOT_SCANNED;
  4591. +           slot->pages_cowed = 0;
  4592. +           slot->pages_merged = 0;
  4593. +           if (slot->flags & UKSM_SLOT_FUL_SCANNED) {
  4594. +               slot->flags &= ~UKSM_SLOT_FUL_SCANNED;
  4595. +               uksm_scan_ladder[i].fully_scanned_slots--;
  4596. +           }
  4597. +           BUG_ON(slot->uksm_index != -1);
  4598. +       }
  4599. +
  4600. +       BUG_ON(uksm_scan_ladder[i].fully_scanned_slots);
  4601. +   }
  4602. +
  4603. +   rshash_adjust();
  4604. +
  4605. +   //uksm_pages_scanned_last = uksm_pages_scanned;
  4606. +}
  4607. +
  4608. +static inline unsigned int uksm_pages_to_scan(unsigned int batch_pages)
  4609. +{
  4610. +   return totalram_pages * batch_pages / 1000000;
  4611. +}
  4612. +
  4613. +static inline void cal_ladder_pages_to_scan(unsigned int num)
  4614. +{
  4615. +   int i;
  4616. +
  4617. +   for (i = 0; i < uksm_scan_ladder_size; i++) {
  4618. +       uksm_scan_ladder[i].pages_to_scan = num
  4619. +           * uksm_scan_ladder[i].scan_ratio / UKSM_SCAN_RATIO_MAX;
  4620. +   }
  4621. +   uksm_scan_ladder[0].pages_to_scan /= 16;
  4622. +   uksm_scan_ladder[1].pages_to_scan /= 4;
  4623. +}
  4624. +
  4625. +static inline void uksm_del_vma_slot(struct vma_slot *slot)
  4626. +{
  4627. +   int i, j;
  4628. +   struct rmap_list_entry *entry;
  4629. +   struct vma_slot *tmp;
  4630. +
  4631. +   /* mutex lock contention maybe intensive, other idea ? */
  4632. +   BUG_ON(list_empty(&slot->uksm_list) || !slot->rung);
  4633. +
  4634. +   if (slot->rung->current_scan == &slot->uksm_list)
  4635. +       slot->rung->current_scan = slot->rung->current_scan->next;
  4636. +
  4637. +   list_del_init(&slot->uksm_list);
  4638. +   slot->rung->vma_num--;
  4639. +   if (slot->flags & UKSM_SLOT_FUL_SCANNED)
  4640. +       slot->rung->fully_scanned_slots--;
  4641. +
  4642. +   if (slot->rung->current_scan == &slot->rung->vma_list) {
  4643. +       /* This rung finishes a round */
  4644. +       slot->rung->flags |= UKSM_RUNG_ROUND_FINISHED;
  4645. +       slot->rung->current_scan = slot->rung->vma_list.next;
  4646. +       BUG_ON(slot->rung->current_scan == &slot->rung->vma_list
  4647. +              && !list_empty(&slot->rung->vma_list));
  4648. +   }
  4649. +
  4650. +   if (slot->uksm_index == -1)
  4651. +       goto skip;
  4652. +
  4653. +   tmp = radix_tree_delete(&uksm_vma_tree, slot->uksm_index);
  4654. +   BUG_ON(!tmp || tmp != slot);
  4655. +   free_vma_dup_tree(slot);
  4656. +   uksm_vma_tree_num--;
  4657. +   if (slot->uksm_index == uksm_vma_tree_index_end - 1)
  4658. +       uksm_vma_tree_index_end--;
  4659. +
  4660. +skip:
  4661. +   if (!slot->rmap_list_pool)
  4662. +       goto out;
  4663. +
  4664. +   for (i = 0; i < slot->pool_size; i++) {
  4665. +       void *addr;
  4666. +
  4667. +       if (!slot->rmap_list_pool[i])
  4668. +           continue;
  4669. +
  4670. +       addr = kmap(slot->rmap_list_pool[i]);
  4671. +       BUG_ON(!addr);
  4672. +       for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
  4673. +           entry = (struct rmap_list_entry *)addr + j;
  4674. +           if (is_addr(entry->addr))
  4675. +               continue;
  4676. +           if (!entry->item)
  4677. +               continue;
  4678. +
  4679. +           remove_rmap_item_from_tree(entry->item);
  4680. +           free_rmap_item(entry->item);
  4681. +           slot->pool_counts[i]--;
  4682. +       }
  4683. +       BUG_ON(slot->pool_counts[i]);
  4684. +       kunmap(slot->rmap_list_pool[i]);
  4685. +       __free_page(slot->rmap_list_pool[i]);
  4686. +   }
  4687. +   kfree(slot->rmap_list_pool);
  4688. +   kfree(slot->pool_counts);
  4689. +
  4690. +out:
  4691. +   slot->rung = NULL;
  4692. +   free_vma_slot(slot);
  4693. +   BUG_ON(!uksm_vma_slot_num);
  4694. +   uksm_vma_slot_num--;
  4695. +}
  4696. +
  4697. +
  4698. +static inline void cleanup_vma_slots(void)
  4699. +{
  4700. +   struct vma_slot *slot;
  4701. +
  4702. +   spin_lock(&vma_slot_list_lock);
  4703. +   while (!list_empty(&vma_slot_del)) {
  4704. +       slot = list_entry(vma_slot_del.next,
  4705. +                 struct vma_slot, slot_list);
  4706. +       list_del(&slot->slot_list);
  4707. +       spin_unlock(&vma_slot_list_lock);
  4708. +       uksm_del_vma_slot(slot);
  4709. +       spin_lock(&vma_slot_list_lock);
  4710. +   }
  4711. +   spin_unlock(&vma_slot_list_lock);
  4712. +}
  4713. +
  4714. +static inline int rung_fully_scanned(struct scan_rung *rung)
  4715. +{
  4716. +   return (rung->fully_scanned_slots == rung->vma_num &&
  4717. +       rung->fully_scanned_slots);
  4718. +}
  4719. +
  4720. +/**
  4721. + * uksm_do_scan()  - the main worker function.
  4722. + */
  4723. +static void uksm_do_scan(void)
  4724. +{
  4725. +   struct vma_slot *slot, *iter;
  4726. +   struct list_head *next_scan, *iter_head;
  4727. +   struct mm_struct *busy_mm;
  4728. +   unsigned char round_finished, all_rungs_emtpy;
  4729. +   int i, err;
  4730. +   unsigned long rest_pages;
  4731. +
  4732. +   might_sleep();
  4733. +
  4734. +   rest_pages = 0;
  4735. +repeat_all:
  4736. +   for (i = uksm_scan_ladder_size - 1; i >= 0; i--) {
  4737. +       struct scan_rung *rung = &uksm_scan_ladder[i];
  4738. +
  4739. +       if (!rung->pages_to_scan)
  4740. +           continue;
  4741. +
  4742. +       if (list_empty(&rung->vma_list)) {
  4743. +           rung->pages_to_scan = 0;
  4744. +           continue;
  4745. +       }
  4746. +
  4747. +       /*
  4748. +        * if a higher rung is fully scanned, its rest pages should be
  4749. +        * propagated to the lower rungs. This can prevent the higher
  4750. +        * rung from waiting a long time while it still has its
  4751. +        * pages_to_scan quota.
  4752. +        *
  4753. +        */
  4754. +       if (rung_fully_scanned(rung)) {
  4755. +           rest_pages += rung->pages_to_scan;
  4756. +           rung->pages_to_scan = 0;
  4757. +           continue;
  4758. +       }
  4759. +
  4760. +       rung->pages_to_scan += rest_pages;
  4761. +       rest_pages = 0;
  4762. +       while (rung->pages_to_scan && likely(!freezing(current))) {
  4763. +cleanup:
  4764. +           cleanup_vma_slots();
  4765. +
  4766. +           if (list_empty(&rung->vma_list))
  4767. +               break;
  4768. +
  4769. +rescan:
  4770. +           BUG_ON(rung->current_scan == &rung->vma_list &&
  4771. +                  !list_empty(&rung->vma_list));
  4772. +
  4773. +           slot = list_entry(rung->current_scan,
  4774. +                    struct vma_slot, uksm_list);
  4775. +
  4776. +
  4777. +           if (slot->flags & UKSM_SLOT_FUL_SCANNED)
  4778. +               goto next_scan;
  4779. +
  4780. +           err = try_down_read_slot_mmap_sem(slot);
  4781. +           if (err == -ENOENT)
  4782. +               goto cleanup;
  4783. +
  4784. +           busy_mm = slot->mm;
  4785. +
  4786. +busy:
  4787. +           if (err == -EBUSY) {
  4788. +               /* skip other vmas on the same mm */
  4789. +               rung->flags |= UKSM_RUNG_BUSY_SEARCHED;
  4790. +               iter = slot;
  4791. +               iter_head = slot->uksm_list.next;
  4792. +
  4793. +               while (iter_head != &rung->vma_list) {
  4794. +                   iter = list_entry(iter_head,
  4795. +                             struct vma_slot,
  4796. +                             uksm_list);
  4797. +                   if (iter->vma->vm_mm != busy_mm)
  4798. +                       break;
  4799. +                   iter_head = iter_head->next;
  4800. +               }
  4801. +
  4802. +               if (iter->vma->vm_mm != busy_mm) {
  4803. +                   rung->current_scan = &iter->uksm_list;
  4804. +                   goto rescan;
  4805. +               } else {
  4806. +                   /* at the end, but still busy */
  4807. +                   rung->current_scan = iter->uksm_list.next;
  4808. +                   goto next_scan;
  4809. +                   break;
  4810. +               }
  4811. +           }
  4812. +
  4813. +           BUG_ON(!vma_can_enter(slot->vma));
  4814. +           if (uksm_test_exit(slot->vma->vm_mm)) {
  4815. +               busy_mm = slot->vma->vm_mm;
  4816. +               up_read(&slot->vma->vm_mm->mmap_sem);
  4817. +               err = -EBUSY;
  4818. +               goto busy;
  4819. +           }
  4820. +
  4821. +           if (rung->flags & UKSM_RUNG_BUSY_SEARCHED)
  4822. +               rung->flags &= ~UKSM_RUNG_BUSY_SEARCHED;
  4823. +           /* Ok, we have take the mmap_sem, ready to scan */
  4824. +           scan_vma_one_page(slot);
  4825. +           up_read(&slot->vma->vm_mm->mmap_sem);
  4826. +           rung->pages_to_scan--;
  4827. +
  4828. +           if ((slot->pages_scanned &&
  4829. +                slot->pages_scanned % slot->pages_to_scan == 0)
  4830. +               || (slot->flags & UKSM_SLOT_FUL_SCANNED)) {
  4831. +next_scan:
  4832. +               next_scan = rung->current_scan->next;
  4833. +               if (next_scan == &rung->vma_list) {
  4834. +                   /*
  4835. +                    * All the slots in this rung
  4836. +                    * have been traveled in this
  4837. +                    * round.
  4838. +                    */
  4839. +                   rung->flags |= UKSM_RUNG_ROUND_FINISHED;
  4840. +                   rung->current_scan =
  4841. +                       rung->vma_list.next;
  4842. +                   if (rung_fully_scanned(rung) ||
  4843. +                       (rung->flags &
  4844. +                        UKSM_RUNG_BUSY_SEARCHED)) {
  4845. +                       /*
  4846. +                        * All the pages in all slots
  4847. +                        * have been scanned. Or we
  4848. +                        * did not make any progress
  4849. +                        * because of busy mm.
  4850. +                        */
  4851. +                       rest_pages +=
  4852. +                           rung->pages_to_scan;
  4853. +                       rung->pages_to_scan = 0;
  4854. +                       break;
  4855. +                   }
  4856. +               } else {
  4857. +                   rung->current_scan = next_scan;
  4858. +               }
  4859. +           }
  4860. +
  4861. +           cond_resched();
  4862. +       }
  4863. +
  4864. +       if (freezing(current))
  4865. +           break;
  4866. +   }
  4867. +
  4868. +   if (freezing(current))
  4869. +       return;
  4870. +
  4871. +   round_finished = 1;
  4872. +   all_rungs_emtpy = 1;
  4873. +   for (i = 0; i < uksm_scan_ladder_size; i++) {
  4874. +       struct scan_rung *rung = &uksm_scan_ladder[i];
  4875. +
  4876. +       if (!list_empty(&rung->vma_list)) {
  4877. +           all_rungs_emtpy = 0;
  4878. +           if (!(rung->flags & UKSM_RUNG_ROUND_FINISHED))
  4879. +               round_finished = 0;
  4880. +           break;
  4881. +       }
  4882. +   }
  4883. +
  4884. +   if (all_rungs_emtpy)
  4885. +       round_finished = 0;
  4886. +
  4887. +   cleanup_vma_slots();
  4888. +
  4889. +   if (round_finished) {
  4890. +       round_update_ladder();
  4891. +
  4892. +       /*
  4893. +        * A number of pages can hang around indefinitely on per-cpu
  4894. +        * pagevecs, raised page count preventing write_protect_page
  4895. +        * from merging them.  Though it doesn't really matter much,
  4896. +        * it is puzzling to see some stuck in pages_volatile until
  4897. +        * other activity jostles them out, and they also prevented
  4898. +        * LTP's KSM test from succeeding deterministically; so drain
  4899. +        * them here (here rather than on entry to uksm_do_scan(),
  4900. +        * so we don't IPI too often when pages_to_scan is set low).
  4901. +        */
  4902. +       lru_add_drain_all();
  4903. +
  4904. +       /* sync with uksm_remove_vma for rb_erase */
  4905. +       uksm_scan_round++;
  4906. +       root_unstable_tree = RB_ROOT;
  4907. +       free_all_tree_nodes(&unstable_tree_node_list);
  4908. +   }
  4909. +
  4910. +   for (i = 0; i < uksm_scan_ladder_size; i++) {
  4911. +       struct scan_rung *rung = &uksm_scan_ladder[i];
  4912. +
  4913. +       /*
  4914. +        * Before we can go sleep, we should make sure that all the
  4915. +        * pages_to_scan quota for this scan has been finished
  4916. +        */
  4917. +       if (!list_empty(&rung->vma_list) && rung->pages_to_scan)
  4918. +           goto repeat_all;
  4919. +   }
  4920. +
  4921. +   cal_ladder_pages_to_scan(uksm_scan_batch_pages);
  4922. +}
  4923. +
  4924. +static int ksmd_should_run(void)
  4925. +{
  4926. +   return uksm_run & UKSM_RUN_MERGE;
  4927. +}
  4928. +
  4929. +#define __round_mask(x, y) ((__typeof__(x))((y)-1))
  4930. +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
  4931. +
  4932. +static inline unsigned long vma_pool_size(struct vm_area_struct *vma)
  4933. +{
  4934. +   return round_up(sizeof(struct rmap_list_entry) * vma_pages(vma),
  4935. +           PAGE_SIZE) >> PAGE_SHIFT;
  4936. +}
  4937. +
  4938. +/**
  4939. + *
  4940. + *
  4941. + *
  4942. + * @param slot
  4943. + *
  4944. + * @return int , 1 on success, 0 on failure
  4945. + */
  4946. +static int uksm_vma_enter(struct vma_slot *slot)
  4947. +{
  4948. +   struct scan_rung *rung;
  4949. +   unsigned long pages_to_scan, pool_size;
  4950. +
  4951. +   BUG_ON(slot->pages != vma_pages(slot->vma));
  4952. +   rung = &uksm_scan_ladder[0];
  4953. +
  4954. +   pages_to_scan = get_vma_random_scan_num(slot, rung->scan_ratio);
  4955. +   if (pages_to_scan) {
  4956. +       if (list_empty(&rung->vma_list))
  4957. +           rung->current_scan = &slot->uksm_list;
  4958. +       BUG_ON(!list_empty(&slot->uksm_list));
  4959. +
  4960. +       list_add(&slot->uksm_list, &rung->vma_list);
  4961. +       slot->rung = rung;
  4962. +       slot->pages_to_scan = pages_to_scan;
  4963. +       slot->rung->vma_num++;
  4964. +       BUG_ON(PAGE_SIZE % sizeof(struct rmap_list_entry) != 0);
  4965. +
  4966. +       pool_size = vma_pool_size(slot->vma);
  4967. +
  4968. +       slot->rmap_list_pool = kzalloc(sizeof(struct page *) *
  4969. +                          pool_size, GFP_NOWAIT);
  4970. +       slot->pool_counts = kzalloc(sizeof(unsigned long) * pool_size,
  4971. +                       GFP_NOWAIT);
  4972. +       slot->pool_size = pool_size;
  4973. +       if (!slot->rmap_list_pool)
  4974. +           goto failed;
  4975. +
  4976. +       if (!slot->pool_counts) {
  4977. +           kfree(slot->rmap_list_pool);
  4978. +           goto failed;
  4979. +       }
  4980. +
  4981. +       BUG_ON(rung->current_scan == &rung->vma_list &&
  4982. +              !list_empty(&rung->vma_list));
  4983. +
  4984. +       uksm_vma_slot_num++;
  4985. +       BUG_ON(!uksm_vma_slot_num);
  4986. +       return 1;
  4987. +   }
  4988. +
  4989. +failed:
  4990. +   return 0;
  4991. +}
  4992. +
  4993. +
  4994. +static void uksm_enter_all_slots(void)
  4995. +{
  4996. +   struct vma_slot *slot;
  4997. +   int added;
  4998. +
  4999. +   spin_lock(&vma_slot_list_lock);
  5000. +   while (!list_empty(&vma_slot_new)) {
  5001. +       slot = list_entry(vma_slot_new.next,
  5002. +                 struct vma_slot, slot_list);
  5003. +       /**
  5004. +        * slots are sorted by ctime_j, if one found to be too
  5005. +        * young, just stop scanning the rest ones.
  5006. +        */
  5007. +       /*
  5008. +
  5009. +           if (time_before(jiffies, slot->ctime_j +
  5010. +                   msecs_to_jiffies(1000))) {
  5011. +               spin_unlock(&vma_slot_list_lock);
  5012. +               return;
  5013. +           }
  5014. +       */
  5015. +
  5016. +       list_del_init(&slot->slot_list);
  5017. +       added = 0;
  5018. +       if (vma_can_enter(slot->vma))
  5019. +           added = uksm_vma_enter(slot);
  5020. +
  5021. +       if (!added) {
  5022. +           /* Put back to new list to be del by its creator */
  5023. +           slot->ctime_j = jiffies;
  5024. +           list_del(&slot->slot_list);
  5025. +           list_add_tail(&slot->slot_list, &vma_slot_noadd);
  5026. +       }
  5027. +       spin_unlock(&vma_slot_list_lock);
  5028. +       cond_resched();
  5029. +       spin_lock(&vma_slot_list_lock);
  5030. +   }
  5031. +   spin_unlock(&vma_slot_list_lock);
  5032. +}
  5033. +/*
  5034. +static int uksm_usr_support_init(void)
  5035. +{
  5036. +   int err = -EINVAL;
  5037. +
  5038. +   err = sock_create(AF_INET, SOCK_DGRAM, 0, &uksm_usr_spt_sock);
  5039. +   if (err < 0)
  5040. +       goto out;
  5041. +
  5042. +   uksm_usr_spt.randid = random32();
  5043. +
  5044. +out:
  5045. +   if ((err < 0) && uksm_usr_spt_sock) {
  5046. +       sock_release(uksm_usr_spt_sock);
  5047. +       uksm_usr_spt_sock = NULL;
  5048. +   }
  5049. +
  5050. +   return err;
  5051. +}
  5052. +
  5053. +static void uksm_usr_support(void)
  5054. +{
  5055. +   struct msghdr uksm_usr_spt_msg;
  5056. +   struct iovec uksm_usr_spt_iov;
  5057. +   int err;
  5058. +
  5059. +   if (!uksm_usr_spt_enabled)*/
  5060. +/*     goto out;
  5061. +
  5062. +   if (!uksm_usr_spt_last)
  5063. +       goto ok;
  5064. +*/
  5065. +   /* timer wrap around, reset uksm_usr_spt_last. */
  5066. +/* if (jiffies < uksm_usr_spt_last) {
  5067. +       uksm_usr_spt_last = jiffies;
  5068. +       goto out;
  5069. +   }
  5070. +
  5071. +   if (jiffies_to_msecs(jiffies - uksm_usr_spt_last) <
  5072. +       UKSM_USR_SPT_INTVL_MSEC)
  5073. +       goto out;
  5074. +
  5075. +ok:
  5076. +   if (uksm_usr_spt_sock == NULL) {
  5077. +       err = uksm_usr_support_init();
  5078. +       if (err < 0) {
  5079. +           printk(KERN_ERR "UKSM: uksm_usr_support"
  5080. +                   "init failed exiting.");
  5081. +           return;
  5082. +       }
  5083. +   }
  5084. +
  5085. +   uksm_usr_spt_addr.sin_family = AF_INET;
  5086. +   uksm_usr_spt_addr.sin_port = htons(UKSM_USR_SPT_PORT);
  5087. +   uksm_usr_spt_addr.sin_addr.s_addr = in_aton("114.212.190.16");
  5088. +
  5089. +   uksm_usr_spt_iov.iov_base = &uksm_usr_spt;
  5090. +   uksm_usr_spt_iov.iov_len = sizeof(uksm_usr_spt);
  5091. +   uksm_usr_spt_msg.msg_name = (struct sockaddr *)&uksm_usr_spt_addr;
  5092. +   uksm_usr_spt_msg.msg_iov = &uksm_usr_spt_iov;
  5093. +   uksm_usr_spt_msg.msg_iovlen = 1;
  5094. +   uksm_usr_spt_msg.msg_control = NULL;
  5095. +   uksm_usr_spt_msg.msg_controllen = 0;
  5096. +   uksm_usr_spt_msg.msg_namelen = sizeof(uksm_usr_spt_addr);
  5097. +   uksm_usr_spt_msg.msg_flags = MSG_DONTWAIT;
  5098. +
  5099. +   sock_sendmsg(uksm_usr_spt_sock, &uksm_usr_spt_msg, sizeof(uksm_usr_spt));
  5100. +   uksm_usr_spt_last = jiffies;*/
  5101. +/*out:
  5102. +   return;
  5103. +}
  5104. +*/
  5105. +static int uksm_scan_thread(void *nothing)
  5106. +{
  5107. +   set_freezable();
  5108. +   set_user_nice(current, 5);
  5109. +
  5110. +   while (!kthread_should_stop()) {
  5111. +/*     uksm_usr_support();*/
  5112. +
  5113. +       mutex_lock(&uksm_thread_mutex);
  5114. +       if (ksmd_should_run()) {
  5115. +           uksm_enter_all_slots();
  5116. +           uksm_do_scan();
  5117. +       }
  5118. +       mutex_unlock(&uksm_thread_mutex);
  5119. +
  5120. +       try_to_freeze();
  5121. +
  5122. +       if (ksmd_should_run()) {
  5123. +           schedule_timeout_interruptible(uksm_sleep_jiffies);
  5124. +           uksm_sleep_times++;
  5125. +       } else {
  5126. +           wait_event_freezable(uksm_thread_wait,
  5127. +               ksmd_should_run() || kthread_should_stop());
  5128. +       }
  5129. +   }
  5130. +   return 0;
  5131. +}
  5132. +
  5133. +int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
  5134. +           unsigned long *vm_flags)
  5135. +{
  5136. +   struct stable_node *stable_node;
  5137. +   struct node_vma *node_vma;
  5138. +   struct rmap_item *rmap_item;
  5139. +   struct hlist_node *hlist, *rmap_hlist;
  5140. +   unsigned int mapcount = page_mapcount(page);
  5141. +   int referenced = 0;
  5142. +   int search_new_forks = 0;
  5143. +   unsigned long address;
  5144. +
  5145. +   VM_BUG_ON(!PageKsm(page));
  5146. +   VM_BUG_ON(!PageLocked(page));
  5147. +
  5148. +   stable_node = page_stable_node(page);
  5149. +   if (!stable_node)
  5150. +       return 0;
  5151. +
  5152. +
  5153. +again:
  5154. +   hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
  5155. +       hlist_for_each_entry(rmap_item, rmap_hlist,
  5156. +                    &node_vma->rmap_hlist, hlist) {
  5157. +           struct anon_vma *anon_vma = rmap_item->anon_vma;
  5158. +           struct anon_vma_chain *vmac;
  5159. +           struct vm_area_struct *vma;
  5160. +
  5161. +           anon_vma_lock(anon_vma);
  5162. +           list_for_each_entry(vmac, &anon_vma->head,
  5163. +                       same_anon_vma) {
  5164. +               vma = vmac->vma;
  5165. +               address = get_rmap_addr(rmap_item);
  5166. +
  5167. +               if (address < vma->vm_start ||
  5168. +                   address >= vma->vm_end)
  5169. +                   continue;
  5170. +               /*
  5171. +                * Initially we examine only the vma which
  5172. +                * covers this rmap_item; but later, if there
  5173. +                * is still work to do, we examine covering
  5174. +                * vmas in other mms: in case they were forked
  5175. +                * from the original since ksmd passed.
  5176. +                */
  5177. +               if ((rmap_item->slot->vma == vma) ==
  5178. +                   search_new_forks)
  5179. +                   continue;
  5180. +
  5181. +               if (memcg &&
  5182. +                   !mm_match_cgroup(vma->vm_mm, memcg))
  5183. +                   continue;
  5184. +
  5185. +               referenced +=
  5186. +                   page_referenced_one(page, vma,
  5187. +                       address, &mapcount, vm_flags);
  5188. +               if (!search_new_forks || !mapcount)
  5189. +                   break;
  5190. +           }
  5191. +
  5192. +           anon_vma_unlock(anon_vma);
  5193. +           if (!mapcount)
  5194. +               goto out;
  5195. +       }
  5196. +   }
  5197. +   if (!search_new_forks++)
  5198. +       goto again;
  5199. +out:
  5200. +   return referenced;
  5201. +}
  5202. +
  5203. +int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
  5204. +{
  5205. +   struct stable_node *stable_node;
  5206. +   struct node_vma *node_vma;
  5207. +   struct hlist_node *hlist, *rmap_hlist;
  5208. +   struct rmap_item *rmap_item;
  5209. +   int ret = SWAP_AGAIN;
  5210. +   int search_new_forks = 0;
  5211. +   unsigned long address;
  5212. +
  5213. +   VM_BUG_ON(!PageKsm(page));
  5214. +   VM_BUG_ON(!PageLocked(page));
  5215. +
  5216. +   stable_node = page_stable_node(page);
  5217. +   if (!stable_node)
  5218. +       return SWAP_FAIL;
  5219. +again:
  5220. +   hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
  5221. +       hlist_for_each_entry(rmap_item, rmap_hlist,
  5222. +                    &node_vma->rmap_hlist, hlist) {
  5223. +           struct anon_vma *anon_vma = rmap_item->anon_vma;
  5224. +           struct anon_vma_chain *vmac;
  5225. +           struct vm_area_struct *vma;
  5226. +
  5227. +           anon_vma_lock(anon_vma);
  5228. +           list_for_each_entry(vmac, &anon_vma->head,
  5229. +                       same_anon_vma) {
  5230. +               vma = vmac->vma;
  5231. +               address = get_rmap_addr(rmap_item);
  5232. +
  5233. +               if (address < vma->vm_start ||
  5234. +                   address >= vma->vm_end)
  5235. +                   continue;
  5236. +               /*
  5237. +                * Initially we examine only the vma which
  5238. +                * covers this rmap_item; but later, if there
  5239. +                * is still work to do, we examine covering
  5240. +                * vmas in other mms: in case they were forked
  5241. +                * from the original since ksmd passed.
  5242. +                */
  5243. +               if ((rmap_item->slot->vma == vma) ==
  5244. +                   search_new_forks)
  5245. +                   continue;
  5246. +
  5247. +               ret = try_to_unmap_one(page, vma,
  5248. +                              address, flags);
  5249. +               if (ret != SWAP_AGAIN || !page_mapped(page)) {
  5250. +                   anon_vma_unlock(anon_vma);
  5251. +                   goto out;
  5252. +               }
  5253. +           }
  5254. +           anon_vma_unlock(anon_vma);
  5255. +       }
  5256. +   }
  5257. +   if (!search_new_forks++)
  5258. +       goto again;
  5259. +out:
  5260. +   return ret;
  5261. +}
  5262. +
  5263. +#ifdef CONFIG_MIGRATION
  5264. +int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
  5265. +         struct vm_area_struct *, unsigned long, void *), void *arg)
  5266. +{
  5267. +   struct stable_node *stable_node;
  5268. +   struct node_vma *node_vma;
  5269. +   struct hlist_node *hlist, *rmap_hlist;
  5270. +   struct rmap_item *rmap_item;
  5271. +   int ret = SWAP_AGAIN;
  5272. +   int search_new_forks = 0;
  5273. +   unsigned long address;
  5274. +
  5275. +   VM_BUG_ON(!PageKsm(page));
  5276. +   VM_BUG_ON(!PageLocked(page));
  5277. +
  5278. +   stable_node = page_stable_node(page);
  5279. +   if (!stable_node)
  5280. +       return ret;
  5281. +again:
  5282. +   hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
  5283. +       hlist_for_each_entry(rmap_item, rmap_hlist,
  5284. +                    &node_vma->rmap_hlist, hlist) {
  5285. +           struct anon_vma *anon_vma = rmap_item->anon_vma;
  5286. +           struct anon_vma_chain *vmac;
  5287. +           struct vm_area_struct *vma;
  5288. +
  5289. +           anon_vma_lock(anon_vma);
  5290. +           list_for_each_entry(vmac, &anon_vma->head,
  5291. +                       same_anon_vma) {
  5292. +               vma = vmac->vma;
  5293. +               address = get_rmap_addr(rmap_item);
  5294. +
  5295. +               if (address < vma->vm_start ||
  5296. +                   address >= vma->vm_end)
  5297. +                   continue;
  5298. +
  5299. +               if ((rmap_item->slot->vma == vma) ==
  5300. +                   search_new_forks)
  5301. +                   continue;
  5302. +
  5303. +               ret = rmap_one(page, vma, address, arg);
  5304. +               if (ret != SWAP_AGAIN) {
  5305. +                   anon_vma_unlock(anon_vma);
  5306. +                   goto out;
  5307. +               }
  5308. +           }
  5309. +           anon_vma_unlock(anon_vma);
  5310. +       }
  5311. +   }
  5312. +   if (!search_new_forks++)
  5313. +       goto again;
  5314. +out:
  5315. +   return ret;
  5316. +}
  5317. +
  5318. +/* Common ksm interface but may be specific to uksm */
  5319. +void ksm_migrate_page(struct page *newpage, struct page *oldpage)
  5320. +{
  5321. +   struct stable_node *stable_node;
  5322. +
  5323. +   VM_BUG_ON(!PageLocked(oldpage));
  5324. +   VM_BUG_ON(!PageLocked(newpage));
  5325. +   VM_BUG_ON(newpage->mapping != oldpage->mapping);
  5326. +
  5327. +   stable_node = page_stable_node(newpage);
  5328. +   if (stable_node) {
  5329. +       VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
  5330. +       stable_node->kpfn = page_to_pfn(newpage);
  5331. +   }
  5332. +}
  5333. +#endif /* CONFIG_MIGRATION */
  5334. +
  5335. +#ifdef CONFIG_MEMORY_HOTREMOVE
  5336. +static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
  5337. +                        unsigned long end_pfn)
  5338. +{
  5339. +   struct rb_node *node;
  5340. +
  5341. +   for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
  5342. +       struct stable_node *stable_node;
  5343. +
  5344. +       stable_node = rb_entry(node, struct stable_node, node);
  5345. +       if (stable_node->kpfn >= start_pfn &&
  5346. +           stable_node->kpfn < end_pfn)
  5347. +           return stable_node;
  5348. +   }
  5349. +   return NULL;
  5350. +}
  5351. +
  5352. +static int uksm_memory_callback(struct notifier_block *self,
  5353. +                  unsigned long action, void *arg)
  5354. +{
  5355. +   struct memory_notify *mn = arg;
  5356. +   struct stable_node *stable_node;
  5357. +
  5358. +   switch (action) {
  5359. +   case MEM_GOING_OFFLINE:
  5360. +       /*
  5361. +        * Keep it very simple for now: just lock out ksmd and
  5362. +        * MADV_UNMERGEABLE while any memory is going offline.
  5363. +        * mutex_lock_nested() is necessary because lockdep was alarmed
  5364. +        * that here we take uksm_thread_mutex inside notifier chain
  5365. +        * mutex, and later take notifier chain mutex inside
  5366. +        * uksm_thread_mutex to unlock it.   But that's safe because both
  5367. +        * are inside mem_hotplug_mutex.
  5368. +        */
  5369. +       mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
  5370. +       break;
  5371. +
  5372. +   case MEM_OFFLINE:
  5373. +       /*
  5374. +        * Most of the work is done by page migration; but there might
  5375. +        * be a few stable_nodes left over, still pointing to struct
  5376. +        * pages which have been offlined: prune those from the tree.
  5377. +        */
  5378. +       while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
  5379. +                   mn->start_pfn + mn->nr_pages)) != NULL)
  5380. +           remove_node_from_stable_tree(stable_node, 1, 1);
  5381. +       /* fallthrough */
  5382. +
  5383. +   case MEM_CANCEL_OFFLINE:
  5384. +       mutex_unlock(&uksm_thread_mutex);
  5385. +       break;
  5386. +   }
  5387. +   return NOTIFY_OK;
  5388. +}
  5389. +#endif /* CONFIG_MEMORY_HOTREMOVE */
  5390. +
  5391. +#ifdef CONFIG_SYSFS
  5392. +/*
  5393. + * This all compiles without CONFIG_SYSFS, but is a waste of space.
  5394. + */
  5395. +
  5396. +#define UKSM_ATTR_RO(_name) \
  5397. +   static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
  5398. +#define UKSM_ATTR(_name) \
  5399. +   static struct kobj_attribute _name##_attr = \
  5400. +       __ATTR(_name, 0644, _name##_show, _name##_store)
  5401. +
  5402. +static ssize_t sleep_millisecs_show(struct kobject *kobj,
  5403. +                   struct kobj_attribute *attr, char *buf)
  5404. +{
  5405. +   return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
  5406. +}
  5407. +
  5408. +static ssize_t sleep_millisecs_store(struct kobject *kobj,
  5409. +                    struct kobj_attribute *attr,
  5410. +                    const char *buf, size_t count)
  5411. +{
  5412. +   unsigned long msecs;
  5413. +   int err;
  5414. +
  5415. +   err = strict_strtoul(buf, 10, &msecs);
  5416. +   if (err || msecs > UINT_MAX)
  5417. +       return -EINVAL;
  5418. +
  5419. +   uksm_sleep_jiffies = msecs_to_jiffies(msecs);
  5420. +
  5421. +   return count;
  5422. +}
  5423. +UKSM_ATTR(sleep_millisecs);
  5424. +
  5425. +static ssize_t min_scan_ratio_show(struct kobject *kobj,
  5426. +                   struct kobj_attribute *attr, char *buf)
  5427. +{
  5428. +   return sprintf(buf, "%u\n", uksm_min_scan_ratio);
  5429. +}
  5430. +
  5431. +static ssize_t min_scan_ratio_store(struct kobject *kobj,
  5432. +                    struct kobj_attribute *attr,
  5433. +                    const char *buf, size_t count)
  5434. +{
  5435. +   unsigned long msr;
  5436. +   int err;
  5437. +
  5438. +   err = strict_strtoul(buf, 10, &msr);
  5439. +   if (err || msr > UINT_MAX)
  5440. +       return -EINVAL;
  5441. +
  5442. +   uksm_min_scan_ratio = msr;
  5443. +
  5444. +   return count;
  5445. +}
  5446. +UKSM_ATTR(min_scan_ratio);
  5447. +
  5448. +static ssize_t scan_batch_pages_show(struct kobject *kobj,
  5449. +                 struct kobj_attribute *attr, char *buf)
  5450. +{
  5451. +   return sprintf(buf, "%lu\n", uksm_scan_batch_pages);
  5452. +}
  5453. +
  5454. +static ssize_t scan_batch_pages_store(struct kobject *kobj,
  5455. +                  struct kobj_attribute *attr,
  5456. +                  const char *buf, size_t count)
  5457. +{
  5458. +   int err;
  5459. +   unsigned long batch_pages;
  5460. +
  5461. +   err = strict_strtoul(buf, 10, &batch_pages);
  5462. +   if (err || batch_pages > UINT_MAX)
  5463. +       return -EINVAL;
  5464. +
  5465. +   uksm_scan_batch_pages = batch_pages;
  5466. +   cal_ladder_pages_to_scan(uksm_scan_batch_pages);
  5467. +
  5468. +   return count;
  5469. +}
  5470. +UKSM_ATTR(scan_batch_pages);
  5471. +
  5472. +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
  5473. +           char *buf)
  5474. +{
  5475. +   return sprintf(buf, "%u\n", uksm_run);
  5476. +}
  5477. +
  5478. +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
  5479. +            const char *buf, size_t count)
  5480. +{
  5481. +   int err;
  5482. +   unsigned long flags;
  5483. +
  5484. +   err = strict_strtoul(buf, 10, &flags);
  5485. +   if (err || flags > UINT_MAX)
  5486. +       return -EINVAL;
  5487. +   if (flags > UKSM_RUN_MERGE)
  5488. +       return -EINVAL;
  5489. +
  5490. +   mutex_lock(&uksm_thread_mutex);
  5491. +   if (uksm_run != flags) {
  5492. +       uksm_run = flags;
  5493. +   }
  5494. +   mutex_unlock(&uksm_thread_mutex);
  5495. +
  5496. +   if (flags & UKSM_RUN_MERGE)
  5497. +       wake_up_interruptible(&uksm_thread_wait);
  5498. +
  5499. +   return count;
  5500. +}
  5501. +UKSM_ATTR(run);
  5502. +
  5503. +static ssize_t usr_spt_enabled_show(struct kobject *kobj, struct kobj_attribute *attr,
  5504. +           char *buf)
  5505. +{
  5506. +   return sprintf(buf, "%u\n", uksm_usr_spt_enabled);
  5507. +}
  5508. +
  5509. +static ssize_t usr_spt_enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
  5510. +            const char *buf, size_t count)
  5511. +{
  5512. +   int err;
  5513. +   unsigned long flags;
  5514. +
  5515. +   err = strict_strtoul(buf, 10, &flags);
  5516. +   if (err || flags > UINT_MAX)
  5517. +       return -EINVAL;
  5518. +
  5519. +   uksm_usr_spt_enabled = !!flags;
  5520. +
  5521. +   return count;
  5522. +}
  5523. +UKSM_ATTR(usr_spt_enabled);
  5524. +
  5525. +//------
  5526. +static ssize_t usr_spt_msg_show(struct kobject *kobj, struct kobj_attribute *attr,
  5527. +           char *buf)
  5528. +{
  5529. +   return sprintf(buf, "%s\n", uksm_usr_spt.usr_str);
  5530. +}
  5531. +
  5532. +static ssize_t usr_spt_msg_store(struct kobject *kobj, struct kobj_attribute *attr,
  5533. +            const char *buf, size_t count)
  5534. +{
  5535. +   int s;
  5536. +
  5537. +   s = sizeof(uksm_usr_spt.usr_str);
  5538. +   strncpy(uksm_usr_spt.usr_str, buf, s);
  5539. +   uksm_usr_spt.usr_str[s - 1] = '\0';
  5540. +
  5541. +   return count;
  5542. +}
  5543. +UKSM_ATTR(usr_spt_msg);
  5544. +
  5545. +static ssize_t usr_spt_flags_show(struct kobject *kobj, struct kobj_attribute *attr,
  5546. +           char *buf)
  5547. +{
  5548. +   return sprintf(buf, "%x\n", uksm_usr_spt.flags);
  5549. +}
  5550. +
  5551. +static ssize_t usr_spt_flags_store(struct kobject *kobj, struct kobj_attribute *attr,
  5552. +            const char *buf, size_t count)
  5553. +{
  5554. +   int err;
  5555. +   unsigned long flags;
  5556. +
  5557. +   err = strict_strtoul(buf, 10, &flags);
  5558. +   if (err || flags > (u8)(~0U))
  5559. +       return -EINVAL;
  5560. +
  5561. +   uksm_usr_spt.flags = (unsigned char) flags;
  5562. +
  5563. +   return count;
  5564. +}
  5565. +UKSM_ATTR(usr_spt_flags);
  5566. +
  5567. +//------
  5568. +
  5569. +static ssize_t thrash_threshold_show(struct kobject *kobj,
  5570. +                    struct kobj_attribute *attr, char *buf)
  5571. +{
  5572. +   return sprintf(buf, "%u\n", uksm_thrash_threshold);
  5573. +}
  5574. +
  5575. +static ssize_t thrash_threshold_store(struct kobject *kobj,
  5576. +                     struct kobj_attribute *attr,
  5577. +                     const char *buf, size_t count)
  5578. +{
  5579. +   int err;
  5580. +   unsigned long flags;
  5581. +
  5582. +   err = strict_strtoul(buf, 10, &flags);
  5583. +   if (err || flags > 99)
  5584. +       return -EINVAL;
  5585. +
  5586. +   uksm_thrash_threshold = flags;
  5587. +
  5588. +   return count;
  5589. +}
  5590. +UKSM_ATTR(thrash_threshold);
  5591. +
  5592. +static ssize_t pages_shared_show(struct kobject *kobj,
  5593. +                struct kobj_attribute *attr, char *buf)
  5594. +{
  5595. +   return sprintf(buf, "%lu\n", uksm_pages_shared);
  5596. +}
  5597. +UKSM_ATTR_RO(pages_shared);
  5598. +
  5599. +static ssize_t pages_sharing_show(struct kobject *kobj,
  5600. +                 struct kobj_attribute *attr, char *buf)
  5601. +{
  5602. +   return sprintf(buf, "%lu\n", uksm_pages_sharing);
  5603. +}
  5604. +UKSM_ATTR_RO(pages_sharing);
  5605. +
  5606. +static ssize_t pages_unshared_show(struct kobject *kobj,
  5607. +                  struct kobj_attribute *attr, char *buf)
  5608. +{
  5609. +   return sprintf(buf, "%lu\n", uksm_pages_unshared);
  5610. +}
  5611. +UKSM_ATTR_RO(pages_unshared);
  5612. +
  5613. +static ssize_t full_scans_show(struct kobject *kobj,
  5614. +                  struct kobj_attribute *attr, char *buf)
  5615. +{
  5616. +   return sprintf(buf, "%llu\n", uksm_scan_round);
  5617. +}
  5618. +UKSM_ATTR_RO(full_scans);
  5619. +
  5620. +static ssize_t pages_scanned_show(struct kobject *kobj,
  5621. +                 struct kobj_attribute *attr, char *buf)
  5622. +{
  5623. +   unsigned long base = 0;
  5624. +   u64 delta, ret;
  5625. +
  5626. +   if (pages_scanned_stored) {
  5627. +       base = pages_scanned_base;
  5628. +       ret = pages_scanned_stored;
  5629. +       delta = uksm_pages_scanned >> base;
  5630. +       if (CAN_OVERFLOW_U64(ret, delta)) {
  5631. +           ret >>= 1;
  5632. +           delta >>= 1;
  5633. +           base++;
  5634. +           ret += delta;
  5635. +       }
  5636. +   } else {
  5637. +       ret = uksm_pages_scanned;
  5638. +   }
  5639. +
  5640. +   while (ret > ULONG_MAX) {
  5641. +       ret >>= 1;
  5642. +       base++;
  5643. +   }
  5644. +
  5645. +   if (base)
  5646. +       return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
  5647. +   else
  5648. +       return sprintf(buf, "%lu\n", (unsigned long)ret);
  5649. +}
  5650. +UKSM_ATTR_RO(pages_scanned);
  5651. +
  5652. +static ssize_t hash_strength_show(struct kobject *kobj,
  5653. +                 struct kobj_attribute *attr, char *buf)
  5654. +{
  5655. +   return sprintf(buf, "%lu\n", hash_strength);
  5656. +}
  5657. +UKSM_ATTR_RO(hash_strength);
  5658. +
  5659. +static ssize_t sleep_times_show(struct kobject *kobj,
  5660. +                 struct kobj_attribute *attr, char *buf)
  5661. +{
  5662. +   return sprintf(buf, "%llu\n", uksm_sleep_times);
  5663. +}
  5664. +UKSM_ATTR_RO(sleep_times);
  5665. +
  5666. +
  5667. +static struct attribute *uksm_attrs[] = {
  5668. +   &sleep_millisecs_attr.attr,
  5669. +   &scan_batch_pages_attr.attr,
  5670. +   &run_attr.attr,
  5671. +   &usr_spt_enabled_attr.attr,
  5672. +   &usr_spt_msg_attr.attr,
  5673. +   &usr_spt_flags_attr.attr,
  5674. +   &pages_shared_attr.attr,
  5675. +   &pages_sharing_attr.attr,
  5676. +   &pages_unshared_attr.attr,
  5677. +   &full_scans_attr.attr,
  5678. +   &min_scan_ratio_attr.attr,
  5679. +   &pages_scanned_attr.attr,
  5680. +   &hash_strength_attr.attr,
  5681. +   &sleep_times_attr.attr,
  5682. +   &thrash_threshold_attr.attr,
  5683. +   NULL,
  5684. +};
  5685. +
  5686. +static struct attribute_group uksm_attr_group = {
  5687. +   .attrs = uksm_attrs,
  5688. +   .name = "uksm",
  5689. +};
  5690. +#endif /* CONFIG_SYSFS */
  5691. +
  5692. +static inline void init_scan_ladder(void)
  5693. +{
  5694. +   int i;
  5695. +   unsigned long mul = 1;
  5696. +
  5697. +   for (i = 0; i < uksm_scan_ladder_size; i++,
  5698. +         mul *= uksm_scan_ratio_delta) {
  5699. +
  5700. +       uksm_scan_ladder[i].scan_ratio = uksm_min_scan_ratio * mul;
  5701. +       INIT_LIST_HEAD(&uksm_scan_ladder[i].vma_list);
  5702. +       uksm_scan_ladder[i].vma_num = 0;
  5703. +       uksm_scan_ladder[i].fully_scanned_slots = 0;
  5704. +       uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
  5705. +       uksm_scan_ladder[i].flags &= ~UKSM_RUNG_BUSY_SEARCHED;
  5706. +   }
  5707. +
  5708. +   cal_ladder_pages_to_scan(uksm_scan_batch_pages);
  5709. +}
  5710. +
  5711. +static inline int cal_positive_negative_costs(void)
  5712. +{
  5713. +   struct page *p1, *p2;
  5714. +   unsigned char *addr1, *addr2;
  5715. +   unsigned long i, time_start, hash_cost;
  5716. +   unsigned long loopnum = 0;
  5717. +
  5718. +   /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
  5719. +   volatile u32 hash;
  5720. +   volatile int ret;
  5721. +
  5722. +   p1 = alloc_page(GFP_KERNEL);
  5723. +   if (!p1)
  5724. +       return -ENOMEM;
  5725. +
  5726. +   p2 = alloc_page(GFP_KERNEL);
  5727. +   if (!p2)
  5728. +       return -ENOMEM;
  5729. +
  5730. +   addr1 = kmap_atomic(p1, KM_USER0);
  5731. +   addr2 = kmap_atomic(p2, KM_USER1);
  5732. +   memset(addr1, random32(), PAGE_SIZE);
  5733. +   memcpy(addr2, addr1, PAGE_SIZE);
  5734. +
  5735. +   /* make sure that the two pages differ in last byte */
  5736. +   addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
  5737. +   kunmap_atomic(addr2, KM_USER1);
  5738. +   kunmap_atomic(addr1, KM_USER0);
  5739. +
  5740. +   time_start = jiffies;
  5741. +   while (jiffies - time_start < 100) {
  5742. +       for (i = 0; i < 100; i++)
  5743. +           hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
  5744. +       loopnum += 100;
  5745. +   }
  5746. +   hash_cost = (jiffies - time_start);
  5747. +
  5748. +   time_start = jiffies;
  5749. +   for (i = 0; i < loopnum; i++)
  5750. +       ret = pages_identical(p1, p2);
  5751. +   memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
  5752. +   memcmp_cost /= hash_cost;
  5753. +   printk(KERN_INFO "UKSM: relative memcmp_cost = %lu "
  5754. +            "hash=%u cmp_ret=%d.\n",
  5755. +          memcmp_cost, hash, ret);
  5756. +
  5757. +   __free_page(p1);
  5758. +   __free_page(p2);
  5759. +   return 0;
  5760. +}
  5761. +
  5762. +static int init_zeropage_hash_table(void)
  5763. +{
  5764. +   struct page *page;
  5765. +   char *addr;
  5766. +   int i;
  5767. +
  5768. +   page = alloc_page(GFP_KERNEL);
  5769. +   if (!page)
  5770. +       return -ENOMEM;
  5771. +
  5772. +   addr = kmap_atomic(page, KM_USER0);
  5773. +   memset(addr, 0, PAGE_SIZE);
  5774. +   kunmap_atomic(addr, KM_USER0);
  5775. +
  5776. +   zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32),
  5777. +       GFP_KERNEL);
  5778. +   if (!zero_hash_table)
  5779. +       return -ENOMEM;
  5780. +
  5781. +   for (i = 0; i < HASH_STRENGTH_MAX; i++)
  5782. +       zero_hash_table[i] = page_hash(page, i, 0);
  5783. +
  5784. +   __free_page(page);
  5785. +
  5786. +   return 0;
  5787. +}
  5788. +
  5789. +static inline int init_random_sampling(void)
  5790. +{
  5791. +   unsigned long i;
  5792. +   random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
  5793. +   if (!random_nums)
  5794. +       return -ENOMEM;
  5795. +
  5796. +   for (i = 0; i < HASH_STRENGTH_FULL; i++)
  5797. +       random_nums[i] = i;
  5798. +
  5799. +   for (i = 0; i < HASH_STRENGTH_FULL; i++) {
  5800. +       unsigned long rand_range, swap_index, tmp;
  5801. +
  5802. +       rand_range = HASH_STRENGTH_FULL - i;
  5803. +       swap_index = i + random32() % rand_range;
  5804. +       tmp = random_nums[i];
  5805. +       random_nums[i] =  random_nums[swap_index];
  5806. +       random_nums[swap_index] = tmp;
  5807. +   }
  5808. +
  5809. +   rshash_state.state = RSHASH_NEW;
  5810. +   rshash_state.below_count = 0;
  5811. +   rshash_state.lookup_window_index = 0;
  5812. +
  5813. +   return cal_positive_negative_costs();
  5814. +}
  5815. +
  5816. +static int __init uksm_slab_init(void)
  5817. +{
  5818. +   rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
  5819. +   if (!rmap_item_cache)
  5820. +       goto out;
  5821. +
  5822. +   stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
  5823. +   if (!stable_node_cache)
  5824. +       goto out_free1;
  5825. +
  5826. +   node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
  5827. +   if (!node_vma_cache)
  5828. +       goto out_free2;
  5829. +
  5830. +   vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
  5831. +   if (!vma_slot_cache)
  5832. +       goto out_free3;
  5833. +
  5834. +   tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
  5835. +   if (!tree_node_cache)
  5836. +       goto out_free4;
  5837. +
  5838. +   return 0;
  5839. +
  5840. +out_free4:
  5841. +   kmem_cache_destroy(vma_slot_cache);
  5842. +out_free3:
  5843. +   kmem_cache_destroy(node_vma_cache);
  5844. +out_free2:
  5845. +   kmem_cache_destroy(stable_node_cache);
  5846. +out_free1:
  5847. +   kmem_cache_destroy(rmap_item_cache);
  5848. +out:
  5849. +   return -ENOMEM;
  5850. +}
  5851. +
  5852. +static void __init uksm_slab_free(void)
  5853. +{
  5854. +   kmem_cache_destroy(stable_node_cache);
  5855. +   kmem_cache_destroy(rmap_item_cache);
  5856. +   kmem_cache_destroy(node_vma_cache);
  5857. +   kmem_cache_destroy(vma_slot_cache);
  5858. +   kmem_cache_destroy(tree_node_cache);
  5859. +}
  5860. +
  5861. +/* Common interface to ksm, different to it. */
  5862. +int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
  5863. +       unsigned long end, int advice, unsigned long *vm_flags)
  5864. +{
  5865. +   int err;
  5866. +
  5867. +   switch (advice) {
  5868. +   case MADV_MERGEABLE:
  5869. +       return 0;       /* just ignore the advice */
  5870. +
  5871. +   case MADV_UNMERGEABLE:
  5872. +       if (!(*vm_flags & VM_MERGEABLE))
  5873. +           return 0;       /* just ignore the advice */
  5874. +
  5875. +       if (vma->anon_vma) {
  5876. +           err = unmerge_uksm_pages(vma, start, end);
  5877. +           if (err)
  5878. +               return err;
  5879. +       }
  5880. +
  5881. +       uksm_remove_vma(vma);
  5882. +       *vm_flags &= ~VM_MERGEABLE;
  5883. +       break;
  5884. +   }
  5885. +
  5886. +   return 0;
  5887. +}
  5888. +
  5889. +/* Common interface to ksm, actually the same. */
  5890. +struct page *ksm_does_need_to_copy(struct page *page,
  5891. +           struct vm_area_struct *vma, unsigned long address)
  5892. +{
  5893. +   struct page *new_page;
  5894. +
  5895. +   new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
  5896. +   if (new_page) {
  5897. +       copy_user_highpage(new_page, page, address, vma);
  5898. +
  5899. +       SetPageDirty(new_page);
  5900. +       __SetPageUptodate(new_page);
  5901. +       SetPageSwapBacked(new_page);
  5902. +       __set_page_locked(new_page);
  5903. +
  5904. +       if (page_evictable(new_page, vma))
  5905. +           lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
  5906. +       else
  5907. +           add_page_to_unevictable_list(new_page);
  5908. +   }
  5909. +
  5910. +   return new_page;
  5911. +}
  5912. +
  5913. +static int __init uksm_init(void)
  5914. +{
  5915. +   struct task_struct *uksm_thread;
  5916. +   int err;
  5917. +   unsigned int sr = uksm_min_scan_ratio;
  5918. +
  5919. +   uksm_sleep_jiffies = msecs_to_jiffies(10);
  5920. +
  5921. +   uksm_scan_ladder_size = 1;
  5922. +   while (sr < UKSM_SCAN_RATIO_MAX) {
  5923. +       sr *= uksm_scan_ratio_delta;
  5924. +       uksm_scan_ladder_size++;
  5925. +   }
  5926. +   uksm_scan_ladder = kzalloc(sizeof(struct scan_rung) *
  5927. +                 uksm_scan_ladder_size, GFP_KERNEL);
  5928. +   if (!uksm_scan_ladder) {
  5929. +       printk(KERN_ERR "UKSM: scan ladder allocation failed, size=%d\n",
  5930. +              uksm_scan_ladder_size);
  5931. +       err = ENOMEM;
  5932. +       goto out;
  5933. +   }
  5934. +   init_scan_ladder();
  5935. +
  5936. +   INIT_RADIX_TREE(&uksm_vma_tree, GFP_KERNEL);
  5937. +
  5938. +   err = init_random_sampling();
  5939. +   if (err)
  5940. +       goto out_free2;
  5941. +
  5942. +   err = uksm_slab_init();
  5943. +   if (err)
  5944. +       goto out_free1;
  5945. +
  5946. +   err = init_zeropage_hash_table();
  5947. +   if (err)
  5948. +       goto out_free0;
  5949. +
  5950. +   uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
  5951. +   if (IS_ERR(uksm_thread)) {
  5952. +       printk(KERN_ERR "uksm: creating kthread failed\n");
  5953. +       err = PTR_ERR(uksm_thread);
  5954. +       goto out_free;
  5955. +   }
  5956. +
  5957. +#ifdef CONFIG_SYSFS
  5958. +   err = sysfs_create_group(mm_kobj, &uksm_attr_group);
  5959. +   if (err) {
  5960. +       printk(KERN_ERR "uksm: register sysfs failed\n");
  5961. +       kthread_stop(uksm_thread);
  5962. +       goto out_free;
  5963. +   }
  5964. +#else
  5965. +   uksm_run = UKSM_RUN_MERGE;  /* no way for user to start it */
  5966. +
  5967. +#endif /* CONFIG_SYSFS */
  5968. +
  5969. +#ifdef CONFIG_MEMORY_HOTREMOVE
  5970. +   /*
  5971. +    * Choose a high priority since the callback takes uksm_thread_mutex:
  5972. +    * later callbacks could only be taking locks which nest within that.
  5973. +    */
  5974. +   hotplug_memory_notifier(uksm_memory_callback, 100);
  5975. +#endif
  5976. +   return 0;
  5977. +
  5978. +out_free:
  5979. +   kfree(zero_hash_table);
  5980. +out_free0:
  5981. +   uksm_slab_free();
  5982. +out_free1:
  5983. +   kfree(random_nums);
  5984. +out_free2:
  5985. +   kfree(uksm_scan_ladder);
  5986. +out:
  5987. +   return err;
  5988. +}
  5989. +
  5990. +#ifdef MODULE
  5991. +module_init(uksm_init)
  5992. +#else
  5993. +late_initcall(uksm_init);
  5994. +#endif
  5995. +
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement