Pastebin launched a little side project called VERYVIRAL.com, check it out ;-) Want more features on Pastebin? Sign Up, it's FREE!
Guest

uksm-0.1.2.0-for-v3.6.patch

By: ImNtReal on Oct 15th, 2012  |  syntax: Diff  |  size: 144.12 KB  |  views: 93  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. diff -Naur linux-3.6-orig linux-3.6
  2. diff -Naur linux-3.6-orig/mm/uksm.c linux-3.6/mm/uksm.c
  3. --- linux-3.6-orig/mm/uksm.c    1969-12-31 19:00:00.000000000 -0500
  4. +++ linux-3.6/mm/uksm.c 2012-10-15 11:28:33.550230252 -0400
  5. @@ -0,0 +1,5616 @@
  6. +/*
  7. + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
  8. + *
  9. + * This is an improvement upon KSM. Some basic data structures and routines
  10. + * are borrowed from ksm.c .
  11. + *
  12. + * Its new features:
  13. + * 1. Full system scan:
  14. + *      It automatically scans all user processes' anonymous VMAs. Kernel-user
  15. + *      interaction to submit a memory area to KSM is no longer needed.
  16. + *
  17. + * 2. Rich area detection:
  18. + *      It automatically detects rich areas containing abundant duplicated
  19. + *      pages based. Rich areas are given a full scan speed. Poor areas are
  20. + *      sampled at a reasonable speed with very low CPU consumption.
  21. + *
  22. + * 3. Ultra Per-page scan speed improvement:
  23. + *      A new hash algorithm is proposed. As a result, on a machine with
  24. + *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
  25. + *      can scan memory areas that does not contain duplicated pages at speed of
  26. + *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
  27. + *      477MB/sec ~ 923MB/sec.
  28. + *
  29. + * 4. Thrashing area avoidance:
  30. + *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
  31. + *      filtered out. My benchmark shows it's more efficient than KSM's per-page
  32. + *      hash value based volatile page detection.
  33. + *
  34. + *
  35. + * 5. Misc changes upon KSM:
  36. + *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
  37. + *        comparison. It's much faster than default C version on x86.
  38. + *      * rmap_item now has an struct *page member to loosely cache a
  39. + *        address-->page mapping, which reduces too much time-costly
  40. + *        follow_page().
  41. + *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
  42. + *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
  43. + *        ksm is needed for this case.
  44. + *
  45. + * 6. Full Zero Page consideration(contributed by Figo Zhang)
  46. + *    Now uksmd consider full zero pages as special pages and merge them to an
  47. + *    special unswappable uksm zero page.
  48. + */
  49. +
  50. +#include <linux/errno.h>
  51. +#include <linux/mm.h>
  52. +#include <linux/fs.h>
  53. +#include <linux/mman.h>
  54. +#include <linux/sched.h>
  55. +#include <linux/rwsem.h>
  56. +#include <linux/pagemap.h>
  57. +#include <linux/rmap.h>
  58. +#include <linux/spinlock.h>
  59. +#include <linux/jhash.h>
  60. +#include <linux/delay.h>
  61. +#include <linux/kthread.h>
  62. +#include <linux/wait.h>
  63. +#include <linux/slab.h>
  64. +#include <linux/rbtree.h>
  65. +#include <linux/memory.h>
  66. +#include <linux/mmu_notifier.h>
  67. +#include <linux/swap.h>
  68. +#include <linux/ksm.h>
  69. +#include <linux/crypto.h>
  70. +#include <linux/scatterlist.h>
  71. +#include <crypto/hash.h>
  72. +#include <linux/random.h>
  73. +#include <linux/math64.h>
  74. +#include <linux/gcd.h>
  75. +#include <linux/freezer.h>
  76. +#include <linux/sradix-tree.h>
  77. +
  78. +#include <asm/tlbflush.h>
  79. +#include "internal.h"
  80. +
  81. +#ifdef CONFIG_X86
  82. +#undef memcmp
  83. +
  84. +#ifdef CONFIG_X86_32
  85. +#define memcmp memcmpx86_32
  86. +/*
  87. + * Compare 4-byte-aligned address s1 and s2, with length n
  88. + */
  89. +int memcmpx86_32(void *s1, void *s2, size_t n)
  90. +{
  91. +       size_t num = n / 4;
  92. +       register int res;
  93. +
  94. +       __asm__ __volatile__
  95. +       (
  96. +        "testl %3,%3\n\t"
  97. +        "repe; cmpsd\n\t"
  98. +        "je        1f\n\t"
  99. +        "sbbl      %0,%0\n\t"
  100. +        "orl       $1,%0\n"
  101. +        "1:"
  102. +        : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
  103. +        : "0" (0)
  104. +        : "cc");
  105. +
  106. +       return res;
  107. +}
  108. +
  109. +/*
  110. + * Check the page is all zero ?
  111. + */
  112. +static int is_full_zero(const void *s1, size_t len)
  113. +{
  114. +       unsigned char same;
  115. +
  116. +       len /= 4;
  117. +
  118. +       __asm__ __volatile__
  119. +       ("repe; scasl;"
  120. +        "sete %0"
  121. +        : "=qm" (same), "+D" (s1), "+c" (len)
  122. +        : "a" (0)
  123. +        : "cc");
  124. +
  125. +       return same;
  126. +}
  127. +
  128. +
  129. +#elif defined(CONFIG_X86_64)
  130. +#define memcmp memcmpx86_64
  131. +/*
  132. + * Compare 8-byte-aligned address s1 and s2, with length n
  133. + */
  134. +int memcmpx86_64(void *s1, void *s2, size_t n)
  135. +{
  136. +       size_t num = n / 8;
  137. +       register int res;
  138. +
  139. +       __asm__ __volatile__
  140. +       (
  141. +        "testq %q3,%q3\n\t"
  142. +        "repe; cmpsq\n\t"
  143. +        "je        1f\n\t"
  144. +        "sbbq      %q0,%q0\n\t"
  145. +        "orq       $1,%q0\n"
  146. +        "1:"
  147. +        : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
  148. +        : "0" (0)
  149. +        : "cc");
  150. +
  151. +       return res;
  152. +}
  153. +
  154. +static int is_full_zero(const void *s1, size_t len)
  155. +{
  156. +       unsigned char same;
  157. +
  158. +       len /= 8;
  159. +
  160. +       __asm__ __volatile__
  161. +       ("repe; scasq;"
  162. +        "sete %0"
  163. +        : "=qm" (same), "+D" (s1), "+c" (len)
  164. +        : "a" (0)
  165. +        : "cc");
  166. +
  167. +       return same;
  168. +}
  169. +
  170. +#endif
  171. +#else
  172. +static int is_full_zero(const void *s1, size_t len)
  173. +{
  174. +       unsigned long *src = s1;
  175. +       int i;
  176. +
  177. +       len /= sizeof(*src);
  178. +
  179. +       for (i = 0; i < len; i++) {
  180. +               if (src[i])
  181. +                       return 0;
  182. +       }
  183. +
  184. +       return 1;
  185. +}
  186. +#endif
  187. +
  188. +#define U64_MAX                (~((u64)0))
  189. +#define UKSM_RUNG_ROUND_FINISHED  (1 << 0)
  190. +#define TIME_RATIO_SCALE       10000
  191. +
  192. +#define SLOT_TREE_NODE_SHIFT   8
  193. +#define SLOT_TREE_NODE_STORE_SIZE      (1UL << SLOT_TREE_NODE_SHIFT)
  194. +struct slot_tree_node {
  195. +       unsigned long size;
  196. +       struct sradix_tree_node snode;
  197. +       void *stores[SLOT_TREE_NODE_STORE_SIZE];
  198. +};
  199. +
  200. +static struct kmem_cache *slot_tree_node_cachep;
  201. +
  202. +static struct sradix_tree_node *slot_tree_node_alloc(void)
  203. +{
  204. +       struct slot_tree_node *p;
  205. +       p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL);
  206. +       if (!p)
  207. +               return NULL;
  208. +
  209. +       return &p->snode;
  210. +}
  211. +
  212. +static void slot_tree_node_free(struct sradix_tree_node *node)
  213. +{
  214. +       struct slot_tree_node *p;
  215. +
  216. +       p = container_of(node, struct slot_tree_node, snode);
  217. +       kmem_cache_free(slot_tree_node_cachep, p);
  218. +}
  219. +
  220. +static void slot_tree_node_extend(struct sradix_tree_node *parent,
  221. +                                 struct sradix_tree_node *child)
  222. +{
  223. +       struct slot_tree_node *p, *c;
  224. +
  225. +       p = container_of(parent, struct slot_tree_node, snode);
  226. +       c = container_of(child, struct slot_tree_node, snode);
  227. +
  228. +       p->size += c->size;
  229. +}
  230. +
  231. +void slot_tree_node_assign(struct sradix_tree_node *node,
  232. +                          unsigned index, void *item)
  233. +{
  234. +       struct vma_slot *slot = item;
  235. +       struct slot_tree_node *cur;
  236. +
  237. +       slot->snode = node;
  238. +       slot->sindex = index;
  239. +
  240. +       while (node) {
  241. +               cur = container_of(node, struct slot_tree_node, snode);
  242. +               cur->size += slot->pages;
  243. +               node = node->parent;
  244. +       }
  245. +}
  246. +
  247. +void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset)
  248. +{
  249. +       struct vma_slot *slot;
  250. +       struct slot_tree_node *cur;
  251. +       unsigned long pages;
  252. +
  253. +       if (node->height == 1) {
  254. +               slot = node->stores[offset];
  255. +               pages = slot->pages;
  256. +       } else {
  257. +               cur = container_of(node->stores[offset],
  258. +                                  struct slot_tree_node, snode);
  259. +               pages = cur->size;
  260. +       }
  261. +
  262. +       while (node) {
  263. +               cur = container_of(node, struct slot_tree_node, snode);
  264. +               cur->size -= pages;
  265. +               node = node->parent;
  266. +       }
  267. +}
  268. +
  269. +unsigned long slot_iter_index;
  270. +int slot_iter(void *item,  unsigned long height)
  271. +{
  272. +       struct slot_tree_node *node;
  273. +       struct vma_slot *slot;
  274. +
  275. +       if (height == 1) {
  276. +               slot = item;
  277. +               if (slot_iter_index < slot->pages) {
  278. +                       /*in this one*/
  279. +                       return 1;
  280. +               } else {
  281. +                       slot_iter_index -= slot->pages;
  282. +                       return 0;
  283. +               }
  284. +
  285. +       } else {
  286. +               node = container_of(item, struct slot_tree_node, snode);
  287. +               if (slot_iter_index < node->size) {
  288. +                       /*in this one*/
  289. +                       return 1;
  290. +               } else {
  291. +                       slot_iter_index -= node->size;
  292. +                       return 0;
  293. +               }
  294. +       }
  295. +}
  296. +
  297. +
  298. +static inline void slot_tree_init_root(struct sradix_tree_root *root)
  299. +{
  300. +       init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
  301. +       root->alloc = slot_tree_node_alloc;
  302. +       root->free = slot_tree_node_free;
  303. +       root->extend = slot_tree_node_extend;
  304. +       root->assign = slot_tree_node_assign;
  305. +       root->rm = slot_tree_node_rm;
  306. +}
  307. +
  308. +void slot_tree_init(void)
  309. +{
  310. +       slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
  311. +                               sizeof(struct slot_tree_node), 0,
  312. +                               SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
  313. +                               NULL);
  314. +}
  315. +
  316. +
  317. +/* Each rung of this ladder is a list of VMAs having a same scan ratio */
  318. +struct scan_rung {
  319. +       //struct list_head scanned_list;
  320. +       struct sradix_tree_root vma_root;
  321. +       struct sradix_tree_root vma_root2;
  322. +
  323. +       struct vma_slot *current_scan;
  324. +       unsigned long current_offset;
  325. +
  326. +       /*
  327. +        * The initial value for current_offset, it should loop over
  328. +        * [0~ step - 1] to let all slot have its chance to be scanned.
  329. +        */
  330. +       unsigned long offset_init;
  331. +       unsigned long step; /* dynamic step for current_offset */
  332. +       unsigned int flags;
  333. +       unsigned long pages_to_scan;
  334. +       //unsigned long fully_scanned_slots;
  335. +       /*
  336. +        * a little bit tricky - if cpu_time_ratio > 0, then the value is the
  337. +        * the cpu time ratio it can spend in rung_i for every scan
  338. +        * period. if < 0, then it is the cpu time ratio relative to the
  339. +        * max cpu percentage user specified. Both in unit of
  340. +        * 1/TIME_RATIO_SCALE
  341. +        */
  342. +       int cpu_ratio;
  343. +
  344. +       /*
  345. +        * How long it will take for all slots in this rung to be fully
  346. +        * scanned? If it's zero, we don't care about the cover time:
  347. +        * it's fully scanned.
  348. +        */
  349. +       unsigned int cover_msecs;
  350. +       //unsigned long vma_num;
  351. +       //unsigned long pages; /* Sum of all slot's pages in rung */
  352. +};
  353. +
  354. +/**
  355. + * node of either the stable or unstale rbtree
  356. + *
  357. + */
  358. +struct tree_node {
  359. +       struct rb_node node; /* link in the main (un)stable rbtree */
  360. +       struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
  361. +       u32 hash;
  362. +       unsigned long count; /* TODO: merged with sub_root */
  363. +       struct list_head all_list; /* all tree nodes in stable/unstable tree */
  364. +};
  365. +
  366. +/**
  367. + * struct stable_node - node of the stable rbtree
  368. + * @node: rb node of this ksm page in the stable tree
  369. + * @hlist: hlist head of rmap_items using this ksm page
  370. + * @kpfn: page frame number of this ksm page
  371. + */
  372. +struct stable_node {
  373. +       struct rb_node node; /* link in sub-rbtree */
  374. +       struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
  375. +       struct hlist_head hlist;
  376. +       unsigned long kpfn;
  377. +       u32 hash_max; /* if ==0 then it's not been calculated yet */
  378. +       struct list_head all_list; /* in a list for all stable nodes */
  379. +};
  380. +
  381. +/**
  382. + * struct node_vma - group rmap_items linked in a same stable
  383. + * node together.
  384. + */
  385. +struct node_vma {
  386. +       union {
  387. +               struct vma_slot *slot;
  388. +               unsigned long key;  /* slot is used as key sorted on hlist */
  389. +       };
  390. +       struct hlist_node hlist;
  391. +       struct hlist_head rmap_hlist;
  392. +       struct stable_node *head;
  393. +};
  394. +
  395. +/**
  396. + * struct rmap_item - reverse mapping item for virtual addresses
  397. + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
  398. + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
  399. + * @mm: the memory structure this rmap_item is pointing into
  400. + * @address: the virtual address this rmap_item tracks (+ flags in low bits)
  401. + * @node: rb node of this rmap_item in the unstable tree
  402. + * @head: pointer to stable_node heading this list in the stable tree
  403. + * @hlist: link into hlist of rmap_items hanging off that stable_node
  404. + */
  405. +struct rmap_item {
  406. +       struct vma_slot *slot;
  407. +       struct page *page;
  408. +       unsigned long address;  /* + low bits used for flags below */
  409. +       unsigned long hash_round;
  410. +       unsigned long entry_index;
  411. +       union {
  412. +               struct {/* when in unstable tree */
  413. +                       struct rb_node node;
  414. +                       struct tree_node *tree_node;
  415. +                       u32 hash_max;
  416. +               };
  417. +               struct { /* when in stable tree */
  418. +                       struct node_vma *head;
  419. +                       struct hlist_node hlist;
  420. +                       struct anon_vma *anon_vma;
  421. +               };
  422. +       };
  423. +} __attribute__((aligned(4)));
  424. +
  425. +struct rmap_list_entry {
  426. +       union {
  427. +               struct rmap_item *item;
  428. +               unsigned long addr;
  429. +       };
  430. +       /* lowest bit is used for is_addr tag */
  431. +} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/
  432. +
  433. +
  434. +/* Basic data structure definition ends */
  435. +
  436. +
  437. +/*
  438. + * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
  439. + * The flags use the low bits of rmap_item.address
  440. + */
  441. +#define UNSTABLE_FLAG  0x1
  442. +#define STABLE_FLAG    0x2
  443. +#define get_rmap_addr(x)       ((x)->address & PAGE_MASK)
  444. +
  445. +/*
  446. + * rmap_list_entry helpers
  447. + */
  448. +#define IS_ADDR_FLAG   1
  449. +#define is_addr(ptr)           ((unsigned long)(ptr) & IS_ADDR_FLAG)
  450. +#define set_is_addr(ptr)       ((ptr) |= IS_ADDR_FLAG)
  451. +#define get_clean_addr(ptr)    (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
  452. +
  453. +
  454. +/*
  455. + * High speed caches for frequently allocated and freed structs
  456. + */
  457. +static struct kmem_cache *rmap_item_cache;
  458. +static struct kmem_cache *stable_node_cache;
  459. +static struct kmem_cache *node_vma_cache;
  460. +static struct kmem_cache *vma_slot_cache;
  461. +static struct kmem_cache *tree_node_cache;
  462. +#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
  463. +               sizeof(struct __struct), __alignof__(struct __struct),\
  464. +               (__flags), NULL)
  465. +
  466. +/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
  467. +#define SCAN_LADDER_SIZE 4
  468. +static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
  469. +
  470. +/* The evaluation rounds uksmd has finished */
  471. +static unsigned long long uksm_eval_round = 1;
  472. +
  473. +/*
  474. + * we add 1 to this var when we consider we should rebuild the whole
  475. + * unstable tree.
  476. + */
  477. +static unsigned long uksm_hash_round = 1;
  478. +
  479. +/*
  480. + * How many times the whole memory is scanned.
  481. + */
  482. +static unsigned long long fully_scanned_round = 1;
  483. +
  484. +/* The total number of virtual pages of all vma slots */
  485. +static u64 uksm_pages_total;
  486. +
  487. +/* The number of pages has been scanned since the start up */
  488. +static u64 uksm_pages_scanned;
  489. +
  490. +static u64 scanned_virtual_pages;
  491. +
  492. +/* The number of pages has been scanned since last encode_benefit call */
  493. +static u64 uksm_pages_scanned_last;
  494. +
  495. +/* If the scanned number is tooo large, we encode it here */
  496. +static u64 pages_scanned_stored;
  497. +
  498. +static unsigned long pages_scanned_base;
  499. +
  500. +/* The number of nodes in the stable tree */
  501. +static unsigned long uksm_pages_shared;
  502. +
  503. +/* The number of page slots additionally sharing those nodes */
  504. +static unsigned long uksm_pages_sharing;
  505. +
  506. +/* The number of nodes in the unstable tree */
  507. +static unsigned long uksm_pages_unshared;
  508. +
  509. +/*
  510. + * Milliseconds ksmd should sleep between scans,
  511. + * >= 100ms to be consistent with
  512. + * scan_time_to_sleep_msec()
  513. + */
  514. +static unsigned int uksm_sleep_jiffies;
  515. +
  516. +/* The real value for the uksmd next sleep */
  517. +static unsigned int uksm_sleep_real;
  518. +
  519. +/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
  520. +static unsigned int uksm_sleep_saved;
  521. +
  522. +/* Max percentage of cpu utilization ksmd can take to scan in one batch */
  523. +static unsigned int uksm_max_cpu_percentage;
  524. +
  525. +static int uksm_cpu_governor;
  526. +
  527. +static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
  528. +
  529. +struct uksm_cpu_preset_s {
  530. +       int cpu_ratio[SCAN_LADDER_SIZE];
  531. +       unsigned int cover_msecs[SCAN_LADDER_SIZE];
  532. +       unsigned int max_cpu; /* percentage */
  533. +};
  534. +
  535. +struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
  536. +       { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
  537. +       { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
  538. +       { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
  539. +       { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
  540. +};
  541. +
  542. +/* The default value for uksm_ema_page_time if it's not initialized */
  543. +#define UKSM_PAGE_TIME_DEFAULT 500
  544. +
  545. +/*cost to scan one page by expotional moving average in nsecs */
  546. +static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
  547. +
  548. +/* The expotional moving average alpha weight, in percentage. */
  549. +#define EMA_ALPHA      20
  550. +
  551. +/*
  552. + * The threshold used to filter out thrashing areas,
  553. + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
  554. + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
  555. + * will be considered as having a zero duplication ratio.
  556. + */
  557. +static unsigned int uksm_thrash_threshold = 50;
  558. +
  559. +/* How much dedup ratio is considered to be abundant*/
  560. +static unsigned int uksm_abundant_threshold = 10;
  561. +
  562. +/* All slots having merged pages in this eval round. */
  563. +struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
  564. +
  565. +/* How many times the ksmd has slept since startup */
  566. +static unsigned long long uksm_sleep_times;
  567. +
  568. +#define UKSM_RUN_STOP  0
  569. +#define UKSM_RUN_MERGE 1
  570. +static unsigned int uksm_run = 1;
  571. +
  572. +static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
  573. +static DEFINE_MUTEX(uksm_thread_mutex);
  574. +
  575. +/*
  576. + * List vma_slot_new is for newly created vma_slot waiting to be added by
  577. + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
  578. + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
  579. + * VMA has been removed/freed.
  580. + */
  581. +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
  582. +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
  583. +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
  584. +static DEFINE_SPINLOCK(vma_slot_list_lock);
  585. +
  586. +/* The unstable tree heads */
  587. +static struct rb_root root_unstable_tree = RB_ROOT;
  588. +
  589. +/*
  590. + * All tree_nodes are in a list to be freed at once when unstable tree is
  591. + * freed after each scan round.
  592. + */
  593. +static struct list_head unstable_tree_node_list =
  594. +                               LIST_HEAD_INIT(unstable_tree_node_list);
  595. +
  596. +/* List contains all stable nodes */
  597. +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
  598. +
  599. +/*
  600. + * When the hash strength is changed, the stable tree must be delta_hashed and
  601. + * re-structured. We use two set of below structs to speed up the
  602. + * re-structuring of stable tree.
  603. + */
  604. +static struct list_head
  605. +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
  606. +                           LIST_HEAD_INIT(stable_tree_node_list[1])};
  607. +
  608. +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
  609. +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
  610. +static struct rb_root *root_stable_treep = &root_stable_tree[0];
  611. +static unsigned long stable_tree_index;
  612. +
  613. +/* The hash strength needed to hash a full page */
  614. +#define HASH_STRENGTH_FULL             (PAGE_SIZE / sizeof(u32))
  615. +
  616. +/* The hash strength needed for loop-back hashing */
  617. +#define HASH_STRENGTH_MAX              (HASH_STRENGTH_FULL + 10)
  618. +
  619. +/* The random offsets in a page */
  620. +static u32 *random_nums;
  621. +
  622. +/* The hash strength */
  623. +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
  624. +
  625. +/* The delta value each time the hash strength increases or decreases */
  626. +static unsigned long hash_strength_delta;
  627. +#define HASH_STRENGTH_DELTA_MAX        5
  628. +
  629. +/* The time we have saved due to random_sample_hash */
  630. +static u64 rshash_pos;
  631. +
  632. +/* The time we have wasted due to hash collision */
  633. +static u64 rshash_neg;
  634. +
  635. +struct uksm_benefit {
  636. +       u64 pos;
  637. +       u64 neg;
  638. +       u64 scanned;
  639. +       unsigned long base;
  640. +} benefit;
  641. +
  642. +/*
  643. + * The relative cost of memcmp, compared to 1 time unit of random sample
  644. + * hash, this value is tested when ksm module is initialized
  645. + */
  646. +static unsigned long memcmp_cost;
  647. +
  648. +static unsigned long  rshash_neg_cont_zero;
  649. +static unsigned long  rshash_cont_obscure;
  650. +
  651. +/* The possible states of hash strength adjustment heuristic */
  652. +enum rshash_states {
  653. +               RSHASH_STILL,
  654. +               RSHASH_TRYUP,
  655. +               RSHASH_TRYDOWN,
  656. +               RSHASH_NEW,
  657. +               RSHASH_PRE_STILL,
  658. +};
  659. +
  660. +/* The possible direction we are about to adjust hash strength */
  661. +enum rshash_direct {
  662. +       GO_UP,
  663. +       GO_DOWN,
  664. +       OBSCURE,
  665. +       STILL,
  666. +};
  667. +
  668. +/* random sampling hash state machine */
  669. +static struct {
  670. +       enum rshash_states state;
  671. +       enum rshash_direct pre_direct;
  672. +       u8 below_count;
  673. +       /* Keep a lookup window of size 5, iff above_count/below_count > 3
  674. +        * in this window we stop trying.
  675. +        */
  676. +       u8 lookup_window_index;
  677. +       u64 stable_benefit;
  678. +       unsigned long turn_point_down;
  679. +       unsigned long turn_benefit_down;
  680. +       unsigned long turn_point_up;
  681. +       unsigned long turn_benefit_up;
  682. +       unsigned long stable_point;
  683. +} rshash_state;
  684. +
  685. +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
  686. +static u32 *zero_hash_table;
  687. +
  688. +static inline struct node_vma *alloc_node_vma(void)
  689. +{
  690. +       struct node_vma *node_vma;
  691. +       node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL);
  692. +       if (node_vma) {
  693. +               INIT_HLIST_HEAD(&node_vma->rmap_hlist);
  694. +               INIT_HLIST_NODE(&node_vma->hlist);
  695. +       }
  696. +       return node_vma;
  697. +}
  698. +
  699. +static inline void free_node_vma(struct node_vma *node_vma)
  700. +{
  701. +       kmem_cache_free(node_vma_cache, node_vma);
  702. +}
  703. +
  704. +
  705. +static inline struct vma_slot *alloc_vma_slot(void)
  706. +{
  707. +       struct vma_slot *slot;
  708. +
  709. +       /*
  710. +        * In case ksm is not initialized by now.
  711. +        * Oops, we need to consider the call site of uksm_init() in the future.
  712. +        */
  713. +       if (!vma_slot_cache)
  714. +               return NULL;
  715. +
  716. +       slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL);
  717. +       if (slot) {
  718. +               INIT_LIST_HEAD(&slot->slot_list);
  719. +               INIT_LIST_HEAD(&slot->dedup_list);
  720. +               slot->flags |= UKSM_SLOT_NEED_RERAND;
  721. +       }
  722. +       return slot;
  723. +}
  724. +
  725. +static inline void free_vma_slot(struct vma_slot *vma_slot)
  726. +{
  727. +       kmem_cache_free(vma_slot_cache, vma_slot);
  728. +}
  729. +
  730. +
  731. +
  732. +static inline struct rmap_item *alloc_rmap_item(void)
  733. +{
  734. +       struct rmap_item *rmap_item;
  735. +
  736. +       rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
  737. +       if (rmap_item) {
  738. +               /* bug on lowest bit is not clear for flag use */
  739. +               BUG_ON(is_addr(rmap_item));
  740. +       }
  741. +       return rmap_item;
  742. +}
  743. +
  744. +static inline void free_rmap_item(struct rmap_item *rmap_item)
  745. +{
  746. +       rmap_item->slot = NULL; /* debug safety */
  747. +       kmem_cache_free(rmap_item_cache, rmap_item);
  748. +}
  749. +
  750. +static inline struct stable_node *alloc_stable_node(void)
  751. +{
  752. +       struct stable_node *node;
  753. +       node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC);
  754. +       if (!node)
  755. +               return NULL;
  756. +
  757. +       INIT_HLIST_HEAD(&node->hlist);
  758. +       list_add(&node->all_list, &stable_node_list);
  759. +       return node;
  760. +}
  761. +
  762. +static inline void free_stable_node(struct stable_node *stable_node)
  763. +{
  764. +       list_del(&stable_node->all_list);
  765. +       kmem_cache_free(stable_node_cache, stable_node);
  766. +}
  767. +
  768. +static inline struct tree_node *alloc_tree_node(struct list_head *list)
  769. +{
  770. +       struct tree_node *node;
  771. +       node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC);
  772. +       if (!node)
  773. +               return NULL;
  774. +
  775. +       list_add(&node->all_list, list);
  776. +       return node;
  777. +}
  778. +
  779. +static inline void free_tree_node(struct tree_node *node)
  780. +{
  781. +       list_del(&node->all_list);
  782. +       kmem_cache_free(tree_node_cache, node);
  783. +}
  784. +
  785. +static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
  786. +{
  787. +       struct anon_vma *anon_vma = rmap_item->anon_vma;
  788. +
  789. +       put_anon_vma(anon_vma);
  790. +}
  791. +
  792. +
  793. +/**
  794. + * Remove a stable node from stable_tree, may unlink from its tree_node and
  795. + * may remove its parent tree_node if no other stable node is pending.
  796. + *
  797. + * @stable_node        The node need to be removed
  798. + * @unlink_rb          Will this node be unlinked from the rbtree?
  799. + * @remove_tree_       node Will its tree_node be removed if empty?
  800. + */
  801. +static void remove_node_from_stable_tree(struct stable_node *stable_node,
  802. +                                        int unlink_rb,  int remove_tree_node)
  803. +{
  804. +       struct node_vma *node_vma;
  805. +       struct rmap_item *rmap_item;
  806. +       struct hlist_node *hlist, *rmap_hlist, *n;
  807. +
  808. +       if (!hlist_empty(&stable_node->hlist)) {
  809. +               hlist_for_each_entry_safe(node_vma, hlist, n,
  810. +                                         &stable_node->hlist, hlist) {
  811. +                       hlist_for_each_entry(rmap_item, rmap_hlist,
  812. +                                            &node_vma->rmap_hlist, hlist) {
  813. +                               uksm_pages_sharing--;
  814. +
  815. +                               uksm_drop_anon_vma(rmap_item);
  816. +                               rmap_item->address &= PAGE_MASK;
  817. +                       }
  818. +                       free_node_vma(node_vma);
  819. +                       cond_resched();
  820. +               }
  821. +
  822. +               /* the last one is counted as shared */
  823. +               uksm_pages_shared--;
  824. +               uksm_pages_sharing++;
  825. +       }
  826. +
  827. +       if (stable_node->tree_node && unlink_rb) {
  828. +               rb_erase(&stable_node->node,
  829. +                        &stable_node->tree_node->sub_root);
  830. +
  831. +               if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
  832. +                   remove_tree_node) {
  833. +                       rb_erase(&stable_node->tree_node->node,
  834. +                                root_stable_treep);
  835. +                       free_tree_node(stable_node->tree_node);
  836. +               } else {
  837. +                       stable_node->tree_node->count--;
  838. +               }
  839. +       }
  840. +
  841. +       free_stable_node(stable_node);
  842. +}
  843. +
  844. +
  845. +/*
  846. + * get_uksm_page: checks if the page indicated by the stable node
  847. + * is still its ksm page, despite having held no reference to it.
  848. + * In which case we can trust the content of the page, and it
  849. + * returns the gotten page; but if the page has now been zapped,
  850. + * remove the stale node from the stable tree and return NULL.
  851. + *
  852. + * You would expect the stable_node to hold a reference to the ksm page.
  853. + * But if it increments the page's count, swapping out has to wait for
  854. + * ksmd to come around again before it can free the page, which may take
  855. + * seconds or even minutes: much too unresponsive.  So instead we use a
  856. + * "keyhole reference": access to the ksm page from the stable node peeps
  857. + * out through its keyhole to see if that page still holds the right key,
  858. + * pointing back to this stable node.  This relies on freeing a PageAnon
  859. + * page to reset its page->mapping to NULL, and relies on no other use of
  860. + * a page to put something that might look like our key in page->mapping.
  861. + *
  862. + * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
  863. + * but this is different - made simpler by uksm_thread_mutex being held, but
  864. + * interesting for assuming that no other use of the struct page could ever
  865. + * put our expected_mapping into page->mapping (or a field of the union which
  866. + * coincides with page->mapping).  The RCU calls are not for KSM at all, but
  867. + * to keep the page_count protocol described with page_cache_get_speculative.
  868. + *
  869. + * Note: it is possible that get_uksm_page() will return NULL one moment,
  870. + * then page the next, if the page is in between page_freeze_refs() and
  871. + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
  872. + * is on its way to being freed; but it is an anomaly to bear in mind.
  873. + *
  874. + * @unlink_rb:                 if the removal of this node will firstly unlink from
  875. + * its rbtree. stable_node_reinsert will prevent this when restructuring the
  876. + * node from its old tree.
  877. + *
  878. + * @remove_tree_node:  if this is the last one of its tree_node, will the
  879. + * tree_node be freed ? If we are inserting stable node, this tree_node may
  880. + * be reused, so don't free it.
  881. + */
  882. +static struct page *get_uksm_page(struct stable_node *stable_node,
  883. +                                int unlink_rb, int remove_tree_node)
  884. +{
  885. +       struct page *page;
  886. +       void *expected_mapping;
  887. +
  888. +       page = pfn_to_page(stable_node->kpfn);
  889. +       expected_mapping = (void *)stable_node +
  890. +                               (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
  891. +       rcu_read_lock();
  892. +       if (page->mapping != expected_mapping)
  893. +               goto stale;
  894. +       if (!get_page_unless_zero(page))
  895. +               goto stale;
  896. +       if (page->mapping != expected_mapping) {
  897. +               put_page(page);
  898. +               goto stale;
  899. +       }
  900. +       rcu_read_unlock();
  901. +       return page;
  902. +stale:
  903. +       rcu_read_unlock();
  904. +       remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
  905. +
  906. +       return NULL;
  907. +}
  908. +
  909. +/*
  910. + * Removing rmap_item from stable or unstable tree.
  911. + * This function will clean the information from the stable/unstable tree.
  912. + */
  913. +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
  914. +{
  915. +       if (rmap_item->address & STABLE_FLAG) {
  916. +               struct stable_node *stable_node;
  917. +               struct node_vma *node_vma;
  918. +               struct page *page;
  919. +
  920. +               node_vma = rmap_item->head;
  921. +               stable_node = node_vma->head;
  922. +               page = get_uksm_page(stable_node, 1, 1);
  923. +               if (!page)
  924. +                       goto out;
  925. +
  926. +               /*
  927. +                * page lock is needed because it's racing with
  928. +                * try_to_unmap_ksm(), etc.
  929. +                */
  930. +               lock_page(page);
  931. +               hlist_del(&rmap_item->hlist);
  932. +
  933. +               if (hlist_empty(&node_vma->rmap_hlist)) {
  934. +                       hlist_del(&node_vma->hlist);
  935. +                       free_node_vma(node_vma);
  936. +               }
  937. +               unlock_page(page);
  938. +
  939. +               put_page(page);
  940. +               if (hlist_empty(&stable_node->hlist)) {
  941. +                       /* do NOT call remove_node_from_stable_tree() here,
  942. +                        * it's possible for a forked rmap_item not in
  943. +                        * stable tree while the in-tree rmap_items were
  944. +                        * deleted.
  945. +                        */
  946. +                       uksm_pages_shared--;
  947. +               } else
  948. +                       uksm_pages_sharing--;
  949. +
  950. +
  951. +               uksm_drop_anon_vma(rmap_item);
  952. +       } else if (rmap_item->address & UNSTABLE_FLAG) {
  953. +               if (rmap_item->hash_round == uksm_hash_round) {
  954. +
  955. +                       rb_erase(&rmap_item->node,
  956. +                                &rmap_item->tree_node->sub_root);
  957. +                       if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
  958. +                               rb_erase(&rmap_item->tree_node->node,
  959. +                                        &root_unstable_tree);
  960. +
  961. +                               free_tree_node(rmap_item->tree_node);
  962. +                       } else
  963. +                               rmap_item->tree_node->count--;
  964. +               }
  965. +               uksm_pages_unshared--;
  966. +       }
  967. +
  968. +       rmap_item->address &= PAGE_MASK;
  969. +       rmap_item->hash_max = 0;
  970. +
  971. +out:
  972. +       cond_resched();         /* we're called from many long loops */
  973. +}
  974. +
  975. +static inline int slot_in_uksm(struct vma_slot *slot)
  976. +{
  977. +       return list_empty(&slot->slot_list);
  978. +}
  979. +
  980. +/*
  981. + * Test if the mm is exiting
  982. + */
  983. +static inline bool uksm_test_exit(struct mm_struct *mm)
  984. +{
  985. +       return atomic_read(&mm->mm_users) == 0;
  986. +}
  987. +
  988. +/**
  989. + * Need to do two things:
  990. + * 1. check if slot was moved to del list
  991. + * 2. make sure the mmap_sem is manipulated under valid vma.
  992. + *
  993. + * My concern here is that in some cases, this may make
  994. + * vma_slot_list_lock() waiters to serialized further by some
  995. + * sem->wait_lock, can this really be expensive?
  996. + *
  997. + *
  998. + * @return
  999. + * 0: if successfully locked mmap_sem
  1000. + * -ENOENT: this slot was moved to del list
  1001. + * -EBUSY: vma lock failed
  1002. + */
  1003. +static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
  1004. +{
  1005. +       struct vm_area_struct *vma;
  1006. +       struct mm_struct *mm;
  1007. +       struct rw_semaphore *sem;
  1008. +
  1009. +       spin_lock(&vma_slot_list_lock);
  1010. +
  1011. +       /* the slot_list was removed and inited from new list, when it enters
  1012. +        * uksm_list. If now it's not empty, then it must be moved to del list
  1013. +        */
  1014. +       if (!slot_in_uksm(slot)) {
  1015. +               spin_unlock(&vma_slot_list_lock);
  1016. +               return -ENOENT;
  1017. +       }
  1018. +
  1019. +       BUG_ON(slot->pages != vma_pages(slot->vma));
  1020. +       /* Ok, vma still valid */
  1021. +       vma = slot->vma;
  1022. +       mm = vma->vm_mm;
  1023. +       sem = &mm->mmap_sem;
  1024. +
  1025. +       if (uksm_test_exit(mm)) {
  1026. +               spin_unlock(&vma_slot_list_lock);
  1027. +               return -ENOENT;
  1028. +       }
  1029. +
  1030. +       if (down_read_trylock(sem)) {
  1031. +               spin_unlock(&vma_slot_list_lock);
  1032. +               return 0;
  1033. +       }
  1034. +
  1035. +       spin_unlock(&vma_slot_list_lock);
  1036. +       return -EBUSY;
  1037. +}
  1038. +
  1039. +static inline unsigned long
  1040. +vma_page_address(struct page *page, struct vm_area_struct *vma)
  1041. +{
  1042. +       pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  1043. +       unsigned long address;
  1044. +
  1045. +       address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  1046. +       if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
  1047. +               /* page should be within @vma mapping range */
  1048. +               return -EFAULT;
  1049. +       }
  1050. +       return address;
  1051. +}
  1052. +
  1053. +
  1054. +/* return 0 on success with the item's mmap_sem locked */
  1055. +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
  1056. +{
  1057. +       struct mm_struct *mm;
  1058. +       struct vma_slot *slot = item->slot;
  1059. +       int err = -EINVAL;
  1060. +
  1061. +       struct page *page;
  1062. +
  1063. +       /*
  1064. +        * try_down_read_slot_mmap_sem() returns non-zero if the slot
  1065. +        * has been removed by uksm_remove_vma().
  1066. +        */
  1067. +       if (try_down_read_slot_mmap_sem(slot))
  1068. +               return -EBUSY;
  1069. +
  1070. +       mm = slot->vma->vm_mm;
  1071. +
  1072. +       if (uksm_test_exit(mm))
  1073. +               goto failout_up;
  1074. +
  1075. +       page = item->page;
  1076. +       rcu_read_lock();
  1077. +       if (!get_page_unless_zero(page)) {
  1078. +               rcu_read_unlock();
  1079. +               goto failout_up;
  1080. +       }
  1081. +
  1082. +       /* No need to consider huge page here. */
  1083. +       if (item->slot->vma->anon_vma != page_anon_vma(page) ||
  1084. +           vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
  1085. +               /*
  1086. +                * TODO:
  1087. +                * should we release this item becase of its stale page
  1088. +                * mapping?
  1089. +                */
  1090. +               put_page(page);
  1091. +               rcu_read_unlock();
  1092. +               goto failout_up;
  1093. +       }
  1094. +       rcu_read_unlock();
  1095. +       return 0;
  1096. +
  1097. +failout_up:
  1098. +       up_read(&mm->mmap_sem);
  1099. +       return err;
  1100. +}
  1101. +
  1102. +/*
  1103. + * What kind of VMA is considered ?
  1104. + */
  1105. +static inline int vma_can_enter(struct vm_area_struct *vma)
  1106. +{
  1107. +       return uksm_flags_can_scan(vma->vm_flags);
  1108. +}
  1109. +
  1110. +/*
  1111. + * Called whenever a fresh new vma is created A new vma_slot.
  1112. + * is created and inserted into a global list Must be called.
  1113. + * after vma is inserted to its mm                         .
  1114. + */
  1115. +void uksm_vma_add_new(struct vm_area_struct *vma)
  1116. +{
  1117. +       struct vma_slot *slot;
  1118. +
  1119. +       if (!vma_can_enter(vma)) {
  1120. +               vma->uksm_vma_slot = NULL;
  1121. +               return;
  1122. +       }
  1123. +
  1124. +       slot = alloc_vma_slot();
  1125. +       if (!slot) {
  1126. +               vma->uksm_vma_slot = NULL;
  1127. +               return;
  1128. +       }
  1129. +
  1130. +       vma->uksm_vma_slot = slot;
  1131. +       vma->vm_flags |= VM_MERGEABLE;
  1132. +       slot->vma = vma;
  1133. +       slot->mm = vma->vm_mm;
  1134. +       slot->ctime_j = jiffies;
  1135. +       slot->pages = vma_pages(vma);
  1136. +       spin_lock(&vma_slot_list_lock);
  1137. +       list_add_tail(&slot->slot_list, &vma_slot_new);
  1138. +       spin_unlock(&vma_slot_list_lock);
  1139. +}
  1140. +
  1141. +/*
  1142. + * Called after vma is unlinked from its mm
  1143. + */
  1144. +void uksm_remove_vma(struct vm_area_struct *vma)
  1145. +{
  1146. +       struct vma_slot *slot;
  1147. +
  1148. +       if (!vma->uksm_vma_slot)
  1149. +               return;
  1150. +
  1151. +       slot = vma->uksm_vma_slot;
  1152. +       spin_lock(&vma_slot_list_lock);
  1153. +       if (slot_in_uksm(slot)) {
  1154. +               /**
  1155. +                * This slot has been added by ksmd, so move to the del list
  1156. +                * waiting ksmd to free it.
  1157. +                */
  1158. +               list_add_tail(&slot->slot_list, &vma_slot_del);
  1159. +       } else {
  1160. +               /**
  1161. +                * It's still on new list. It's ok to free slot directly.
  1162. +                */
  1163. +               list_del(&slot->slot_list);
  1164. +               free_vma_slot(slot);
  1165. +       }
  1166. +       spin_unlock(&vma_slot_list_lock);
  1167. +       vma->uksm_vma_slot = NULL;
  1168. +}
  1169. +
  1170. +/*   32/3 < they < 32/2 */
  1171. +#define shiftl 8
  1172. +#define shiftr 12
  1173. +
  1174. +#define HASH_FROM_TO(from, to)                                 \
  1175. +for (index = from; index < to; index++) {              \
  1176. +       pos = random_nums[index];                       \
  1177. +       hash += key[pos];                               \
  1178. +       hash += (hash << shiftl);                       \
  1179. +       hash ^= (hash >> shiftr);                       \
  1180. +}
  1181. +
  1182. +
  1183. +#define HASH_FROM_DOWN_TO(from, to)                    \
  1184. +for (index = from - 1; index >= to; index--) {         \
  1185. +       hash ^= (hash >> shiftr);                       \
  1186. +       hash ^= (hash >> (shiftr*2));                   \
  1187. +       hash -= (hash << shiftl);                       \
  1188. +       hash += (hash << (shiftl*2));                   \
  1189. +       pos = random_nums[index];                       \
  1190. +       hash -= key[pos];                               \
  1191. +}
  1192. +
  1193. +/*
  1194. + * The main random sample hash function.
  1195. + */
  1196. +static u32 random_sample_hash(void *addr, u32 hash_strength)
  1197. +{
  1198. +       u32 hash = 0xdeadbeef;
  1199. +       int index, pos, loop = hash_strength;
  1200. +       u32 *key = (u32 *)addr;
  1201. +
  1202. +       if (loop > HASH_STRENGTH_FULL)
  1203. +               loop = HASH_STRENGTH_FULL;
  1204. +
  1205. +       HASH_FROM_TO(0, loop);
  1206. +
  1207. +       if (hash_strength > HASH_STRENGTH_FULL) {
  1208. +               loop = hash_strength - HASH_STRENGTH_FULL;
  1209. +               HASH_FROM_TO(0, loop);
  1210. +       }
  1211. +
  1212. +       return hash;
  1213. +}
  1214. +
  1215. +
  1216. +/**
  1217. + * It's used when hash strength is adjusted
  1218. + *
  1219. + * @addr The page's virtual address
  1220. + * @from The original hash strength
  1221. + * @to   The hash strength changed to
  1222. + * @hash The hash value generated with "from" hash value
  1223. + *
  1224. + * return the hash value
  1225. + */
  1226. +static u32 delta_hash(void *addr, int from, int to, u32 hash)
  1227. +{
  1228. +       u32 *key = (u32 *)addr;
  1229. +       int index, pos; /* make sure they are int type */
  1230. +
  1231. +       if (to > from) {
  1232. +               if (from >= HASH_STRENGTH_FULL) {
  1233. +                       from -= HASH_STRENGTH_FULL;
  1234. +                       to -= HASH_STRENGTH_FULL;
  1235. +                       HASH_FROM_TO(from, to);
  1236. +               } else if (to <= HASH_STRENGTH_FULL) {
  1237. +                       HASH_FROM_TO(from, to);
  1238. +               } else {
  1239. +                       HASH_FROM_TO(from, HASH_STRENGTH_FULL);
  1240. +                       HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
  1241. +               }
  1242. +       } else {
  1243. +               if (from <= HASH_STRENGTH_FULL) {
  1244. +                       HASH_FROM_DOWN_TO(from, to);
  1245. +               } else if (to >= HASH_STRENGTH_FULL) {
  1246. +                       from -= HASH_STRENGTH_FULL;
  1247. +                       to -= HASH_STRENGTH_FULL;
  1248. +                       HASH_FROM_DOWN_TO(from, to);
  1249. +               } else {
  1250. +                       HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
  1251. +                       HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
  1252. +               }
  1253. +       }
  1254. +
  1255. +       return hash;
  1256. +}
  1257. +
  1258. +
  1259. +
  1260. +
  1261. +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
  1262. +
  1263. +/**
  1264. + *
  1265. + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
  1266. + * has finished.
  1267. + *
  1268. + * return 0 if no page has been scanned since last call, 1 otherwise.
  1269. + */
  1270. +static inline int encode_benefit(void)
  1271. +{
  1272. +       u64 scanned_delta, pos_delta, neg_delta;
  1273. +       unsigned long base = benefit.base;
  1274. +
  1275. +       scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
  1276. +
  1277. +       if (!scanned_delta)
  1278. +               return 0;
  1279. +
  1280. +       scanned_delta >>= base;
  1281. +       pos_delta = rshash_pos >> base;
  1282. +       neg_delta = rshash_neg >> base;
  1283. +
  1284. +       if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
  1285. +           CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
  1286. +           CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
  1287. +               benefit.scanned >>= 1;
  1288. +               benefit.neg >>= 1;
  1289. +               benefit.pos >>= 1;
  1290. +               benefit.base++;
  1291. +               scanned_delta >>= 1;
  1292. +               pos_delta >>= 1;
  1293. +               neg_delta >>= 1;
  1294. +       }
  1295. +
  1296. +       benefit.pos += pos_delta;
  1297. +       benefit.neg += neg_delta;
  1298. +       benefit.scanned += scanned_delta;
  1299. +
  1300. +       BUG_ON(!benefit.scanned);
  1301. +
  1302. +       rshash_pos = rshash_neg = 0;
  1303. +       uksm_pages_scanned_last = uksm_pages_scanned;
  1304. +
  1305. +       return 1;
  1306. +}
  1307. +
  1308. +static inline void reset_benefit(void)
  1309. +{
  1310. +       benefit.pos = 0;
  1311. +       benefit.neg = 0;
  1312. +       benefit.base = 0;
  1313. +       benefit.scanned = 0;
  1314. +}
  1315. +
  1316. +static inline void inc_rshash_pos(unsigned long delta)
  1317. +{
  1318. +       if (CAN_OVERFLOW_U64(rshash_pos, delta))
  1319. +               encode_benefit();
  1320. +
  1321. +       rshash_pos += delta;
  1322. +}
  1323. +
  1324. +static inline void inc_rshash_neg(unsigned long delta)
  1325. +{
  1326. +       if (CAN_OVERFLOW_U64(rshash_neg, delta))
  1327. +               encode_benefit();
  1328. +
  1329. +       rshash_neg += delta;
  1330. +}
  1331. +
  1332. +
  1333. +static inline u32 page_hash(struct page *page, unsigned long hash_strength,
  1334. +                           int cost_accounting)
  1335. +{
  1336. +       u32 val;
  1337. +       unsigned long delta;
  1338. +
  1339. +       void *addr = kmap_atomic(page);
  1340. +
  1341. +       val = random_sample_hash(addr, hash_strength);
  1342. +       kunmap_atomic(addr);
  1343. +
  1344. +       if (cost_accounting) {
  1345. +               if (HASH_STRENGTH_FULL > hash_strength)
  1346. +                       delta = HASH_STRENGTH_FULL - hash_strength;
  1347. +               else
  1348. +                       delta = 0;
  1349. +
  1350. +               inc_rshash_pos(delta);
  1351. +       }
  1352. +
  1353. +       return val;
  1354. +}
  1355. +
  1356. +static int memcmp_pages(struct page *page1, struct page *page2,
  1357. +                       int cost_accounting)
  1358. +{
  1359. +       char *addr1, *addr2;
  1360. +       int ret;
  1361. +
  1362. +       addr1 = kmap_atomic(page1);
  1363. +       addr2 = kmap_atomic(page2);
  1364. +       ret = memcmp(addr1, addr2, PAGE_SIZE);
  1365. +       kunmap_atomic(addr2);
  1366. +       kunmap_atomic(addr1);
  1367. +
  1368. +       if (cost_accounting)
  1369. +               inc_rshash_neg(memcmp_cost);
  1370. +
  1371. +       return ret;
  1372. +}
  1373. +
  1374. +static inline int pages_identical(struct page *page1, struct page *page2)
  1375. +{
  1376. +       return !memcmp_pages(page1, page2, 0);
  1377. +}
  1378. +
  1379. +static inline int is_page_full_zero(struct page *page)
  1380. +{
  1381. +       char *addr;
  1382. +       int ret;
  1383. +
  1384. +       addr = kmap_atomic(page);
  1385. +       ret = is_full_zero(addr, PAGE_SIZE);
  1386. +       kunmap_atomic(addr);
  1387. +
  1388. +       return ret;
  1389. +}
  1390. +
  1391. +static int write_protect_page(struct vm_area_struct *vma, struct page *page,
  1392. +                             pte_t *orig_pte, pte_t *old_pte)
  1393. +{
  1394. +       struct mm_struct *mm = vma->vm_mm;
  1395. +       unsigned long addr;
  1396. +       pte_t *ptep;
  1397. +       spinlock_t *ptl;
  1398. +       int swapped;
  1399. +       int err = -EFAULT;
  1400. +
  1401. +       addr = page_address_in_vma(page, vma);
  1402. +       if (addr == -EFAULT)
  1403. +               goto out;
  1404. +
  1405. +       BUG_ON(PageTransCompound(page));
  1406. +       ptep = page_check_address(page, mm, addr, &ptl, 0);
  1407. +       if (!ptep)
  1408. +               goto out;
  1409. +
  1410. +       if (old_pte)
  1411. +               *old_pte = *ptep;
  1412. +
  1413. +       if (pte_write(*ptep) || pte_dirty(*ptep)) {
  1414. +               pte_t entry;
  1415. +
  1416. +               swapped = PageSwapCache(page);
  1417. +               flush_cache_page(vma, addr, page_to_pfn(page));
  1418. +               /*
  1419. +                * Ok this is tricky, when get_user_pages_fast() run it doesnt
  1420. +                * take any lock, therefore the check that we are going to make
  1421. +                * with the pagecount against the mapcount is racey and
  1422. +                * O_DIRECT can happen right after the check.
  1423. +                * So we clear the pte and flush the tlb before the check
  1424. +                * this assure us that no O_DIRECT can happen after the check
  1425. +                * or in the middle of the check.
  1426. +                */
  1427. +               entry = ptep_clear_flush(vma, addr, ptep);
  1428. +               /*
  1429. +                * Check that no O_DIRECT or similar I/O is in progress on the
  1430. +                * page
  1431. +                */
  1432. +               if (page_mapcount(page) + 1 + swapped != page_count(page)) {
  1433. +                       set_pte_at(mm, addr, ptep, entry);
  1434. +                       goto out_unlock;
  1435. +               }
  1436. +               if (pte_dirty(entry))
  1437. +                       set_page_dirty(page);
  1438. +               entry = pte_mkclean(pte_wrprotect(entry));
  1439. +               set_pte_at_notify(mm, addr, ptep, entry);
  1440. +       }
  1441. +       *orig_pte = *ptep;
  1442. +       err = 0;
  1443. +
  1444. +out_unlock:
  1445. +       pte_unmap_unlock(ptep, ptl);
  1446. +out:
  1447. +       return err;
  1448. +}
  1449. +
  1450. +#define MERGE_ERR_PGERR                1 /* the page is invalid cannot continue */
  1451. +#define MERGE_ERR_COLLI                2 /* there is a collision */
  1452. +#define MERGE_ERR_COLLI_MAX    3 /* collision at the max hash strength */
  1453. +#define MERGE_ERR_CHANGED      4 /* the page has changed since last hash */
  1454. +
  1455. +
  1456. +/**
  1457. + * replace_page - replace page in vma by new ksm page
  1458. + * @vma:      vma that holds the pte pointing to page
  1459. + * @page:     the page we are replacing by kpage
  1460. + * @kpage:    the ksm page we replace page by
  1461. + * @orig_pte: the original value of the pte
  1462. + *
  1463. + * Returns 0 on success, MERGE_ERR_PGERR on failure.
  1464. + */
  1465. +static int replace_page(struct vm_area_struct *vma, struct page *page,
  1466. +                       struct page *kpage, pte_t orig_pte)
  1467. +{
  1468. +       struct mm_struct *mm = vma->vm_mm;
  1469. +       pgd_t *pgd;
  1470. +       pud_t *pud;
  1471. +       pmd_t *pmd;
  1472. +       pte_t *ptep;
  1473. +       spinlock_t *ptl;
  1474. +       pte_t entry;
  1475. +
  1476. +       unsigned long addr;
  1477. +       int err = MERGE_ERR_PGERR;
  1478. +
  1479. +       addr = page_address_in_vma(page, vma);
  1480. +       if (addr == -EFAULT)
  1481. +               goto out;
  1482. +
  1483. +       pgd = pgd_offset(mm, addr);
  1484. +       if (!pgd_present(*pgd))
  1485. +               goto out;
  1486. +
  1487. +       pud = pud_offset(pgd, addr);
  1488. +       if (!pud_present(*pud))
  1489. +               goto out;
  1490. +
  1491. +       pmd = pmd_offset(pud, addr);
  1492. +       BUG_ON(pmd_trans_huge(*pmd));
  1493. +       if (!pmd_present(*pmd))
  1494. +               goto out;
  1495. +
  1496. +       ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
  1497. +       if (!pte_same(*ptep, orig_pte)) {
  1498. +               pte_unmap_unlock(ptep, ptl);
  1499. +               goto out;
  1500. +       }
  1501. +
  1502. +       flush_cache_page(vma, addr, pte_pfn(*ptep));
  1503. +       ptep_clear_flush(vma, addr, ptep);
  1504. +       entry = mk_pte(kpage, vma->vm_page_prot);
  1505. +
  1506. +       /* special treatment is needed for zero_page */
  1507. +       if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
  1508. +                               (page_to_pfn(kpage) == zero_pfn))
  1509. +               entry = pte_mkspecial(entry);
  1510. +       else {
  1511. +               get_page(kpage);
  1512. +               page_add_anon_rmap(kpage, vma, addr);
  1513. +       }
  1514. +
  1515. +       set_pte_at_notify(mm, addr, ptep, entry);
  1516. +
  1517. +       page_remove_rmap(page);
  1518. +       if (!page_mapped(page))
  1519. +               try_to_free_swap(page);
  1520. +       put_page(page);
  1521. +
  1522. +       pte_unmap_unlock(ptep, ptl);
  1523. +       err = 0;
  1524. +out:
  1525. +       return err;
  1526. +}
  1527. +
  1528. +
  1529. +/**
  1530. + *  Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
  1531. + *  zero hash value at HASH_STRENGTH_MAX is used to indicated that its
  1532. + *  hash_max member has not been calculated.
  1533. + *
  1534. + * @page The page needs to be hashed
  1535. + * @hash_old The hash value calculated with current hash strength
  1536. + *
  1537. + * return the new hash value calculated at HASH_STRENGTH_MAX
  1538. + */
  1539. +static inline u32 page_hash_max(struct page *page, u32 hash_old)
  1540. +{
  1541. +       u32 hash_max = 0;
  1542. +       void *addr;
  1543. +
  1544. +       addr = kmap_atomic(page);
  1545. +       hash_max = delta_hash(addr, hash_strength,
  1546. +                             HASH_STRENGTH_MAX, hash_old);
  1547. +
  1548. +       kunmap_atomic(addr);
  1549. +
  1550. +       if (!hash_max)
  1551. +               hash_max = 1;
  1552. +
  1553. +       inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
  1554. +       return hash_max;
  1555. +}
  1556. +
  1557. +/*
  1558. + * We compare the hash again, to ensure that it is really a hash collision
  1559. + * instead of being caused by page write.
  1560. + */
  1561. +static inline int check_collision(struct rmap_item *rmap_item,
  1562. +                                 u32 hash)
  1563. +{
  1564. +       int err;
  1565. +       struct page *page = rmap_item->page;
  1566. +
  1567. +       /* if this rmap_item has already been hash_maxed, then the collision
  1568. +        * must appears in the second-level rbtree search. In this case we check
  1569. +        * if its hash_max value has been changed. Otherwise, the collision
  1570. +        * happens in the first-level rbtree search, so we check against it's
  1571. +        * current hash value.
  1572. +        */
  1573. +       if (rmap_item->hash_max) {
  1574. +               inc_rshash_neg(memcmp_cost);
  1575. +               inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
  1576. +
  1577. +               if (rmap_item->hash_max == page_hash_max(page, hash))
  1578. +                       err = MERGE_ERR_COLLI;
  1579. +               else
  1580. +                       err = MERGE_ERR_CHANGED;
  1581. +       } else {
  1582. +               inc_rshash_neg(memcmp_cost + hash_strength);
  1583. +
  1584. +               if (page_hash(page, hash_strength, 0) == hash)
  1585. +                       err = MERGE_ERR_COLLI;
  1586. +               else
  1587. +                       err = MERGE_ERR_CHANGED;
  1588. +       }
  1589. +
  1590. +       return err;
  1591. +}
  1592. +
  1593. +static struct page *page_trans_compound_anon(struct page *page)
  1594. +{
  1595. +       if (PageTransCompound(page)) {
  1596. +               struct page *head = compound_trans_head(page);
  1597. +               /*
  1598. +                * head may actually be splitted and freed from under
  1599. +                * us but it's ok here.
  1600. +                */
  1601. +               if (PageAnon(head))
  1602. +                       return head;
  1603. +       }
  1604. +       return NULL;
  1605. +}
  1606. +
  1607. +static int page_trans_compound_anon_split(struct page *page)
  1608. +{
  1609. +       int ret = 0;
  1610. +       struct page *transhuge_head = page_trans_compound_anon(page);
  1611. +       if (transhuge_head) {
  1612. +               /* Get the reference on the head to split it. */
  1613. +               if (get_page_unless_zero(transhuge_head)) {
  1614. +                       /*
  1615. +                        * Recheck we got the reference while the head
  1616. +                        * was still anonymous.
  1617. +                        */
  1618. +                       if (PageAnon(transhuge_head))
  1619. +                               ret = split_huge_page(transhuge_head);
  1620. +                       else
  1621. +                               /*
  1622. +                                * Retry later if split_huge_page run
  1623. +                                * from under us.
  1624. +                                */
  1625. +                               ret = 1;
  1626. +                       put_page(transhuge_head);
  1627. +               } else
  1628. +                       /* Retry later if split_huge_page run from under us. */
  1629. +                       ret = 1;
  1630. +       }
  1631. +       return ret;
  1632. +}
  1633. +
  1634. +/**
  1635. + * Try to merge a rmap_item.page with a kpage in stable node. kpage must
  1636. + * already be a ksm page.
  1637. + *
  1638. + * @return 0 if the pages were merged, -EFAULT otherwise.
  1639. + */
  1640. +static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
  1641. +                                     struct page *kpage, u32 hash)
  1642. +{
  1643. +       struct vm_area_struct *vma = rmap_item->slot->vma;
  1644. +       struct mm_struct *mm = vma->vm_mm;
  1645. +       pte_t orig_pte = __pte(0);
  1646. +       int err = MERGE_ERR_PGERR;
  1647. +       struct page *page;
  1648. +
  1649. +       if (uksm_test_exit(mm))
  1650. +               goto out;
  1651. +
  1652. +       page = rmap_item->page;
  1653. +
  1654. +       if (page == kpage) { /* ksm page forked */
  1655. +               err = 0;
  1656. +               goto out;
  1657. +       }
  1658. +
  1659. +       if (PageTransCompound(page) && page_trans_compound_anon_split(page))
  1660. +               goto out;
  1661. +       BUG_ON(PageTransCompound(page));
  1662. +
  1663. +       if (!PageAnon(page) || !PageKsm(kpage))
  1664. +               goto out;
  1665. +
  1666. +       /*
  1667. +        * We need the page lock to read a stable PageSwapCache in
  1668. +        * write_protect_page().  We use trylock_page() instead of
  1669. +        * lock_page() because we don't want to wait here - we
  1670. +        * prefer to continue scanning and merging different pages,
  1671. +        * then come back to this page when it is unlocked.
  1672. +        */
  1673. +       if (!trylock_page(page))
  1674. +               goto out;
  1675. +       /*
  1676. +        * If this anonymous page is mapped only here, its pte may need
  1677. +        * to be write-protected.  If it's mapped elsewhere, all of its
  1678. +        * ptes are necessarily already write-protected.  But in either
  1679. +        * case, we need to lock and check page_count is not raised.
  1680. +        */
  1681. +       if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
  1682. +               if (pages_identical(page, kpage))
  1683. +                       err = replace_page(vma, page, kpage, orig_pte);
  1684. +               else
  1685. +                       err = check_collision(rmap_item, hash);
  1686. +       }
  1687. +
  1688. +       if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
  1689. +               munlock_vma_page(page);
  1690. +               if (!PageMlocked(kpage)) {
  1691. +                       unlock_page(page);
  1692. +                       lock_page(kpage);
  1693. +                       mlock_vma_page(kpage);
  1694. +                       page = kpage;           /* for final unlock */
  1695. +               }
  1696. +       }
  1697. +
  1698. +       unlock_page(page);
  1699. +out:
  1700. +       return err;
  1701. +}
  1702. +
  1703. +
  1704. +
  1705. +/**
  1706. + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
  1707. + * to restore a page mapping that has been changed in try_to_merge_two_pages.
  1708. + *
  1709. + * @return 0 on success.
  1710. + */
  1711. +static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
  1712. +                            pte_t orig_pte, pte_t wprt_pte)
  1713. +{
  1714. +       struct mm_struct *mm = vma->vm_mm;
  1715. +       pgd_t *pgd;
  1716. +       pud_t *pud;
  1717. +       pmd_t *pmd;
  1718. +       pte_t *ptep;
  1719. +       spinlock_t *ptl;
  1720. +
  1721. +       int err = -EFAULT;
  1722. +
  1723. +       pgd = pgd_offset(mm, addr);
  1724. +       if (!pgd_present(*pgd))
  1725. +               goto out;
  1726. +
  1727. +       pud = pud_offset(pgd, addr);
  1728. +       if (!pud_present(*pud))
  1729. +               goto out;
  1730. +
  1731. +       pmd = pmd_offset(pud, addr);
  1732. +       if (!pmd_present(*pmd))
  1733. +               goto out;
  1734. +
  1735. +       ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
  1736. +       if (!pte_same(*ptep, wprt_pte)) {
  1737. +               /* already copied, let it be */
  1738. +               pte_unmap_unlock(ptep, ptl);
  1739. +               goto out;
  1740. +       }
  1741. +
  1742. +       /*
  1743. +        * Good boy, still here. When we still get the ksm page, it does not
  1744. +        * return to the free page pool, there is no way that a pte was changed
  1745. +        * to other page and gets back to this page. And remind that ksm page
  1746. +        * do not reuse in do_wp_page(). So it's safe to restore the original
  1747. +        * pte.
  1748. +        */
  1749. +       flush_cache_page(vma, addr, pte_pfn(*ptep));
  1750. +       ptep_clear_flush(vma, addr, ptep);
  1751. +       set_pte_at_notify(mm, addr, ptep, orig_pte);
  1752. +
  1753. +       pte_unmap_unlock(ptep, ptl);
  1754. +       err = 0;
  1755. +out:
  1756. +       return err;
  1757. +}
  1758. +
  1759. +/**
  1760. + * try_to_merge_two_pages() - take two identical pages and prepare
  1761. + * them to be merged into one page(rmap_item->page)
  1762. + *
  1763. + * @return 0 if we successfully merged two identical pages into
  1764. + *         one ksm page. MERGE_ERR_COLLI if it's only a hash collision
  1765. + *         search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
  1766. + *         changed since it's hashed. MERGE_ERR_PGERR otherwise.
  1767. + *
  1768. + */
  1769. +static int try_to_merge_two_pages(struct rmap_item *rmap_item,
  1770. +                                 struct rmap_item *tree_rmap_item,
  1771. +                                 u32 hash)
  1772. +{
  1773. +       pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
  1774. +       pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
  1775. +       struct vm_area_struct *vma1 = rmap_item->slot->vma;
  1776. +       struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
  1777. +       struct page *page = rmap_item->page;
  1778. +       struct page *tree_page = tree_rmap_item->page;
  1779. +       int err = MERGE_ERR_PGERR;
  1780. +       struct address_space *saved_mapping;
  1781. +
  1782. +
  1783. +       if (rmap_item->page == tree_rmap_item->page)
  1784. +               goto out;
  1785. +
  1786. +       if (PageTransCompound(page) && page_trans_compound_anon_split(page))
  1787. +               goto out;
  1788. +       BUG_ON(PageTransCompound(page));
  1789. +
  1790. +       if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page))
  1791. +               goto out;
  1792. +       BUG_ON(PageTransCompound(tree_page));
  1793. +
  1794. +       if (!PageAnon(page) || !PageAnon(tree_page))
  1795. +               goto out;
  1796. +
  1797. +       if (!trylock_page(page))
  1798. +               goto out;
  1799. +
  1800. +
  1801. +       if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
  1802. +               unlock_page(page);
  1803. +               goto out;
  1804. +       }
  1805. +
  1806. +       /*
  1807. +        * While we hold page lock, upgrade page from
  1808. +        * PageAnon+anon_vma to PageKsm+NULL stable_node:
  1809. +        * stable_tree_insert() will update stable_node.
  1810. +        */
  1811. +       saved_mapping = page->mapping;
  1812. +       set_page_stable_node(page, NULL);
  1813. +       mark_page_accessed(page);
  1814. +       unlock_page(page);
  1815. +
  1816. +       if (!trylock_page(tree_page))
  1817. +               goto restore_out;
  1818. +
  1819. +       if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
  1820. +               unlock_page(tree_page);
  1821. +               goto restore_out;
  1822. +       }
  1823. +
  1824. +       if (pages_identical(page, tree_page)) {
  1825. +               err = replace_page(vma2, tree_page, page, wprt_pte2);
  1826. +               if (err) {
  1827. +                       unlock_page(tree_page);
  1828. +                       goto restore_out;
  1829. +               }
  1830. +
  1831. +               if ((vma2->vm_flags & VM_LOCKED)) {
  1832. +                       munlock_vma_page(tree_page);
  1833. +                       if (!PageMlocked(page)) {
  1834. +                               unlock_page(tree_page);
  1835. +                               lock_page(page);
  1836. +                               mlock_vma_page(page);
  1837. +                               tree_page = page; /* for final unlock */
  1838. +                       }
  1839. +               }
  1840. +
  1841. +               unlock_page(tree_page);
  1842. +
  1843. +               goto out; /* success */
  1844. +
  1845. +       } else {
  1846. +               if (tree_rmap_item->hash_max &&
  1847. +                   tree_rmap_item->hash_max == rmap_item->hash_max) {
  1848. +                       err = MERGE_ERR_COLLI_MAX;
  1849. +               } else if (page_hash(page, hash_strength, 0) ==
  1850. +                   page_hash(tree_page, hash_strength, 0)) {
  1851. +                       inc_rshash_neg(memcmp_cost + hash_strength * 2);
  1852. +                       err = MERGE_ERR_COLLI;
  1853. +               } else {
  1854. +                       err = MERGE_ERR_CHANGED;
  1855. +               }
  1856. +
  1857. +               unlock_page(tree_page);
  1858. +       }
  1859. +
  1860. +restore_out:
  1861. +       lock_page(page);
  1862. +       if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
  1863. +                                 orig_pte1, wprt_pte1))
  1864. +               page->mapping = saved_mapping;
  1865. +
  1866. +       unlock_page(page);
  1867. +out:
  1868. +       return err;
  1869. +}
  1870. +
  1871. +static inline int hash_cmp(u32 new_val, u32 node_val)
  1872. +{
  1873. +       if (new_val > node_val)
  1874. +               return 1;
  1875. +       else if (new_val < node_val)
  1876. +               return -1;
  1877. +       else
  1878. +               return 0;
  1879. +}
  1880. +
  1881. +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
  1882. +{
  1883. +       u32 hash_max = item->hash_max;
  1884. +
  1885. +       if (!hash_max) {
  1886. +               hash_max = page_hash_max(item->page, hash);
  1887. +
  1888. +               item->hash_max = hash_max;
  1889. +       }
  1890. +
  1891. +       return hash_max;
  1892. +}
  1893. +
  1894. +
  1895. +
  1896. +/**
  1897. + * stable_tree_search() - search the stable tree for a page
  1898. + *
  1899. + * @item:      the rmap_item we are comparing with
  1900. + * @hash:      the hash value of this item->page already calculated
  1901. + *
  1902. + * @return     the page we have found, NULL otherwise. The page returned has
  1903. + *             been gotten.
  1904. + */
  1905. +static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
  1906. +{
  1907. +       struct rb_node *node = root_stable_treep->rb_node;
  1908. +       struct tree_node *tree_node;
  1909. +       unsigned long hash_max;
  1910. +       struct page *page = item->page;
  1911. +       struct stable_node *stable_node;
  1912. +
  1913. +       stable_node = page_stable_node(page);
  1914. +       if (stable_node) {
  1915. +               /* ksm page forked, that is
  1916. +                * if (PageKsm(page) && !in_stable_tree(rmap_item))
  1917. +                * it's actually gotten once outside.
  1918. +                */
  1919. +               get_page(page);
  1920. +               return page;
  1921. +       }
  1922. +
  1923. +       while (node) {
  1924. +               int cmp;
  1925. +
  1926. +               tree_node = rb_entry(node, struct tree_node, node);
  1927. +
  1928. +               cmp = hash_cmp(hash, tree_node->hash);
  1929. +
  1930. +               if (cmp < 0)
  1931. +                       node = node->rb_left;
  1932. +               else if (cmp > 0)
  1933. +                       node = node->rb_right;
  1934. +               else
  1935. +                       break;
  1936. +       }
  1937. +
  1938. +       if (!node)
  1939. +               return NULL;
  1940. +
  1941. +       if (tree_node->count == 1) {
  1942. +               stable_node = rb_entry(tree_node->sub_root.rb_node,
  1943. +                                      struct stable_node, node);
  1944. +               BUG_ON(!stable_node);
  1945. +
  1946. +               goto get_page_out;
  1947. +       }
  1948. +
  1949. +       /*
  1950. +        * ok, we have to search the second
  1951. +        * level subtree, hash the page to a
  1952. +        * full strength.
  1953. +        */
  1954. +       node = tree_node->sub_root.rb_node;
  1955. +       BUG_ON(!node);
  1956. +       hash_max = rmap_item_hash_max(item, hash);
  1957. +
  1958. +       while (node) {
  1959. +               int cmp;
  1960. +
  1961. +               stable_node = rb_entry(node, struct stable_node, node);
  1962. +
  1963. +               cmp = hash_cmp(hash_max, stable_node->hash_max);
  1964. +
  1965. +               if (cmp < 0)
  1966. +                       node = node->rb_left;
  1967. +               else if (cmp > 0)
  1968. +                       node = node->rb_right;
  1969. +               else
  1970. +                       goto get_page_out;
  1971. +       }
  1972. +
  1973. +       return NULL;
  1974. +
  1975. +get_page_out:
  1976. +       page = get_uksm_page(stable_node, 1, 1);
  1977. +       return page;
  1978. +}
  1979. +
  1980. +static int try_merge_rmap_item(struct rmap_item *item,
  1981. +                              struct page *kpage,
  1982. +                              struct page *tree_page)
  1983. +{
  1984. +       spinlock_t *ptl;
  1985. +       pte_t *ptep;
  1986. +       unsigned long addr;
  1987. +       struct vm_area_struct *vma = item->slot->vma;
  1988. +
  1989. +       addr = get_rmap_addr(item);
  1990. +       ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0);
  1991. +       if (!ptep)
  1992. +               return 0;
  1993. +
  1994. +       if (pte_write(*ptep)) {
  1995. +               /* has changed, abort! */
  1996. +               pte_unmap_unlock(ptep, ptl);
  1997. +               return 0;
  1998. +       }
  1999. +
  2000. +       get_page(tree_page);
  2001. +       page_add_anon_rmap(tree_page, vma, addr);
  2002. +
  2003. +       flush_cache_page(vma, addr, pte_pfn(*ptep));
  2004. +       ptep_clear_flush(vma, addr, ptep);
  2005. +       set_pte_at_notify(vma->vm_mm, addr, ptep,
  2006. +                         mk_pte(tree_page, vma->vm_page_prot));
  2007. +
  2008. +       page_remove_rmap(kpage);
  2009. +       put_page(kpage);
  2010. +
  2011. +       pte_unmap_unlock(ptep, ptl);
  2012. +
  2013. +       return 1;
  2014. +}
  2015. +
  2016. +/**
  2017. + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
  2018. + * into stable tree, the page was found to be identical to a stable ksm page,
  2019. + * this is the last chance we can merge them into one.
  2020. + *
  2021. + * @item1:     the rmap_item holding the page which we wanted to insert
  2022. + *             into stable tree.
  2023. + * @item2:     the other rmap_item we found when unstable tree search
  2024. + * @oldpage:   the page currently mapped by the two rmap_items
  2025. + * @tree_page:         the page we found identical in stable tree node
  2026. + * @success1:  return if item1 is successfully merged
  2027. + * @success2:  return if item2 is successfully merged
  2028. + */
  2029. +static void try_merge_with_stable(struct rmap_item *item1,
  2030. +                                 struct rmap_item *item2,
  2031. +                                 struct page **kpage,
  2032. +                                 struct page *tree_page,
  2033. +                                 int *success1, int *success2)
  2034. +{
  2035. +       struct vm_area_struct *vma1 = item1->slot->vma;
  2036. +       struct vm_area_struct *vma2 = item2->slot->vma;
  2037. +       *success1 = 0;
  2038. +       *success2 = 0;
  2039. +
  2040. +       if (unlikely(*kpage == tree_page)) {
  2041. +               /* I don't think this can really happen */
  2042. +               printk(KERN_WARNING "UKSM: unexpected condition detected in "
  2043. +                       "try_merge_with_stable() -- *kpage == tree_page !\n");
  2044. +               *success1 = 1;
  2045. +               *success2 = 1;
  2046. +               return;
  2047. +       }
  2048. +
  2049. +       if (!PageAnon(*kpage) || !PageKsm(*kpage))
  2050. +               goto failed;
  2051. +
  2052. +       if (!trylock_page(tree_page))
  2053. +               goto failed;
  2054. +
  2055. +       /* If the oldpage is still ksm and still pointed
  2056. +        * to in the right place, and still write protected,
  2057. +        * we are confident it's not changed, no need to
  2058. +        * memcmp anymore.
  2059. +        * be ware, we cannot take nested pte locks,
  2060. +        * deadlock risk.
  2061. +        */
  2062. +       if (!try_merge_rmap_item(item1, *kpage, tree_page))
  2063. +               goto unlock_failed;
  2064. +
  2065. +       /* ok, then vma2, remind that pte1 already set */
  2066. +       if (!try_merge_rmap_item(item2, *kpage, tree_page))
  2067. +               goto success_1;
  2068. +
  2069. +       *success2 = 1;
  2070. +success_1:
  2071. +       *success1 = 1;
  2072. +
  2073. +
  2074. +       if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
  2075. +           (*success2 && vma2->vm_flags & VM_LOCKED)) {
  2076. +               munlock_vma_page(*kpage);
  2077. +               if (!PageMlocked(tree_page))
  2078. +                       mlock_vma_page(tree_page);
  2079. +       }
  2080. +
  2081. +       /*
  2082. +        * We do not need oldpage any more in the caller, so can break the lock
  2083. +        * now.
  2084. +        */
  2085. +       unlock_page(*kpage);
  2086. +       *kpage = tree_page; /* Get unlocked outside. */
  2087. +       return;
  2088. +
  2089. +unlock_failed:
  2090. +       unlock_page(tree_page);
  2091. +failed:
  2092. +       return;
  2093. +}
  2094. +
  2095. +static inline void stable_node_hash_max(struct stable_node *node,
  2096. +                                        struct page *page, u32 hash)
  2097. +{
  2098. +       u32 hash_max = node->hash_max;
  2099. +
  2100. +       if (!hash_max) {
  2101. +               hash_max = page_hash_max(page, hash);
  2102. +               node->hash_max = hash_max;
  2103. +       }
  2104. +}
  2105. +
  2106. +static inline
  2107. +struct stable_node *new_stable_node(struct tree_node *tree_node,
  2108. +                                   struct page *kpage, u32 hash_max)
  2109. +{
  2110. +       struct stable_node *new_stable_node;
  2111. +
  2112. +       new_stable_node = alloc_stable_node();
  2113. +       if (!new_stable_node)
  2114. +               return NULL;
  2115. +
  2116. +       new_stable_node->kpfn = page_to_pfn(kpage);
  2117. +       new_stable_node->hash_max = hash_max;
  2118. +       new_stable_node->tree_node = tree_node;
  2119. +       set_page_stable_node(kpage, new_stable_node);
  2120. +
  2121. +       return new_stable_node;
  2122. +}
  2123. +
  2124. +static inline
  2125. +struct stable_node *first_level_insert(struct tree_node *tree_node,
  2126. +                                      struct rmap_item *rmap_item,
  2127. +                                      struct rmap_item *tree_rmap_item,
  2128. +                                      struct page **kpage, u32 hash,
  2129. +                                      int *success1, int *success2)
  2130. +{
  2131. +       int cmp;
  2132. +       struct page *tree_page;
  2133. +       u32 hash_max = 0;
  2134. +       struct stable_node *stable_node, *new_snode;
  2135. +       struct rb_node *parent = NULL, **new;
  2136. +
  2137. +       /* this tree node contains no sub-tree yet */
  2138. +       stable_node = rb_entry(tree_node->sub_root.rb_node,
  2139. +                              struct stable_node, node);
  2140. +
  2141. +       tree_page = get_uksm_page(stable_node, 1, 0);
  2142. +       if (tree_page) {
  2143. +               cmp = memcmp_pages(*kpage, tree_page, 1);
  2144. +               if (!cmp) {
  2145. +                       try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
  2146. +                                             tree_page, success1, success2);
  2147. +                       put_page(tree_page);
  2148. +                       if (!*success1 && !*success2)
  2149. +                               goto failed;
  2150. +
  2151. +                       return stable_node;
  2152. +
  2153. +               } else {
  2154. +                       /*
  2155. +                        * collision in first level try to create a subtree.
  2156. +                        * A new node need to be created.
  2157. +                        */
  2158. +                       put_page(tree_page);
  2159. +
  2160. +                       stable_node_hash_max(stable_node, tree_page,
  2161. +                                            tree_node->hash);
  2162. +                       hash_max = rmap_item_hash_max(rmap_item, hash);
  2163. +                       cmp = hash_cmp(hash_max, stable_node->hash_max);
  2164. +
  2165. +                       parent = &stable_node->node;
  2166. +                       if (cmp < 0) {
  2167. +                               new = &parent->rb_left;
  2168. +                       } else if (cmp > 0) {
  2169. +                               new = &parent->rb_right;
  2170. +                       } else {
  2171. +                               goto failed;
  2172. +                       }
  2173. +               }
  2174. +
  2175. +       } else {
  2176. +               /* the only stable_node deleted, we reuse its tree_node.
  2177. +                */
  2178. +               parent = NULL;
  2179. +               new = &tree_node->sub_root.rb_node;
  2180. +       }
  2181. +
  2182. +       new_snode = new_stable_node(tree_node, *kpage, hash_max);
  2183. +       if (!new_snode)
  2184. +               goto failed;
  2185. +
  2186. +       rb_link_node(&new_snode->node, parent, new);
  2187. +       rb_insert_color(&new_snode->node, &tree_node->sub_root);
  2188. +       tree_node->count++;
  2189. +       *success1 = *success2 = 1;
  2190. +
  2191. +       return new_snode;
  2192. +
  2193. +failed:
  2194. +       return NULL;
  2195. +}
  2196. +
  2197. +static inline
  2198. +struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
  2199. +                                         struct rmap_item *rmap_item,
  2200. +                                         struct rmap_item *tree_rmap_item,
  2201. +                                         struct page **kpage, u32 hash,
  2202. +                                         int *success1, int *success2)
  2203. +{
  2204. +       struct page *tree_page;
  2205. +       u32 hash_max;
  2206. +       struct stable_node *stable_node, *new_snode;
  2207. +       struct rb_node *parent, **new;
  2208. +
  2209. +research:
  2210. +       parent = NULL;
  2211. +       new = &tree_node->sub_root.rb_node;
  2212. +       BUG_ON(!*new);
  2213. +       hash_max = rmap_item_hash_max(rmap_item, hash);
  2214. +       while (*new) {
  2215. +               int cmp;
  2216. +
  2217. +               stable_node = rb_entry(*new, struct stable_node, node);
  2218. +
  2219. +               cmp = hash_cmp(hash_max, stable_node->hash_max);
  2220. +
  2221. +               if (cmp < 0) {
  2222. +                       parent = *new;
  2223. +                       new = &parent->rb_left;
  2224. +               } else if (cmp > 0) {
  2225. +                       parent = *new;
  2226. +                       new = &parent->rb_right;
  2227. +               } else {
  2228. +                       tree_page = get_uksm_page(stable_node, 1, 0);
  2229. +                       if (tree_page) {
  2230. +                               cmp = memcmp_pages(*kpage, tree_page, 1);
  2231. +                               if (!cmp) {
  2232. +                                       try_merge_with_stable(rmap_item,
  2233. +                                               tree_rmap_item, kpage,
  2234. +                                               tree_page, success1, success2);
  2235. +
  2236. +                                       put_page(tree_page);
  2237. +                                       if (!*success1 && !*success2)
  2238. +                                               goto failed;
  2239. +                                       /*
  2240. +                                        * successfully merged with a stable
  2241. +                                        * node
  2242. +                                        */
  2243. +                                       return stable_node;
  2244. +                               } else {
  2245. +                                       put_page(tree_page);
  2246. +                                       goto failed;
  2247. +                               }
  2248. +                       } else {
  2249. +                               /*
  2250. +                                * stable node may be deleted,
  2251. +                                * and subtree maybe
  2252. +                                * restructed, cannot
  2253. +                                * continue, research it.
  2254. +                                */
  2255. +                               if (tree_node->count) {
  2256. +                                       goto research;
  2257. +                               } else {
  2258. +                                       /* reuse the tree node*/
  2259. +                                       parent = NULL;
  2260. +                                       new = &tree_node->sub_root.rb_node;
  2261. +                               }
  2262. +                       }
  2263. +               }
  2264. +       }
  2265. +
  2266. +       new_snode = new_stable_node(tree_node, *kpage, hash_max);
  2267. +       if (!new_snode)
  2268. +               goto failed;
  2269. +
  2270. +       rb_link_node(&new_snode->node, parent, new);
  2271. +       rb_insert_color(&new_snode->node, &tree_node->sub_root);
  2272. +       tree_node->count++;
  2273. +       *success1 = *success2 = 1;
  2274. +
  2275. +       return new_snode;
  2276. +
  2277. +failed:
  2278. +       return NULL;
  2279. +}
  2280. +
  2281. +
  2282. +/**
  2283. + * stable_tree_insert() - try to insert a merged page in unstable tree to
  2284. + * the stable tree
  2285. + *
  2286. + * @kpage:             the page need to be inserted
  2287. + * @hash:              the current hash of this page
  2288. + * @rmap_item:         the rmap_item being scanned
  2289. + * @tree_rmap_item:    the rmap_item found on unstable tree
  2290. + * @success1:          return if rmap_item is merged
  2291. + * @success2:          return if tree_rmap_item is merged
  2292. + *
  2293. + * @return             the stable_node on stable tree if at least one
  2294. + *                     rmap_item is inserted into stable tree, NULL
  2295. + *                     otherwise.
  2296. + */
  2297. +static struct stable_node *
  2298. +stable_tree_insert(struct page **kpage, u32 hash,
  2299. +                  struct rmap_item *rmap_item,
  2300. +                  struct rmap_item *tree_rmap_item,
  2301. +                  int *success1, int *success2)
  2302. +{
  2303. +       struct rb_node **new = &root_stable_treep->rb_node;
  2304. +       struct rb_node *parent = NULL;
  2305. +       struct stable_node *stable_node;
  2306. +       struct tree_node *tree_node;
  2307. +       u32 hash_max = 0;
  2308. +
  2309. +       *success1 = *success2 = 0;
  2310. +
  2311. +       while (*new) {
  2312. +               int cmp;
  2313. +
  2314. +               tree_node = rb_entry(*new, struct tree_node, node);
  2315. +
  2316. +               cmp = hash_cmp(hash, tree_node->hash);
  2317. +
  2318. +               if (cmp < 0) {
  2319. +                       parent = *new;
  2320. +                       new = &parent->rb_left;
  2321. +               } else if (cmp > 0) {
  2322. +                       parent = *new;
  2323. +                       new = &parent->rb_right;
  2324. +               } else
  2325. +                       break;
  2326. +       }
  2327. +
  2328. +       if (*new) {
  2329. +               if (tree_node->count == 1) {
  2330. +                       stable_node = first_level_insert(tree_node, rmap_item,
  2331. +                                               tree_rmap_item, kpage,
  2332. +                                               hash, success1, success2);
  2333. +               } else {
  2334. +                       stable_node = stable_subtree_insert(tree_node,
  2335. +                                       rmap_item, tree_rmap_item, kpage,
  2336. +                                       hash, success1, success2);
  2337. +               }
  2338. +       } else {
  2339. +
  2340. +               /* no tree node found */
  2341. +               tree_node = alloc_tree_node(stable_tree_node_listp);
  2342. +               if (!tree_node) {
  2343. +                       stable_node = NULL;
  2344. +                       goto out;
  2345. +               }
  2346. +
  2347. +               stable_node = new_stable_node(tree_node, *kpage, hash_max);
  2348. +               if (!stable_node) {
  2349. +                       free_tree_node(tree_node);
  2350. +                       goto out;
  2351. +               }
  2352. +
  2353. +               tree_node->hash = hash;
  2354. +               rb_link_node(&tree_node->node, parent, new);
  2355. +               rb_insert_color(&tree_node->node, root_stable_treep);
  2356. +               parent = NULL;
  2357. +               new = &tree_node->sub_root.rb_node;
  2358. +
  2359. +               rb_link_node(&stable_node->node, parent, new);
  2360. +               rb_insert_color(&stable_node->node, &tree_node->sub_root);
  2361. +               tree_node->count++;
  2362. +               *success1 = *success2 = 1;
  2363. +       }
  2364. +
  2365. +out:
  2366. +       return stable_node;
  2367. +}
  2368. +
  2369. +
  2370. +/**
  2371. + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
  2372. + *
  2373. + * @return     0 on success, -EBUSY if unable to lock the mmap_sem,
  2374. + *             -EINVAL if the page mapping has been changed.
  2375. + */
  2376. +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
  2377. +{
  2378. +       int err;
  2379. +
  2380. +       err = get_mergeable_page_lock_mmap(tree_rmap_item);
  2381. +
  2382. +       if (err == -EINVAL) {
  2383. +               /* its page map has been changed, remove it */
  2384. +               remove_rmap_item_from_tree(tree_rmap_item);
  2385. +       }
  2386. +
  2387. +       /* The page is gotten and mmap_sem is locked now. */
  2388. +       return err;
  2389. +}
  2390. +
  2391. +
  2392. +/**
  2393. + * unstable_tree_search_insert() - search an unstable tree rmap_item with the
  2394. + * same hash value. Get its page and trylock the mmap_sem
  2395. + */
  2396. +static inline
  2397. +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
  2398. +                                             u32 hash)
  2399. +
  2400. +{
  2401. +       struct rb_node **new = &root_unstable_tree.rb_node;
  2402. +       struct rb_node *parent = NULL;
  2403. +       struct tree_node *tree_node;
  2404. +       u32 hash_max;
  2405. +       struct rmap_item *tree_rmap_item;
  2406. +
  2407. +       while (*new) {
  2408. +               int cmp;
  2409. +
  2410. +               tree_node = rb_entry(*new, struct tree_node, node);
  2411. +
  2412. +               cmp = hash_cmp(hash, tree_node->hash);
  2413. +
  2414. +               if (cmp < 0) {
  2415. +                       parent = *new;
  2416. +                       new = &parent->rb_left;
  2417. +               } else if (cmp > 0) {
  2418. +                       parent = *new;
  2419. +                       new = &parent->rb_right;
  2420. +               } else
  2421. +                       break;
  2422. +       }
  2423. +
  2424. +       if (*new) {
  2425. +               /* got the tree_node */
  2426. +               if (tree_node->count == 1) {
  2427. +                       tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
  2428. +                                                 struct rmap_item, node);
  2429. +                       BUG_ON(!tree_rmap_item);
  2430. +
  2431. +                       goto get_page_out;
  2432. +               }
  2433. +
  2434. +               /* well, search the collision subtree */
  2435. +               new = &tree_node->sub_root.rb_node;
  2436. +               BUG_ON(!*new);
  2437. +               hash_max = rmap_item_hash_max(rmap_item, hash);
  2438. +
  2439. +               while (*new) {
  2440. +                       int cmp;
  2441. +
  2442. +                       tree_rmap_item = rb_entry(*new, struct rmap_item,
  2443. +                                                 node);
  2444. +
  2445. +                       cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
  2446. +                       parent = *new;
  2447. +                       if (cmp < 0)
  2448. +                               new = &parent->rb_left;
  2449. +                       else if (cmp > 0)
  2450. +                               new = &parent->rb_right;
  2451. +                       else
  2452. +                               goto get_page_out;
  2453. +               }
  2454. +       } else {
  2455. +               /* alloc a new tree_node */
  2456. +               tree_node = alloc_tree_node(&unstable_tree_node_list);
  2457. +               if (!tree_node)
  2458. +                       return NULL;
  2459. +
  2460. +               tree_node->hash = hash;
  2461. +               rb_link_node(&tree_node->node, parent, new);
  2462. +               rb_insert_color(&tree_node->node, &root_unstable_tree);
  2463. +               parent = NULL;
  2464. +               new = &tree_node->sub_root.rb_node;
  2465. +       }
  2466. +
  2467. +       /* did not found even in sub-tree */
  2468. +       rmap_item->tree_node = tree_node;
  2469. +       rmap_item->address |= UNSTABLE_FLAG;
  2470. +       rmap_item->hash_round = uksm_hash_round;
  2471. +       rb_link_node(&rmap_item->node, parent, new);
  2472. +       rb_insert_color(&rmap_item->node, &tree_node->sub_root);
  2473. +
  2474. +       uksm_pages_unshared++;
  2475. +       return NULL;
  2476. +
  2477. +get_page_out:
  2478. +       if (tree_rmap_item->page == rmap_item->page)
  2479. +               return NULL;
  2480. +
  2481. +       if (get_tree_rmap_item_page(tree_rmap_item))
  2482. +               return NULL;
  2483. +
  2484. +       return tree_rmap_item;
  2485. +}
  2486. +
  2487. +static void hold_anon_vma(struct rmap_item *rmap_item,
  2488. +                         struct anon_vma *anon_vma)
  2489. +{
  2490. +       rmap_item->anon_vma = anon_vma;
  2491. +       get_anon_vma(anon_vma);
  2492. +}
  2493. +
  2494. +
  2495. +/**
  2496. + * stable_tree_append() - append a rmap_item to a stable node. Deduplication
  2497. + * ratio statistics is done in this function.
  2498. + *
  2499. + */
  2500. +static void stable_tree_append(struct rmap_item *rmap_item,
  2501. +                              struct stable_node *stable_node, int logdedup)
  2502. +{
  2503. +       struct node_vma *node_vma = NULL, *new_node_vma;
  2504. +       struct hlist_node *hlist = NULL, *cont_p = NULL;
  2505. +       unsigned long key = (unsigned long)rmap_item->slot;
  2506. +       unsigned long factor = rmap_item->slot->rung->step;
  2507. +
  2508. +       BUG_ON(!stable_node);
  2509. +       rmap_item->address |= STABLE_FLAG;
  2510. +
  2511. +       if (hlist_empty(&stable_node->hlist)) {
  2512. +               uksm_pages_shared++;
  2513. +               goto node_vma_new;
  2514. +       } else {
  2515. +               uksm_pages_sharing++;
  2516. +       }
  2517. +
  2518. +       hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
  2519. +               if (node_vma->key >= key)
  2520. +                       break;
  2521. +
  2522. +               if (logdedup) {
  2523. +                       node_vma->slot->pages_bemerged += factor;
  2524. +                       if (list_empty(&node_vma->slot->dedup_list))
  2525. +                               list_add(&node_vma->slot->dedup_list,
  2526. +                                        &vma_slot_dedup);
  2527. +               }
  2528. +       }
  2529. +
  2530. +       if (node_vma) {
  2531. +               if (node_vma->key == key) {
  2532. +                       cont_p = hlist->next;
  2533. +                       goto node_vma_ok;
  2534. +               } else if (node_vma->key > key) {
  2535. +                       cont_p = hlist;
  2536. +               }
  2537. +       }
  2538. +
  2539. +node_vma_new:
  2540. +       /* no same vma already in node, alloc a new node_vma */
  2541. +       new_node_vma = alloc_node_vma();
  2542. +       BUG_ON(!new_node_vma);
  2543. +       new_node_vma->head = stable_node;
  2544. +       new_node_vma->slot = rmap_item->slot;
  2545. +
  2546. +       if (!node_vma) {
  2547. +               hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
  2548. +       } else if (node_vma->key != key) {
  2549. +               if (node_vma->key < key)
  2550. +                       hlist_add_after(&node_vma->hlist, &new_node_vma->hlist);
  2551. +               else {
  2552. +                       hlist_add_before(&new_node_vma->hlist,
  2553. +                                        &node_vma->hlist);
  2554. +               }
  2555. +
  2556. +       }
  2557. +       node_vma = new_node_vma;
  2558. +
  2559. +node_vma_ok: /* ok, ready to add to the list */
  2560. +       rmap_item->head = node_vma;
  2561. +       hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
  2562. +       hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
  2563. +       if (logdedup) {
  2564. +               rmap_item->slot->pages_merged++;
  2565. +               if (cont_p) {
  2566. +                       hlist_for_each_entry_continue(node_vma,
  2567. +                                                     cont_p, hlist) {
  2568. +                               node_vma->slot->pages_bemerged += factor;
  2569. +                               if (list_empty(&node_vma->slot->dedup_list))
  2570. +                                       list_add(&node_vma->slot->dedup_list,
  2571. +                                                &vma_slot_dedup);
  2572. +                       }
  2573. +               }
  2574. +       }
  2575. +}
  2576. +
  2577. +/*
  2578. + * We use break_ksm to break COW on a ksm page: it's a stripped down
  2579. + *
  2580. + *     if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
  2581. + *             put_page(page);
  2582. + *
  2583. + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
  2584. + * in case the application has unmapped and remapped mm,addr meanwhile.
  2585. + * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
  2586. + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
  2587. + */
  2588. +static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
  2589. +{
  2590. +       struct page *page;
  2591. +       int ret = 0;
  2592. +
  2593. +       do {
  2594. +               cond_resched();
  2595. +               page = follow_page(vma, addr, FOLL_GET);
  2596. +               if (IS_ERR_OR_NULL(page))
  2597. +                       break;
  2598. +               if (PageKsm(page)) {
  2599. +                       ret = handle_mm_fault(vma->vm_mm, vma, addr,
  2600. +                                             FAULT_FLAG_WRITE);
  2601. +               } else
  2602. +                       ret = VM_FAULT_WRITE;
  2603. +               put_page(page);
  2604. +       } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
  2605. +       /*
  2606. +        * We must loop because handle_mm_fault() may back out if there's
  2607. +        * any difficulty e.g. if pte accessed bit gets updated concurrently.
  2608. +        *
  2609. +        * VM_FAULT_WRITE is what we have been hoping for: it indicates that
  2610. +        * COW has been broken, even if the vma does not permit VM_WRITE;
  2611. +        * but note that a concurrent fault might break PageKsm for us.
  2612. +        *
  2613. +        * VM_FAULT_SIGBUS could occur if we race with truncation of the
  2614. +        * backing file, which also invalidates anonymous pages: that's
  2615. +        * okay, that truncation will have unmapped the PageKsm for us.
  2616. +        *
  2617. +        * VM_FAULT_OOM: at the time of writing (late July 2009), setting
  2618. +        * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
  2619. +        * current task has TIF_MEMDIE set, and will be OOM killed on return
  2620. +        * to user; and ksmd, having no mm, would never be chosen for that.
  2621. +        *
  2622. +        * But if the mm is in a limited mem_cgroup, then the fault may fail
  2623. +        * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
  2624. +        * even ksmd can fail in this way - though it's usually breaking ksm
  2625. +        * just to undo a merge it made a moment before, so unlikely to oom.
  2626. +        *
  2627. +        * That's a pity: we might therefore have more kernel pages allocated
  2628. +        * than we're counting as nodes in the stable tree; but uksm_do_scan
  2629. +        * will retry to break_cow on each pass, so should recover the page
  2630. +        * in due course.  The important thing is to not let VM_MERGEABLE
  2631. +        * be cleared while any such pages might remain in the area.
  2632. +        */
  2633. +       return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
  2634. +}
  2635. +
  2636. +static void break_cow(struct rmap_item *rmap_item)
  2637. +{
  2638. +       struct vm_area_struct *vma = rmap_item->slot->vma;
  2639. +       struct mm_struct *mm = vma->vm_mm;
  2640. +       unsigned long addr = get_rmap_addr(rmap_item);
  2641. +
  2642. +       if (uksm_test_exit(mm))
  2643. +               goto out;
  2644. +
  2645. +       break_ksm(vma, addr);
  2646. +out:
  2647. +       return;
  2648. +}
  2649. +
  2650. +/*
  2651. + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
  2652. + * than check every pte of a given vma, the locking doesn't quite work for
  2653. + * that - an rmap_item is assigned to the stable tree after inserting ksm
  2654. + * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
  2655. + * rmap_items from parent to child at fork time (so as not to waste time
  2656. + * if exit comes before the next scan reaches it).
  2657. + *
  2658. + * Similarly, although we'd like to remove rmap_items (so updating counts
  2659. + * and freeing memory) when unmerging an area, it's easier to leave that
  2660. + * to the next pass of ksmd - consider, for example, how ksmd might be
  2661. + * in cmp_and_merge_page on one of the rmap_items we would be removing.
  2662. + */
  2663. +inline int unmerge_uksm_pages(struct vm_area_struct *vma,
  2664. +                     unsigned long start, unsigned long end)
  2665. +{
  2666. +       unsigned long addr;
  2667. +       int err = 0;
  2668. +
  2669. +       for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
  2670. +               if (uksm_test_exit(vma->vm_mm))
  2671. +                       break;
  2672. +               if (signal_pending(current))
  2673. +                       err = -ERESTARTSYS;
  2674. +               else
  2675. +                       err = break_ksm(vma, addr);
  2676. +       }
  2677. +       return err;
  2678. +}
  2679. +
  2680. +static inline void inc_uksm_pages_scanned(void)
  2681. +{
  2682. +       u64 delta;
  2683. +
  2684. +
  2685. +       if (uksm_pages_scanned == U64_MAX) {
  2686. +               encode_benefit();
  2687. +
  2688. +               delta = uksm_pages_scanned >> pages_scanned_base;
  2689. +
  2690. +               if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
  2691. +                       pages_scanned_stored >>= 1;
  2692. +                       delta >>= 1;
  2693. +                       pages_scanned_base++;
  2694. +               }
  2695. +
  2696. +               pages_scanned_stored += delta;
  2697. +
  2698. +               uksm_pages_scanned = uksm_pages_scanned_last = 0;
  2699. +       }
  2700. +
  2701. +       uksm_pages_scanned++;
  2702. +}
  2703. +
  2704. +static inline int find_zero_page_hash(int strength, u32 hash)
  2705. +{
  2706. +       return (zero_hash_table[strength] == hash);
  2707. +}
  2708. +
  2709. +static
  2710. +int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
  2711. +{
  2712. +       struct page *zero_page = empty_uksm_zero_page;
  2713. +       struct mm_struct *mm = vma->vm_mm;
  2714. +       pte_t orig_pte = __pte(0);
  2715. +       int err = -EFAULT;
  2716. +
  2717. +       if (uksm_test_exit(mm))
  2718. +               goto out;
  2719. +
  2720. +       if (PageTransCompound(page) && page_trans_compound_anon_split(page))
  2721. +               goto out;
  2722. +       BUG_ON(PageTransCompound(page));
  2723. +
  2724. +       if (!PageAnon(page))
  2725. +               goto out;
  2726. +
  2727. +       if (!trylock_page(page))
  2728. +               goto out;
  2729. +
  2730. +       if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
  2731. +               if (is_page_full_zero(page))
  2732. +                       err = replace_page(vma, page, zero_page, orig_pte);
  2733. +       }
  2734. +
  2735. +       unlock_page(page);
  2736. +out:
  2737. +       return err;
  2738. +}
  2739. +
  2740. +/*
  2741. + * cmp_and_merge_page() - first see if page can be merged into the stable
  2742. + * tree; if not, compare hash to previous and if it's the same, see if page
  2743. + * can be inserted into the unstable tree, or merged with a page already there
  2744. + * and both transferred to the stable tree.
  2745. + *
  2746. + * @page: the page that we are searching identical page to.
  2747. + * @rmap_item: the reverse mapping into the virtual address of this page
  2748. + */
  2749. +static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
  2750. +{
  2751. +       struct rmap_item *tree_rmap_item;
  2752. +       struct page *page;
  2753. +       struct page *kpage = NULL;
  2754. +       u32 hash_max;
  2755. +       int err;
  2756. +       unsigned int success1, success2;
  2757. +       struct stable_node *snode;
  2758. +       int cmp;
  2759. +       struct rb_node *parent = NULL, **new;
  2760. +
  2761. +       remove_rmap_item_from_tree(rmap_item);
  2762. +       page = rmap_item->page;
  2763. +
  2764. +       /* We first start with searching the page inside the stable tree */
  2765. +       kpage = stable_tree_search(rmap_item, hash);
  2766. +       if (kpage) {
  2767. +               err = try_to_merge_with_uksm_page(rmap_item, kpage,
  2768. +                                                hash);
  2769. +               if (!err) {
  2770. +                       /*
  2771. +                        * The page was successfully merged, add
  2772. +                        * its rmap_item to the stable tree.
  2773. +                        * page lock is needed because it's
  2774. +                        * racing with try_to_unmap_ksm(), etc.
  2775. +                        */
  2776. +                       lock_page(kpage);
  2777. +                       snode = page_stable_node(kpage);
  2778. +                       stable_tree_append(rmap_item, snode, 1);
  2779. +                       unlock_page(kpage);
  2780. +                       put_page(kpage);
  2781. +                       return; /* success */
  2782. +               }
  2783. +               put_page(kpage);
  2784. +
  2785. +               /*
  2786. +                * if it's a collision and it has been search in sub-rbtree
  2787. +                * (hash_max != 0), we want to abort, because if it is
  2788. +                * successfully merged in unstable tree, the collision trends to
  2789. +                * happen again.
  2790. +                */
  2791. +               if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
  2792. +                       return;
  2793. +       }
  2794. +
  2795. +       tree_rmap_item =
  2796. +               unstable_tree_search_insert(rmap_item, hash);
  2797. +       if (tree_rmap_item) {
  2798. +               err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
  2799. +               /*
  2800. +                * As soon as we merge this page, we want to remove the
  2801. +                * rmap_item of the page we have merged with from the unstable
  2802. +                * tree, and insert it instead as new node in the stable tree.
  2803. +                */
  2804. +               if (!err) {
  2805. +                       kpage = page;
  2806. +                       remove_rmap_item_from_tree(tree_rmap_item);
  2807. +                       lock_page(kpage);
  2808. +                       snode = stable_tree_insert(&kpage, hash,
  2809. +                                                  rmap_item, tree_rmap_item,
  2810. +                                                  &success1, &success2);
  2811. +
  2812. +                       /*
  2813. +                        * Do not log dedup for tree item, it's not counted as
  2814. +                        * scanned in this round.
  2815. +                        */
  2816. +                       if (success2)
  2817. +                               stable_tree_append(tree_rmap_item, snode, 0);
  2818. +
  2819. +                       /*
  2820. +                        * The order of these two stable append is important:
  2821. +                        * we are scanning rmap_item.
  2822. +                        */
  2823. +                       if (success1)
  2824. +                               stable_tree_append(rmap_item, snode, 1);
  2825. +
  2826. +                       /*
  2827. +                        * The original kpage may be unlocked inside
  2828. +                        * stable_tree_insert() already. This page
  2829. +                        * should be unlocked before doing
  2830. +                        * break_cow().
  2831. +                        */
  2832. +                       unlock_page(kpage);
  2833. +
  2834. +                       if (!success1)
  2835. +                               break_cow(rmap_item);
  2836. +
  2837. +                       if (!success2)
  2838. +                               break_cow(tree_rmap_item);
  2839. +
  2840. +               } else if (err == MERGE_ERR_COLLI) {
  2841. +                       BUG_ON(tree_rmap_item->tree_node->count > 1);
  2842. +
  2843. +                       rmap_item_hash_max(tree_rmap_item,
  2844. +                                          tree_rmap_item->tree_node->hash);
  2845. +
  2846. +                       hash_max = rmap_item_hash_max(rmap_item, hash);
  2847. +                       cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
  2848. +                       parent = &tree_rmap_item->node;
  2849. +                       if (cmp < 0)
  2850. +                               new = &parent->rb_left;
  2851. +                       else if (cmp > 0)
  2852. +                               new = &parent->rb_right;
  2853. +                       else
  2854. +                               goto put_up_out;
  2855. +
  2856. +                       rmap_item->tree_node = tree_rmap_item->tree_node;
  2857. +                       rmap_item->address |= UNSTABLE_FLAG;
  2858. +                       rmap_item->hash_round = uksm_hash_round;
  2859. +                       rb_link_node(&rmap_item->node, parent, new);
  2860. +                       rb_insert_color(&rmap_item->node,
  2861. +                                       &tree_rmap_item->tree_node->sub_root);
  2862. +                       rmap_item->tree_node->count++;
  2863. +               } else {
  2864. +                       /*
  2865. +                        * either one of the page has changed or they collide
  2866. +                        * at the max hash, we consider them as ill items.
  2867. +                        */
  2868. +                       remove_rmap_item_from_tree(tree_rmap_item);
  2869. +               }
  2870. +put_up_out:
  2871. +               put_page(tree_rmap_item->page);
  2872. +               up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem);
  2873. +       }
  2874. +}
  2875. +
  2876. +
  2877. +
  2878. +
  2879. +static inline unsigned long get_pool_index(struct vma_slot *slot,
  2880. +                                          unsigned long index)
  2881. +{
  2882. +       unsigned long pool_index;
  2883. +
  2884. +       pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
  2885. +       if (pool_index >= slot->pool_size)
  2886. +               BUG();
  2887. +       return pool_index;
  2888. +}
  2889. +
  2890. +static inline unsigned long index_page_offset(unsigned long index)
  2891. +{
  2892. +       return offset_in_page(sizeof(struct rmap_list_entry *) * index);
  2893. +}
  2894. +
  2895. +static inline
  2896. +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
  2897. +                                           unsigned long index, int need_alloc)
  2898. +{
  2899. +       unsigned long pool_index;
  2900. +       struct page *page;
  2901. +       void *addr;
  2902. +
  2903. +
  2904. +       pool_index = get_pool_index(slot, index);
  2905. +       if (!slot->rmap_list_pool[pool_index]) {
  2906. +               if (!need_alloc)
  2907. +                       return NULL;
  2908. +
  2909. +               page = alloc_page(GFP_KERNEL | __GFP_ZERO);
  2910. +               if (!page)
  2911. +                       return NULL;
  2912. +
  2913. +               slot->rmap_list_pool[pool_index] = page;
  2914. +       }
  2915. +
  2916. +       addr = kmap(slot->rmap_list_pool[pool_index]);
  2917. +       addr += index_page_offset(index);
  2918. +
  2919. +       return addr;
  2920. +}
  2921. +
  2922. +static inline void put_rmap_list_entry(struct vma_slot *slot,
  2923. +                                      unsigned long index)
  2924. +{
  2925. +       unsigned long pool_index;
  2926. +
  2927. +       pool_index = get_pool_index(slot, index);
  2928. +       BUG_ON(!slot->rmap_list_pool[pool_index]);
  2929. +       kunmap(slot->rmap_list_pool[pool_index]);
  2930. +}
  2931. +
  2932. +static inline int entry_is_new(struct rmap_list_entry *entry)
  2933. +{
  2934. +       return !entry->item;
  2935. +}
  2936. +
  2937. +static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
  2938. +                                               unsigned long index)
  2939. +{
  2940. +       return slot->vma->vm_start + (index << PAGE_SHIFT);
  2941. +}
  2942. +
  2943. +static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
  2944. +{
  2945. +       unsigned long addr;
  2946. +
  2947. +       if (is_addr(entry->addr))
  2948. +               addr = get_clean_addr(entry->addr);
  2949. +       else if (entry->item)
  2950. +               addr = get_rmap_addr(entry->item);
  2951. +       else
  2952. +               BUG();
  2953. +
  2954. +       return addr;
  2955. +}
  2956. +
  2957. +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
  2958. +{
  2959. +       if (is_addr(entry->addr))
  2960. +               return NULL;
  2961. +
  2962. +       return entry->item;
  2963. +}
  2964. +
  2965. +static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
  2966. +                                           unsigned long index)
  2967. +{
  2968. +       unsigned long pool_index;
  2969. +
  2970. +       pool_index = get_pool_index(slot, index);
  2971. +       BUG_ON(!slot->rmap_list_pool[pool_index]);
  2972. +       slot->pool_counts[pool_index]++;
  2973. +}
  2974. +
  2975. +static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
  2976. +                                           unsigned long index)
  2977. +{
  2978. +       unsigned long pool_index;
  2979. +
  2980. +       pool_index = get_pool_index(slot, index);
  2981. +       BUG_ON(!slot->rmap_list_pool[pool_index]);
  2982. +       BUG_ON(!slot->pool_counts[pool_index]);
  2983. +       slot->pool_counts[pool_index]--;
  2984. +}
  2985. +
  2986. +static inline int entry_has_rmap(struct rmap_list_entry *entry)
  2987. +{
  2988. +       return !is_addr(entry->addr) && entry->item;
  2989. +}
  2990. +
  2991. +static inline void swap_entries(struct rmap_list_entry *entry1,
  2992. +                               unsigned long index1,
  2993. +                               struct rmap_list_entry *entry2,
  2994. +                               unsigned long index2)
  2995. +{
  2996. +       struct rmap_list_entry tmp;
  2997. +
  2998. +       /* swapping two new entries is meaningless */
  2999. +       BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
  3000. +
  3001. +       tmp = *entry1;
  3002. +       *entry1 = *entry2;
  3003. +       *entry2 = tmp;
  3004. +
  3005. +       if (entry_has_rmap(entry1))
  3006. +               entry1->item->entry_index = index1;
  3007. +
  3008. +       if (entry_has_rmap(entry2))
  3009. +               entry2->item->entry_index = index2;
  3010. +
  3011. +       if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
  3012. +               inc_rmap_list_pool_count(entry1->item->slot, index1);
  3013. +               dec_rmap_list_pool_count(entry1->item->slot, index2);
  3014. +       } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
  3015. +               inc_rmap_list_pool_count(entry2->item->slot, index2);
  3016. +               dec_rmap_list_pool_count(entry2->item->slot, index1);
  3017. +       }
  3018. +}
  3019. +
  3020. +static inline void free_entry_item(struct rmap_list_entry *entry)
  3021. +{
  3022. +       unsigned long index;
  3023. +       struct rmap_item *item;
  3024. +
  3025. +       if (!is_addr(entry->addr)) {
  3026. +               BUG_ON(!entry->item);
  3027. +               item = entry->item;
  3028. +               entry->addr = get_rmap_addr(item);
  3029. +               set_is_addr(entry->addr);
  3030. +               index = item->entry_index;
  3031. +               remove_rmap_item_from_tree(item);
  3032. +               dec_rmap_list_pool_count(item->slot, index);
  3033. +               free_rmap_item(item);
  3034. +       }
  3035. +}
  3036. +
  3037. +static inline int pool_entry_boundary(unsigned long index)
  3038. +{
  3039. +       unsigned long linear_addr;
  3040. +
  3041. +       linear_addr = sizeof(struct rmap_list_entry *) * index;
  3042. +       return index && !offset_in_page(linear_addr);
  3043. +}
  3044. +
  3045. +static inline void try_free_last_pool(struct vma_slot *slot,
  3046. +                                     unsigned long index)
  3047. +{
  3048. +       unsigned long pool_index;
  3049. +
  3050. +       pool_index = get_pool_index(slot, index);
  3051. +       if (slot->rmap_list_pool[pool_index] &&
  3052. +           !slot->pool_counts[pool_index]) {
  3053. +               __free_page(slot->rmap_list_pool[pool_index]);
  3054. +               slot->rmap_list_pool[pool_index] = NULL;
  3055. +               slot->flags |= UKSM_SLOT_NEED_SORT;
  3056. +       }
  3057. +
  3058. +}
  3059. +
  3060. +static inline unsigned long vma_item_index(struct vm_area_struct *vma,
  3061. +                                          struct rmap_item *item)
  3062. +{
  3063. +       return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
  3064. +}
  3065. +
  3066. +static int within_same_pool(struct vma_slot *slot,
  3067. +                           unsigned long i, unsigned long j)
  3068. +{
  3069. +       unsigned long pool_i, pool_j;
  3070. +
  3071. +       pool_i = get_pool_index(slot, i);
  3072. +       pool_j = get_pool_index(slot, j);
  3073. +
  3074. +       return (pool_i == pool_j);
  3075. +}
  3076. +
  3077. +static void sort_rmap_entry_list(struct vma_slot *slot)
  3078. +{
  3079. +       unsigned long i, j;
  3080. +       struct rmap_list_entry *entry, *swap_entry;
  3081. +
  3082. +       entry = get_rmap_list_entry(slot, 0, 0);
  3083. +       for (i = 0; i < slot->pages; ) {
  3084. +
  3085. +               if (!entry)
  3086. +                       goto skip_whole_pool;
  3087. +
  3088. +               if (entry_is_new(entry))
  3089. +                       goto next_entry;
  3090. +
  3091. +               if (is_addr(entry->addr)) {
  3092. +                       entry->addr = 0;
  3093. +                       goto next_entry;
  3094. +               }
  3095. +
  3096. +               j = vma_item_index(slot->vma, entry->item);
  3097. +               if (j == i)
  3098. +                       goto next_entry;
  3099. +
  3100. +               if (within_same_pool(slot, i, j))
  3101. +                       swap_entry = entry + j - i;
  3102. +               else
  3103. +                       swap_entry = get_rmap_list_entry(slot, j, 1);
  3104. +
  3105. +               swap_entries(entry, i, swap_entry, j);
  3106. +               if (!within_same_pool(slot, i, j))
  3107. +                       put_rmap_list_entry(slot, j);
  3108. +               continue;
  3109. +
  3110. +skip_whole_pool:
  3111. +               i += PAGE_SIZE / sizeof(*entry);
  3112. +               if (i < slot->pages)
  3113. +                       entry = get_rmap_list_entry(slot, i, 0);
  3114. +               continue;
  3115. +
  3116. +next_entry:
  3117. +               if (i >= slot->pages - 1 ||
  3118. +                   !within_same_pool(slot, i, i + 1)) {
  3119. +                       put_rmap_list_entry(slot, i);
  3120. +                       if (i + 1 < slot->pages)
  3121. +                               entry = get_rmap_list_entry(slot, i + 1, 0);
  3122. +               } else
  3123. +                       entry++;
  3124. +               i++;
  3125. +               continue;
  3126. +       }
  3127. +
  3128. +       /* free empty pool entries which contain no rmap_item */
  3129. +       /* CAN be simplied to based on only pool_counts when bug freed !!!!! */
  3130. +       for (i = 0; i < slot->pool_size; i++) {
  3131. +               unsigned char has_rmap;
  3132. +               void *addr;
  3133. +
  3134. +               if (!slot->rmap_list_pool[i])
  3135. +                       continue;
  3136. +
  3137. +               has_rmap = 0;
  3138. +               addr = kmap(slot->rmap_list_pool[i]);
  3139. +               BUG_ON(!addr);
  3140. +               for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
  3141. +                       entry = (struct rmap_list_entry *)addr + j;
  3142. +                       if (is_addr(entry->addr))
  3143. +                               continue;
  3144. +                       if (!entry->item)
  3145. +                               continue;
  3146. +                       has_rmap = 1;
  3147. +               }
  3148. +               kunmap(slot->rmap_list_pool[i]);
  3149. +               if (!has_rmap) {
  3150. +                       BUG_ON(slot->pool_counts[i]);
  3151. +                       __free_page(slot->rmap_list_pool[i]);
  3152. +                       slot->rmap_list_pool[i] = NULL;
  3153. +               }
  3154. +       }
  3155. +
  3156. +       slot->flags &= ~UKSM_SLOT_NEED_SORT;
  3157. +}
  3158. +
  3159. +/*
  3160. + * vma_fully_scanned() - if all the pages in this slot have been scanned.
  3161. + */
  3162. +static inline int vma_fully_scanned(struct vma_slot *slot)
  3163. +{
  3164. +       return slot->pages_scanned == slot->pages;
  3165. +}
  3166. +
  3167. +/**
  3168. + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
  3169. + * its random permutation. This function is embedded with the random
  3170. + * permutation index management code.
  3171. + */
  3172. +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
  3173. +{
  3174. +       unsigned long rand_range, addr, swap_index, scan_index;
  3175. +       struct rmap_item *item = NULL;
  3176. +       struct rmap_list_entry *scan_entry, *swap_entry = NULL;
  3177. +       struct page *page;
  3178. +
  3179. +       scan_index = swap_index = slot->pages_scanned % slot->pages;
  3180. +
  3181. +       if (pool_entry_boundary(scan_index))
  3182. +               try_free_last_pool(slot, scan_index - 1);
  3183. +
  3184. +       if (vma_fully_scanned(slot)) {
  3185. +               if (slot->flags & UKSM_SLOT_NEED_SORT)
  3186. +                       slot->flags |= UKSM_SLOT_NEED_RERAND;
  3187. +               else
  3188. +                       slot->flags &= ~UKSM_SLOT_NEED_RERAND;
  3189. +               if (slot->flags & UKSM_SLOT_NEED_SORT)
  3190. +                       sort_rmap_entry_list(slot);
  3191. +       }
  3192. +
  3193. +       scan_entry = get_rmap_list_entry(slot, scan_index, 1);
  3194. +       if (!scan_entry)
  3195. +               return NULL;
  3196. +
  3197. +       if (entry_is_new(scan_entry)) {
  3198. +               scan_entry->addr = get_index_orig_addr(slot, scan_index);
  3199. +               set_is_addr(scan_entry->addr);
  3200. +       }
  3201. +
  3202. +       if (slot->flags & UKSM_SLOT_NEED_RERAND) {
  3203. +               rand_range = slot->pages - scan_index;
  3204. +               BUG_ON(!rand_range);
  3205. +               swap_index = scan_index + (random32() % rand_range);
  3206. +       }
  3207. +
  3208. +       if (swap_index != scan_index) {
  3209. +               swap_entry = get_rmap_list_entry(slot, swap_index, 1);
  3210. +               if (entry_is_new(swap_entry)) {
  3211. +                       swap_entry->addr = get_index_orig_addr(slot,
  3212. +                                                              swap_index);
  3213. +                       set_is_addr(swap_entry->addr);
  3214. +               }
  3215. +               swap_entries(scan_entry, scan_index, swap_entry, swap_index);
  3216. +       }
  3217. +
  3218. +       addr = get_entry_address(scan_entry);
  3219. +       item = get_entry_item(scan_entry);
  3220. +       BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
  3221. +
  3222. +       page = follow_page(slot->vma, addr, FOLL_GET);
  3223. +       if (IS_ERR_OR_NULL(page))
  3224. +               goto nopage;
  3225. +
  3226. +       if (!PageAnon(page) && !page_trans_compound_anon(page))
  3227. +               goto putpage;
  3228. +
  3229. +       /*check is zero_page pfn or uksm_zero_page*/
  3230. +       if ((page_to_pfn(page) == zero_pfn)
  3231. +                       || (page_to_pfn(page) == uksm_zero_pfn))
  3232. +               goto putpage;
  3233. +
  3234. +       flush_anon_page(slot->vma, page, addr);
  3235. +       flush_dcache_page(page);
  3236. +
  3237. +
  3238. +       *hash = page_hash(page, hash_strength, 1);
  3239. +       inc_uksm_pages_scanned();
  3240. +       /*if the page content all zero, re-map to zero-page*/
  3241. +       if (find_zero_page_hash(hash_strength, *hash)) {
  3242. +               if (!cmp_and_merge_zero_page(slot->vma, page)) {
  3243. +                       slot->pages_merged++;
  3244. +                       __inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
  3245. +                       dec_mm_counter(slot->mm, MM_ANONPAGES);
  3246. +
  3247. +                       /* For full-zero pages, no need to create rmap item */
  3248. +                       goto putpage;
  3249. +               } else {
  3250. +                       inc_rshash_neg(memcmp_cost / 2);
  3251. +               }
  3252. +       }
  3253. +
  3254. +       if (!item) {
  3255. +               item = alloc_rmap_item();
  3256. +               if (item) {
  3257. +                       /* It has already been zeroed */
  3258. +                       item->slot = slot;
  3259. +                       item->address = addr;
  3260. +                       item->entry_index = scan_index;
  3261. +                       scan_entry->item = item;
  3262. +                       inc_rmap_list_pool_count(slot, scan_index);
  3263. +               } else
  3264. +                       goto putpage;
  3265. +       }
  3266. +
  3267. +       BUG_ON(item->slot != slot);
  3268. +       /* the page may have changed */
  3269. +       item->page = page;
  3270. +       put_rmap_list_entry(slot, scan_index);
  3271. +       if (swap_entry)
  3272. +               put_rmap_list_entry(slot, swap_index);
  3273. +       return item;
  3274. +
  3275. +putpage:
  3276. +       put_page(page);
  3277. +       page = NULL;
  3278. +nopage:
  3279. +       /* no page, store addr back and free rmap_item if possible */
  3280. +       free_entry_item(scan_entry);
  3281. +       put_rmap_list_entry(slot, scan_index);
  3282. +       if (swap_entry)
  3283. +               put_rmap_list_entry(slot, swap_index);
  3284. +       return NULL;
  3285. +}
  3286. +
  3287. +static inline int in_stable_tree(struct rmap_item *rmap_item)
  3288. +{
  3289. +       return rmap_item->address & STABLE_FLAG;
  3290. +}
  3291. +
  3292. +/**
  3293. + * scan_vma_one_page() - scan the next page in a vma_slot. Called with
  3294. + * mmap_sem locked.
  3295. + */
  3296. +static noinline void scan_vma_one_page(struct vma_slot *slot)
  3297. +{
  3298. +       u32 hash;
  3299. +       struct mm_struct *mm;
  3300. +       struct rmap_item *rmap_item = NULL;
  3301. +       struct vm_area_struct *vma = slot->vma;
  3302. +
  3303. +       mm = vma->vm_mm;
  3304. +       BUG_ON(!mm);
  3305. +       BUG_ON(!slot);
  3306. +
  3307. +       rmap_item = get_next_rmap_item(slot, &hash);
  3308. +       if (!rmap_item)
  3309. +               goto out1;
  3310. +
  3311. +       if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
  3312. +               goto out2;
  3313. +
  3314. +       cmp_and_merge_page(rmap_item, hash);
  3315. +out2:
  3316. +       put_page(rmap_item->page);
  3317. +out1:
  3318. +       slot->pages_scanned++;
  3319. +       if (slot->fully_scanned_round != fully_scanned_round)
  3320. +               scanned_virtual_pages++;
  3321. +
  3322. +       if (vma_fully_scanned(slot))
  3323. +               slot->fully_scanned_round = fully_scanned_round;
  3324. +}
  3325. +
  3326. +static inline unsigned long rung_get_pages(struct scan_rung *rung)
  3327. +{
  3328. +       struct slot_tree_node *node;
  3329. +
  3330. +       if (!rung->vma_root.rnode)
  3331. +               return 0;
  3332. +
  3333. +       node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
  3334. +
  3335. +       return node->size;
  3336. +}
  3337. +
  3338. +#define RUNG_SAMPLED_MIN       3
  3339. +
  3340. +static inline
  3341. +void uksm_calc_rung_step(struct scan_rung *rung,
  3342. +                        unsigned long page_time, unsigned long ratio)
  3343. +{
  3344. +       unsigned long sampled, pages;
  3345. +
  3346. +       /* will be fully scanned ? */
  3347. +       if (!rung->cover_msecs) {
  3348. +               rung->step = 1;
  3349. +               return;
  3350. +       }
  3351. +
  3352. +       sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
  3353. +                 * ratio / page_time;
  3354. +
  3355. +       /*
  3356. +        *  Before we finsish a scan round and expensive per-round jobs,
  3357. +        *  we need to have a chance to estimate the per page time. So
  3358. +        *  the sampled number can not be too small.
  3359. +        */
  3360. +       if (sampled < RUNG_SAMPLED_MIN)
  3361. +               sampled = RUNG_SAMPLED_MIN;
  3362. +
  3363. +       pages = rung_get_pages(rung);
  3364. +       if (likely(pages > sampled))
  3365. +               rung->step = pages / sampled;
  3366. +       else
  3367. +               rung->step = 1;
  3368. +}
  3369. +
  3370. +static inline int step_need_recalc(struct scan_rung *rung)
  3371. +{
  3372. +       unsigned long pages, stepmax;
  3373. +
  3374. +       pages = rung_get_pages(rung);
  3375. +       stepmax = pages / RUNG_SAMPLED_MIN;
  3376. +
  3377. +       return pages && (rung->step > pages ||
  3378. +                        (stepmax && rung->step > stepmax));
  3379. +}
  3380. +
  3381. +static inline
  3382. +void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
  3383. +{
  3384. +       struct vma_slot *slot;
  3385. +
  3386. +       if (finished)
  3387. +               rung->flags |= UKSM_RUNG_ROUND_FINISHED;
  3388. +
  3389. +       if (step_recalc || step_need_recalc(rung)) {
  3390. +               uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
  3391. +               BUG_ON(step_need_recalc(rung));
  3392. +       }
  3393. +
  3394. +       slot_iter_index = random32() % rung->step;
  3395. +       BUG_ON(!rung->vma_root.rnode);
  3396. +       slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
  3397. +       BUG_ON(!slot);
  3398. +
  3399. +       rung->current_scan = slot;
  3400. +       rung->current_offset = slot_iter_index;
  3401. +}
  3402. +
  3403. +static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
  3404. +{
  3405. +       return &slot->rung->vma_root;
  3406. +}
  3407. +
  3408. +/*
  3409. + * return if resetted.
  3410. + */
  3411. +static int advance_current_scan(struct scan_rung *rung)
  3412. +{
  3413. +       unsigned short n;
  3414. +       struct vma_slot *slot, *next = NULL;
  3415. +
  3416. +       BUG_ON(!rung->vma_root.num);
  3417. +
  3418. +       slot = rung->current_scan;
  3419. +       n = (slot->pages - rung->current_offset) % rung->step;
  3420. +       slot_iter_index = rung->step - n;
  3421. +       next = sradix_tree_next(&rung->vma_root, slot->snode,
  3422. +                               slot->sindex, slot_iter);
  3423. +
  3424. +       if (next) {
  3425. +               rung->current_offset = slot_iter_index;
  3426. +               rung->current_scan = next;
  3427. +               return 0;
  3428. +       } else {
  3429. +               reset_current_scan(rung, 1, 0);
  3430. +               return 1;
  3431. +       }
  3432. +}
  3433. +
  3434. +static inline void rung_rm_slot(struct vma_slot *slot)
  3435. +{
  3436. +       struct scan_rung *rung = slot->rung;
  3437. +       struct sradix_tree_root *root;
  3438. +
  3439. +       if (rung->current_scan == slot)
  3440. +               advance_current_scan(rung);
  3441. +
  3442. +       root = slot_get_root(slot);
  3443. +       sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
  3444. +       slot->snode = NULL;
  3445. +       if (step_need_recalc(rung)) {
  3446. +               uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
  3447. +               BUG_ON(step_need_recalc(rung));
  3448. +       }
  3449. +
  3450. +       /* In case advance_current_scan loop back to this slot again */
  3451. +       if (rung->vma_root.num && rung->current_scan == slot)
  3452. +               reset_current_scan(slot->rung, 1, 0);
  3453. +}
  3454. +
  3455. +static inline void rung_add_new_slots(struct scan_rung *rung,
  3456. +                       struct vma_slot **slots, unsigned long num)
  3457. +{
  3458. +       int err;
  3459. +       struct vma_slot *slot;
  3460. +       unsigned long i;
  3461. +       struct sradix_tree_root *root = &rung->vma_root;
  3462. +
  3463. +       err = sradix_tree_enter(root, (void **)slots, num);
  3464. +       BUG_ON(err);
  3465. +
  3466. +       for (i = 0; i < num; i++) {
  3467. +               slot = slots[i];
  3468. +               slot->rung = rung;
  3469. +               BUG_ON(vma_fully_scanned(slot));
  3470. +       }
  3471. +
  3472. +       if (rung->vma_root.num == num)
  3473. +               reset_current_scan(rung, 0, 1);
  3474. +}
  3475. +
  3476. +static inline int rung_add_one_slot(struct scan_rung *rung,
  3477. +                                    struct vma_slot *slot)
  3478. +{
  3479. +       int err;
  3480. +
  3481. +       err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
  3482. +       if (err)
  3483. +               return err;
  3484. +
  3485. +       slot->rung = rung;
  3486. +       if (rung->vma_root.num == 1)
  3487. +               reset_current_scan(rung, 0, 1);
  3488. +
  3489. +       return 0;
  3490. +}
  3491. +
  3492. +/*
  3493. + * Return true if the slot is deleted from its rung.
  3494. + */
  3495. +static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
  3496. +{
  3497. +       struct scan_rung *old_rung = slot->rung;
  3498. +       int err;
  3499. +
  3500. +       if (old_rung == rung)
  3501. +               return 0;
  3502. +
  3503. +       rung_rm_slot(slot);
  3504. +       err = rung_add_one_slot(rung, slot);
  3505. +       if (err) {
  3506. +               err = rung_add_one_slot(old_rung, slot);
  3507. +               WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
  3508. +       }
  3509. +
  3510. +       return 1;
  3511. +}
  3512. +
  3513. +static inline int vma_rung_up(struct vma_slot *slot)
  3514. +{
  3515. +       struct scan_rung *rung;
  3516. +
  3517. +       rung = slot->rung;
  3518. +       if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
  3519. +               rung++;
  3520. +
  3521. +       return vma_rung_enter(slot, rung);
  3522. +}
  3523. +
  3524. +static inline int vma_rung_down(struct vma_slot *slot)
  3525. +{
  3526. +       struct scan_rung *rung;
  3527. +
  3528. +       rung = slot->rung;
  3529. +       if (slot->rung != &uksm_scan_ladder[0])
  3530. +               rung--;
  3531. +
  3532. +       return vma_rung_enter(slot, rung);
  3533. +}
  3534. +
  3535. +/**
  3536. + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
  3537. + */
  3538. +static unsigned long cal_dedup_ratio(struct vma_slot *slot)
  3539. +{
  3540. +       unsigned long ret;
  3541. +
  3542. +       BUG_ON(slot->pages_scanned == slot->last_scanned);
  3543. +
  3544. +       ret = slot->pages_merged;
  3545. +
  3546. +       /* Thrashing area filtering */
  3547. +       if (ret && uksm_thrash_threshold) {
  3548. +               if (slot->pages_cowed * 100 / slot->pages_merged
  3549. +                   > uksm_thrash_threshold) {
  3550. +                       ret = 0;
  3551. +               } else {
  3552. +                       ret = slot->pages_merged - slot->pages_cowed;
  3553. +               }
  3554. +       }
  3555. +
  3556. +       return ret;
  3557. +}
  3558. +
  3559. +/**
  3560. + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
  3561. + */
  3562. +static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
  3563. +{
  3564. +       unsigned long ret;
  3565. +       unsigned long pages_scanned;
  3566. +
  3567. +       pages_scanned = slot->pages_scanned;
  3568. +       if (!pages_scanned) {
  3569. +               if (uksm_thrash_threshold)
  3570. +                       return 0;
  3571. +               else
  3572. +                       pages_scanned = slot->pages_scanned;
  3573. +       }
  3574. +
  3575. +       ret = slot->pages_bemerged * 100 / pages_scanned;
  3576. +
  3577. +       /* Thrashing area filtering */
  3578. +       if (ret && uksm_thrash_threshold) {
  3579. +               if (slot->pages_cowed * 100 / slot->pages_bemerged
  3580. +                   > uksm_thrash_threshold) {
  3581. +                       ret = 0;
  3582. +               } else {
  3583. +                       ret = slot->pages_bemerged - slot->pages_cowed;
  3584. +               }
  3585. +       }
  3586. +
  3587. +       return ret;
  3588. +}
  3589. +
  3590. +/**
  3591. + * stable_node_reinsert() - When the hash_strength has been adjusted, the
  3592. + * stable tree need to be restructured, this is the function re-inserting the
  3593. + * stable node.
  3594. + */
  3595. +static inline void stable_node_reinsert(struct stable_node *new_node,
  3596. +                                       struct page *page,
  3597. +                                       struct rb_root *root_treep,
  3598. +                                       struct list_head *tree_node_listp,
  3599. +                                       u32 hash)
  3600. +{
  3601. +       struct rb_node **new = &root_treep->rb_node;
  3602. +       struct rb_node *parent = NULL;
  3603. +       struct stable_node *stable_node;
  3604. +       struct tree_node *tree_node;
  3605. +       struct page *tree_page;
  3606. +       int cmp;
  3607. +
  3608. +       while (*new) {
  3609. +               int cmp;
  3610. +
  3611. +               tree_node = rb_entry(*new, struct tree_node, node);
  3612. +
  3613. +               cmp = hash_cmp(hash, tree_node->hash);
  3614. +
  3615. +               if (cmp < 0) {
  3616. +                       parent = *new;
  3617. +                       new = &parent->rb_left;
  3618. +               } else if (cmp > 0) {
  3619. +                       parent = *new;
  3620. +                       new = &parent->rb_right;
  3621. +               } else
  3622. +                       break;
  3623. +       }
  3624. +
  3625. +       if (*new) {
  3626. +               /* find a stable tree node with same first level hash value */
  3627. +               stable_node_hash_max(new_node, page, hash);
  3628. +               if (tree_node->count == 1) {
  3629. +                       stable_node = rb_entry(tree_node->sub_root.rb_node,
  3630. +                                              struct stable_node, node);
  3631. +                       tree_page = get_uksm_page(stable_node, 1, 0);
  3632. +                       if (tree_page) {
  3633. +                               stable_node_hash_max(stable_node,
  3634. +                                                     tree_page, hash);
  3635. +                               put_page(tree_page);
  3636. +
  3637. +                               /* prepare for stable node insertion */
  3638. +
  3639. +                               cmp = hash_cmp(new_node->hash_max,
  3640. +                                                  stable_node->hash_max);
  3641. +                               parent = &stable_node->node;
  3642. +                               if (cmp < 0)
  3643. +                                       new = &parent->rb_left;
  3644. +                               else if (cmp > 0)
  3645. +                                       new = &parent->rb_right;
  3646. +                               else
  3647. +                                       goto failed;
  3648. +
  3649. +                               goto add_node;
  3650. +                       } else {
  3651. +                               /* the only stable_node deleted, the tree node
  3652. +                                * was not deleted.
  3653. +                                */
  3654. +                               goto tree_node_reuse;
  3655. +                       }
  3656. +               }
  3657. +
  3658. +               /* well, search the collision subtree */
  3659. +               new = &tree_node->sub_root.rb_node;
  3660. +               parent = NULL;
  3661. +               BUG_ON(!*new);
  3662. +               while (*new) {
  3663. +                       int cmp;
  3664. +
  3665. +                       stable_node = rb_entry(*new, struct stable_node, node);
  3666. +
  3667. +                       cmp = hash_cmp(new_node->hash_max,
  3668. +                                          stable_node->hash_max);
  3669. +
  3670. +                       if (cmp < 0) {
  3671. +                               parent = *new;
  3672. +                               new = &parent->rb_left;
  3673. +                       } else if (cmp > 0) {
  3674. +                               parent = *new;
  3675. +                               new = &parent->rb_right;
  3676. +                       } else {
  3677. +                               /* oh, no, still a collision */
  3678. +                               goto failed;
  3679. +                       }
  3680. +               }
  3681. +
  3682. +               goto add_node;
  3683. +       }
  3684. +
  3685. +       /* no tree node found */
  3686. +       tree_node = alloc_tree_node(tree_node_listp);
  3687. +       if (!tree_node) {
  3688. +               printk(KERN_ERR "UKSM: memory allocation error!\n");
  3689. +               goto failed;
  3690. +       } else {
  3691. +               tree_node->hash = hash;
  3692. +               rb_link_node(&tree_node->node, parent, new);
  3693. +               rb_insert_color(&tree_node->node, root_treep);
  3694. +
  3695. +tree_node_reuse:
  3696. +               /* prepare for stable node insertion */
  3697. +               parent = NULL;
  3698. +               new = &tree_node->sub_root.rb_node;
  3699. +       }
  3700. +
  3701. +add_node:
  3702. +       rb_link_node(&new_node->node, parent, new);
  3703. +       rb_insert_color(&new_node->node, &tree_node->sub_root);
  3704. +       new_node->tree_node = tree_node;
  3705. +       tree_node->count++;
  3706. +       return;
  3707. +
  3708. +failed:
  3709. +       /* This can only happen when two nodes have collided
  3710. +        * in two levels.
  3711. +        */
  3712. +       new_node->tree_node = NULL;
  3713. +       return;
  3714. +}
  3715. +
  3716. +static inline void free_all_tree_nodes(struct list_head *list)
  3717. +{
  3718. +       struct tree_node *node, *tmp;
  3719. +
  3720. +       list_for_each_entry_safe(node, tmp, list, all_list) {
  3721. +               free_tree_node(node);
  3722. +       }
  3723. +}
  3724. +
  3725. +/**
  3726. + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
  3727. + * strength to the current hash_strength. It re-structures the hole tree.
  3728. + */
  3729. +static inline void stable_tree_delta_hash(u32 prev_hash_strength)
  3730. +{
  3731. +       struct stable_node *node, *tmp;
  3732. +       struct rb_root *root_new_treep;
  3733. +       struct list_head *new_tree_node_listp;
  3734. +
  3735. +       stable_tree_index = (stable_tree_index + 1) % 2;
  3736. +       root_new_treep = &root_stable_tree[stable_tree_index];
  3737. +       new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
  3738. +       *root_new_treep = RB_ROOT;
  3739. +       BUG_ON(!list_empty(new_tree_node_listp));
  3740. +
  3741. +       /*
  3742. +        * we need to be safe, the node could be removed by get_uksm_page()
  3743. +        */
  3744. +       list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
  3745. +               void *addr;
  3746. +               struct page *node_page;
  3747. +               u32 hash;
  3748. +
  3749. +               /*
  3750. +                * We are completely re-structuring the stable nodes to a new
  3751. +                * stable tree. We don't want to touch the old tree unlinks and
  3752. +                * old tree_nodes. The old tree_nodes will be freed at once.
  3753. +                */
  3754. +               node_page = get_uksm_page(node, 0, 0);
  3755. +               if (!node_page)
  3756. +                       continue;
  3757. +
  3758. +               if (node->tree_node) {
  3759. +                       hash = node->tree_node->hash;
  3760. +
  3761. +                       addr = kmap_atomic(node_page);
  3762. +
  3763. +                       hash = delta_hash(addr, prev_hash_strength,
  3764. +                                         hash_strength, hash);
  3765. +                       kunmap_atomic(addr);
  3766. +               } else {
  3767. +                       /*
  3768. +                        *it was not inserted to rbtree due to collision in last
  3769. +                        *round scan.
  3770. +                        */
  3771. +                       hash = page_hash(node_page, hash_strength, 0);
  3772. +               }
  3773. +
  3774. +               stable_node_reinsert(node, node_page, root_new_treep,
  3775. +                                    new_tree_node_listp, hash);
  3776. +               put_page(node_page);
  3777. +       }
  3778. +
  3779. +       root_stable_treep = root_new_treep;
  3780. +       free_all_tree_nodes(stable_tree_node_listp);
  3781. +       BUG_ON(!list_empty(stable_tree_node_listp));
  3782. +       stable_tree_node_listp = new_tree_node_listp;
  3783. +}
  3784. +
  3785. +static inline void inc_hash_strength(unsigned long delta)
  3786. +{
  3787. +       hash_strength += 1 << delta;
  3788. +       if (hash_strength > HASH_STRENGTH_MAX)
  3789. +               hash_strength = HASH_STRENGTH_MAX;
  3790. +}
  3791. +
  3792. +static inline void dec_hash_strength(unsigned long delta)
  3793. +{
  3794. +       unsigned long change = 1 << delta;
  3795. +
  3796. +       if (hash_strength <= change + 1)
  3797. +               hash_strength = 1;
  3798. +       else
  3799. +               hash_strength -= change;
  3800. +}
  3801. +
  3802. +static inline void inc_hash_strength_delta(void)
  3803. +{
  3804. +       hash_strength_delta++;
  3805. +       if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
  3806. +               hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
  3807. +}
  3808. +
  3809. +/*
  3810. +static inline unsigned long get_current_neg_ratio(void)
  3811. +{
  3812. +       if (!rshash_pos || rshash_neg > rshash_pos)
  3813. +               return 100;
  3814. +
  3815. +       return div64_u64(100 * rshash_neg , rshash_pos);
  3816. +}
  3817. +*/
  3818. +
  3819. +static inline unsigned long get_current_neg_ratio(void)
  3820. +{
  3821. +       u64 pos = benefit.pos;
  3822. +       u64 neg = benefit.neg;
  3823. +
  3824. +       if (!neg)
  3825. +               return 0;
  3826. +
  3827. +       if (!pos || neg > pos)
  3828. +               return 100;
  3829. +
  3830. +       if (neg > div64_u64(U64_MAX, 100))
  3831. +               pos = div64_u64(pos, 100);
  3832. +       else
  3833. +               neg *= 100;
  3834. +
  3835. +       return div64_u64(neg, pos);
  3836. +}
  3837. +
  3838. +static inline unsigned long get_current_benefit(void)
  3839. +{
  3840. +       u64 pos = benefit.pos;
  3841. +       u64 neg = benefit.neg;
  3842. +       u64 scanned = benefit.scanned;
  3843. +
  3844. +       if (neg > pos)
  3845. +               return 0;
  3846. +
  3847. +       return div64_u64((pos - neg), scanned);
  3848. +}
  3849. +
  3850. +static inline int judge_rshash_direction(void)
  3851. +{
  3852. +       u64 current_neg_ratio, stable_benefit;
  3853. +       u64 current_benefit, delta = 0;
  3854. +       int ret = STILL;
  3855. +
  3856. +       /* Try to probe a value after the boot, and in case the system
  3857. +          are still for a long time. */
  3858. +       if ((fully_scanned_round & 0xFFULL) == 10) {
  3859. +               ret = OBSCURE;
  3860. +               goto out;
  3861. +       }
  3862. +
  3863. +       current_neg_ratio = get_current_neg_ratio();
  3864. +
  3865. +       if (current_neg_ratio == 0) {
  3866. +               rshash_neg_cont_zero++;
  3867. +               if (rshash_neg_cont_zero > 2)
  3868. +                       return GO_DOWN;
  3869. +               else
  3870. +                       return STILL;
  3871. +       }
  3872. +       rshash_neg_cont_zero = 0;
  3873. +
  3874. +       if (current_neg_ratio > 90) {
  3875. +               ret = GO_UP;
  3876. +               goto out;
  3877. +       }
  3878. +
  3879. +       current_benefit = get_current_benefit();
  3880. +       stable_benefit = rshash_state.stable_benefit;
  3881. +
  3882. +       if (!stable_benefit) {
  3883. +               ret = OBSCURE;
  3884. +               goto out;
  3885. +       }
  3886. +
  3887. +       if (current_benefit > stable_benefit)
  3888. +               delta = current_benefit - stable_benefit;
  3889. +       else if (current_benefit < stable_benefit)
  3890. +               delta = stable_benefit - current_benefit;
  3891. +
  3892. +       delta = div64_u64(100 * delta , stable_benefit);
  3893. +
  3894. +       if (delta > 50) {
  3895. +               rshash_cont_obscure++;
  3896. +               if (rshash_cont_obscure > 2)
  3897. +                       return OBSCURE;
  3898. +               else
  3899. +                       return STILL;
  3900. +       }
  3901. +
  3902. +out:
  3903. +       rshash_cont_obscure = 0;
  3904. +       return ret;
  3905. +}
  3906. +
  3907. +/**
  3908. + * rshash_adjust() - The main function to control the random sampling state
  3909. + * machine for hash strength adapting.
  3910. + *
  3911. + * return true if hash_strength has changed.
  3912. + */
  3913. +static inline int rshash_adjust(void)
  3914. +{
  3915. +       unsigned long prev_hash_strength = hash_strength;
  3916. +
  3917. +       if (!encode_benefit())
  3918. +               return 0;
  3919. +
  3920. +       switch (rshash_state.state) {
  3921. +       case RSHASH_STILL:
  3922. +               switch (judge_rshash_direction()) {
  3923. +               case GO_UP:
  3924. +                       if (rshash_state.pre_direct == GO_DOWN)
  3925. +                               hash_strength_delta = 0;
  3926. +
  3927. +                       inc_hash_strength(hash_strength_delta);
  3928. +                       inc_hash_strength_delta();
  3929. +                       rshash_state.stable_benefit = get_current_benefit();
  3930. +                       rshash_state.pre_direct = GO_UP;
  3931. +                       break;
  3932. +
  3933. +               case GO_DOWN:
  3934. +                       if (rshash_state.pre_direct == GO_UP)
  3935. +                               hash_strength_delta = 0;
  3936. +
  3937. +                       dec_hash_strength(hash_strength_delta);
  3938. +                       inc_hash_strength_delta();
  3939. +                       rshash_state.stable_benefit = get_current_benefit();
  3940. +                       rshash_state.pre_direct = GO_DOWN;
  3941. +                       break;
  3942. +
  3943. +               case OBSCURE:
  3944. +                       rshash_state.stable_point = hash_strength;
  3945. +                       rshash_state.turn_point_down = hash_strength;
  3946. +                       rshash_state.turn_point_up = hash_strength;
  3947. +                       rshash_state.turn_benefit_down = get_current_benefit();
  3948. +                       rshash_state.turn_benefit_up = get_current_benefit();
  3949. +                       rshash_state.lookup_window_index = 0;
  3950. +                       rshash_state.state = RSHASH_TRYDOWN;
  3951. +                       dec_hash_strength(hash_strength_delta);
  3952. +                       inc_hash_strength_delta();
  3953. +                       break;
  3954. +
  3955. +               case STILL:
  3956. +                       break;
  3957. +               default:
  3958. +                       BUG();
  3959. +               }
  3960. +               break;
  3961. +
  3962. +       case RSHASH_TRYDOWN:
  3963. +               if (rshash_state.lookup_window_index++ % 5 == 0)
  3964. +                       rshash_state.below_count = 0;
  3965. +
  3966. +               if (get_current_benefit() < rshash_state.stable_benefit)
  3967. +                       rshash_state.below_count++;
  3968. +               else if (get_current_benefit() >
  3969. +                        rshash_state.turn_benefit_down) {
  3970. +                       rshash_state.turn_point_down = hash_strength;
  3971. +                       rshash_state.turn_benefit_down = get_current_benefit();
  3972. +               }
  3973. +
  3974. +               if (rshash_state.below_count >= 3 ||
  3975. +                   judge_rshash_direction() == GO_UP ||
  3976. +                   hash_strength == 1) {
  3977. +                       hash_strength = rshash_state.stable_point;
  3978. +                       hash_strength_delta = 0;
  3979. +                       inc_hash_strength(hash_strength_delta);
  3980. +                       inc_hash_strength_delta();
  3981. +                       rshash_state.lookup_window_index = 0;
  3982. +                       rshash_state.state = RSHASH_TRYUP;
  3983. +                       hash_strength_delta = 0;
  3984. +               } else {
  3985. +                       dec_hash_strength(hash_strength_delta);
  3986. +                       inc_hash_strength_delta();
  3987. +               }
  3988. +               break;
  3989. +
  3990. +       case RSHASH_TRYUP:
  3991. +               if (rshash_state.lookup_window_index++ % 5 == 0)
  3992. +                       rshash_state.below_count = 0;
  3993. +
  3994. +               if (get_current_benefit() < rshash_state.turn_benefit_down)
  3995. +                       rshash_state.below_count++;
  3996. +               else if (get_current_benefit() > rshash_state.turn_benefit_up) {
  3997. +                       rshash_state.turn_point_up = hash_strength;
  3998. +                       rshash_state.turn_benefit_up = get_current_benefit();
  3999. +               }
  4000. +
  4001. +               if (rshash_state.below_count >= 3 ||
  4002. +                   judge_rshash_direction() == GO_DOWN ||
  4003. +                   hash_strength == HASH_STRENGTH_MAX) {
  4004. +                       hash_strength = rshash_state.turn_benefit_up >
  4005. +                               rshash_state.turn_benefit_down ?
  4006. +                               rshash_state.turn_point_up :
  4007. +                               rshash_state.turn_point_down;
  4008. +
  4009. +                       rshash_state.state = RSHASH_PRE_STILL;
  4010. +               } else {
  4011. +                       inc_hash_strength(hash_strength_delta);
  4012. +                       inc_hash_strength_delta();
  4013. +               }
  4014. +
  4015. +               break;
  4016. +
  4017. +       case RSHASH_NEW:
  4018. +       case RSHASH_PRE_STILL:
  4019. +               rshash_state.stable_benefit = get_current_benefit();
  4020. +               rshash_state.state = RSHASH_STILL;
  4021. +               hash_strength_delta = 0;
  4022. +               break;
  4023. +       default:
  4024. +               BUG();
  4025. +       }
  4026. +
  4027. +       /* rshash_neg = rshash_pos = 0; */
  4028. +       reset_benefit();
  4029. +
  4030. +       if (prev_hash_strength != hash_strength)
  4031. +               stable_tree_delta_hash(prev_hash_strength);
  4032. +
  4033. +       return prev_hash_strength != hash_strength;
  4034. +}
  4035. +
  4036. +/**
  4037. + * round_update_ladder() - The main function to do update of all the
  4038. + * adjustments whenever a scan round is finished.
  4039. + */
  4040. +static noinline void round_update_ladder(void)
  4041. +{
  4042. +       int i;
  4043. +       unsigned long dedup;
  4044. +       struct vma_slot *slot, *tmp_slot;
  4045. +
  4046. +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  4047. +               uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
  4048. +       }
  4049. +
  4050. +       list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
  4051. +
  4052. +               /* slot may be rung_rm_slot() when mm exits */
  4053. +               if (slot->snode) {
  4054. +                       dedup = cal_dedup_ratio_old(slot);
  4055. +                       if (dedup && dedup >= uksm_abundant_threshold)
  4056. +                               vma_rung_up(slot);
  4057. +               }
  4058. +
  4059. +               slot->pages_bemerged = 0;
  4060. +               slot->pages_cowed = 0;
  4061. +
  4062. +               list_del_init(&slot->dedup_list);
  4063. +       }
  4064. +}
  4065. +
  4066. +static void uksm_del_vma_slot(struct vma_slot *slot)
  4067. +{
  4068. +       int i, j;
  4069. +       struct rmap_list_entry *entry;
  4070. +
  4071. +       if (slot->snode) {
  4072. +               /*
  4073. +                * In case it just failed when entering the rung, it's not
  4074. +                * necessary.
  4075. +                */
  4076. +               rung_rm_slot(slot);
  4077. +       }
  4078. +
  4079. +       if (!list_empty(&slot->dedup_list))
  4080. +               list_del(&slot->dedup_list);
  4081. +
  4082. +       if (!slot->rmap_list_pool || !slot->pool_counts) {
  4083. +               /* In case it OOMed in uksm_vma_enter() */
  4084. +               goto out;
  4085. +       }
  4086. +
  4087. +       for (i = 0; i < slot->pool_size; i++) {
  4088. +               void *addr;
  4089. +
  4090. +               if (!slot->rmap_list_pool[i])
  4091. +                       continue;
  4092. +
  4093. +               addr = kmap(slot->rmap_list_pool[i]);
  4094. +               for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
  4095. +                       entry = (struct rmap_list_entry *)addr + j;
  4096. +                       if (is_addr(entry->addr))
  4097. +                               continue;
  4098. +                       if (!entry->item)
  4099. +                               continue;
  4100. +
  4101. +                       remove_rmap_item_from_tree(entry->item);
  4102. +                       free_rmap_item(entry->item);
  4103. +                       slot->pool_counts[i]--;
  4104. +               }
  4105. +               BUG_ON(slot->pool_counts[i]);
  4106. +               kunmap(slot->rmap_list_pool[i]);
  4107. +               __free_page(slot->rmap_list_pool[i]);
  4108. +       }
  4109. +       kfree(slot->rmap_list_pool);
  4110. +       kfree(slot->pool_counts);
  4111. +
  4112. +out:
  4113. +       slot->rung = NULL;
  4114. +       BUG_ON(uksm_pages_total < slot->pages);
  4115. +       if (slot->flags & UKSM_SLOT_IN_UKSM)
  4116. +               uksm_pages_total -= slot->pages;
  4117. +
  4118. +       if (slot->fully_scanned_round == fully_scanned_round)
  4119. +               scanned_virtual_pages -= slot->pages;
  4120. +       else
  4121. +               scanned_virtual_pages -= slot->pages_scanned;
  4122. +       free_vma_slot(slot);
  4123. +}
  4124. +
  4125. +
  4126. +#define SPIN_LOCK_PERIOD       32
  4127. +static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
  4128. +static inline void cleanup_vma_slots(void)
  4129. +{
  4130. +       struct vma_slot *slot;
  4131. +       int i;
  4132. +
  4133. +       i = 0;
  4134. +       spin_lock(&vma_slot_list_lock);
  4135. +       while (!list_empty(&vma_slot_del)) {
  4136. +               slot = list_entry(vma_slot_del.next,
  4137. +                                 struct vma_slot, slot_list);
  4138. +               list_del(&slot->slot_list);
  4139. +               cleanup_slots[i++] = slot;
  4140. +               if (i == SPIN_LOCK_PERIOD) {
  4141. +                       spin_unlock(&vma_slot_list_lock);
  4142. +                       while (--i >= 0)
  4143. +                               uksm_del_vma_slot(cleanup_slots[i]);
  4144. +                       i = 0;
  4145. +                       spin_lock(&vma_slot_list_lock);
  4146. +               }
  4147. +       }
  4148. +       spin_unlock(&vma_slot_list_lock);
  4149. +
  4150. +       while (--i >= 0)
  4151. +               uksm_del_vma_slot(cleanup_slots[i]);
  4152. +}
  4153. +
  4154. +/*
  4155. +*expotional moving average formula
  4156. +*/
  4157. +static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
  4158. +{
  4159. +       /*
  4160. +        * For a very high burst, even the ema cannot work well, a false very
  4161. +        * high per-page time estimation can result in feedback in very high
  4162. +        * overhead of context swith and rung update -- this will then lead
  4163. +        * to higher per-paper time, this may not converge.
  4164. +        *
  4165. +        * Instead, we try to approach this value in a binary manner.
  4166. +        */
  4167. +       if (curr > last_ema * 10)
  4168. +               return last_ema * 2;
  4169. +
  4170. +       return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
  4171. +}
  4172. +
  4173. +/*
  4174. + * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
  4175. + * nanoseconds based on current uksm_sleep_jiffies.
  4176. + */
  4177. +static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
  4178. +{
  4179. +       return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
  4180. +               (TIME_RATIO_SCALE - ratio) * ratio;
  4181. +}
  4182. +
  4183. +
  4184. +static inline unsigned long rung_real_ratio(int cpu_time_ratio)
  4185. +{
  4186. +       unsigned long ret;
  4187. +
  4188. +       BUG_ON(!cpu_time_ratio);
  4189. +
  4190. +       if (cpu_time_ratio > 0)
  4191. +               ret = cpu_time_ratio;
  4192. +       else
  4193. +               ret = (unsigned long)(-cpu_time_ratio) *
  4194. +                       uksm_max_cpu_percentage / 100UL;
  4195. +
  4196. +       return ret ? ret : 1;
  4197. +}
  4198. +
  4199. +static noinline void uksm_calc_scan_pages(void)
  4200. +{
  4201. +       struct scan_rung *ladder = uksm_scan_ladder;
  4202. +       unsigned long sleep_usecs, nsecs;
  4203. +       unsigned long ratio;
  4204. +       int i;
  4205. +       unsigned long per_page;
  4206. +
  4207. +       if (uksm_ema_page_time > 100000 ||
  4208. +           (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
  4209. +               uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
  4210. +
  4211. +       per_page = uksm_ema_page_time;
  4212. +       BUG_ON(!per_page);
  4213. +
  4214. +       /*
  4215. +        * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
  4216. +        * based on saved user input.
  4217. +        */
  4218. +       if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
  4219. +               uksm_sleep_jiffies = uksm_sleep_saved;
  4220. +
  4221. +       /* We require a rung scan at least 1 page in a period. */
  4222. +       nsecs = per_page;
  4223. +       ratio = rung_real_ratio(ladder[0].cpu_ratio);
  4224. +       if (cpu_ratio_to_nsec(ratio) < nsecs) {
  4225. +               sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
  4226. +                               / NSEC_PER_USEC;
  4227. +               uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
  4228. +       }
  4229. +
  4230. +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  4231. +               ratio = rung_real_ratio(ladder[i].cpu_ratio);
  4232. +               ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
  4233. +                                       per_page;
  4234. +               BUG_ON(!ladder[i].pages_to_scan);
  4235. +               uksm_calc_rung_step(&ladder[i], per_page, ratio);
  4236. +       }
  4237. +}
  4238. +
  4239. +/*
  4240. + * From the scan time of this round (ns) to next expected min sleep time
  4241. + * (ms), be careful of the possible overflows. ratio is taken from
  4242. + * rung_real_ratio()
  4243. + */
  4244. +static inline
  4245. +unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
  4246. +{
  4247. +       scan_time >>= 20; /* to msec level now */
  4248. +       BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
  4249. +
  4250. +       return (unsigned int) ((unsigned long) scan_time *
  4251. +                              (TIME_RATIO_SCALE - ratio) / ratio);
  4252. +}
  4253. +
  4254. +#define __round_mask(x, y) ((__typeof__(x))((y)-1))
  4255. +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
  4256. +
  4257. +static inline unsigned long vma_pool_size(struct vma_slot *slot)
  4258. +{
  4259. +       return round_up(sizeof(struct rmap_list_entry) * slot->pages,
  4260. +                       PAGE_SIZE) >> PAGE_SHIFT;
  4261. +}
  4262. +
  4263. +static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
  4264. +{
  4265. +       struct scan_rung *rung;
  4266. +       unsigned long pool_size, i;
  4267. +       struct vma_slot *slot;
  4268. +       int failed;
  4269. +
  4270. +       rung = &uksm_scan_ladder[0];
  4271. +
  4272. +       failed = 0;
  4273. +       for (i = 0; i < num; i++) {
  4274. +               slot = slots[i];
  4275. +
  4276. +               pool_size = vma_pool_size(slot);
  4277. +               slot->rmap_list_pool = kzalloc(sizeof(struct page *) *
  4278. +                                              pool_size, GFP_KERNEL);
  4279. +               if (!slot->rmap_list_pool)
  4280. +                       break;
  4281. +
  4282. +               slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size,
  4283. +                                           GFP_KERNEL);
  4284. +               if (!slot->pool_counts) {
  4285. +                       kfree(slot->rmap_list_pool);
  4286. +                       break;
  4287. +               }
  4288. +
  4289. +               slot->pool_size = pool_size;
  4290. +               BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
  4291. +               slot->flags |= UKSM_SLOT_IN_UKSM;
  4292. +               uksm_pages_total += slot->pages;
  4293. +       }
  4294. +
  4295. +       if (i)
  4296. +               rung_add_new_slots(rung, slots, i);
  4297. +
  4298. +       return;
  4299. +}
  4300. +
  4301. +static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
  4302. +
  4303. +static void uksm_enter_all_slots(void)
  4304. +{
  4305. +       struct vma_slot *slot;
  4306. +       unsigned long index;
  4307. +       struct list_head empty_vma_list;
  4308. +       int i;
  4309. +
  4310. +       i = 0;
  4311. +       index = 0;
  4312. +       INIT_LIST_HEAD(&empty_vma_list);
  4313. +
  4314. +       spin_lock(&vma_slot_list_lock);
  4315. +       while (!list_empty(&vma_slot_new)) {
  4316. +               slot = list_entry(vma_slot_new.next,
  4317. +                                 struct vma_slot, slot_list);
  4318. +
  4319. +               if (!slot->vma->anon_vma) {
  4320. +                       list_move(&slot->slot_list, &empty_vma_list);
  4321. +               } else if (vma_can_enter(slot->vma)) {
  4322. +                       batch_slots[index++] = slot;
  4323. +                       list_del_init(&slot->slot_list);
  4324. +               } else {
  4325. +                       list_move(&slot->slot_list, &vma_slot_noadd);
  4326. +               }
  4327. +
  4328. +               if (++i == SPIN_LOCK_PERIOD ||
  4329. +                   (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
  4330. +                       spin_unlock(&vma_slot_list_lock);
  4331. +
  4332. +                       if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
  4333. +                               uksm_vma_enter(batch_slots, index);
  4334. +                               index = 0;
  4335. +                       }
  4336. +                       i = 0;
  4337. +                       cond_resched();
  4338. +                       spin_lock(&vma_slot_list_lock);
  4339. +               }
  4340. +       }
  4341. +
  4342. +       list_splice(&empty_vma_list, &vma_slot_new);
  4343. +
  4344. +       spin_unlock(&vma_slot_list_lock);
  4345. +
  4346. +       if (index)
  4347. +               uksm_vma_enter(batch_slots, index);
  4348. +
  4349. +}
  4350. +
  4351. +static inline int rung_round_finished(struct scan_rung *rung)
  4352. +{
  4353. +       return rung->flags & UKSM_RUNG_ROUND_FINISHED;
  4354. +}
  4355. +
  4356. +static inline void judge_slot(struct vma_slot *slot)
  4357. +{
  4358. +       struct scan_rung *rung = slot->rung;
  4359. +       unsigned long dedup;
  4360. +       int deleted;
  4361. +
  4362. +       dedup = cal_dedup_ratio(slot);
  4363. +       if (vma_fully_scanned(slot) && uksm_thrash_threshold)
  4364. +               deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
  4365. +       else if (dedup && dedup >= uksm_abundant_threshold)
  4366. +               deleted = vma_rung_up(slot);
  4367. +       else
  4368. +               deleted = vma_rung_down(slot);
  4369. +
  4370. +       slot->pages_merged = 0;
  4371. +       slot->pages_cowed = 0;
  4372. +
  4373. +       if (vma_fully_scanned(slot))
  4374. +               slot->pages_scanned = 0;
  4375. +
  4376. +       slot->last_scanned = slot->pages_scanned;
  4377. +
  4378. +       /* If its deleted in above, then rung was already advanced. */
  4379. +       if (!deleted)
  4380. +               advance_current_scan(rung);
  4381. +}
  4382. +
  4383. +
  4384. +static inline int hash_round_finished(void)
  4385. +{
  4386. +       if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
  4387. +               scanned_virtual_pages = 0;
  4388. +               if (uksm_pages_scanned)
  4389. +                       fully_scanned_round++;
  4390. +
  4391. +               return 1;
  4392. +       } else {
  4393. +               return 0;
  4394. +       }
  4395. +}
  4396. +
  4397. +#define UKSM_MMSEM_BATCH       5
  4398. +/**
  4399. + * uksm_do_scan()  - the main worker function.
  4400. + */
  4401. +static noinline void uksm_do_scan(void)
  4402. +{
  4403. +       struct vma_slot *slot, *iter;
  4404. +       struct mm_struct *busy_mm;
  4405. +       unsigned char round_finished, all_rungs_emtpy;
  4406. +       int i, err, mmsem_batch;
  4407. +       unsigned long pcost;
  4408. +       long long delta_exec;
  4409. +       unsigned long vpages, max_cpu_ratio;
  4410. +       unsigned long long start_time, end_time, scan_time;
  4411. +       unsigned int expected_jiffies;
  4412. +
  4413. +       might_sleep();
  4414. +
  4415. +       vpages = 0;
  4416. +
  4417. +       start_time = task_sched_runtime(current);
  4418. +       max_cpu_ratio = 0;
  4419. +       mmsem_batch = 0;
  4420. +
  4421. +       for (i = 0; i < SCAN_LADDER_SIZE;) {
  4422. +               struct scan_rung *rung = &uksm_scan_ladder[i];
  4423. +               unsigned long ratio;
  4424. +
  4425. +               if (!rung->pages_to_scan) {
  4426. +                       i++;
  4427. +                       continue;
  4428. +               }
  4429. +
  4430. +               if (!rung->vma_root.num) {
  4431. +                       rung->pages_to_scan = 0;
  4432. +                       i++;
  4433. +                       continue;
  4434. +               }
  4435. +
  4436. +               ratio = rung_real_ratio(rung->cpu_ratio);
  4437. +               if (ratio > max_cpu_ratio)
  4438. +                       max_cpu_ratio = ratio;
  4439. +
  4440. +               /*
  4441. +                * Do not consider rung_round_finished() here, just used up the
  4442. +                * rung->pages_to_scan quota.
  4443. +                */
  4444. +               while (rung->pages_to_scan && rung->vma_root.num &&
  4445. +                      likely(!freezing(current))) {
  4446. +                       int reset = 0;
  4447. +
  4448. +                       slot = rung->current_scan;
  4449. +
  4450. +                       BUG_ON(vma_fully_scanned(slot));
  4451. +
  4452. +                       if (mmsem_batch) {
  4453. +                               err = 0;
  4454. +                       } else {
  4455. +                               err = try_down_read_slot_mmap_sem(slot);
  4456. +                       }
  4457. +
  4458. +                       if (err == -ENOENT) {
  4459. +rm_slot:
  4460. +                               rung_rm_slot(slot);
  4461. +                               continue;
  4462. +                       }
  4463. +
  4464. +                       busy_mm = slot->mm;
  4465. +
  4466. +                       if (err == -EBUSY) {
  4467. +                               /* skip other vmas on the same mm */
  4468. +                               do {
  4469. +                                       reset = advance_current_scan(rung);
  4470. +                                       iter = rung->current_scan;
  4471. +                                       if (iter->vma->vm_mm != busy_mm)
  4472. +                                               break;
  4473. +                               } while (!reset);
  4474. +
  4475. +                               if (iter->vma->vm_mm != busy_mm) {
  4476. +                                       continue;
  4477. +                               } else {
  4478. +                                       /* scan round finsished */
  4479. +                                       break;
  4480. +                               }
  4481. +                       }
  4482. +
  4483. +                       BUG_ON(!vma_can_enter(slot->vma));
  4484. +                       if (uksm_test_exit(slot->vma->vm_mm)) {
  4485. +                               mmsem_batch = 0;
  4486. +                               up_read(&slot->vma->vm_mm->mmap_sem);
  4487. +                               goto rm_slot;
  4488. +                       }
  4489. +
  4490. +                       if (mmsem_batch)
  4491. +                               mmsem_batch--;
  4492. +                       else
  4493. +                               mmsem_batch = UKSM_MMSEM_BATCH;
  4494. +
  4495. +                       /* Ok, we have take the mmap_sem, ready to scan */
  4496. +                       scan_vma_one_page(slot);
  4497. +                       rung->pages_to_scan--;
  4498. +                       vpages++;
  4499. +
  4500. +                       if (rung->current_offset + rung->step > slot->pages - 1
  4501. +                           || vma_fully_scanned(slot)) {
  4502. +                               up_read(&slot->vma->vm_mm->mmap_sem);
  4503. +                               judge_slot(slot);
  4504. +                               mmsem_batch = 0;
  4505. +                       } else {
  4506. +                               rung->current_offset += rung->step;
  4507. +                               if (!mmsem_batch)
  4508. +                                       up_read(&slot->vma->vm_mm->mmap_sem);
  4509. +                       }
  4510. +
  4511. +                       cond_resched();
  4512. +               }
  4513. +
  4514. +               if (mmsem_batch) {
  4515. +                       up_read(&slot->vma->vm_mm->mmap_sem);
  4516. +                       mmsem_batch = 0;
  4517. +               }
  4518. +
  4519. +               if (freezing(current))
  4520. +                       break;
  4521. +
  4522. +               cond_resched();
  4523. +       }
  4524. +       end_time = task_sched_runtime(current);
  4525. +       delta_exec = end_time - start_time;
  4526. +
  4527. +       if (freezing(current))
  4528. +               return;
  4529. +
  4530. +       cleanup_vma_slots();
  4531. +       uksm_enter_all_slots();
  4532. +
  4533. +       round_finished = 1;
  4534. +       all_rungs_emtpy = 1;
  4535. +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  4536. +               struct scan_rung *rung = &uksm_scan_ladder[i];
  4537. +
  4538. +               if (rung->vma_root.num) {
  4539. +                       all_rungs_emtpy = 0;
  4540. +                       if (!rung_round_finished(rung))
  4541. +                               round_finished = 0;
  4542. +               }
  4543. +       }
  4544. +
  4545. +       if (all_rungs_emtpy)
  4546. +               round_finished = 0;
  4547. +
  4548. +       if (round_finished) {
  4549. +               round_update_ladder();
  4550. +               uksm_eval_round++;
  4551. +
  4552. +               if (hash_round_finished() && rshash_adjust()) {
  4553. +                       /* Reset the unstable root iff hash strength changed */
  4554. +                       uksm_hash_round++;
  4555. +                       root_unstable_tree = RB_ROOT;
  4556. +                       free_all_tree_nodes(&unstable_tree_node_list);
  4557. +               }
  4558. +
  4559. +               /*
  4560. +                * A number of pages can hang around indefinitely on per-cpu
  4561. +                * pagevecs, raised page count preventing write_protect_page
  4562. +                * from merging them.  Though it doesn't really matter much,
  4563. +                * it is puzzling to see some stuck in pages_volatile until
  4564. +                * other activity jostles them out, and they also prevented
  4565. +                * LTP's KSM test from succeeding deterministically; so drain
  4566. +                * them here (here rather than on entry to uksm_do_scan(),
  4567. +                * so we don't IPI too often when pages_to_scan is set low).
  4568. +                */
  4569. +               lru_add_drain_all();
  4570. +       }
  4571. +
  4572. +
  4573. +       if (vpages && delta_exec > 0) {
  4574. +               pcost = (unsigned long) delta_exec / vpages;
  4575. +               if (likely(uksm_ema_page_time))
  4576. +                       uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
  4577. +               else
  4578. +                       uksm_ema_page_time = pcost;
  4579. +       }
  4580. +
  4581. +       uksm_calc_scan_pages();
  4582. +       uksm_sleep_real = uksm_sleep_jiffies;
  4583. +       /* in case of radical cpu bursts, apply the upper bound */
  4584. +       end_time = task_sched_runtime(current);
  4585. +       if (max_cpu_ratio && end_time > start_time) {
  4586. +               scan_time = end_time - start_time;
  4587. +               expected_jiffies = msecs_to_jiffies(
  4588. +                       scan_time_to_sleep(scan_time, max_cpu_ratio));
  4589. +
  4590. +               if (expected_jiffies > uksm_sleep_real)
  4591. +                       uksm_sleep_real = expected_jiffies;
  4592. +
  4593. +               /* We have a 1 second up bound for responsiveness. */
  4594. +               if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
  4595. +                       uksm_sleep_real = msecs_to_jiffies(1000);
  4596. +       }
  4597. +
  4598. +       return;
  4599. +}
  4600. +
  4601. +static int ksmd_should_run(void)
  4602. +{
  4603. +       return uksm_run & UKSM_RUN_MERGE;
  4604. +}
  4605. +
  4606. +static int uksm_scan_thread(void *nothing)
  4607. +{
  4608. +       set_freezable();
  4609. +       set_user_nice(current, 5);
  4610. +
  4611. +       while (!kthread_should_stop()) {
  4612. +               mutex_lock(&uksm_thread_mutex);
  4613. +               if (ksmd_should_run()) {
  4614. +                       uksm_do_scan();
  4615. +               }
  4616. +               mutex_unlock(&uksm_thread_mutex);
  4617. +
  4618. +               try_to_freeze();
  4619. +
  4620. +               if (ksmd_should_run()) {
  4621. +                       schedule_timeout_interruptible(uksm_sleep_real);
  4622. +                       uksm_sleep_times++;
  4623. +               } else {
  4624. +                       wait_event_freezable(uksm_thread_wait,
  4625. +                               ksmd_should_run() || kthread_should_stop());
  4626. +               }
  4627. +       }
  4628. +       return 0;
  4629. +}
  4630. +
  4631. +int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
  4632. +                       unsigned long *vm_flags)
  4633. +{
  4634. +       struct stable_node *stable_node;
  4635. +       struct node_vma *node_vma;
  4636. +       struct rmap_item *rmap_item;
  4637. +       struct hlist_node *hlist, *rmap_hlist;
  4638. +       unsigned int mapcount = page_mapcount(page);
  4639. +       int referenced = 0;
  4640. +       int search_new_forks = 0;
  4641. +       unsigned long address;
  4642. +
  4643. +       VM_BUG_ON(!PageKsm(page));
  4644. +       VM_BUG_ON(!PageLocked(page));
  4645. +
  4646. +       stable_node = page_stable_node(page);
  4647. +       if (!stable_node)
  4648. +               return 0;
  4649. +
  4650. +
  4651. +again:
  4652. +       hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
  4653. +               hlist_for_each_entry(rmap_item, rmap_hlist,
  4654. +                                    &node_vma->rmap_hlist, hlist) {
  4655. +                       struct anon_vma *anon_vma = rmap_item->anon_vma;
  4656. +                       struct anon_vma_chain *vmac;
  4657. +                       struct vm_area_struct *vma;
  4658. +
  4659. +                       anon_vma_lock(anon_vma);
  4660. +                       list_for_each_entry(vmac, &anon_vma->head,
  4661. +                                           same_anon_vma) {
  4662. +                               vma = vmac->vma;
  4663. +                               address = get_rmap_addr(rmap_item);
  4664. +
  4665. +                               if (address < vma->vm_start ||
  4666. +                                   address >= vma->vm_end)
  4667. +                                       continue;
  4668. +                               /*
  4669. +                                * Initially we examine only the vma which
  4670. +                                * covers this rmap_item; but later, if there
  4671. +                                * is still work to do, we examine covering
  4672. +                                * vmas in other mms: in case they were forked
  4673. +                                * from the original since ksmd passed.
  4674. +                                */
  4675. +                               if ((rmap_item->slot->vma == vma) ==
  4676. +                                   search_new_forks)
  4677. +                                       continue;
  4678. +
  4679. +                               if (memcg &&
  4680. +                                   !mm_match_cgroup(vma->vm_mm, memcg))
  4681. +                                       continue;
  4682. +
  4683. +                               referenced +=
  4684. +                                       page_referenced_one(page, vma,
  4685. +                                               address, &mapcount, vm_flags);
  4686. +                               if (!search_new_forks || !mapcount)
  4687. +                                       break;
  4688. +                       }
  4689. +
  4690. +                       anon_vma_unlock(anon_vma);
  4691. +                       if (!mapcount)
  4692. +                               goto out;
  4693. +               }
  4694. +       }
  4695. +       if (!search_new_forks++)
  4696. +               goto again;
  4697. +out:
  4698. +       return referenced;
  4699. +}
  4700. +
  4701. +int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
  4702. +{
  4703. +       struct stable_node *stable_node;
  4704. +       struct node_vma *node_vma;
  4705. +       struct hlist_node *hlist, *rmap_hlist;
  4706. +       struct rmap_item *rmap_item;
  4707. +       int ret = SWAP_AGAIN;
  4708. +       int search_new_forks = 0;
  4709. +       unsigned long address;
  4710. +
  4711. +       VM_BUG_ON(!PageKsm(page));
  4712. +       VM_BUG_ON(!PageLocked(page));
  4713. +
  4714. +       stable_node = page_stable_node(page);
  4715. +       if (!stable_node)
  4716. +               return SWAP_FAIL;
  4717. +again:
  4718. +       hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
  4719. +               hlist_for_each_entry(rmap_item, rmap_hlist,
  4720. +                                    &node_vma->rmap_hlist, hlist) {
  4721. +                       struct anon_vma *anon_vma = rmap_item->anon_vma;
  4722. +                       struct anon_vma_chain *vmac;
  4723. +                       struct vm_area_struct *vma;
  4724. +
  4725. +                       anon_vma_lock(anon_vma);
  4726. +                       list_for_each_entry(vmac, &anon_vma->head,
  4727. +                                           same_anon_vma) {
  4728. +                               vma = vmac->vma;
  4729. +                               address = get_rmap_addr(rmap_item);
  4730. +
  4731. +                               if (address < vma->vm_start ||
  4732. +                                   address >= vma->vm_end)
  4733. +                                       continue;
  4734. +                               /*
  4735. +                                * Initially we examine only the vma which
  4736. +                                * covers this rmap_item; but later, if there
  4737. +                                * is still work to do, we examine covering
  4738. +                                * vmas in other mms: in case they were forked
  4739. +                                * from the original since ksmd passed.
  4740. +                                */
  4741. +                               if ((rmap_item->slot->vma == vma) ==
  4742. +                                   search_new_forks)
  4743. +                                       continue;
  4744. +
  4745. +                               ret = try_to_unmap_one(page, vma,
  4746. +                                                      address, flags);
  4747. +                               if (ret != SWAP_AGAIN || !page_mapped(page)) {
  4748. +                                       anon_vma_unlock(anon_vma);
  4749. +                                       goto out;
  4750. +                               }
  4751. +                       }
  4752. +                       anon_vma_unlock(anon_vma);
  4753. +               }
  4754. +       }
  4755. +       if (!search_new_forks++)
  4756. +               goto again;
  4757. +out:
  4758. +       return ret;
  4759. +}
  4760. +
  4761. +#ifdef CONFIG_MIGRATION
  4762. +int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
  4763. +                 struct vm_area_struct *, unsigned long, void *), void *arg)
  4764. +{
  4765. +       struct stable_node *stable_node;
  4766. +       struct node_vma *node_vma;
  4767. +       struct hlist_node *hlist, *rmap_hlist;
  4768. +       struct rmap_item *rmap_item;
  4769. +       int ret = SWAP_AGAIN;
  4770. +       int search_new_forks = 0;
  4771. +       unsigned long address;
  4772. +
  4773. +       VM_BUG_ON(!PageKsm(page));
  4774. +       VM_BUG_ON(!PageLocked(page));
  4775. +
  4776. +       stable_node = page_stable_node(page);
  4777. +       if (!stable_node)
  4778. +               return ret;
  4779. +again:
  4780. +       hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
  4781. +               hlist_for_each_entry(rmap_item, rmap_hlist,
  4782. +                                    &node_vma->rmap_hlist, hlist) {
  4783. +                       struct anon_vma *anon_vma = rmap_item->anon_vma;
  4784. +                       struct anon_vma_chain *vmac;
  4785. +                       struct vm_area_struct *vma;
  4786. +
  4787. +                       anon_vma_lock(anon_vma);
  4788. +                       list_for_each_entry(vmac, &anon_vma->head,
  4789. +                                           same_anon_vma) {
  4790. +                               vma = vmac->vma;
  4791. +                               address = get_rmap_addr(rmap_item);
  4792. +
  4793. +                               if (address < vma->vm_start ||
  4794. +                                   address >= vma->vm_end)
  4795. +                                       continue;
  4796. +
  4797. +                               if ((rmap_item->slot->vma == vma) ==
  4798. +                                   search_new_forks)
  4799. +                                       continue;
  4800. +
  4801. +                               ret = rmap_one(page, vma, address, arg);
  4802. +                               if (ret != SWAP_AGAIN) {
  4803. +                                       anon_vma_unlock(anon_vma);
  4804. +                                       goto out;
  4805. +                               }
  4806. +                       }
  4807. +                       anon_vma_unlock(anon_vma);
  4808. +               }
  4809. +       }
  4810. +       if (!search_new_forks++)
  4811. +               goto again;
  4812. +out:
  4813. +       return ret;
  4814. +}
  4815. +
  4816. +/* Common ksm interface but may be specific to uksm */
  4817. +void ksm_migrate_page(struct page *newpage, struct page *oldpage)
  4818. +{
  4819. +       struct stable_node *stable_node;
  4820. +
  4821. +       VM_BUG_ON(!PageLocked(oldpage));
  4822. +       VM_BUG_ON(!PageLocked(newpage));
  4823. +       VM_BUG_ON(newpage->mapping != oldpage->mapping);
  4824. +
  4825. +       stable_node = page_stable_node(newpage);
  4826. +       if (stable_node) {
  4827. +               VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
  4828. +               stable_node->kpfn = page_to_pfn(newpage);
  4829. +       }
  4830. +}
  4831. +#endif /* CONFIG_MIGRATION */
  4832. +
  4833. +#ifdef CONFIG_MEMORY_HOTREMOVE
  4834. +static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
  4835. +                                                unsigned long end_pfn)
  4836. +{
  4837. +       struct rb_node *node;
  4838. +
  4839. +       for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
  4840. +               struct stable_node *stable_node;
  4841. +
  4842. +               stable_node = rb_entry(node, struct stable_node, node);
  4843. +               if (stable_node->kpfn >= start_pfn &&
  4844. +                   stable_node->kpfn < end_pfn)
  4845. +                       return stable_node;
  4846. +       }
  4847. +       return NULL;
  4848. +}
  4849. +
  4850. +static int uksm_memory_callback(struct notifier_block *self,
  4851. +                              unsigned long action, void *arg)
  4852. +{
  4853. +       struct memory_notify *mn = arg;
  4854. +       struct stable_node *stable_node;
  4855. +
  4856. +       switch (action) {
  4857. +       case MEM_GOING_OFFLINE:
  4858. +               /*
  4859. +                * Keep it very simple for now: just lock out ksmd and
  4860. +                * MADV_UNMERGEABLE while any memory is going offline.
  4861. +                * mutex_lock_nested() is necessary because lockdep was alarmed
  4862. +                * that here we take uksm_thread_mutex inside notifier chain
  4863. +                * mutex, and later take notifier chain mutex inside
  4864. +                * uksm_thread_mutex to unlock it.   But that's safe because both
  4865. +                * are inside mem_hotplug_mutex.
  4866. +                */
  4867. +               mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
  4868. +               break;
  4869. +
  4870. +       case MEM_OFFLINE:
  4871. +               /*
  4872. +                * Most of the work is done by page migration; but there might
  4873. +                * be a few stable_nodes left over, still pointing to struct
  4874. +                * pages which have been offlined: prune those from the tree.
  4875. +                */
  4876. +               while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
  4877. +                                       mn->start_pfn + mn->nr_pages)) != NULL)
  4878. +                       remove_node_from_stable_tree(stable_node, 1, 1);
  4879. +               /* fallthrough */
  4880. +
  4881. +       case MEM_CANCEL_OFFLINE:
  4882. +               mutex_unlock(&uksm_thread_mutex);
  4883. +               break;
  4884. +       }
  4885. +       return NOTIFY_OK;
  4886. +}
  4887. +#endif /* CONFIG_MEMORY_HOTREMOVE */
  4888. +
  4889. +#ifdef CONFIG_SYSFS
  4890. +/*
  4891. + * This all compiles without CONFIG_SYSFS, but is a waste of space.
  4892. + */
  4893. +
  4894. +#define UKSM_ATTR_RO(_name) \
  4895. +       static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
  4896. +#define UKSM_ATTR(_name) \
  4897. +       static struct kobj_attribute _name##_attr = \
  4898. +               __ATTR(_name, 0644, _name##_show, _name##_store)
  4899. +
  4900. +static ssize_t max_cpu_percentage_show(struct kobject *kobj,
  4901. +                                   struct kobj_attribute *attr, char *buf)
  4902. +{
  4903. +       return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
  4904. +}
  4905. +
  4906. +static ssize_t max_cpu_percentage_store(struct kobject *kobj,
  4907. +                                    struct kobj_attribute *attr,
  4908. +                                    const char *buf, size_t count)
  4909. +{
  4910. +       unsigned long max_cpu_percentage;
  4911. +       int err;
  4912. +
  4913. +       err = strict_strtoul(buf, 10, &max_cpu_percentage);
  4914. +       if (err || max_cpu_percentage > 100)
  4915. +               return -EINVAL;
  4916. +
  4917. +       if (max_cpu_percentage == 100)
  4918. +               max_cpu_percentage = 99;
  4919. +       else if (max_cpu_percentage < 10)
  4920. +               max_cpu_percentage = 10;
  4921. +
  4922. +       uksm_max_cpu_percentage = max_cpu_percentage;
  4923. +
  4924. +       return count;
  4925. +}
  4926. +UKSM_ATTR(max_cpu_percentage);
  4927. +
  4928. +static ssize_t sleep_millisecs_show(struct kobject *kobj,
  4929. +                                   struct kobj_attribute *attr, char *buf)
  4930. +{
  4931. +       return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
  4932. +}
  4933. +
  4934. +static ssize_t sleep_millisecs_store(struct kobject *kobj,
  4935. +                                    struct kobj_attribute *attr,
  4936. +                                    const char *buf, size_t count)
  4937. +{
  4938. +       unsigned long msecs;
  4939. +       int err;
  4940. +
  4941. +       err = strict_strtoul(buf, 10, &msecs);
  4942. +       if (err || msecs > MSEC_PER_SEC)
  4943. +               return -EINVAL;
  4944. +
  4945. +       uksm_sleep_jiffies = msecs_to_jiffies(msecs);
  4946. +       uksm_sleep_saved = uksm_sleep_jiffies;
  4947. +
  4948. +       return count;
  4949. +}
  4950. +UKSM_ATTR(sleep_millisecs);
  4951. +
  4952. +
  4953. +static ssize_t cpu_governor_show(struct kobject *kobj,
  4954. +                                 struct kobj_attribute *attr, char *buf)
  4955. +{
  4956. +       int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
  4957. +       int i;
  4958. +
  4959. +       buf[0] = '\0';
  4960. +       for (i = 0; i < n ; i++) {
  4961. +               if (uksm_cpu_governor == i)
  4962. +                       strcat(buf, "[");
  4963. +
  4964. +               strcat(buf, uksm_cpu_governor_str[i]);
  4965. +
  4966. +               if (uksm_cpu_governor == i)
  4967. +                       strcat(buf, "]");
  4968. +
  4969. +               strcat(buf, " ");
  4970. +       }
  4971. +       strcat(buf, "\n");
  4972. +
  4973. +       return strlen(buf);
  4974. +}
  4975. +
  4976. +static inline void init_performance_values(void)
  4977. +{
  4978. +       int i;
  4979. +       struct scan_rung *rung;
  4980. +       struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
  4981. +
  4982. +
  4983. +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  4984. +               rung = uksm_scan_ladder + i;
  4985. +               rung->cpu_ratio = preset->cpu_ratio[i];
  4986. +               rung->cover_msecs = preset->cover_msecs[i];
  4987. +       }
  4988. +
  4989. +       uksm_max_cpu_percentage = preset->max_cpu;
  4990. +}
  4991. +
  4992. +static ssize_t cpu_governor_store(struct kobject *kobj,
  4993. +                                  struct kobj_attribute *attr,
  4994. +                                  const char *buf, size_t count)
  4995. +{
  4996. +       int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
  4997. +
  4998. +       for (n--; n >=0 ; n--) {
  4999. +               if (!strncmp(buf, uksm_cpu_governor_str[n],
  5000. +                            strlen(uksm_cpu_governor_str[n])))
  5001. +                       break;
  5002. +       }
  5003. +
  5004. +       if (n < 0)
  5005. +               return -EINVAL;
  5006. +       else
  5007. +               uksm_cpu_governor = n;
  5008. +
  5009. +       init_performance_values();
  5010. +
  5011. +       return count;
  5012. +}
  5013. +UKSM_ATTR(cpu_governor);
  5014. +
  5015. +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
  5016. +                       char *buf)
  5017. +{
  5018. +       return sprintf(buf, "%u\n", uksm_run);
  5019. +}
  5020. +
  5021. +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
  5022. +                        const char *buf, size_t count)
  5023. +{
  5024. +       int err;
  5025. +       unsigned long flags;
  5026. +
  5027. +       err = strict_strtoul(buf, 10, &flags);
  5028. +       if (err || flags > UINT_MAX)
  5029. +               return -EINVAL;
  5030. +       if (flags > UKSM_RUN_MERGE)
  5031. +               return -EINVAL;
  5032. +
  5033. +       mutex_lock(&uksm_thread_mutex);
  5034. +       if (uksm_run != flags) {
  5035. +               uksm_run = flags;
  5036. +       }
  5037. +       mutex_unlock(&uksm_thread_mutex);
  5038. +
  5039. +       if (flags & UKSM_RUN_MERGE)
  5040. +               wake_up_interruptible(&uksm_thread_wait);
  5041. +
  5042. +       return count;
  5043. +}
  5044. +UKSM_ATTR(run);
  5045. +
  5046. +static ssize_t abundant_threshold_show(struct kobject *kobj,
  5047. +                                    struct kobj_attribute *attr, char *buf)
  5048. +{
  5049. +       return sprintf(buf, "%u\n", uksm_abundant_threshold);
  5050. +}
  5051. +
  5052. +static ssize_t abundant_threshold_store(struct kobject *kobj,
  5053. +                                     struct kobj_attribute *attr,
  5054. +                                     const char *buf, size_t count)
  5055. +{
  5056. +       int err;
  5057. +       unsigned long flags;
  5058. +
  5059. +       err = strict_strtoul(buf, 10, &flags);
  5060. +       if (err || flags > 99)
  5061. +               return -EINVAL;
  5062. +
  5063. +       uksm_abundant_threshold = flags;
  5064. +
  5065. +       return count;
  5066. +}
  5067. +UKSM_ATTR(abundant_threshold);
  5068. +
  5069. +static ssize_t thrash_threshold_show(struct kobject *kobj,
  5070. +                                    struct kobj_attribute *attr, char *buf)
  5071. +{
  5072. +       return sprintf(buf, "%u\n", uksm_thrash_threshold);
  5073. +}
  5074. +
  5075. +static ssize_t thrash_threshold_store(struct kobject *kobj,
  5076. +                                     struct kobj_attribute *attr,
  5077. +                                     const char *buf, size_t count)
  5078. +{
  5079. +       int err;
  5080. +       unsigned long flags;
  5081. +
  5082. +       err = strict_strtoul(buf, 10, &flags);
  5083. +       if (err || flags > 99)
  5084. +               return -EINVAL;
  5085. +
  5086. +       uksm_thrash_threshold = flags;
  5087. +
  5088. +       return count;
  5089. +}
  5090. +UKSM_ATTR(thrash_threshold);
  5091. +
  5092. +static ssize_t cpu_ratios_show(struct kobject *kobj,
  5093. +                              struct kobj_attribute *attr, char *buf)
  5094. +{
  5095. +       int i, size;
  5096. +       struct scan_rung *rung;
  5097. +       char *p = buf;
  5098. +
  5099. +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  5100. +               rung = &uksm_scan_ladder[i];
  5101. +
  5102. +               if (rung->cpu_ratio > 0)
  5103. +                       size = sprintf(p, "%d ", rung->cpu_ratio);
  5104. +               else
  5105. +                       size = sprintf(p, "MAX/%d ",
  5106. +                                       TIME_RATIO_SCALE / -rung->cpu_ratio);
  5107. +
  5108. +               p += size;
  5109. +       }
  5110. +
  5111. +       *p++ = '\n';
  5112. +       *p = '\0';
  5113. +
  5114. +       return p - buf;
  5115. +}
  5116. +
  5117. +static ssize_t cpu_ratios_store(struct kobject *kobj,
  5118. +                                     struct kobj_attribute *attr,
  5119. +                                     const char *buf, size_t count)
  5120. +{
  5121. +       int i, cpuratios[SCAN_LADDER_SIZE], err;
  5122. +       unsigned long value;
  5123. +       struct scan_rung *rung;
  5124. +       char *p, *end = NULL;
  5125. +
  5126. +       p = kzalloc(count, GFP_KERNEL);
  5127. +       if (!p)
  5128. +               return -ENOMEM;
  5129. +
  5130. +       memcpy(p, buf, count);
  5131. +
  5132. +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  5133. +               if (i != SCAN_LADDER_SIZE -1) {
  5134. +                       end = strchr(p, ' ');
  5135. +                       if (!end)
  5136. +                               return -EINVAL;
  5137. +
  5138. +                       *end = '\0';
  5139. +               }
  5140. +
  5141. +               if (strstr(p, "MAX/")) {
  5142. +                       p = strchr(p, '/') + 1;
  5143. +                       err = strict_strtoul(p, 10, &value);
  5144. +                       if (err || value > TIME_RATIO_SCALE || !value)
  5145. +                               return -EINVAL;
  5146. +
  5147. +                       cpuratios[i] = - (int) (TIME_RATIO_SCALE / value);
  5148. +               } else {
  5149. +                       err = strict_strtoul(p, 10, &value);
  5150. +                       if (err || value > TIME_RATIO_SCALE || !value)
  5151. +                               return -EINVAL;
  5152. +
  5153. +                       cpuratios[i] = value;
  5154. +               }
  5155. +
  5156. +               p = end + 1;
  5157. +       }
  5158. +
  5159. +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  5160. +               rung = &uksm_scan_ladder[i];
  5161. +
  5162. +               rung->cpu_ratio = cpuratios[i];
  5163. +       }
  5164. +
  5165. +       return count;
  5166. +}
  5167. +UKSM_ATTR(cpu_ratios);
  5168. +
  5169. +static ssize_t eval_intervals_show(struct kobject *kobj,
  5170. +                              struct kobj_attribute *attr, char *buf)
  5171. +{
  5172. +       int i, size;
  5173. +       struct scan_rung *rung;
  5174. +       char *p = buf;
  5175. +
  5176. +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  5177. +               rung = &uksm_scan_ladder[i];
  5178. +               size = sprintf(p, "%u ", rung->cover_msecs);
  5179. +               p += size;
  5180. +       }
  5181. +
  5182. +       *p++ = '\n';
  5183. +       *p = '\0';
  5184. +
  5185. +       return p - buf;
  5186. +}
  5187. +
  5188. +static ssize_t eval_intervals_store(struct kobject *kobj,
  5189. +                                     struct kobj_attribute *attr,
  5190. +                                     const char *buf, size_t count)
  5191. +{
  5192. +       int i, err;
  5193. +       unsigned long values[SCAN_LADDER_SIZE];
  5194. +       struct scan_rung *rung;
  5195. +       char *p, *end = NULL;
  5196. +
  5197. +       p = kzalloc(count, GFP_KERNEL);
  5198. +       if (!p)
  5199. +               return -ENOMEM;
  5200. +
  5201. +       memcpy(p, buf, count);
  5202. +
  5203. +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  5204. +               if (i != SCAN_LADDER_SIZE -1) {
  5205. +                       end = strchr(p, ' ');
  5206. +                       if (!end)
  5207. +                               return -EINVAL;
  5208. +
  5209. +                       *end = '\0';
  5210. +               }
  5211. +
  5212. +               err = strict_strtoul(p, 10, &values[i]);
  5213. +               if (err)
  5214. +                       return -EINVAL;
  5215. +
  5216. +               p = end + 1;
  5217. +       }
  5218. +
  5219. +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  5220. +               rung = &uksm_scan_ladder[i];
  5221. +
  5222. +               rung->cover_msecs = values[i];
  5223. +       }
  5224. +
  5225. +       return count;
  5226. +}
  5227. +UKSM_ATTR(eval_intervals);
  5228. +
  5229. +static ssize_t ema_per_page_time_show(struct kobject *kobj,
  5230. +                                struct kobj_attribute *attr, char *buf)
  5231. +{
  5232. +       return sprintf(buf, "%lu\n", uksm_ema_page_time);
  5233. +}
  5234. +UKSM_ATTR_RO(ema_per_page_time);
  5235. +
  5236. +static ssize_t pages_shared_show(struct kobject *kobj,
  5237. +                                struct kobj_attribute *attr, char *buf)
  5238. +{
  5239. +       return sprintf(buf, "%lu\n", uksm_pages_shared);
  5240. +}
  5241. +UKSM_ATTR_RO(pages_shared);
  5242. +
  5243. +static ssize_t pages_sharing_show(struct kobject *kobj,
  5244. +                                 struct kobj_attribute *attr, char *buf)
  5245. +{
  5246. +       return sprintf(buf, "%lu\n", uksm_pages_sharing);
  5247. +}
  5248. +UKSM_ATTR_RO(pages_sharing);
  5249. +
  5250. +static ssize_t pages_unshared_show(struct kobject *kobj,
  5251. +                                  struct kobj_attribute *attr, char *buf)
  5252. +{
  5253. +       return sprintf(buf, "%lu\n", uksm_pages_unshared);
  5254. +}
  5255. +UKSM_ATTR_RO(pages_unshared);
  5256. +
  5257. +static ssize_t full_scans_show(struct kobject *kobj,
  5258. +                              struct kobj_attribute *attr, char *buf)
  5259. +{
  5260. +       return sprintf(buf, "%llu\n", fully_scanned_round);
  5261. +}
  5262. +UKSM_ATTR_RO(full_scans);
  5263. +
  5264. +static ssize_t pages_scanned_show(struct kobject *kobj,
  5265. +                                 struct kobj_attribute *attr, char *buf)
  5266. +{
  5267. +       unsigned long base = 0;
  5268. +       u64 delta, ret;
  5269. +
  5270. +       if (pages_scanned_stored) {
  5271. +               base = pages_scanned_base;
  5272. +               ret = pages_scanned_stored;
  5273. +               delta = uksm_pages_scanned >> base;
  5274. +               if (CAN_OVERFLOW_U64(ret, delta)) {
  5275. +                       ret >>= 1;
  5276. +                       delta >>= 1;
  5277. +                       base++;
  5278. +                       ret += delta;
  5279. +               }
  5280. +       } else {
  5281. +               ret = uksm_pages_scanned;
  5282. +       }
  5283. +
  5284. +       while (ret > ULONG_MAX) {
  5285. +               ret >>= 1;
  5286. +               base++;
  5287. +       }
  5288. +
  5289. +       if (base)
  5290. +               return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
  5291. +       else
  5292. +               return sprintf(buf, "%lu\n", (unsigned long)ret);
  5293. +}
  5294. +UKSM_ATTR_RO(pages_scanned);
  5295. +
  5296. +static ssize_t hash_strength_show(struct kobject *kobj,
  5297. +                                 struct kobj_attribute *attr, char *buf)
  5298. +{
  5299. +       return sprintf(buf, "%lu\n", hash_strength);
  5300. +}
  5301. +UKSM_ATTR_RO(hash_strength);
  5302. +
  5303. +static ssize_t sleep_times_show(struct kobject *kobj,
  5304. +                                 struct kobj_attribute *attr, char *buf)
  5305. +{
  5306. +       return sprintf(buf, "%llu\n", uksm_sleep_times);
  5307. +}
  5308. +UKSM_ATTR_RO(sleep_times);
  5309. +
  5310. +
  5311. +static struct attribute *uksm_attrs[] = {
  5312. +       &max_cpu_percentage_attr.attr,
  5313. +       &sleep_millisecs_attr.attr,
  5314. +       &cpu_governor_attr.attr,
  5315. +       &run_attr.attr,
  5316. +       &ema_per_page_time_attr.attr,
  5317. +       &pages_shared_attr.attr,
  5318. +       &pages_sharing_attr.attr,
  5319. +       &pages_unshared_attr.attr,
  5320. +       &full_scans_attr.attr,
  5321. +       &pages_scanned_attr.attr,
  5322. +       &hash_strength_attr.attr,
  5323. +       &sleep_times_attr.attr,
  5324. +       &thrash_threshold_attr.attr,
  5325. +       &abundant_threshold_attr.attr,
  5326. +       &cpu_ratios_attr.attr,
  5327. +       &eval_intervals_attr.attr,
  5328. +       NULL,
  5329. +};
  5330. +
  5331. +static struct attribute_group uksm_attr_group = {
  5332. +       .attrs = uksm_attrs,
  5333. +       .name = "uksm",
  5334. +};
  5335. +#endif /* CONFIG_SYSFS */
  5336. +
  5337. +static inline void init_scan_ladder(void)
  5338. +{
  5339. +       int i;
  5340. +       struct scan_rung *rung;
  5341. +
  5342. +       for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  5343. +               rung = uksm_scan_ladder + i;
  5344. +               slot_tree_init_root(&rung->vma_root);
  5345. +       }
  5346. +
  5347. +       init_performance_values();
  5348. +       uksm_calc_scan_pages();
  5349. +}
  5350. +
  5351. +static inline int cal_positive_negative_costs(void)
  5352. +{
  5353. +       struct page *p1, *p2;
  5354. +       unsigned char *addr1, *addr2;
  5355. +       unsigned long i, time_start, hash_cost;
  5356. +       unsigned long loopnum = 0;
  5357. +
  5358. +       /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
  5359. +       volatile u32 hash;
  5360. +       volatile int ret;
  5361. +
  5362. +       p1 = alloc_page(GFP_KERNEL);
  5363. +       if (!p1)
  5364. +               return -ENOMEM;
  5365. +
  5366. +       p2 = alloc_page(GFP_KERNEL);
  5367. +       if (!p2)
  5368. +               return -ENOMEM;
  5369. +
  5370. +       addr1 = kmap_atomic(p1);
  5371. +       addr2 = kmap_atomic(p2);
  5372. +       memset(addr1, random32(), PAGE_SIZE);
  5373. +       memcpy(addr2, addr1, PAGE_SIZE);
  5374. +
  5375. +       /* make sure that the two pages differ in last byte */
  5376. +       addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
  5377. +       kunmap_atomic(addr2);
  5378. +       kunmap_atomic(addr1);
  5379. +
  5380. +       time_start = jiffies;
  5381. +       while (jiffies - time_start < 100) {
  5382. +               for (i = 0; i < 100; i++)
  5383. +                       hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
  5384. +               loopnum += 100;
  5385. +       }
  5386. +       hash_cost = (jiffies - time_start);
  5387. +
  5388. +       time_start = jiffies;
  5389. +       for (i = 0; i < loopnum; i++)
  5390. +               ret = pages_identical(p1, p2);
  5391. +       memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
  5392. +       memcmp_cost /= hash_cost;
  5393. +       printk(KERN_INFO "UKSM: relative memcmp_cost = %lu "
  5394. +                        "hash=%u cmp_ret=%d.\n",
  5395. +              memcmp_cost, hash, ret);
  5396. +
  5397. +       __free_page(p1);
  5398. +       __free_page(p2);
  5399. +       return 0;
  5400. +}
  5401. +
  5402. +static int init_zeropage_hash_table(void)
  5403. +{
  5404. +       struct page *page;
  5405. +       char *addr;
  5406. +       int i;
  5407. +
  5408. +       page = alloc_page(GFP_KERNEL);
  5409. +       if (!page)
  5410. +               return -ENOMEM;
  5411. +
  5412. +       addr = kmap_atomic(page);
  5413. +       memset(addr, 0, PAGE_SIZE);
  5414. +       kunmap_atomic(addr);
  5415. +
  5416. +       zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32),
  5417. +               GFP_KERNEL);
  5418. +       if (!zero_hash_table)
  5419. +               return -ENOMEM;
  5420. +
  5421. +       for (i = 0; i < HASH_STRENGTH_MAX; i++)
  5422. +               zero_hash_table[i] = page_hash(page, i, 0);
  5423. +
  5424. +       __free_page(page);
  5425. +
  5426. +       return 0;
  5427. +}
  5428. +
  5429. +static inline int init_random_sampling(void)
  5430. +{
  5431. +       unsigned long i;
  5432. +       random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
  5433. +       if (!random_nums)
  5434. +               return -ENOMEM;
  5435. +
  5436. +       for (i = 0; i < HASH_STRENGTH_FULL; i++)
  5437. +               random_nums[i] = i;
  5438. +
  5439. +       for (i = 0; i < HASH_STRENGTH_FULL; i++) {
  5440. +               unsigned long rand_range, swap_index, tmp;
  5441. +
  5442. +               rand_range = HASH_STRENGTH_FULL - i;
  5443. +               swap_index = i + random32() % rand_range;
  5444. +               tmp = random_nums[i];
  5445. +               random_nums[i] =  random_nums[swap_index];
  5446. +               random_nums[swap_index] = tmp;
  5447. +       }
  5448. +
  5449. +       rshash_state.state = RSHASH_NEW;
  5450. +       rshash_state.below_count = 0;
  5451. +       rshash_state.lookup_window_index = 0;
  5452. +
  5453. +       return cal_positive_negative_costs();
  5454. +}
  5455. +
  5456. +static int __init uksm_slab_init(void)
  5457. +{
  5458. +       rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
  5459. +       if (!rmap_item_cache)
  5460. +               goto out;
  5461. +
  5462. +       stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
  5463. +       if (!stable_node_cache)
  5464. +               goto out_free1;
  5465. +
  5466. +       node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
  5467. +       if (!node_vma_cache)
  5468. +               goto out_free2;
  5469. +
  5470. +       vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
  5471. +       if (!vma_slot_cache)
  5472. +               goto out_free3;
  5473. +
  5474. +       tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
  5475. +       if (!tree_node_cache)
  5476. +               goto out_free4;
  5477. +
  5478. +       return 0;
  5479. +
  5480. +out_free4:
  5481. +       kmem_cache_destroy(vma_slot_cache);
  5482. +out_free3:
  5483. +       kmem_cache_destroy(node_vma_cache);
  5484. +out_free2:
  5485. +       kmem_cache_destroy(stable_node_cache);
  5486. +out_free1:
  5487. +       kmem_cache_destroy(rmap_item_cache);
  5488. +out:
  5489. +       return -ENOMEM;
  5490. +}
  5491. +
  5492. +static void __init uksm_slab_free(void)
  5493. +{
  5494. +       kmem_cache_destroy(stable_node_cache);
  5495. +       kmem_cache_destroy(rmap_item_cache);
  5496. +       kmem_cache_destroy(node_vma_cache);
  5497. +       kmem_cache_destroy(vma_slot_cache);
  5498. +       kmem_cache_destroy(tree_node_cache);
  5499. +}
  5500. +
  5501. +/* Common interface to ksm, different to it. */
  5502. +int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
  5503. +               unsigned long end, int advice, unsigned long *vm_flags)
  5504. +{
  5505. +       int err;
  5506. +
  5507. +       switch (advice) {
  5508. +       case MADV_MERGEABLE:
  5509. +               return 0;               /* just ignore the advice */
  5510. +
  5511. +       case MADV_UNMERGEABLE:
  5512. +               if (!(*vm_flags & VM_MERGEABLE))
  5513. +                       return 0;               /* just ignore the advice */
  5514. +
  5515. +               if (vma->anon_vma) {
  5516. +                       err = unmerge_uksm_pages(vma, start, end);
  5517. +                       if (err)
  5518. +                               return err;
  5519. +               }
  5520. +
  5521. +               uksm_remove_vma(vma);
  5522. +               *vm_flags &= ~VM_MERGEABLE;
  5523. +               break;
  5524. +       }
  5525. +
  5526. +       return 0;
  5527. +}
  5528. +
  5529. +/* Common interface to ksm, actually the same. */
  5530. +struct page *ksm_does_need_to_copy(struct page *page,
  5531. +                       struct vm_area_struct *vma, unsigned long address)
  5532. +{
  5533. +       struct page *new_page;
  5534. +
  5535. +       new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
  5536. +       if (new_page) {
  5537. +               copy_user_highpage(new_page, page, address, vma);
  5538. +
  5539. +               SetPageDirty(new_page);
  5540. +               __SetPageUptodate(new_page);
  5541. +               SetPageSwapBacked(new_page);
  5542. +               __set_page_locked(new_page);
  5543. +
  5544. +               if (page_evictable(new_page, vma))
  5545. +                       lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
  5546. +               else
  5547. +                       add_page_to_unevictable_list(new_page);
  5548. +       }
  5549. +
  5550. +       return new_page;
  5551. +}
  5552. +
  5553. +static int __init uksm_init(void)
  5554. +{
  5555. +       struct task_struct *uksm_thread;
  5556. +       int err;
  5557. +
  5558. +       uksm_sleep_jiffies = msecs_to_jiffies(100);
  5559. +       uksm_sleep_saved = uksm_sleep_jiffies;
  5560. +
  5561. +       slot_tree_init();
  5562. +       init_scan_ladder();
  5563. +
  5564. +
  5565. +       err = init_random_sampling();
  5566. +       if (err)
  5567. +               goto out_free2;
  5568. +
  5569. +       err = uksm_slab_init();
  5570. +       if (err)
  5571. +               goto out_free1;
  5572. +
  5573. +       err = init_zeropage_hash_table();
  5574. +       if (err)
  5575. +               goto out_free0;
  5576. +
  5577. +       uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
  5578. +       if (IS_ERR(uksm_thread)) {
  5579. +               printk(KERN_ERR "uksm: creating kthread failed\n");
  5580. +               err = PTR_ERR(uksm_thread);
  5581. +               goto out_free;
  5582. +       }
  5583. +
  5584. +#ifdef CONFIG_SYSFS
  5585. +       err = sysfs_create_group(mm_kobj, &uksm_attr_group);
  5586. +       if (err) {
  5587. +               printk(KERN_ERR "uksm: register sysfs failed\n");
  5588. +               kthread_stop(uksm_thread);
  5589. +               goto out_free;
  5590. +       }
  5591. +#else
  5592. +       uksm_run = UKSM_RUN_MERGE;      /* no way for user to start it */
  5593. +
  5594. +#endif /* CONFIG_SYSFS */
  5595. +
  5596. +#ifdef CONFIG_MEMORY_HOTREMOVE
  5597. +       /*
  5598. +        * Choose a high priority since the callback takes uksm_thread_mutex:
  5599. +        * later callbacks could only be taking locks which nest within that.
  5600. +        */
  5601. +       hotplug_memory_notifier(uksm_memory_callback, 100);
  5602. +#endif
  5603. +       return 0;
  5604. +
  5605. +out_free:
  5606. +       kfree(zero_hash_table);
  5607. +out_free0:
  5608. +       uksm_slab_free();
  5609. +out_free1:
  5610. +       kfree(random_nums);
  5611. +out_free2:
  5612. +       kfree(uksm_scan_ladder);
  5613. +       return err;
  5614. +}
  5615. +
  5616. +#ifdef MODULE
  5617. +module_init(uksm_init)
  5618. +#else
  5619. +late_initcall(uksm_init);
  5620. +#endif
  5621. +