ImNtReal

uksm-0.1.2.0-for-v3.6.patch

Oct 15th, 2012
190
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. diff -Naur linux-3.6-orig linux-3.6
  2. diff -Naur linux-3.6-orig/mm/uksm.c linux-3.6/mm/uksm.c
  3. --- linux-3.6-orig/mm/uksm.c    1969-12-31 19:00:00.000000000 -0500
  4. +++ linux-3.6/mm/uksm.c 2012-10-15 11:28:33.550230252 -0400
  5. @@ -0,0 +1,5616 @@
  6. +/*
  7. + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
  8. + *
  9. + * This is an improvement upon KSM. Some basic data structures and routines
  10. + * are borrowed from ksm.c .
  11. + *
  12. + * Its new features:
  13. + * 1. Full system scan:
  14. + *      It automatically scans all user processes' anonymous VMAs. Kernel-user
  15. + *      interaction to submit a memory area to KSM is no longer needed.
  16. + *
  17. + * 2. Rich area detection:
  18. + *      It automatically detects rich areas containing abundant duplicated
  19. + *      pages based. Rich areas are given a full scan speed. Poor areas are
  20. + *      sampled at a reasonable speed with very low CPU consumption.
  21. + *
  22. + * 3. Ultra Per-page scan speed improvement:
  23. + *      A new hash algorithm is proposed. As a result, on a machine with
  24. + *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
  25. + *      can scan memory areas that does not contain duplicated pages at speed of
  26. + *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
  27. + *      477MB/sec ~ 923MB/sec.
  28. + *
  29. + * 4. Thrashing area avoidance:
  30. + *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
  31. + *      filtered out. My benchmark shows it's more efficient than KSM's per-page
  32. + *      hash value based volatile page detection.
  33. + *
  34. + *
  35. + * 5. Misc changes upon KSM:
  36. + *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
  37. + *        comparison. It's much faster than default C version on x86.
  38. + *      * rmap_item now has an struct *page member to loosely cache a
  39. + *        address-->page mapping, which reduces too much time-costly
  40. + *        follow_page().
  41. + *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
  42. + *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
  43. + *        ksm is needed for this case.
  44. + *
  45. + * 6. Full Zero Page consideration(contributed by Figo Zhang)
  46. + *    Now uksmd consider full zero pages as special pages and merge them to an
  47. + *    special unswappable uksm zero page.
  48. + */
  49. +
  50. +#include <linux/errno.h>
  51. +#include <linux/mm.h>
  52. +#include <linux/fs.h>
  53. +#include <linux/mman.h>
  54. +#include <linux/sched.h>
  55. +#include <linux/rwsem.h>
  56. +#include <linux/pagemap.h>
  57. +#include <linux/rmap.h>
  58. +#include <linux/spinlock.h>
  59. +#include <linux/jhash.h>
  60. +#include <linux/delay.h>
  61. +#include <linux/kthread.h>
  62. +#include <linux/wait.h>
  63. +#include <linux/slab.h>
  64. +#include <linux/rbtree.h>
  65. +#include <linux/memory.h>
  66. +#include <linux/mmu_notifier.h>
  67. +#include <linux/swap.h>
  68. +#include <linux/ksm.h>
  69. +#include <linux/crypto.h>
  70. +#include <linux/scatterlist.h>
  71. +#include <crypto/hash.h>
  72. +#include <linux/random.h>
  73. +#include <linux/math64.h>
  74. +#include <linux/gcd.h>
  75. +#include <linux/freezer.h>
  76. +#include <linux/sradix-tree.h>
  77. +
  78. +#include <asm/tlbflush.h>
  79. +#include "internal.h"
  80. +
  81. +#ifdef CONFIG_X86
  82. +#undef memcmp
  83. +
  84. +#ifdef CONFIG_X86_32
  85. +#define memcmp memcmpx86_32
  86. +/*
  87. + * Compare 4-byte-aligned address s1 and s2, with length n
  88. + */
  89. +int memcmpx86_32(void *s1, void *s2, size_t n)
  90. +{
  91. +   size_t num = n / 4;
  92. +   register int res;
  93. +
  94. +   __asm__ __volatile__
  95. +   (
  96. +    "testl %3,%3\n\t"
  97. +    "repe; cmpsd\n\t"
  98. +    "je        1f\n\t"
  99. +    "sbbl      %0,%0\n\t"
  100. +    "orl       $1,%0\n"
  101. +    "1:"
  102. +    : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
  103. +    : "0" (0)
  104. +    : "cc");
  105. +
  106. +   return res;
  107. +}
  108. +
  109. +/*
  110. + * Check the page is all zero ?
  111. + */
  112. +static int is_full_zero(const void *s1, size_t len)
  113. +{
  114. +   unsigned char same;
  115. +
  116. +   len /= 4;
  117. +
  118. +   __asm__ __volatile__
  119. +   ("repe; scasl;"
  120. +    "sete %0"
  121. +    : "=qm" (same), "+D" (s1), "+c" (len)
  122. +    : "a" (0)
  123. +    : "cc");
  124. +
  125. +   return same;
  126. +}
  127. +
  128. +
  129. +#elif defined(CONFIG_X86_64)
  130. +#define memcmp memcmpx86_64
  131. +/*
  132. + * Compare 8-byte-aligned address s1 and s2, with length n
  133. + */
  134. +int memcmpx86_64(void *s1, void *s2, size_t n)
  135. +{
  136. +   size_t num = n / 8;
  137. +   register int res;
  138. +
  139. +   __asm__ __volatile__
  140. +   (
  141. +    "testq %q3,%q3\n\t"
  142. +    "repe; cmpsq\n\t"
  143. +    "je        1f\n\t"
  144. +    "sbbq      %q0,%q0\n\t"
  145. +    "orq       $1,%q0\n"
  146. +    "1:"
  147. +    : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
  148. +    : "0" (0)
  149. +    : "cc");
  150. +
  151. +   return res;
  152. +}
  153. +
  154. +static int is_full_zero(const void *s1, size_t len)
  155. +{
  156. +   unsigned char same;
  157. +
  158. +   len /= 8;
  159. +
  160. +   __asm__ __volatile__
  161. +   ("repe; scasq;"
  162. +    "sete %0"
  163. +    : "=qm" (same), "+D" (s1), "+c" (len)
  164. +    : "a" (0)
  165. +    : "cc");
  166. +
  167. +   return same;
  168. +}
  169. +
  170. +#endif
  171. +#else
  172. +static int is_full_zero(const void *s1, size_t len)
  173. +{
  174. +   unsigned long *src = s1;
  175. +   int i;
  176. +
  177. +   len /= sizeof(*src);
  178. +
  179. +   for (i = 0; i < len; i++) {
  180. +       if (src[i])
  181. +           return 0;
  182. +   }
  183. +
  184. +   return 1;
  185. +}
  186. +#endif
  187. +
  188. +#define U64_MAX        (~((u64)0))
  189. +#define UKSM_RUNG_ROUND_FINISHED  (1 << 0)
  190. +#define TIME_RATIO_SCALE   10000
  191. +
  192. +#define SLOT_TREE_NODE_SHIFT   8
  193. +#define SLOT_TREE_NODE_STORE_SIZE  (1UL << SLOT_TREE_NODE_SHIFT)
  194. +struct slot_tree_node {
  195. +   unsigned long size;
  196. +   struct sradix_tree_node snode;
  197. +   void *stores[SLOT_TREE_NODE_STORE_SIZE];
  198. +};
  199. +
  200. +static struct kmem_cache *slot_tree_node_cachep;
  201. +
  202. +static struct sradix_tree_node *slot_tree_node_alloc(void)
  203. +{
  204. +   struct slot_tree_node *p;
  205. +   p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL);
  206. +   if (!p)
  207. +       return NULL;
  208. +
  209. +   return &p->snode;
  210. +}
  211. +
  212. +static void slot_tree_node_free(struct sradix_tree_node *node)
  213. +{
  214. +   struct slot_tree_node *p;
  215. +
  216. +   p = container_of(node, struct slot_tree_node, snode);
  217. +   kmem_cache_free(slot_tree_node_cachep, p);
  218. +}
  219. +
  220. +static void slot_tree_node_extend(struct sradix_tree_node *parent,
  221. +                 struct sradix_tree_node *child)
  222. +{
  223. +   struct slot_tree_node *p, *c;
  224. +
  225. +   p = container_of(parent, struct slot_tree_node, snode);
  226. +   c = container_of(child, struct slot_tree_node, snode);
  227. +
  228. +   p->size += c->size;
  229. +}
  230. +
  231. +void slot_tree_node_assign(struct sradix_tree_node *node,
  232. +              unsigned index, void *item)
  233. +{
  234. +   struct vma_slot *slot = item;
  235. +   struct slot_tree_node *cur;
  236. +
  237. +   slot->snode = node;
  238. +   slot->sindex = index;
  239. +
  240. +   while (node) {
  241. +       cur = container_of(node, struct slot_tree_node, snode);
  242. +       cur->size += slot->pages;
  243. +       node = node->parent;
  244. +   }
  245. +}
  246. +
  247. +void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset)
  248. +{
  249. +   struct vma_slot *slot;
  250. +   struct slot_tree_node *cur;
  251. +   unsigned long pages;
  252. +
  253. +   if (node->height == 1) {
  254. +       slot = node->stores[offset];
  255. +       pages = slot->pages;
  256. +   } else {
  257. +       cur = container_of(node->stores[offset],
  258. +                  struct slot_tree_node, snode);
  259. +       pages = cur->size;
  260. +   }
  261. +
  262. +   while (node) {
  263. +       cur = container_of(node, struct slot_tree_node, snode);
  264. +       cur->size -= pages;
  265. +       node = node->parent;
  266. +   }
  267. +}
  268. +
  269. +unsigned long slot_iter_index;
  270. +int slot_iter(void *item,  unsigned long height)
  271. +{
  272. +   struct slot_tree_node *node;
  273. +   struct vma_slot *slot;
  274. +
  275. +   if (height == 1) {
  276. +       slot = item;
  277. +       if (slot_iter_index < slot->pages) {
  278. +           /*in this one*/
  279. +           return 1;
  280. +       } else {
  281. +           slot_iter_index -= slot->pages;
  282. +           return 0;
  283. +       }
  284. +
  285. +   } else {
  286. +       node = container_of(item, struct slot_tree_node, snode);
  287. +       if (slot_iter_index < node->size) {
  288. +           /*in this one*/
  289. +           return 1;
  290. +       } else {
  291. +           slot_iter_index -= node->size;
  292. +           return 0;
  293. +       }
  294. +   }
  295. +}
  296. +
  297. +
  298. +static inline void slot_tree_init_root(struct sradix_tree_root *root)
  299. +{
  300. +   init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
  301. +   root->alloc = slot_tree_node_alloc;
  302. +   root->free = slot_tree_node_free;
  303. +   root->extend = slot_tree_node_extend;
  304. +   root->assign = slot_tree_node_assign;
  305. +   root->rm = slot_tree_node_rm;
  306. +}
  307. +
  308. +void slot_tree_init(void)
  309. +{
  310. +   slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
  311. +               sizeof(struct slot_tree_node), 0,
  312. +               SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
  313. +               NULL);
  314. +}
  315. +
  316. +
  317. +/* Each rung of this ladder is a list of VMAs having a same scan ratio */
  318. +struct scan_rung {
  319. +   //struct list_head scanned_list;
  320. +   struct sradix_tree_root vma_root;
  321. +   struct sradix_tree_root vma_root2;
  322. +
  323. +   struct vma_slot *current_scan;
  324. +   unsigned long current_offset;
  325. +
  326. +   /*
  327. +    * The initial value for current_offset, it should loop over
  328. +    * [0~ step - 1] to let all slot have its chance to be scanned.
  329. +    */
  330. +   unsigned long offset_init;
  331. +   unsigned long step; /* dynamic step for current_offset */
  332. +   unsigned int flags;
  333. +   unsigned long pages_to_scan;
  334. +   //unsigned long fully_scanned_slots;
  335. +   /*
  336. +    * a little bit tricky - if cpu_time_ratio > 0, then the value is the
  337. +    * the cpu time ratio it can spend in rung_i for every scan
  338. +    * period. if < 0, then it is the cpu time ratio relative to the
  339. +    * max cpu percentage user specified. Both in unit of
  340. +    * 1/TIME_RATIO_SCALE
  341. +    */
  342. +   int cpu_ratio;
  343. +
  344. +   /*
  345. +    * How long it will take for all slots in this rung to be fully
  346. +    * scanned? If it's zero, we don't care about the cover time:
  347. +    * it's fully scanned.
  348. +    */
  349. +   unsigned int cover_msecs;
  350. +   //unsigned long vma_num;
  351. +   //unsigned long pages; /* Sum of all slot's pages in rung */
  352. +};
  353. +
  354. +/**
  355. + * node of either the stable or unstale rbtree
  356. + *
  357. + */
  358. +struct tree_node {
  359. +   struct rb_node node; /* link in the main (un)stable rbtree */
  360. +   struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
  361. +   u32 hash;
  362. +   unsigned long count; /* TODO: merged with sub_root */
  363. +   struct list_head all_list; /* all tree nodes in stable/unstable tree */
  364. +};
  365. +
  366. +/**
  367. + * struct stable_node - node of the stable rbtree
  368. + * @node: rb node of this ksm page in the stable tree
  369. + * @hlist: hlist head of rmap_items using this ksm page
  370. + * @kpfn: page frame number of this ksm page
  371. + */
  372. +struct stable_node {
  373. +   struct rb_node node; /* link in sub-rbtree */
  374. +   struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
  375. +   struct hlist_head hlist;
  376. +   unsigned long kpfn;
  377. +   u32 hash_max; /* if ==0 then it's not been calculated yet */
  378. +   struct list_head all_list; /* in a list for all stable nodes */
  379. +};
  380. +
  381. +/**
  382. + * struct node_vma - group rmap_items linked in a same stable
  383. + * node together.
  384. + */
  385. +struct node_vma {
  386. +   union {
  387. +       struct vma_slot *slot;
  388. +       unsigned long key;  /* slot is used as key sorted on hlist */
  389. +   };
  390. +   struct hlist_node hlist;
  391. +   struct hlist_head rmap_hlist;
  392. +   struct stable_node *head;
  393. +};
  394. +
  395. +/**
  396. + * struct rmap_item - reverse mapping item for virtual addresses
  397. + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
  398. + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
  399. + * @mm: the memory structure this rmap_item is pointing into
  400. + * @address: the virtual address this rmap_item tracks (+ flags in low bits)
  401. + * @node: rb node of this rmap_item in the unstable tree
  402. + * @head: pointer to stable_node heading this list in the stable tree
  403. + * @hlist: link into hlist of rmap_items hanging off that stable_node
  404. + */
  405. +struct rmap_item {
  406. +   struct vma_slot *slot;
  407. +   struct page *page;
  408. +   unsigned long address;  /* + low bits used for flags below */
  409. +   unsigned long hash_round;
  410. +   unsigned long entry_index;
  411. +   union {
  412. +       struct {/* when in unstable tree */
  413. +           struct rb_node node;
  414. +           struct tree_node *tree_node;
  415. +           u32 hash_max;
  416. +       };
  417. +       struct { /* when in stable tree */
  418. +           struct node_vma *head;
  419. +           struct hlist_node hlist;
  420. +           struct anon_vma *anon_vma;
  421. +       };
  422. +   };
  423. +} __attribute__((aligned(4)));
  424. +
  425. +struct rmap_list_entry {
  426. +   union {
  427. +       struct rmap_item *item;
  428. +       unsigned long addr;
  429. +   };
  430. +   /* lowest bit is used for is_addr tag */
  431. +} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/
  432. +
  433. +
  434. +/* Basic data structure definition ends */
  435. +
  436. +
  437. +/*
  438. + * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
  439. + * The flags use the low bits of rmap_item.address
  440. + */
  441. +#define UNSTABLE_FLAG  0x1
  442. +#define STABLE_FLAG    0x2
  443. +#define get_rmap_addr(x)   ((x)->address & PAGE_MASK)
  444. +
  445. +/*
  446. + * rmap_list_entry helpers
  447. + */
  448. +#define IS_ADDR_FLAG   1
  449. +#define is_addr(ptr)       ((unsigned long)(ptr) & IS_ADDR_FLAG)
  450. +#define set_is_addr(ptr)   ((ptr) |= IS_ADDR_FLAG)
  451. +#define get_clean_addr(ptr)    (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
  452. +
  453. +
  454. +/*
  455. + * High speed caches for frequently allocated and freed structs
  456. + */
  457. +static struct kmem_cache *rmap_item_cache;
  458. +static struct kmem_cache *stable_node_cache;
  459. +static struct kmem_cache *node_vma_cache;
  460. +static struct kmem_cache *vma_slot_cache;
  461. +static struct kmem_cache *tree_node_cache;
  462. +#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
  463. +       sizeof(struct __struct), __alignof__(struct __struct),\
  464. +       (__flags), NULL)
  465. +
  466. +/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
  467. +#define SCAN_LADDER_SIZE 4
  468. +static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
  469. +
  470. +/* The evaluation rounds uksmd has finished */
  471. +static unsigned long long uksm_eval_round = 1;
  472. +
  473. +/*
  474. + * we add 1 to this var when we consider we should rebuild the whole
  475. + * unstable tree.
  476. + */
  477. +static unsigned long uksm_hash_round = 1;
  478. +
  479. +/*
  480. + * How many times the whole memory is scanned.
  481. + */
  482. +static unsigned long long fully_scanned_round = 1;
  483. +
  484. +/* The total number of virtual pages of all vma slots */
  485. +static u64 uksm_pages_total;
  486. +
  487. +/* The number of pages has been scanned since the start up */
  488. +static u64 uksm_pages_scanned;
  489. +
  490. +static u64 scanned_virtual_pages;
  491. +
  492. +/* The number of pages has been scanned since last encode_benefit call */
  493. +static u64 uksm_pages_scanned_last;
  494. +
  495. +/* If the scanned number is tooo large, we encode it here */
  496. +static u64 pages_scanned_stored;
  497. +
  498. +static unsigned long pages_scanned_base;
  499. +
  500. +/* The number of nodes in the stable tree */
  501. +static unsigned long uksm_pages_shared;
  502. +
  503. +/* The number of page slots additionally sharing those nodes */
  504. +static unsigned long uksm_pages_sharing;
  505. +
  506. +/* The number of nodes in the unstable tree */
  507. +static unsigned long uksm_pages_unshared;
  508. +
  509. +/*
  510. + * Milliseconds ksmd should sleep between scans,
  511. + * >= 100ms to be consistent with
  512. + * scan_time_to_sleep_msec()
  513. + */
  514. +static unsigned int uksm_sleep_jiffies;
  515. +
  516. +/* The real value for the uksmd next sleep */
  517. +static unsigned int uksm_sleep_real;
  518. +
  519. +/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
  520. +static unsigned int uksm_sleep_saved;
  521. +
  522. +/* Max percentage of cpu utilization ksmd can take to scan in one batch */
  523. +static unsigned int uksm_max_cpu_percentage;
  524. +
  525. +static int uksm_cpu_governor;
  526. +
  527. +static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
  528. +
  529. +struct uksm_cpu_preset_s {
  530. +   int cpu_ratio[SCAN_LADDER_SIZE];
  531. +   unsigned int cover_msecs[SCAN_LADDER_SIZE];
  532. +   unsigned int max_cpu; /* percentage */
  533. +};
  534. +
  535. +struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
  536. +   { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
  537. +   { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
  538. +   { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
  539. +   { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
  540. +};
  541. +
  542. +/* The default value for uksm_ema_page_time if it's not initialized */
  543. +#define UKSM_PAGE_TIME_DEFAULT 500
  544. +
  545. +/*cost to scan one page by expotional moving average in nsecs */
  546. +static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
  547. +
  548. +/* The expotional moving average alpha weight, in percentage. */
  549. +#define EMA_ALPHA  20
  550. +
  551. +/*
  552. + * The threshold used to filter out thrashing areas,
  553. + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
  554. + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
  555. + * will be considered as having a zero duplication ratio.
  556. + */
  557. +static unsigned int uksm_thrash_threshold = 50;
  558. +
  559. +/* How much dedup ratio is considered to be abundant*/
  560. +static unsigned int uksm_abundant_threshold = 10;
  561. +
  562. +/* All slots having merged pages in this eval round. */
  563. +struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
  564. +
  565. +/* How many times the ksmd has slept since startup */
  566. +static unsigned long long uksm_sleep_times;
  567. +
  568. +#define UKSM_RUN_STOP  0
  569. +#define UKSM_RUN_MERGE 1
  570. +static unsigned int uksm_run = 1;
  571. +
  572. +static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
  573. +static DEFINE_MUTEX(uksm_thread_mutex);
  574. +
  575. +/*
  576. + * List vma_slot_new is for newly created vma_slot waiting to be added by
  577. + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
  578. + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
  579. + * VMA has been removed/freed.
  580. + */
  581. +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
  582. +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
  583. +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
  584. +static DEFINE_SPINLOCK(vma_slot_list_lock);
  585. +
  586. +/* The unstable tree heads */
  587. +static struct rb_root root_unstable_tree = RB_ROOT;
  588. +
  589. +/*
  590. + * All tree_nodes are in a list to be freed at once when unstable tree is
  591. + * freed after each scan round.
  592. + */
  593. +static struct list_head unstable_tree_node_list =
  594. +               LIST_HEAD_INIT(unstable_tree_node_list);
  595. +
  596. +/* List contains all stable nodes */
  597. +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
  598. +
  599. +/*
  600. + * When the hash strength is changed, the stable tree must be delta_hashed and
  601. + * re-structured. We use two set of below structs to speed up the
  602. + * re-structuring of stable tree.
  603. + */
  604. +static struct list_head
  605. +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
  606. +               LIST_HEAD_INIT(stable_tree_node_list[1])};
  607. +
  608. +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
  609. +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
  610. +static struct rb_root *root_stable_treep = &root_stable_tree[0];
  611. +static unsigned long stable_tree_index;
  612. +
  613. +/* The hash strength needed to hash a full page */
  614. +#define HASH_STRENGTH_FULL     (PAGE_SIZE / sizeof(u32))
  615. +
  616. +/* The hash strength needed for loop-back hashing */
  617. +#define HASH_STRENGTH_MAX      (HASH_STRENGTH_FULL + 10)
  618. +
  619. +/* The random offsets in a page */
  620. +static u32 *random_nums;
  621. +
  622. +/* The hash strength */
  623. +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
  624. +
  625. +/* The delta value each time the hash strength increases or decreases */
  626. +static unsigned long hash_strength_delta;
  627. +#define HASH_STRENGTH_DELTA_MAX    5
  628. +
  629. +/* The time we have saved due to random_sample_hash */
  630. +static u64 rshash_pos;
  631. +
  632. +/* The time we have wasted due to hash collision */
  633. +static u64 rshash_neg;
  634. +
  635. +struct uksm_benefit {
  636. +   u64 pos;
  637. +   u64 neg;
  638. +   u64 scanned;
  639. +   unsigned long base;
  640. +} benefit;
  641. +
  642. +/*
  643. + * The relative cost of memcmp, compared to 1 time unit of random sample
  644. + * hash, this value is tested when ksm module is initialized
  645. + */
  646. +static unsigned long memcmp_cost;
  647. +
  648. +static unsigned long  rshash_neg_cont_zero;
  649. +static unsigned long  rshash_cont_obscure;
  650. +
  651. +/* The possible states of hash strength adjustment heuristic */
  652. +enum rshash_states {
  653. +       RSHASH_STILL,
  654. +       RSHASH_TRYUP,
  655. +       RSHASH_TRYDOWN,
  656. +       RSHASH_NEW,
  657. +       RSHASH_PRE_STILL,
  658. +};
  659. +
  660. +/* The possible direction we are about to adjust hash strength */
  661. +enum rshash_direct {
  662. +   GO_UP,
  663. +   GO_DOWN,
  664. +   OBSCURE,
  665. +   STILL,
  666. +};
  667. +
  668. +/* random sampling hash state machine */
  669. +static struct {
  670. +   enum rshash_states state;
  671. +   enum rshash_direct pre_direct;
  672. +   u8 below_count;
  673. +   /* Keep a lookup window of size 5, iff above_count/below_count > 3
  674. +    * in this window we stop trying.
  675. +    */
  676. +   u8 lookup_window_index;
  677. +   u64 stable_benefit;
  678. +   unsigned long turn_point_down;
  679. +   unsigned long turn_benefit_down;
  680. +   unsigned long turn_point_up;
  681. +   unsigned long turn_benefit_up;
  682. +   unsigned long stable_point;
  683. +} rshash_state;
  684. +
  685. +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
  686. +static u32 *zero_hash_table;
  687. +
  688. +static inline struct node_vma *alloc_node_vma(void)
  689. +{
  690. +   struct node_vma *node_vma;
  691. +   node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL);
  692. +   if (node_vma) {
  693. +       INIT_HLIST_HEAD(&node_vma->rmap_hlist);
  694. +       INIT_HLIST_NODE(&node_vma->hlist);
  695. +   }
  696. +   return node_vma;
  697. +}
  698. +
  699. +static inline void free_node_vma(struct node_vma *node_vma)
  700. +{
  701. +   kmem_cache_free(node_vma_cache, node_vma);
  702. +}
  703. +
  704. +
  705. +static inline struct vma_slot *alloc_vma_slot(void)
  706. +{
  707. +   struct vma_slot *slot;
  708. +
  709. +   /*
  710. +    * In case ksm is not initialized by now.
  711. +    * Oops, we need to consider the call site of uksm_init() in the future.
  712. +    */
  713. +   if (!vma_slot_cache)
  714. +       return NULL;
  715. +
  716. +   slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL);
  717. +   if (slot) {
  718. +       INIT_LIST_HEAD(&slot->slot_list);
  719. +       INIT_LIST_HEAD(&slot->dedup_list);
  720. +       slot->flags |= UKSM_SLOT_NEED_RERAND;
  721. +   }
  722. +   return slot;
  723. +}
  724. +
  725. +static inline void free_vma_slot(struct vma_slot *vma_slot)
  726. +{
  727. +   kmem_cache_free(vma_slot_cache, vma_slot);
  728. +}
  729. +
  730. +
  731. +
  732. +static inline struct rmap_item *alloc_rmap_item(void)
  733. +{
  734. +   struct rmap_item *rmap_item;
  735. +
  736. +   rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
  737. +   if (rmap_item) {
  738. +       /* bug on lowest bit is not clear for flag use */
  739. +       BUG_ON(is_addr(rmap_item));
  740. +   }
  741. +   return rmap_item;
  742. +}
  743. +
  744. +static inline void free_rmap_item(struct rmap_item *rmap_item)
  745. +{
  746. +   rmap_item->slot = NULL; /* debug safety */
  747. +   kmem_cache_free(rmap_item_cache, rmap_item);
  748. +}
  749. +
  750. +static inline struct stable_node *alloc_stable_node(void)
  751. +{
  752. +   struct stable_node *node;
  753. +   node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC);
  754. +   if (!node)
  755. +       return NULL;
  756. +
  757. +   INIT_HLIST_HEAD(&node->hlist);
  758. +   list_add(&node->all_list, &stable_node_list);
  759. +   return node;
  760. +}
  761. +
  762. +static inline void free_stable_node(struct stable_node *stable_node)
  763. +{
  764. +   list_del(&stable_node->all_list);
  765. +   kmem_cache_free(stable_node_cache, stable_node);
  766. +}
  767. +
  768. +static inline struct tree_node *alloc_tree_node(struct list_head *list)
  769. +{
  770. +   struct tree_node *node;
  771. +   node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC);
  772. +   if (!node)
  773. +       return NULL;
  774. +
  775. +   list_add(&node->all_list, list);
  776. +   return node;
  777. +}
  778. +
  779. +static inline void free_tree_node(struct tree_node *node)
  780. +{
  781. +   list_del(&node->all_list);
  782. +   kmem_cache_free(tree_node_cache, node);
  783. +}
  784. +
  785. +static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
  786. +{
  787. +   struct anon_vma *anon_vma = rmap_item->anon_vma;
  788. +
  789. +   put_anon_vma(anon_vma);
  790. +}
  791. +
  792. +
  793. +/**
  794. + * Remove a stable node from stable_tree, may unlink from its tree_node and
  795. + * may remove its parent tree_node if no other stable node is pending.
  796. + *
  797. + * @stable_node    The node need to be removed
  798. + * @unlink_rb      Will this node be unlinked from the rbtree?
  799. + * @remove_tree_   node Will its tree_node be removed if empty?
  800. + */
  801. +static void remove_node_from_stable_tree(struct stable_node *stable_node,
  802. +                    int unlink_rb,  int remove_tree_node)
  803. +{
  804. +   struct node_vma *node_vma;
  805. +   struct rmap_item *rmap_item;
  806. +   struct hlist_node *hlist, *rmap_hlist, *n;
  807. +
  808. +   if (!hlist_empty(&stable_node->hlist)) {
  809. +       hlist_for_each_entry_safe(node_vma, hlist, n,
  810. +                     &stable_node->hlist, hlist) {
  811. +           hlist_for_each_entry(rmap_item, rmap_hlist,
  812. +                        &node_vma->rmap_hlist, hlist) {
  813. +               uksm_pages_sharing--;
  814. +
  815. +               uksm_drop_anon_vma(rmap_item);
  816. +               rmap_item->address &= PAGE_MASK;
  817. +           }
  818. +           free_node_vma(node_vma);
  819. +           cond_resched();
  820. +       }
  821. +
  822. +       /* the last one is counted as shared */
  823. +       uksm_pages_shared--;
  824. +       uksm_pages_sharing++;
  825. +   }
  826. +
  827. +   if (stable_node->tree_node && unlink_rb) {
  828. +       rb_erase(&stable_node->node,
  829. +            &stable_node->tree_node->sub_root);
  830. +
  831. +       if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
  832. +           remove_tree_node) {
  833. +           rb_erase(&stable_node->tree_node->node,
  834. +                root_stable_treep);
  835. +           free_tree_node(stable_node->tree_node);
  836. +       } else {
  837. +           stable_node->tree_node->count--;
  838. +       }
  839. +   }
  840. +
  841. +   free_stable_node(stable_node);
  842. +}
  843. +
  844. +
  845. +/*
  846. + * get_uksm_page: checks if the page indicated by the stable node
  847. + * is still its ksm page, despite having held no reference to it.
  848. + * In which case we can trust the content of the page, and it
  849. + * returns the gotten page; but if the page has now been zapped,
  850. + * remove the stale node from the stable tree and return NULL.
  851. + *
  852. + * You would expect the stable_node to hold a reference to the ksm page.
  853. + * But if it increments the page's count, swapping out has to wait for
  854. + * ksmd to come around again before it can free the page, which may take
  855. + * seconds or even minutes: much too unresponsive.  So instead we use a
  856. + * "keyhole reference": access to the ksm page from the stable node peeps
  857. + * out through its keyhole to see if that page still holds the right key,
  858. + * pointing back to this stable node.  This relies on freeing a PageAnon
  859. + * page to reset its page->mapping to NULL, and relies on no other use of
  860. + * a page to put something that might look like our key in page->mapping.
  861. + *
  862. + * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
  863. + * but this is different - made simpler by uksm_thread_mutex being held, but
  864. + * interesting for assuming that no other use of the struct page could ever
  865. + * put our expected_mapping into page->mapping (or a field of the union which
  866. + * coincides with page->mapping).  The RCU calls are not for KSM at all, but
  867. + * to keep the page_count protocol described with page_cache_get_speculative.
  868. + *
  869. + * Note: it is possible that get_uksm_page() will return NULL one moment,
  870. + * then page the next, if the page is in between page_freeze_refs() and
  871. + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
  872. + * is on its way to being freed; but it is an anomaly to bear in mind.
  873. + *
  874. + * @unlink_rb:         if the removal of this node will firstly unlink from
  875. + * its rbtree. stable_node_reinsert will prevent this when restructuring the
  876. + * node from its old tree.
  877. + *
  878. + * @remove_tree_node:  if this is the last one of its tree_node, will the
  879. + * tree_node be freed ? If we are inserting stable node, this tree_node may
  880. + * be reused, so don't free it.
  881. + */
  882. +static struct page *get_uksm_page(struct stable_node *stable_node,
  883. +                int unlink_rb, int remove_tree_node)
  884. +{
  885. +   struct page *page;
  886. +   void *expected_mapping;
  887. +
  888. +   page = pfn_to_page(stable_node->kpfn);
  889. +   expected_mapping = (void *)stable_node +
  890. +               (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
  891. +   rcu_read_lock();
  892. +   if (page->mapping != expected_mapping)
  893. +       goto stale;
  894. +   if (!get_page_unless_zero(page))
  895. +       goto stale;
  896. +   if (page->mapping != expected_mapping) {
  897. +       put_page(page);
  898. +       goto stale;
  899. +   }
  900. +   rcu_read_unlock();
  901. +   return page;
  902. +stale:
  903. +   rcu_read_unlock();
  904. +   remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
  905. +
  906. +   return NULL;
  907. +}
  908. +
  909. +/*
  910. + * Removing rmap_item from stable or unstable tree.
  911. + * This function will clean the information from the stable/unstable tree.
  912. + */
  913. +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
  914. +{
  915. +   if (rmap_item->address & STABLE_FLAG) {
  916. +       struct stable_node *stable_node;
  917. +       struct node_vma *node_vma;
  918. +       struct page *page;
  919. +
  920. +       node_vma = rmap_item->head;
  921. +       stable_node = node_vma->head;
  922. +       page = get_uksm_page(stable_node, 1, 1);
  923. +       if (!page)
  924. +           goto out;
  925. +
  926. +       /*
  927. +        * page lock is needed because it's racing with
  928. +        * try_to_unmap_ksm(), etc.
  929. +        */
  930. +       lock_page(page);
  931. +       hlist_del(&rmap_item->hlist);
  932. +
  933. +       if (hlist_empty(&node_vma->rmap_hlist)) {
  934. +           hlist_del(&node_vma->hlist);
  935. +           free_node_vma(node_vma);
  936. +       }
  937. +       unlock_page(page);
  938. +
  939. +       put_page(page);
  940. +       if (hlist_empty(&stable_node->hlist)) {
  941. +           /* do NOT call remove_node_from_stable_tree() here,
  942. +            * it's possible for a forked rmap_item not in
  943. +            * stable tree while the in-tree rmap_items were
  944. +            * deleted.
  945. +            */
  946. +           uksm_pages_shared--;
  947. +       } else
  948. +           uksm_pages_sharing--;
  949. +
  950. +
  951. +       uksm_drop_anon_vma(rmap_item);
  952. +   } else if (rmap_item->address & UNSTABLE_FLAG) {
  953. +       if (rmap_item->hash_round == uksm_hash_round) {
  954. +
  955. +           rb_erase(&rmap_item->node,
  956. +                &rmap_item->tree_node->sub_root);
  957. +           if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
  958. +               rb_erase(&rmap_item->tree_node->node,
  959. +                    &root_unstable_tree);
  960. +
  961. +               free_tree_node(rmap_item->tree_node);
  962. +           } else
  963. +               rmap_item->tree_node->count--;
  964. +       }
  965. +       uksm_pages_unshared--;
  966. +   }
  967. +
  968. +   rmap_item->address &= PAGE_MASK;
  969. +   rmap_item->hash_max = 0;
  970. +
  971. +out:
  972. +   cond_resched();     /* we're called from many long loops */
  973. +}
  974. +
  975. +static inline int slot_in_uksm(struct vma_slot *slot)
  976. +{
  977. +   return list_empty(&slot->slot_list);
  978. +}
  979. +
  980. +/*
  981. + * Test if the mm is exiting
  982. + */
  983. +static inline bool uksm_test_exit(struct mm_struct *mm)
  984. +{
  985. +   return atomic_read(&mm->mm_users) == 0;
  986. +}
  987. +
  988. +/**
  989. + * Need to do two things:
  990. + * 1. check if slot was moved to del list
  991. + * 2. make sure the mmap_sem is manipulated under valid vma.
  992. + *
  993. + * My concern here is that in some cases, this may make
  994. + * vma_slot_list_lock() waiters to serialized further by some
  995. + * sem->wait_lock, can this really be expensive?
  996. + *
  997. + *
  998. + * @return
  999. + * 0: if successfully locked mmap_sem
  1000. + * -ENOENT: this slot was moved to del list
  1001. + * -EBUSY: vma lock failed
  1002. + */
  1003. +static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
  1004. +{
  1005. +   struct vm_area_struct *vma;
  1006. +   struct mm_struct *mm;
  1007. +   struct rw_semaphore *sem;
  1008. +
  1009. +   spin_lock(&vma_slot_list_lock);
  1010. +
  1011. +   /* the slot_list was removed and inited from new list, when it enters
  1012. +    * uksm_list. If now it's not empty, then it must be moved to del list
  1013. +    */
  1014. +   if (!slot_in_uksm(slot)) {
  1015. +       spin_unlock(&vma_slot_list_lock);
  1016. +       return -ENOENT;
  1017. +   }
  1018. +
  1019. +   BUG_ON(slot->pages != vma_pages(slot->vma));
  1020. +   /* Ok, vma still valid */
  1021. +   vma = slot->vma;
  1022. +   mm = vma->vm_mm;
  1023. +   sem = &mm->mmap_sem;
  1024. +
  1025. +   if (uksm_test_exit(mm)) {
  1026. +       spin_unlock(&vma_slot_list_lock);
  1027. +       return -ENOENT;
  1028. +   }
  1029. +
  1030. +   if (down_read_trylock(sem)) {
  1031. +       spin_unlock(&vma_slot_list_lock);
  1032. +       return 0;
  1033. +   }
  1034. +
  1035. +   spin_unlock(&vma_slot_list_lock);
  1036. +   return -EBUSY;
  1037. +}
  1038. +
  1039. +static inline unsigned long
  1040. +vma_page_address(struct page *page, struct vm_area_struct *vma)
  1041. +{
  1042. +   pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  1043. +   unsigned long address;
  1044. +
  1045. +   address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  1046. +   if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
  1047. +       /* page should be within @vma mapping range */
  1048. +       return -EFAULT;
  1049. +   }
  1050. +   return address;
  1051. +}
  1052. +
  1053. +
  1054. +/* return 0 on success with the item's mmap_sem locked */
  1055. +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
  1056. +{
  1057. +   struct mm_struct *mm;
  1058. +   struct vma_slot *slot = item->slot;
  1059. +   int err = -EINVAL;
  1060. +
  1061. +   struct page *page;
  1062. +
  1063. +   /*
  1064. +    * try_down_read_slot_mmap_sem() returns non-zero if the slot
  1065. +    * has been removed by uksm_remove_vma().
  1066. +    */
  1067. +   if (try_down_read_slot_mmap_sem(slot))
  1068. +       return -EBUSY;
  1069. +
  1070. +   mm = slot->vma->vm_mm;
  1071. +
  1072. +   if (uksm_test_exit(mm))
  1073. +       goto failout_up;
  1074. +
  1075. +   page = item->page;
  1076. +   rcu_read_lock();
  1077. +   if (!get_page_unless_zero(page)) {
  1078. +       rcu_read_unlock();
  1079. +       goto failout_up;
  1080. +   }
  1081. +
  1082. +   /* No need to consider huge page here. */
  1083. +   if (item->slot->vma->anon_vma != page_anon_vma(page) ||
  1084. +       vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
  1085. +       /*
  1086. +        * TODO:
  1087. +        * should we release this item becase of its stale page
  1088. +        * mapping?
  1089. +        */
  1090. +       put_page(page);
  1091. +       rcu_read_unlock();
  1092. +       goto failout_up;
  1093. +   }
  1094. +   rcu_read_unlock();
  1095. +   return 0;
  1096. +
  1097. +failout_up:
  1098. +   up_read(&mm->mmap_sem);
  1099. +   return err;
  1100. +}
  1101. +
  1102. +/*
  1103. + * What kind of VMA is considered ?
  1104. + */
  1105. +static inline int vma_can_enter(struct vm_area_struct *vma)
  1106. +{
  1107. +   return uksm_flags_can_scan(vma->vm_flags);
  1108. +}
  1109. +
  1110. +/*
  1111. + * Called whenever a fresh new vma is created A new vma_slot.
  1112. + * is created and inserted into a global list Must be called.
  1113. + * after vma is inserted to its mm                 .
  1114. + */
  1115. +void uksm_vma_add_new(struct vm_area_struct *vma)
  1116. +{
  1117. +   struct vma_slot *slot;
  1118. +
  1119. +   if (!vma_can_enter(vma)) {
  1120. +       vma->uksm_vma_slot = NULL;
  1121. +       return;
  1122. +   }
  1123. +
  1124. +   slot = alloc_vma_slot();
  1125. +   if (!slot) {
  1126. +       vma->uksm_vma_slot = NULL;
  1127. +       return;
  1128. +   }
  1129. +
  1130. +   vma->uksm_vma_slot = slot;
  1131. +   vma->vm_flags |= VM_MERGEABLE;
  1132. +   slot->vma = vma;
  1133. +   slot->mm = vma->vm_mm;
  1134. +   slot->ctime_j = jiffies;
  1135. +   slot->pages = vma_pages(vma);
  1136. +   spin_lock(&vma_slot_list_lock);
  1137. +   list_add_tail(&slot->slot_list, &vma_slot_new);
  1138. +   spin_unlock(&vma_slot_list_lock);
  1139. +}
  1140. +
  1141. +/*
  1142. + * Called after vma is unlinked from its mm
  1143. + */
  1144. +void uksm_remove_vma(struct vm_area_struct *vma)
  1145. +{
  1146. +   struct vma_slot *slot;
  1147. +
  1148. +   if (!vma->uksm_vma_slot)
  1149. +       return;
  1150. +
  1151. +   slot = vma->uksm_vma_slot;
  1152. +   spin_lock(&vma_slot_list_lock);
  1153. +   if (slot_in_uksm(slot)) {
  1154. +       /**
  1155. +        * This slot has been added by ksmd, so move to the del list
  1156. +        * waiting ksmd to free it.
  1157. +        */
  1158. +       list_add_tail(&slot->slot_list, &vma_slot_del);
  1159. +   } else {
  1160. +       /**
  1161. +        * It's still on new list. It's ok to free slot directly.
  1162. +        */
  1163. +       list_del(&slot->slot_list);
  1164. +       free_vma_slot(slot);
  1165. +   }
  1166. +   spin_unlock(&vma_slot_list_lock);
  1167. +   vma->uksm_vma_slot = NULL;
  1168. +}
  1169. +
  1170. +/*   32/3 < they < 32/2 */
  1171. +#define shiftl 8
  1172. +#define shiftr 12
  1173. +
  1174. +#define HASH_FROM_TO(from, to)                 \
  1175. +for (index = from; index < to; index++) {      \
  1176. +   pos = random_nums[index];           \
  1177. +   hash += key[pos];               \
  1178. +   hash += (hash << shiftl);           \
  1179. +   hash ^= (hash >> shiftr);           \
  1180. +}
  1181. +
  1182. +
  1183. +#define HASH_FROM_DOWN_TO(from, to)            \
  1184. +for (index = from - 1; index >= to; index--) {     \
  1185. +   hash ^= (hash >> shiftr);           \
  1186. +   hash ^= (hash >> (shiftr*2));           \
  1187. +   hash -= (hash << shiftl);           \
  1188. +   hash += (hash << (shiftl*2));           \
  1189. +   pos = random_nums[index];           \
  1190. +   hash -= key[pos];               \
  1191. +}
  1192. +
  1193. +/*
  1194. + * The main random sample hash function.
  1195. + */
  1196. +static u32 random_sample_hash(void *addr, u32 hash_strength)
  1197. +{
  1198. +   u32 hash = 0xdeadbeef;
  1199. +   int index, pos, loop = hash_strength;
  1200. +   u32 *key = (u32 *)addr;
  1201. +
  1202. +   if (loop > HASH_STRENGTH_FULL)
  1203. +       loop = HASH_STRENGTH_FULL;
  1204. +
  1205. +   HASH_FROM_TO(0, loop);
  1206. +
  1207. +   if (hash_strength > HASH_STRENGTH_FULL) {
  1208. +       loop = hash_strength - HASH_STRENGTH_FULL;
  1209. +       HASH_FROM_TO(0, loop);
  1210. +   }
  1211. +
  1212. +   return hash;
  1213. +}
  1214. +
  1215. +
  1216. +/**
  1217. + * It's used when hash strength is adjusted
  1218. + *
  1219. + * @addr The page's virtual address
  1220. + * @from The original hash strength
  1221. + * @to   The hash strength changed to
  1222. + * @hash The hash value generated with "from" hash value
  1223. + *
  1224. + * return the hash value
  1225. + */
  1226. +static u32 delta_hash(void *addr, int from, int to, u32 hash)
  1227. +{
  1228. +   u32 *key = (u32 *)addr;
  1229. +   int index, pos; /* make sure they are int type */
  1230. +
  1231. +   if (to > from) {
  1232. +       if (from >= HASH_STRENGTH_FULL) {
  1233. +           from -= HASH_STRENGTH_FULL;
  1234. +           to -= HASH_STRENGTH_FULL;
  1235. +           HASH_FROM_TO(from, to);
  1236. +       } else if (to <= HASH_STRENGTH_FULL) {
  1237. +           HASH_FROM_TO(from, to);
  1238. +       } else {
  1239. +           HASH_FROM_TO(from, HASH_STRENGTH_FULL);
  1240. +           HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
  1241. +       }
  1242. +   } else {
  1243. +       if (from <= HASH_STRENGTH_FULL) {
  1244. +           HASH_FROM_DOWN_TO(from, to);
  1245. +       } else if (to >= HASH_STRENGTH_FULL) {
  1246. +           from -= HASH_STRENGTH_FULL;
  1247. +           to -= HASH_STRENGTH_FULL;
  1248. +           HASH_FROM_DOWN_TO(from, to);
  1249. +       } else {
  1250. +           HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
  1251. +           HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
  1252. +       }
  1253. +   }
  1254. +
  1255. +   return hash;
  1256. +}
  1257. +
  1258. +
  1259. +
  1260. +
  1261. +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
  1262. +
  1263. +/**
  1264. + *
  1265. + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
  1266. + * has finished.
  1267. + *
  1268. + * return 0 if no page has been scanned since last call, 1 otherwise.
  1269. + */
  1270. +static inline int encode_benefit(void)
  1271. +{
  1272. +   u64 scanned_delta, pos_delta, neg_delta;
  1273. +   unsigned long base = benefit.base;
  1274. +
  1275. +   scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
  1276. +
  1277. +   if (!scanned_delta)
  1278. +       return 0;
  1279. +
  1280. +   scanned_delta >>= base;
  1281. +   pos_delta = rshash_pos >> base;
  1282. +   neg_delta = rshash_neg >> base;
  1283. +
  1284. +   if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
  1285. +       CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
  1286. +       CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
  1287. +       benefit.scanned >>= 1;
  1288. +       benefit.neg >>= 1;
  1289. +       benefit.pos >>= 1;
  1290. +       benefit.base++;
  1291. +       scanned_delta >>= 1;
  1292. +       pos_delta >>= 1;
  1293. +       neg_delta >>= 1;
  1294. +   }
  1295. +
  1296. +   benefit.pos += pos_delta;
  1297. +   benefit.neg += neg_delta;
  1298. +   benefit.scanned += scanned_delta;
  1299. +
  1300. +   BUG_ON(!benefit.scanned);
  1301. +
  1302. +   rshash_pos = rshash_neg = 0;
  1303. +   uksm_pages_scanned_last = uksm_pages_scanned;
  1304. +
  1305. +   return 1;
  1306. +}
  1307. +
  1308. +static inline void reset_benefit(void)
  1309. +{
  1310. +   benefit.pos = 0;
  1311. +   benefit.neg = 0;
  1312. +   benefit.base = 0;
  1313. +   benefit.scanned = 0;
  1314. +}
  1315. +
  1316. +static inline void inc_rshash_pos(unsigned long delta)
  1317. +{
  1318. +   if (CAN_OVERFLOW_U64(rshash_pos, delta))
  1319. +       encode_benefit();
  1320. +
  1321. +   rshash_pos += delta;
  1322. +}
  1323. +
  1324. +static inline void inc_rshash_neg(unsigned long delta)
  1325. +{
  1326. +   if (CAN_OVERFLOW_U64(rshash_neg, delta))
  1327. +       encode_benefit();
  1328. +
  1329. +   rshash_neg += delta;
  1330. +}
  1331. +
  1332. +
  1333. +static inline u32 page_hash(struct page *page, unsigned long hash_strength,
  1334. +               int cost_accounting)
  1335. +{
  1336. +   u32 val;
  1337. +   unsigned long delta;
  1338. +
  1339. +   void *addr = kmap_atomic(page);
  1340. +
  1341. +   val = random_sample_hash(addr, hash_strength);
  1342. +   kunmap_atomic(addr);
  1343. +
  1344. +   if (cost_accounting) {
  1345. +       if (HASH_STRENGTH_FULL > hash_strength)
  1346. +           delta = HASH_STRENGTH_FULL - hash_strength;
  1347. +       else
  1348. +           delta = 0;
  1349. +
  1350. +       inc_rshash_pos(delta);
  1351. +   }
  1352. +
  1353. +   return val;
  1354. +}
  1355. +
  1356. +static int memcmp_pages(struct page *page1, struct page *page2,
  1357. +           int cost_accounting)
  1358. +{
  1359. +   char *addr1, *addr2;
  1360. +   int ret;
  1361. +
  1362. +   addr1 = kmap_atomic(page1);
  1363. +   addr2 = kmap_atomic(page2);
  1364. +   ret = memcmp(addr1, addr2, PAGE_SIZE);
  1365. +   kunmap_atomic(addr2);
  1366. +   kunmap_atomic(addr1);
  1367. +
  1368. +   if (cost_accounting)
  1369. +       inc_rshash_neg(memcmp_cost);
  1370. +
  1371. +   return ret;
  1372. +}
  1373. +
  1374. +static inline int pages_identical(struct page *page1, struct page *page2)
  1375. +{
  1376. +   return !memcmp_pages(page1, page2, 0);
  1377. +}
  1378. +
  1379. +static inline int is_page_full_zero(struct page *page)
  1380. +{
  1381. +   char *addr;
  1382. +   int ret;
  1383. +
  1384. +   addr = kmap_atomic(page);
  1385. +   ret = is_full_zero(addr, PAGE_SIZE);
  1386. +   kunmap_atomic(addr);
  1387. +
  1388. +   return ret;
  1389. +}
  1390. +
  1391. +static int write_protect_page(struct vm_area_struct *vma, struct page *page,
  1392. +                 pte_t *orig_pte, pte_t *old_pte)
  1393. +{
  1394. +   struct mm_struct *mm = vma->vm_mm;
  1395. +   unsigned long addr;
  1396. +   pte_t *ptep;
  1397. +   spinlock_t *ptl;
  1398. +   int swapped;
  1399. +   int err = -EFAULT;
  1400. +
  1401. +   addr = page_address_in_vma(page, vma);
  1402. +   if (addr == -EFAULT)
  1403. +       goto out;
  1404. +
  1405. +   BUG_ON(PageTransCompound(page));
  1406. +   ptep = page_check_address(page, mm, addr, &ptl, 0);
  1407. +   if (!ptep)
  1408. +       goto out;
  1409. +
  1410. +   if (old_pte)
  1411. +       *old_pte = *ptep;
  1412. +
  1413. +   if (pte_write(*ptep) || pte_dirty(*ptep)) {
  1414. +       pte_t entry;
  1415. +
  1416. +       swapped = PageSwapCache(page);
  1417. +       flush_cache_page(vma, addr, page_to_pfn(page));
  1418. +       /*
  1419. +        * Ok this is tricky, when get_user_pages_fast() run it doesnt
  1420. +        * take any lock, therefore the check that we are going to make
  1421. +        * with the pagecount against the mapcount is racey and
  1422. +        * O_DIRECT can happen right after the check.
  1423. +        * So we clear the pte and flush the tlb before the check
  1424. +        * this assure us that no O_DIRECT can happen after the check
  1425. +        * or in the middle of the check.
  1426. +        */
  1427. +       entry = ptep_clear_flush(vma, addr, ptep);
  1428. +       /*
  1429. +        * Check that no O_DIRECT or similar I/O is in progress on the
  1430. +        * page
  1431. +        */
  1432. +       if (page_mapcount(page) + 1 + swapped != page_count(page)) {
  1433. +           set_pte_at(mm, addr, ptep, entry);
  1434. +           goto out_unlock;
  1435. +       }
  1436. +       if (pte_dirty(entry))
  1437. +           set_page_dirty(page);
  1438. +       entry = pte_mkclean(pte_wrprotect(entry));
  1439. +       set_pte_at_notify(mm, addr, ptep, entry);
  1440. +   }
  1441. +   *orig_pte = *ptep;
  1442. +   err = 0;
  1443. +
  1444. +out_unlock:
  1445. +   pte_unmap_unlock(ptep, ptl);
  1446. +out:
  1447. +   return err;
  1448. +}
  1449. +
  1450. +#define MERGE_ERR_PGERR        1 /* the page is invalid cannot continue */
  1451. +#define MERGE_ERR_COLLI        2 /* there is a collision */
  1452. +#define MERGE_ERR_COLLI_MAX    3 /* collision at the max hash strength */
  1453. +#define MERGE_ERR_CHANGED  4 /* the page has changed since last hash */
  1454. +
  1455. +
  1456. +/**
  1457. + * replace_page - replace page in vma by new ksm page
  1458. + * @vma:      vma that holds the pte pointing to page
  1459. + * @page:     the page we are replacing by kpage
  1460. + * @kpage:    the ksm page we replace page by
  1461. + * @orig_pte: the original value of the pte
  1462. + *
  1463. + * Returns 0 on success, MERGE_ERR_PGERR on failure.
  1464. + */
  1465. +static int replace_page(struct vm_area_struct *vma, struct page *page,
  1466. +           struct page *kpage, pte_t orig_pte)
  1467. +{
  1468. +   struct mm_struct *mm = vma->vm_mm;
  1469. +   pgd_t *pgd;
  1470. +   pud_t *pud;
  1471. +   pmd_t *pmd;
  1472. +   pte_t *ptep;
  1473. +   spinlock_t *ptl;
  1474. +   pte_t entry;
  1475. +
  1476. +   unsigned long addr;
  1477. +   int err = MERGE_ERR_PGERR;
  1478. +
  1479. +   addr = page_address_in_vma(page, vma);
  1480. +   if (addr == -EFAULT)
  1481. +       goto out;
  1482. +
  1483. +   pgd = pgd_offset(mm, addr);
  1484. +   if (!pgd_present(*pgd))
  1485. +       goto out;
  1486. +
  1487. +   pud = pud_offset(pgd, addr);
  1488. +   if (!pud_present(*pud))
  1489. +       goto out;
  1490. +
  1491. +   pmd = pmd_offset(pud, addr);
  1492. +   BUG_ON(pmd_trans_huge(*pmd));
  1493. +   if (!pmd_present(*pmd))
  1494. +       goto out;
  1495. +
  1496. +   ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
  1497. +   if (!pte_same(*ptep, orig_pte)) {
  1498. +       pte_unmap_unlock(ptep, ptl);
  1499. +       goto out;
  1500. +   }
  1501. +
  1502. +   flush_cache_page(vma, addr, pte_pfn(*ptep));
  1503. +   ptep_clear_flush(vma, addr, ptep);
  1504. +   entry = mk_pte(kpage, vma->vm_page_prot);
  1505. +
  1506. +   /* special treatment is needed for zero_page */
  1507. +   if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
  1508. +               (page_to_pfn(kpage) == zero_pfn))
  1509. +       entry = pte_mkspecial(entry);
  1510. +   else {
  1511. +       get_page(kpage);
  1512. +       page_add_anon_rmap(kpage, vma, addr);
  1513. +   }
  1514. +
  1515. +   set_pte_at_notify(mm, addr, ptep, entry);
  1516. +
  1517. +   page_remove_rmap(page);
  1518. +   if (!page_mapped(page))
  1519. +       try_to_free_swap(page);
  1520. +   put_page(page);
  1521. +
  1522. +   pte_unmap_unlock(ptep, ptl);
  1523. +   err = 0;
  1524. +out:
  1525. +   return err;
  1526. +}
  1527. +
  1528. +
  1529. +/**
  1530. + *  Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
  1531. + *  zero hash value at HASH_STRENGTH_MAX is used to indicated that its
  1532. + *  hash_max member has not been calculated.
  1533. + *
  1534. + * @page The page needs to be hashed
  1535. + * @hash_old The hash value calculated with current hash strength
  1536. + *
  1537. + * return the new hash value calculated at HASH_STRENGTH_MAX
  1538. + */
  1539. +static inline u32 page_hash_max(struct page *page, u32 hash_old)
  1540. +{
  1541. +   u32 hash_max = 0;
  1542. +   void *addr;
  1543. +
  1544. +   addr = kmap_atomic(page);
  1545. +   hash_max = delta_hash(addr, hash_strength,
  1546. +                 HASH_STRENGTH_MAX, hash_old);
  1547. +
  1548. +   kunmap_atomic(addr);
  1549. +
  1550. +   if (!hash_max)
  1551. +       hash_max = 1;
  1552. +
  1553. +   inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
  1554. +   return hash_max;
  1555. +}
  1556. +
  1557. +/*
  1558. + * We compare the hash again, to ensure that it is really a hash collision
  1559. + * instead of being caused by page write.
  1560. + */
  1561. +static inline int check_collision(struct rmap_item *rmap_item,
  1562. +                 u32 hash)
  1563. +{
  1564. +   int err;
  1565. +   struct page *page = rmap_item->page;
  1566. +
  1567. +   /* if this rmap_item has already been hash_maxed, then the collision
  1568. +    * must appears in the second-level rbtree search. In this case we check
  1569. +    * if its hash_max value has been changed. Otherwise, the collision
  1570. +    * happens in the first-level rbtree search, so we check against it's
  1571. +    * current hash value.
  1572. +    */
  1573. +   if (rmap_item->hash_max) {
  1574. +       inc_rshash_neg(memcmp_cost);
  1575. +       inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
  1576. +
  1577. +       if (rmap_item->hash_max == page_hash_max(page, hash))
  1578. +           err = MERGE_ERR_COLLI;
  1579. +       else
  1580. +           err = MERGE_ERR_CHANGED;
  1581. +   } else {
  1582. +       inc_rshash_neg(memcmp_cost + hash_strength);
  1583. +
  1584. +       if (page_hash(page, hash_strength, 0) == hash)
  1585. +           err = MERGE_ERR_COLLI;
  1586. +       else
  1587. +           err = MERGE_ERR_CHANGED;
  1588. +   }
  1589. +
  1590. +   return err;
  1591. +}
  1592. +
  1593. +static struct page *page_trans_compound_anon(struct page *page)
  1594. +{
  1595. +   if (PageTransCompound(page)) {
  1596. +       struct page *head = compound_trans_head(page);
  1597. +       /*
  1598. +        * head may actually be splitted and freed from under
  1599. +        * us but it's ok here.
  1600. +        */
  1601. +       if (PageAnon(head))
  1602. +           return head;
  1603. +   }
  1604. +   return NULL;
  1605. +}
  1606. +
  1607. +static int page_trans_compound_anon_split(struct page *page)
  1608. +{
  1609. +   int ret = 0;
  1610. +   struct page *transhuge_head = page_trans_compound_anon(page);
  1611. +   if (transhuge_head) {
  1612. +       /* Get the reference on the head to split it. */
  1613. +       if (get_page_unless_zero(transhuge_head)) {
  1614. +           /*
  1615. +            * Recheck we got the reference while the head
  1616. +            * was still anonymous.
  1617. +            */
  1618. +           if (PageAnon(transhuge_head))
  1619. +               ret = split_huge_page(transhuge_head);
  1620. +           else
  1621. +               /*
  1622. +                * Retry later if split_huge_page run
  1623. +                * from under us.
  1624. +                */
  1625. +               ret = 1;
  1626. +           put_page(transhuge_head);
  1627. +       } else
  1628. +           /* Retry later if split_huge_page run from under us. */
  1629. +           ret = 1;
  1630. +   }
  1631. +   return ret;
  1632. +}
  1633. +
  1634. +/**
  1635. + * Try to merge a rmap_item.page with a kpage in stable node. kpage must
  1636. + * already be a ksm page.
  1637. + *
  1638. + * @return 0 if the pages were merged, -EFAULT otherwise.
  1639. + */
  1640. +static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
  1641. +                     struct page *kpage, u32 hash)
  1642. +{
  1643. +   struct vm_area_struct *vma = rmap_item->slot->vma;
  1644. +   struct mm_struct *mm = vma->vm_mm;
  1645. +   pte_t orig_pte = __pte(0);
  1646. +   int err = MERGE_ERR_PGERR;
  1647. +   struct page *page;
  1648. +
  1649. +   if (uksm_test_exit(mm))
  1650. +       goto out;
  1651. +
  1652. +   page = rmap_item->page;
  1653. +
  1654. +   if (page == kpage) { /* ksm page forked */
  1655. +       err = 0;
  1656. +       goto out;
  1657. +   }
  1658. +
  1659. +   if (PageTransCompound(page) && page_trans_compound_anon_split(page))
  1660. +       goto out;
  1661. +   BUG_ON(PageTransCompound(page));
  1662. +
  1663. +   if (!PageAnon(page) || !PageKsm(kpage))
  1664. +       goto out;
  1665. +
  1666. +   /*
  1667. +    * We need the page lock to read a stable PageSwapCache in
  1668. +    * write_protect_page().  We use trylock_page() instead of
  1669. +    * lock_page() because we don't want to wait here - we
  1670. +    * prefer to continue scanning and merging different pages,
  1671. +    * then come back to this page when it is unlocked.
  1672. +    */
  1673. +   if (!trylock_page(page))
  1674. +       goto out;
  1675. +   /*
  1676. +    * If this anonymous page is mapped only here, its pte may need
  1677. +    * to be write-protected.  If it's mapped elsewhere, all of its
  1678. +    * ptes are necessarily already write-protected.  But in either
  1679. +    * case, we need to lock and check page_count is not raised.
  1680. +    */
  1681. +   if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
  1682. +       if (pages_identical(page, kpage))
  1683. +           err = replace_page(vma, page, kpage, orig_pte);
  1684. +       else
  1685. +           err = check_collision(rmap_item, hash);
  1686. +   }
  1687. +
  1688. +   if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
  1689. +       munlock_vma_page(page);
  1690. +       if (!PageMlocked(kpage)) {
  1691. +           unlock_page(page);
  1692. +           lock_page(kpage);
  1693. +           mlock_vma_page(kpage);
  1694. +           page = kpage;       /* for final unlock */
  1695. +       }
  1696. +   }
  1697. +
  1698. +   unlock_page(page);
  1699. +out:
  1700. +   return err;
  1701. +}
  1702. +
  1703. +
  1704. +
  1705. +/**
  1706. + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
  1707. + * to restore a page mapping that has been changed in try_to_merge_two_pages.
  1708. + *
  1709. + * @return 0 on success.
  1710. + */
  1711. +static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
  1712. +                pte_t orig_pte, pte_t wprt_pte)
  1713. +{
  1714. +   struct mm_struct *mm = vma->vm_mm;
  1715. +   pgd_t *pgd;
  1716. +   pud_t *pud;
  1717. +   pmd_t *pmd;
  1718. +   pte_t *ptep;
  1719. +   spinlock_t *ptl;
  1720. +
  1721. +   int err = -EFAULT;
  1722. +
  1723. +   pgd = pgd_offset(mm, addr);
  1724. +   if (!pgd_present(*pgd))
  1725. +       goto out;
  1726. +
  1727. +   pud = pud_offset(pgd, addr);
  1728. +   if (!pud_present(*pud))
  1729. +       goto out;
  1730. +
  1731. +   pmd = pmd_offset(pud, addr);
  1732. +   if (!pmd_present(*pmd))
  1733. +       goto out;
  1734. +
  1735. +   ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
  1736. +   if (!pte_same(*ptep, wprt_pte)) {
  1737. +       /* already copied, let it be */
  1738. +       pte_unmap_unlock(ptep, ptl);
  1739. +       goto out;
  1740. +   }
  1741. +
  1742. +   /*
  1743. +    * Good boy, still here. When we still get the ksm page, it does not
  1744. +    * return to the free page pool, there is no way that a pte was changed
  1745. +    * to other page and gets back to this page. And remind that ksm page
  1746. +    * do not reuse in do_wp_page(). So it's safe to restore the original
  1747. +    * pte.
  1748. +    */
  1749. +   flush_cache_page(vma, addr, pte_pfn(*ptep));
  1750. +   ptep_clear_flush(vma, addr, ptep);
  1751. +   set_pte_at_notify(mm, addr, ptep, orig_pte);
  1752. +
  1753. +   pte_unmap_unlock(ptep, ptl);
  1754. +   err = 0;
  1755. +out:
  1756. +   return err;
  1757. +}
  1758. +
  1759. +/**
  1760. + * try_to_merge_two_pages() - take two identical pages and prepare
  1761. + * them to be merged into one page(rmap_item->page)
  1762. + *
  1763. + * @return 0 if we successfully merged two identical pages into
  1764. + *         one ksm page. MERGE_ERR_COLLI if it's only a hash collision
  1765. + *         search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
  1766. + *         changed since it's hashed. MERGE_ERR_PGERR otherwise.
  1767. + *
  1768. + */
  1769. +static int try_to_merge_two_pages(struct rmap_item *rmap_item,
  1770. +                 struct rmap_item *tree_rmap_item,
  1771. +                 u32 hash)
  1772. +{
  1773. +   pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
  1774. +   pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
  1775. +   struct vm_area_struct *vma1 = rmap_item->slot->vma;
  1776. +   struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
  1777. +   struct page *page = rmap_item->page;
  1778. +   struct page *tree_page = tree_rmap_item->page;
  1779. +   int err = MERGE_ERR_PGERR;
  1780. +   struct address_space *saved_mapping;
  1781. +
  1782. +
  1783. +   if (rmap_item->page == tree_rmap_item->page)
  1784. +       goto out;
  1785. +
  1786. +   if (PageTransCompound(page) && page_trans_compound_anon_split(page))
  1787. +       goto out;
  1788. +   BUG_ON(PageTransCompound(page));
  1789. +
  1790. +   if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page))
  1791. +       goto out;
  1792. +   BUG_ON(PageTransCompound(tree_page));
  1793. +
  1794. +   if (!PageAnon(page) || !PageAnon(tree_page))
  1795. +       goto out;
  1796. +
  1797. +   if (!trylock_page(page))
  1798. +       goto out;
  1799. +
  1800. +
  1801. +   if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
  1802. +       unlock_page(page);
  1803. +       goto out;
  1804. +   }
  1805. +
  1806. +   /*
  1807. +    * While we hold page lock, upgrade page from
  1808. +    * PageAnon+anon_vma to PageKsm+NULL stable_node:
  1809. +    * stable_tree_insert() will update stable_node.
  1810. +    */
  1811. +   saved_mapping = page->mapping;
  1812. +   set_page_stable_node(page, NULL);
  1813. +   mark_page_accessed(page);
  1814. +   unlock_page(page);
  1815. +
  1816. +   if (!trylock_page(tree_page))
  1817. +       goto restore_out;
  1818. +
  1819. +   if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
  1820. +       unlock_page(tree_page);
  1821. +       goto restore_out;
  1822. +   }
  1823. +
  1824. +   if (pages_identical(page, tree_page)) {
  1825. +       err = replace_page(vma2, tree_page, page, wprt_pte2);
  1826. +       if (err) {
  1827. +           unlock_page(tree_page);
  1828. +           goto restore_out;
  1829. +       }
  1830. +
  1831. +       if ((vma2->vm_flags & VM_LOCKED)) {
  1832. +           munlock_vma_page(tree_page);
  1833. +           if (!PageMlocked(page)) {
  1834. +               unlock_page(tree_page);
  1835. +               lock_page(page);
  1836. +               mlock_vma_page(page);
  1837. +               tree_page = page; /* for final unlock */
  1838. +           }
  1839. +       }
  1840. +
  1841. +       unlock_page(tree_page);
  1842. +
  1843. +       goto out; /* success */
  1844. +
  1845. +   } else {
  1846. +       if (tree_rmap_item->hash_max &&
  1847. +           tree_rmap_item->hash_max == rmap_item->hash_max) {
  1848. +           err = MERGE_ERR_COLLI_MAX;
  1849. +       } else if (page_hash(page, hash_strength, 0) ==
  1850. +           page_hash(tree_page, hash_strength, 0)) {
  1851. +           inc_rshash_neg(memcmp_cost + hash_strength * 2);
  1852. +           err = MERGE_ERR_COLLI;
  1853. +       } else {
  1854. +           err = MERGE_ERR_CHANGED;
  1855. +       }
  1856. +
  1857. +       unlock_page(tree_page);
  1858. +   }
  1859. +
  1860. +restore_out:
  1861. +   lock_page(page);
  1862. +   if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
  1863. +                 orig_pte1, wprt_pte1))
  1864. +       page->mapping = saved_mapping;
  1865. +
  1866. +   unlock_page(page);
  1867. +out:
  1868. +   return err;
  1869. +}
  1870. +
  1871. +static inline int hash_cmp(u32 new_val, u32 node_val)
  1872. +{
  1873. +   if (new_val > node_val)
  1874. +       return 1;
  1875. +   else if (new_val < node_val)
  1876. +       return -1;
  1877. +   else
  1878. +       return 0;
  1879. +}
  1880. +
  1881. +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
  1882. +{
  1883. +   u32 hash_max = item->hash_max;
  1884. +
  1885. +   if (!hash_max) {
  1886. +       hash_max = page_hash_max(item->page, hash);
  1887. +
  1888. +       item->hash_max = hash_max;
  1889. +   }
  1890. +
  1891. +   return hash_max;
  1892. +}
  1893. +
  1894. +
  1895. +
  1896. +/**
  1897. + * stable_tree_search() - search the stable tree for a page
  1898. + *
  1899. + * @item:  the rmap_item we are comparing with
  1900. + * @hash:  the hash value of this item->page already calculated
  1901. + *
  1902. + * @return     the page we have found, NULL otherwise. The page returned has
  1903. + *             been gotten.
  1904. + */
  1905. +static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
  1906. +{
  1907. +   struct rb_node *node = root_stable_treep->rb_node;
  1908. +   struct tree_node *tree_node;
  1909. +   unsigned long hash_max;
  1910. +   struct page *page = item->page;
  1911. +   struct stable_node *stable_node;
  1912. +
  1913. +   stable_node = page_stable_node(page);
  1914. +   if (stable_node) {
  1915. +       /* ksm page forked, that is
  1916. +        * if (PageKsm(page) && !in_stable_tree(rmap_item))
  1917. +        * it's actually gotten once outside.
  1918. +        */
  1919. +       get_page(page);
  1920. +       return page;
  1921. +   }
  1922. +
  1923. +   while (node) {
  1924. +       int cmp;
  1925. +
  1926. +       tree_node = rb_entry(node, struct tree_node, node);
  1927. +
  1928. +       cmp = hash_cmp(hash, tree_node->hash);
  1929. +
  1930. +       if (cmp < 0)
  1931. +           node = node->rb_left;
  1932. +       else if (cmp > 0)
  1933. +           node = node->rb_right;
  1934. +       else
  1935. +           break;
  1936. +   }
  1937. +
  1938. +   if (!node)
  1939. +       return NULL;
  1940. +
  1941. +   if (tree_node->count == 1) {
  1942. +       stable_node = rb_entry(tree_node->sub_root.rb_node,
  1943. +                      struct stable_node, node);
  1944. +       BUG_ON(!stable_node);
  1945. +
  1946. +       goto get_page_out;
  1947. +   }
  1948. +
  1949. +   /*
  1950. +    * ok, we have to search the second
  1951. +    * level subtree, hash the page to a
  1952. +    * full strength.
  1953. +    */
  1954. +   node = tree_node->sub_root.rb_node;
  1955. +   BUG_ON(!node);
  1956. +   hash_max = rmap_item_hash_max(item, hash);
  1957. +
  1958. +   while (node) {
  1959. +       int cmp;
  1960. +
  1961. +       stable_node = rb_entry(node, struct stable_node, node);
  1962. +
  1963. +       cmp = hash_cmp(hash_max, stable_node->hash_max);
  1964. +
  1965. +       if (cmp < 0)
  1966. +           node = node->rb_left;
  1967. +       else if (cmp > 0)
  1968. +           node = node->rb_right;
  1969. +       else
  1970. +           goto get_page_out;
  1971. +   }
  1972. +
  1973. +   return NULL;
  1974. +
  1975. +get_page_out:
  1976. +   page = get_uksm_page(stable_node, 1, 1);
  1977. +   return page;
  1978. +}
  1979. +
  1980. +static int try_merge_rmap_item(struct rmap_item *item,
  1981. +                  struct page *kpage,
  1982. +                  struct page *tree_page)
  1983. +{
  1984. +   spinlock_t *ptl;
  1985. +   pte_t *ptep;
  1986. +   unsigned long addr;
  1987. +   struct vm_area_struct *vma = item->slot->vma;
  1988. +
  1989. +   addr = get_rmap_addr(item);
  1990. +   ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0);
  1991. +   if (!ptep)
  1992. +       return 0;
  1993. +
  1994. +   if (pte_write(*ptep)) {
  1995. +       /* has changed, abort! */
  1996. +       pte_unmap_unlock(ptep, ptl);
  1997. +       return 0;
  1998. +   }
  1999. +
  2000. +   get_page(tree_page);
  2001. +   page_add_anon_rmap(tree_page, vma, addr);
  2002. +
  2003. +   flush_cache_page(vma, addr, pte_pfn(*ptep));
  2004. +   ptep_clear_flush(vma, addr, ptep);
  2005. +   set_pte_at_notify(vma->vm_mm, addr, ptep,
  2006. +             mk_pte(tree_page, vma->vm_page_prot));
  2007. +
  2008. +   page_remove_rmap(kpage);
  2009. +   put_page(kpage);
  2010. +
  2011. +   pte_unmap_unlock(ptep, ptl);
  2012. +
  2013. +   return 1;
  2014. +}
  2015. +
  2016. +/**
  2017. + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
  2018. + * into stable tree, the page was found to be identical to a stable ksm page,
  2019. + * this is the last chance we can merge them into one.
  2020. + *
  2021. + * @item1: the rmap_item holding the page which we wanted to insert
  2022. + *         into stable tree.
  2023. + * @item2: the other rmap_item we found when unstable tree search
  2024. + * @oldpage:   the page currently mapped by the two rmap_items
  2025. + * @tree_page:     the page we found identical in stable tree node
  2026. + * @success1:  return if item1 is successfully merged
  2027. + * @success2:  return if item2 is successfully merged
  2028. + */
  2029. +static void try_merge_with_stable(struct rmap_item *item1,
  2030. +                 struct rmap_item *item2,
  2031. +                 struct page **kpage,
  2032. +                 struct page *tree_page,
  2033. +                 int *success1, int *success2)
  2034. +{
  2035. +   struct vm_area_struct *vma1 = item1->slot->vma;
  2036. +   struct vm_area_struct *vma2 = item2->slot->vma;
  2037. +   *success1 = 0;
  2038. +   *success2 = 0;
  2039. +
  2040. +   if (unlikely(*kpage == tree_page)) {
  2041. +       /* I don't think this can really happen */
  2042. +       printk(KERN_WARNING "UKSM: unexpected condition detected in "
  2043. +           "try_merge_with_stable() -- *kpage == tree_page !\n");
  2044. +       *success1 = 1;
  2045. +       *success2 = 1;
  2046. +       return;
  2047. +   }
  2048. +
  2049. +   if (!PageAnon(*kpage) || !PageKsm(*kpage))
  2050. +       goto failed;
  2051. +
  2052. +   if (!trylock_page(tree_page))
  2053. +       goto failed;
  2054. +
  2055. +   /* If the oldpage is still ksm and still pointed
  2056. +    * to in the right place, and still write protected,
  2057. +    * we are confident it's not changed, no need to
  2058. +    * memcmp anymore.
  2059. +    * be ware, we cannot take nested pte locks,
  2060. +    * deadlock risk.
  2061. +    */
  2062. +   if (!try_merge_rmap_item(item1, *kpage, tree_page))
  2063. +       goto unlock_failed;
  2064. +
  2065. +   /* ok, then vma2, remind that pte1 already set */
  2066. +   if (!try_merge_rmap_item(item2, *kpage, tree_page))
  2067. +       goto success_1;
  2068. +
  2069. +   *success2 = 1;
  2070. +success_1:
  2071. +   *success1 = 1;
  2072. +
  2073. +
  2074. +   if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
  2075. +       (*success2 && vma2->vm_flags & VM_LOCKED)) {
  2076. +       munlock_vma_page(*kpage);
  2077. +       if (!PageMlocked(tree_page))
  2078. +           mlock_vma_page(tree_page);
  2079. +   }
  2080. +
  2081. +   /*
  2082. +    * We do not need oldpage any more in the caller, so can break the lock
  2083. +    * now.
  2084. +    */
  2085. +   unlock_page(*kpage);
  2086. +   *kpage = tree_page; /* Get unlocked outside. */
  2087. +   return;
  2088. +
  2089. +unlock_failed:
  2090. +   unlock_page(tree_page);
  2091. +failed:
  2092. +   return;
  2093. +}
  2094. +
  2095. +static inline void stable_node_hash_max(struct stable_node *node,
  2096. +                    struct page *page, u32 hash)
  2097. +{
  2098. +   u32 hash_max = node->hash_max;
  2099. +
  2100. +   if (!hash_max) {
  2101. +       hash_max = page_hash_max(page, hash);
  2102. +       node->hash_max = hash_max;
  2103. +   }
  2104. +}
  2105. +
  2106. +static inline
  2107. +struct stable_node *new_stable_node(struct tree_node *tree_node,
  2108. +                   struct page *kpage, u32 hash_max)
  2109. +{
  2110. +   struct stable_node *new_stable_node;
  2111. +
  2112. +   new_stable_node = alloc_stable_node();
  2113. +   if (!new_stable_node)
  2114. +       return NULL;
  2115. +
  2116. +   new_stable_node->kpfn = page_to_pfn(kpage);
  2117. +   new_stable_node->hash_max = hash_max;
  2118. +   new_stable_node->tree_node = tree_node;
  2119. +   set_page_stable_node(kpage, new_stable_node);
  2120. +
  2121. +   return new_stable_node;
  2122. +}
  2123. +
  2124. +static inline
  2125. +struct stable_node *first_level_insert(struct tree_node *tree_node,
  2126. +                      struct rmap_item *rmap_item,
  2127. +                      struct rmap_item *tree_rmap_item,
  2128. +                      struct page **kpage, u32 hash,
  2129. +                      int *success1, int *success2)
  2130. +{
  2131. +   int cmp;
  2132. +   struct page *tree_page;
  2133. +   u32 hash_max = 0;
  2134. +   struct stable_node *stable_node, *new_snode;
  2135. +   struct rb_node *parent = NULL, **new;
  2136. +
  2137. +   /* this tree node contains no sub-tree yet */
  2138. +   stable_node = rb_entry(tree_node->sub_root.rb_node,
  2139. +                  struct stable_node, node);
  2140. +
  2141. +   tree_page = get_uksm_page(stable_node, 1, 0);
  2142. +   if (tree_page) {
  2143. +       cmp = memcmp_pages(*kpage, tree_page, 1);
  2144. +       if (!cmp) {
  2145. +           try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
  2146. +                         tree_page, success1, success2);
  2147. +           put_page(tree_page);
  2148. +           if (!*success1 && !*success2)
  2149. +               goto failed;
  2150. +
  2151. +           return stable_node;
  2152. +
  2153. +       } else {
  2154. +           /*
  2155. +            * collision in first level try to create a subtree.
  2156. +            * A new node need to be created.
  2157. +            */
  2158. +           put_page(tree_page);
  2159. +
  2160. +           stable_node_hash_max(stable_node, tree_page,
  2161. +                        tree_node->hash);
  2162. +           hash_max = rmap_item_hash_max(rmap_item, hash);
  2163. +           cmp = hash_cmp(hash_max, stable_node->hash_max);
  2164. +
  2165. +           parent = &stable_node->node;
  2166. +           if (cmp < 0) {
  2167. +               new = &parent->rb_left;
  2168. +           } else if (cmp > 0) {
  2169. +               new = &parent->rb_right;
  2170. +           } else {
  2171. +               goto failed;
  2172. +           }
  2173. +       }
  2174. +
  2175. +   } else {
  2176. +       /* the only stable_node deleted, we reuse its tree_node.
  2177. +        */
  2178. +       parent = NULL;
  2179. +       new = &tree_node->sub_root.rb_node;
  2180. +   }
  2181. +
  2182. +   new_snode = new_stable_node(tree_node, *kpage, hash_max);
  2183. +   if (!new_snode)
  2184. +       goto failed;
  2185. +
  2186. +   rb_link_node(&new_snode->node, parent, new);
  2187. +   rb_insert_color(&new_snode->node, &tree_node->sub_root);
  2188. +   tree_node->count++;
  2189. +   *success1 = *success2 = 1;
  2190. +
  2191. +   return new_snode;
  2192. +
  2193. +failed:
  2194. +   return NULL;
  2195. +}
  2196. +
  2197. +static inline
  2198. +struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
  2199. +                     struct rmap_item *rmap_item,
  2200. +                     struct rmap_item *tree_rmap_item,
  2201. +                     struct page **kpage, u32 hash,
  2202. +                     int *success1, int *success2)
  2203. +{
  2204. +   struct page *tree_page;
  2205. +   u32 hash_max;
  2206. +   struct stable_node *stable_node, *new_snode;
  2207. +   struct rb_node *parent, **new;
  2208. +
  2209. +research:
  2210. +   parent = NULL;
  2211. +   new = &tree_node->sub_root.rb_node;
  2212. +   BUG_ON(!*new);
  2213. +   hash_max = rmap_item_hash_max(rmap_item, hash);
  2214. +   while (*new) {
  2215. +       int cmp;
  2216. +
  2217. +       stable_node = rb_entry(*new, struct stable_node, node);
  2218. +
  2219. +       cmp = hash_cmp(hash_max, stable_node->hash_max);
  2220. +
  2221. +       if (cmp < 0) {
  2222. +           parent = *new;
  2223. +           new = &parent->rb_left;
  2224. +       } else if (cmp > 0) {
  2225. +           parent = *new;
  2226. +           new = &parent->rb_right;
  2227. +       } else {
  2228. +           tree_page = get_uksm_page(stable_node, 1, 0);
  2229. +           if (tree_page) {
  2230. +               cmp = memcmp_pages(*kpage, tree_page, 1);
  2231. +               if (!cmp) {
  2232. +                   try_merge_with_stable(rmap_item,
  2233. +                       tree_rmap_item, kpage,
  2234. +                       tree_page, success1, success2);
  2235. +
  2236. +                   put_page(tree_page);
  2237. +                   if (!*success1 && !*success2)
  2238. +                       goto failed;
  2239. +                   /*
  2240. +                    * successfully merged with a stable
  2241. +                    * node
  2242. +                    */
  2243. +                   return stable_node;
  2244. +               } else {
  2245. +                   put_page(tree_page);
  2246. +                   goto failed;
  2247. +               }
  2248. +           } else {
  2249. +               /*
  2250. +                * stable node may be deleted,
  2251. +                * and subtree maybe
  2252. +                * restructed, cannot
  2253. +                * continue, research it.
  2254. +                */
  2255. +               if (tree_node->count) {
  2256. +                   goto research;
  2257. +               } else {
  2258. +                   /* reuse the tree node*/
  2259. +                   parent = NULL;
  2260. +                   new = &tree_node->sub_root.rb_node;
  2261. +               }
  2262. +           }
  2263. +       }
  2264. +   }
  2265. +
  2266. +   new_snode = new_stable_node(tree_node, *kpage, hash_max);
  2267. +   if (!new_snode)
  2268. +       goto failed;
  2269. +
  2270. +   rb_link_node(&new_snode->node, parent, new);
  2271. +   rb_insert_color(&new_snode->node, &tree_node->sub_root);
  2272. +   tree_node->count++;
  2273. +   *success1 = *success2 = 1;
  2274. +
  2275. +   return new_snode;
  2276. +
  2277. +failed:
  2278. +   return NULL;
  2279. +}
  2280. +
  2281. +
  2282. +/**
  2283. + * stable_tree_insert() - try to insert a merged page in unstable tree to
  2284. + * the stable tree
  2285. + *
  2286. + * @kpage:     the page need to be inserted
  2287. + * @hash:      the current hash of this page
  2288. + * @rmap_item:     the rmap_item being scanned
  2289. + * @tree_rmap_item:    the rmap_item found on unstable tree
  2290. + * @success1:      return if rmap_item is merged
  2291. + * @success2:      return if tree_rmap_item is merged
  2292. + *
  2293. + * @return         the stable_node on stable tree if at least one
  2294. + *             rmap_item is inserted into stable tree, NULL
  2295. + *             otherwise.
  2296. + */
  2297. +static struct stable_node *
  2298. +stable_tree_insert(struct page **kpage, u32 hash,
  2299. +          struct rmap_item *rmap_item,
  2300. +          struct rmap_item *tree_rmap_item,
  2301. +          int *success1, int *success2)
  2302. +{
  2303. +   struct rb_node **new = &root_stable_treep->rb_node;
  2304. +   struct rb_node *parent = NULL;
  2305. +   struct stable_node *stable_node;
  2306. +   struct tree_node *tree_node;
  2307. +   u32 hash_max = 0;
  2308. +
  2309. +   *success1 = *success2 = 0;
  2310. +
  2311. +   while (*new) {
  2312. +       int cmp;
  2313. +
  2314. +       tree_node = rb_entry(*new, struct tree_node, node);
  2315. +
  2316. +       cmp = hash_cmp(hash, tree_node->hash);
  2317. +
  2318. +       if (cmp < 0) {
  2319. +           parent = *new;
  2320. +           new = &parent->rb_left;
  2321. +       } else if (cmp > 0) {
  2322. +           parent = *new;
  2323. +           new = &parent->rb_right;
  2324. +       } else
  2325. +           break;
  2326. +   }
  2327. +
  2328. +   if (*new) {
  2329. +       if (tree_node->count == 1) {
  2330. +           stable_node = first_level_insert(tree_node, rmap_item,
  2331. +                       tree_rmap_item, kpage,
  2332. +                       hash, success1, success2);
  2333. +       } else {
  2334. +           stable_node = stable_subtree_insert(tree_node,
  2335. +                   rmap_item, tree_rmap_item, kpage,
  2336. +                   hash, success1, success2);
  2337. +       }
  2338. +   } else {
  2339. +
  2340. +       /* no tree node found */
  2341. +       tree_node = alloc_tree_node(stable_tree_node_listp);
  2342. +       if (!tree_node) {
  2343. +           stable_node = NULL;
  2344. +           goto out;
  2345. +       }
  2346. +
  2347. +       stable_node = new_stable_node(tree_node, *kpage, hash_max);
  2348. +       if (!stable_node) {
  2349. +           free_tree_node(tree_node);
  2350. +           goto out;
  2351. +       }
  2352. +
  2353. +       tree_node->hash = hash;
  2354. +       rb_link_node(&tree_node->node, parent, new);
  2355. +       rb_insert_color(&tree_node->node, root_stable_treep);
  2356. +       parent = NULL;
  2357. +       new = &tree_node->sub_root.rb_node;
  2358. +
  2359. +       rb_link_node(&stable_node->node, parent, new);
  2360. +       rb_insert_color(&stable_node->node, &tree_node->sub_root);
  2361. +       tree_node->count++;
  2362. +       *success1 = *success2 = 1;
  2363. +   }
  2364. +
  2365. +out:
  2366. +   return stable_node;
  2367. +}
  2368. +
  2369. +
  2370. +/**
  2371. + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
  2372. + *
  2373. + * @return     0 on success, -EBUSY if unable to lock the mmap_sem,
  2374. + *             -EINVAL if the page mapping has been changed.
  2375. + */
  2376. +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
  2377. +{
  2378. +   int err;
  2379. +
  2380. +   err = get_mergeable_page_lock_mmap(tree_rmap_item);
  2381. +
  2382. +   if (err == -EINVAL) {
  2383. +       /* its page map has been changed, remove it */
  2384. +       remove_rmap_item_from_tree(tree_rmap_item);
  2385. +   }
  2386. +
  2387. +   /* The page is gotten and mmap_sem is locked now. */
  2388. +   return err;
  2389. +}
  2390. +
  2391. +
  2392. +/**
  2393. + * unstable_tree_search_insert() - search an unstable tree rmap_item with the
  2394. + * same hash value. Get its page and trylock the mmap_sem
  2395. + */
  2396. +static inline
  2397. +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
  2398. +                         u32 hash)
  2399. +
  2400. +{
  2401. +   struct rb_node **new = &root_unstable_tree.rb_node;
  2402. +   struct rb_node *parent = NULL;
  2403. +   struct tree_node *tree_node;
  2404. +   u32 hash_max;
  2405. +   struct rmap_item *tree_rmap_item;
  2406. +
  2407. +   while (*new) {
  2408. +       int cmp;
  2409. +
  2410. +       tree_node = rb_entry(*new, struct tree_node, node);
  2411. +
  2412. +       cmp = hash_cmp(hash, tree_node->hash);
  2413. +
  2414. +       if (cmp < 0) {
  2415. +           parent = *new;
  2416. +           new = &parent->rb_left;
  2417. +       } else if (cmp > 0) {
  2418. +           parent = *new;
  2419. +           new = &parent->rb_right;
  2420. +       } else
  2421. +           break;
  2422. +   }
  2423. +
  2424. +   if (*new) {
  2425. +       /* got the tree_node */
  2426. +       if (tree_node->count == 1) {
  2427. +           tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
  2428. +                         struct rmap_item, node);
  2429. +           BUG_ON(!tree_rmap_item);
  2430. +
  2431. +           goto get_page_out;
  2432. +       }
  2433. +
  2434. +       /* well, search the collision subtree */
  2435. +       new = &tree_node->sub_root.rb_node;
  2436. +       BUG_ON(!*new);
  2437. +       hash_max = rmap_item_hash_max(rmap_item, hash);
  2438. +
  2439. +       while (*new) {
  2440. +           int cmp;
  2441. +
  2442. +           tree_rmap_item = rb_entry(*new, struct rmap_item,
  2443. +                         node);
  2444. +
  2445. +           cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
  2446. +           parent = *new;
  2447. +           if (cmp < 0)
  2448. +               new = &parent->rb_left;
  2449. +           else if (cmp > 0)
  2450. +               new = &parent->rb_right;
  2451. +           else
  2452. +               goto get_page_out;
  2453. +       }
  2454. +   } else {
  2455. +       /* alloc a new tree_node */
  2456. +       tree_node = alloc_tree_node(&unstable_tree_node_list);
  2457. +       if (!tree_node)
  2458. +           return NULL;
  2459. +
  2460. +       tree_node->hash = hash;
  2461. +       rb_link_node(&tree_node->node, parent, new);
  2462. +       rb_insert_color(&tree_node->node, &root_unstable_tree);
  2463. +       parent = NULL;
  2464. +       new = &tree_node->sub_root.rb_node;
  2465. +   }
  2466. +
  2467. +   /* did not found even in sub-tree */
  2468. +   rmap_item->tree_node = tree_node;
  2469. +   rmap_item->address |= UNSTABLE_FLAG;
  2470. +   rmap_item->hash_round = uksm_hash_round;
  2471. +   rb_link_node(&rmap_item->node, parent, new);
  2472. +   rb_insert_color(&rmap_item->node, &tree_node->sub_root);
  2473. +
  2474. +   uksm_pages_unshared++;
  2475. +   return NULL;
  2476. +
  2477. +get_page_out:
  2478. +   if (tree_rmap_item->page == rmap_item->page)
  2479. +       return NULL;
  2480. +
  2481. +   if (get_tree_rmap_item_page(tree_rmap_item))
  2482. +       return NULL;
  2483. +
  2484. +   return tree_rmap_item;
  2485. +}
  2486. +
  2487. +static void hold_anon_vma(struct rmap_item *rmap_item,
  2488. +             struct anon_vma *anon_vma)
  2489. +{
  2490. +   rmap_item->anon_vma = anon_vma;
  2491. +   get_anon_vma(anon_vma);
  2492. +}
  2493. +
  2494. +
  2495. +/**
  2496. + * stable_tree_append() - append a rmap_item to a stable node. Deduplication
  2497. + * ratio statistics is done in this function.
  2498. + *
  2499. + */
  2500. +static void stable_tree_append(struct rmap_item *rmap_item,
  2501. +                  struct stable_node *stable_node, int logdedup)
  2502. +{
  2503. +   struct node_vma *node_vma = NULL, *new_node_vma;
  2504. +   struct hlist_node *hlist = NULL, *cont_p = NULL;
  2505. +   unsigned long key = (unsigned long)rmap_item->slot;
  2506. +   unsigned long factor = rmap_item->slot->rung->step;
  2507. +
  2508. +   BUG_ON(!stable_node);
  2509. +   rmap_item->address |= STABLE_FLAG;
  2510. +
  2511. +   if (hlist_empty(&stable_node->hlist)) {
  2512. +       uksm_pages_shared++;
  2513. +       goto node_vma_new;
  2514. +   } else {
  2515. +       uksm_pages_sharing++;
  2516. +   }
  2517. +
  2518. +   hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
  2519. +       if (node_vma->key >= key)
  2520. +           break;
  2521. +
  2522. +       if (logdedup) {
  2523. +           node_vma->slot->pages_bemerged += factor;
  2524. +           if (list_empty(&node_vma->slot->dedup_list))
  2525. +               list_add(&node_vma->slot->dedup_list,
  2526. +                    &vma_slot_dedup);
  2527. +       }
  2528. +   }
  2529. +
  2530. +   if (node_vma) {
  2531. +       if (node_vma->key == key) {
  2532. +           cont_p = hlist->next;
  2533. +           goto node_vma_ok;
  2534. +       } else if (node_vma->key > key) {
  2535. +           cont_p = hlist;
  2536. +       }
  2537. +   }
  2538. +
  2539. +node_vma_new:
  2540. +   /* no same vma already in node, alloc a new node_vma */
  2541. +   new_node_vma = alloc_node_vma();
  2542. +   BUG_ON(!new_node_vma);
  2543. +   new_node_vma->head = stable_node;
  2544. +   new_node_vma->slot = rmap_item->slot;
  2545. +
  2546. +   if (!node_vma) {
  2547. +       hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
  2548. +   } else if (node_vma->key != key) {
  2549. +       if (node_vma->key < key)
  2550. +           hlist_add_after(&node_vma->hlist, &new_node_vma->hlist);
  2551. +       else {
  2552. +           hlist_add_before(&new_node_vma->hlist,
  2553. +                    &node_vma->hlist);
  2554. +       }
  2555. +
  2556. +   }
  2557. +   node_vma = new_node_vma;
  2558. +
  2559. +node_vma_ok: /* ok, ready to add to the list */
  2560. +   rmap_item->head = node_vma;
  2561. +   hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
  2562. +   hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
  2563. +   if (logdedup) {
  2564. +       rmap_item->slot->pages_merged++;
  2565. +       if (cont_p) {
  2566. +           hlist_for_each_entry_continue(node_vma,
  2567. +                             cont_p, hlist) {
  2568. +               node_vma->slot->pages_bemerged += factor;
  2569. +               if (list_empty(&node_vma->slot->dedup_list))
  2570. +                   list_add(&node_vma->slot->dedup_list,
  2571. +                        &vma_slot_dedup);
  2572. +           }
  2573. +       }
  2574. +   }
  2575. +}
  2576. +
  2577. +/*
  2578. + * We use break_ksm to break COW on a ksm page: it's a stripped down
  2579. + *
  2580. + * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
  2581. + *     put_page(page);
  2582. + *
  2583. + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
  2584. + * in case the application has unmapped and remapped mm,addr meanwhile.
  2585. + * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
  2586. + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
  2587. + */
  2588. +static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
  2589. +{
  2590. +   struct page *page;
  2591. +   int ret = 0;
  2592. +
  2593. +   do {
  2594. +       cond_resched();
  2595. +       page = follow_page(vma, addr, FOLL_GET);
  2596. +       if (IS_ERR_OR_NULL(page))
  2597. +           break;
  2598. +       if (PageKsm(page)) {
  2599. +           ret = handle_mm_fault(vma->vm_mm, vma, addr,
  2600. +                         FAULT_FLAG_WRITE);
  2601. +       } else
  2602. +           ret = VM_FAULT_WRITE;
  2603. +       put_page(page);
  2604. +   } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
  2605. +   /*
  2606. +    * We must loop because handle_mm_fault() may back out if there's
  2607. +    * any difficulty e.g. if pte accessed bit gets updated concurrently.
  2608. +    *
  2609. +    * VM_FAULT_WRITE is what we have been hoping for: it indicates that
  2610. +    * COW has been broken, even if the vma does not permit VM_WRITE;
  2611. +    * but note that a concurrent fault might break PageKsm for us.
  2612. +    *
  2613. +    * VM_FAULT_SIGBUS could occur if we race with truncation of the
  2614. +    * backing file, which also invalidates anonymous pages: that's
  2615. +    * okay, that truncation will have unmapped the PageKsm for us.
  2616. +    *
  2617. +    * VM_FAULT_OOM: at the time of writing (late July 2009), setting
  2618. +    * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
  2619. +    * current task has TIF_MEMDIE set, and will be OOM killed on return
  2620. +    * to user; and ksmd, having no mm, would never be chosen for that.
  2621. +    *
  2622. +    * But if the mm is in a limited mem_cgroup, then the fault may fail
  2623. +    * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
  2624. +    * even ksmd can fail in this way - though it's usually breaking ksm
  2625. +    * just to undo a merge it made a moment before, so unlikely to oom.
  2626. +    *
  2627. +    * That's a pity: we might therefore have more kernel pages allocated
  2628. +    * than we're counting as nodes in the stable tree; but uksm_do_scan
  2629. +    * will retry to break_cow on each pass, so should recover the page
  2630. +    * in due course.  The important thing is to not let VM_MERGEABLE
  2631. +    * be cleared while any such pages might remain in the area.
  2632. +    */
  2633. +   return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
  2634. +}
  2635. +
  2636. +static void break_cow(struct rmap_item *rmap_item)
  2637. +{
  2638. +   struct vm_area_struct *vma = rmap_item->slot->vma;
  2639. +   struct mm_struct *mm = vma->vm_mm;
  2640. +   unsigned long addr = get_rmap_addr(rmap_item);
  2641. +
  2642. +   if (uksm_test_exit(mm))
  2643. +       goto out;
  2644. +
  2645. +   break_ksm(vma, addr);
  2646. +out:
  2647. +   return;
  2648. +}
  2649. +
  2650. +/*
  2651. + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
  2652. + * than check every pte of a given vma, the locking doesn't quite work for
  2653. + * that - an rmap_item is assigned to the stable tree after inserting ksm
  2654. + * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
  2655. + * rmap_items from parent to child at fork time (so as not to waste time
  2656. + * if exit comes before the next scan reaches it).
  2657. + *
  2658. + * Similarly, although we'd like to remove rmap_items (so updating counts
  2659. + * and freeing memory) when unmerging an area, it's easier to leave that
  2660. + * to the next pass of ksmd - consider, for example, how ksmd might be
  2661. + * in cmp_and_merge_page on one of the rmap_items we would be removing.
  2662. + */
  2663. +inline int unmerge_uksm_pages(struct vm_area_struct *vma,
  2664. +             unsigned long start, unsigned long end)
  2665. +{
  2666. +   unsigned long addr;
  2667. +   int err = 0;
  2668. +
  2669. +   for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
  2670. +       if (uksm_test_exit(vma->vm_mm))
  2671. +           break;
  2672. +       if (signal_pending(current))
  2673. +           err = -ERESTARTSYS;
  2674. +       else
  2675. +           err = break_ksm(vma, addr);
  2676. +   }
  2677. +   return err;
  2678. +}
  2679. +
  2680. +static inline void inc_uksm_pages_scanned(void)
  2681. +{
  2682. +   u64 delta;
  2683. +
  2684. +
  2685. +   if (uksm_pages_scanned == U64_MAX) {
  2686. +       encode_benefit();
  2687. +
  2688. +       delta = uksm_pages_scanned >> pages_scanned_base;
  2689. +
  2690. +       if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
  2691. +           pages_scanned_stored >>= 1;
  2692. +           delta >>= 1;
  2693. +           pages_scanned_base++;
  2694. +       }
  2695. +
  2696. +       pages_scanned_stored += delta;
  2697. +
  2698. +       uksm_pages_scanned = uksm_pages_scanned_last = 0;
  2699. +   }
  2700. +
  2701. +   uksm_pages_scanned++;
  2702. +}
  2703. +
  2704. +static inline int find_zero_page_hash(int strength, u32 hash)
  2705. +{
  2706. +   return (zero_hash_table[strength] == hash);
  2707. +}
  2708. +
  2709. +static
  2710. +int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
  2711. +{
  2712. +   struct page *zero_page = empty_uksm_zero_page;
  2713. +   struct mm_struct *mm = vma->vm_mm;
  2714. +   pte_t orig_pte = __pte(0);
  2715. +   int err = -EFAULT;
  2716. +
  2717. +   if (uksm_test_exit(mm))
  2718. +       goto out;
  2719. +
  2720. +   if (PageTransCompound(page) && page_trans_compound_anon_split(page))
  2721. +       goto out;
  2722. +   BUG_ON(PageTransCompound(page));
  2723. +
  2724. +   if (!PageAnon(page))
  2725. +       goto out;
  2726. +
  2727. +   if (!trylock_page(page))
  2728. +       goto out;
  2729. +
  2730. +   if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
  2731. +       if (is_page_full_zero(page))
  2732. +           err = replace_page(vma, page, zero_page, orig_pte);
  2733. +   }
  2734. +
  2735. +   unlock_page(page);
  2736. +out:
  2737. +   return err;
  2738. +}
  2739. +
  2740. +/*
  2741. + * cmp_and_merge_page() - first see if page can be merged into the stable
  2742. + * tree; if not, compare hash to previous and if it's the same, see if page
  2743. + * can be inserted into the unstable tree, or merged with a page already there
  2744. + * and both transferred to the stable tree.
  2745. + *
  2746. + * @page: the page that we are searching identical page to.
  2747. + * @rmap_item: the reverse mapping into the virtual address of this page
  2748. + */
  2749. +static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
  2750. +{
  2751. +   struct rmap_item *tree_rmap_item;
  2752. +   struct page *page;
  2753. +   struct page *kpage = NULL;
  2754. +   u32 hash_max;
  2755. +   int err;
  2756. +   unsigned int success1, success2;
  2757. +   struct stable_node *snode;
  2758. +   int cmp;
  2759. +   struct rb_node *parent = NULL, **new;
  2760. +
  2761. +   remove_rmap_item_from_tree(rmap_item);
  2762. +   page = rmap_item->page;
  2763. +
  2764. +   /* We first start with searching the page inside the stable tree */
  2765. +   kpage = stable_tree_search(rmap_item, hash);
  2766. +   if (kpage) {
  2767. +       err = try_to_merge_with_uksm_page(rmap_item, kpage,
  2768. +                        hash);
  2769. +       if (!err) {
  2770. +           /*
  2771. +            * The page was successfully merged, add
  2772. +            * its rmap_item to the stable tree.
  2773. +            * page lock is needed because it's
  2774. +            * racing with try_to_unmap_ksm(), etc.
  2775. +            */
  2776. +           lock_page(kpage);
  2777. +           snode = page_stable_node(kpage);
  2778. +           stable_tree_append(rmap_item, snode, 1);
  2779. +           unlock_page(kpage);
  2780. +           put_page(kpage);
  2781. +           return; /* success */
  2782. +       }
  2783. +       put_page(kpage);
  2784. +
  2785. +       /*
  2786. +        * if it's a collision and it has been search in sub-rbtree
  2787. +        * (hash_max != 0), we want to abort, because if it is
  2788. +        * successfully merged in unstable tree, the collision trends to
  2789. +        * happen again.
  2790. +        */
  2791. +       if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
  2792. +           return;
  2793. +   }
  2794. +
  2795. +   tree_rmap_item =
  2796. +       unstable_tree_search_insert(rmap_item, hash);
  2797. +   if (tree_rmap_item) {
  2798. +       err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
  2799. +       /*
  2800. +        * As soon as we merge this page, we want to remove the
  2801. +        * rmap_item of the page we have merged with from the unstable
  2802. +        * tree, and insert it instead as new node in the stable tree.
  2803. +        */
  2804. +       if (!err) {
  2805. +           kpage = page;
  2806. +           remove_rmap_item_from_tree(tree_rmap_item);
  2807. +           lock_page(kpage);
  2808. +           snode = stable_tree_insert(&kpage, hash,
  2809. +                          rmap_item, tree_rmap_item,
  2810. +                          &success1, &success2);
  2811. +
  2812. +           /*
  2813. +            * Do not log dedup for tree item, it's not counted as
  2814. +            * scanned in this round.
  2815. +            */
  2816. +           if (success2)
  2817. +               stable_tree_append(tree_rmap_item, snode, 0);
  2818. +
  2819. +           /*
  2820. +            * The order of these two stable append is important:
  2821. +            * we are scanning rmap_item.
  2822. +            */
  2823. +           if (success1)
  2824. +               stable_tree_append(rmap_item, snode, 1);
  2825. +
  2826. +           /*
  2827. +            * The original kpage may be unlocked inside
  2828. +            * stable_tree_insert() already. This page
  2829. +            * should be unlocked before doing
  2830. +            * break_cow().
  2831. +            */
  2832. +           unlock_page(kpage);
  2833. +
  2834. +           if (!success1)
  2835. +               break_cow(rmap_item);
  2836. +
  2837. +           if (!success2)
  2838. +               break_cow(tree_rmap_item);
  2839. +
  2840. +       } else if (err == MERGE_ERR_COLLI) {
  2841. +           BUG_ON(tree_rmap_item->tree_node->count > 1);
  2842. +
  2843. +           rmap_item_hash_max(tree_rmap_item,
  2844. +                      tree_rmap_item->tree_node->hash);
  2845. +
  2846. +           hash_max = rmap_item_hash_max(rmap_item, hash);
  2847. +           cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
  2848. +           parent = &tree_rmap_item->node;
  2849. +           if (cmp < 0)
  2850. +               new = &parent->rb_left;
  2851. +           else if (cmp > 0)
  2852. +               new = &parent->rb_right;
  2853. +           else
  2854. +               goto put_up_out;
  2855. +
  2856. +           rmap_item->tree_node = tree_rmap_item->tree_node;
  2857. +           rmap_item->address |= UNSTABLE_FLAG;
  2858. +           rmap_item->hash_round = uksm_hash_round;
  2859. +           rb_link_node(&rmap_item->node, parent, new);
  2860. +           rb_insert_color(&rmap_item->node,
  2861. +                   &tree_rmap_item->tree_node->sub_root);
  2862. +           rmap_item->tree_node->count++;
  2863. +       } else {
  2864. +           /*
  2865. +            * either one of the page has changed or they collide
  2866. +            * at the max hash, we consider them as ill items.
  2867. +            */
  2868. +           remove_rmap_item_from_tree(tree_rmap_item);
  2869. +       }
  2870. +put_up_out:
  2871. +       put_page(tree_rmap_item->page);
  2872. +       up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem);
  2873. +   }
  2874. +}
  2875. +
  2876. +
  2877. +
  2878. +
  2879. +static inline unsigned long get_pool_index(struct vma_slot *slot,
  2880. +                      unsigned long index)
  2881. +{
  2882. +   unsigned long pool_index;
  2883. +
  2884. +   pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
  2885. +   if (pool_index >= slot->pool_size)
  2886. +       BUG();
  2887. +   return pool_index;
  2888. +}
  2889. +
  2890. +static inline unsigned long index_page_offset(unsigned long index)
  2891. +{
  2892. +   return offset_in_page(sizeof(struct rmap_list_entry *) * index);
  2893. +}
  2894. +
  2895. +static inline
  2896. +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
  2897. +                       unsigned long index, int need_alloc)
  2898. +{
  2899. +   unsigned long pool_index;
  2900. +   struct page *page;
  2901. +   void *addr;
  2902. +
  2903. +
  2904. +   pool_index = get_pool_index(slot, index);
  2905. +   if (!slot->rmap_list_pool[pool_index]) {
  2906. +       if (!need_alloc)
  2907. +           return NULL;
  2908. +
  2909. +       page = alloc_page(GFP_KERNEL | __GFP_ZERO);
  2910. +       if (!page)
  2911. +           return NULL;
  2912. +
  2913. +       slot->rmap_list_pool[pool_index] = page;
  2914. +   }
  2915. +
  2916. +   addr = kmap(slot->rmap_list_pool[pool_index]);
  2917. +   addr += index_page_offset(index);
  2918. +
  2919. +   return addr;
  2920. +}
  2921. +
  2922. +static inline void put_rmap_list_entry(struct vma_slot *slot,
  2923. +                      unsigned long index)
  2924. +{
  2925. +   unsigned long pool_index;
  2926. +
  2927. +   pool_index = get_pool_index(slot, index);
  2928. +   BUG_ON(!slot->rmap_list_pool[pool_index]);
  2929. +   kunmap(slot->rmap_list_pool[pool_index]);
  2930. +}
  2931. +
  2932. +static inline int entry_is_new(struct rmap_list_entry *entry)
  2933. +{
  2934. +   return !entry->item;
  2935. +}
  2936. +
  2937. +static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
  2938. +                       unsigned long index)
  2939. +{
  2940. +   return slot->vma->vm_start + (index << PAGE_SHIFT);
  2941. +}
  2942. +
  2943. +static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
  2944. +{
  2945. +   unsigned long addr;
  2946. +
  2947. +   if (is_addr(entry->addr))
  2948. +       addr = get_clean_addr(entry->addr);
  2949. +   else if (entry->item)
  2950. +       addr = get_rmap_addr(entry->item);
  2951. +   else
  2952. +       BUG();
  2953. +
  2954. +   return addr;
  2955. +}
  2956. +
  2957. +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
  2958. +{
  2959. +   if (is_addr(entry->addr))
  2960. +       return NULL;
  2961. +
  2962. +   return entry->item;
  2963. +}
  2964. +
  2965. +static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
  2966. +                       unsigned long index)
  2967. +{
  2968. +   unsigned long pool_index;
  2969. +
  2970. +   pool_index = get_pool_index(slot, index);
  2971. +   BUG_ON(!slot->rmap_list_pool[pool_index]);
  2972. +   slot->pool_counts[pool_index]++;
  2973. +}
  2974. +
  2975. +static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
  2976. +                       unsigned long index)
  2977. +{
  2978. +   unsigned long pool_index;
  2979. +
  2980. +   pool_index = get_pool_index(slot, index);
  2981. +   BUG_ON(!slot->rmap_list_pool[pool_index]);
  2982. +   BUG_ON(!slot->pool_counts[pool_index]);
  2983. +   slot->pool_counts[pool_index]--;
  2984. +}
  2985. +
  2986. +static inline int entry_has_rmap(struct rmap_list_entry *entry)
  2987. +{
  2988. +   return !is_addr(entry->addr) && entry->item;
  2989. +}
  2990. +
  2991. +static inline void swap_entries(struct rmap_list_entry *entry1,
  2992. +               unsigned long index1,
  2993. +               struct rmap_list_entry *entry2,
  2994. +               unsigned long index2)
  2995. +{
  2996. +   struct rmap_list_entry tmp;
  2997. +
  2998. +   /* swapping two new entries is meaningless */
  2999. +   BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
  3000. +
  3001. +   tmp = *entry1;
  3002. +   *entry1 = *entry2;
  3003. +   *entry2 = tmp;
  3004. +
  3005. +   if (entry_has_rmap(entry1))
  3006. +       entry1->item->entry_index = index1;
  3007. +
  3008. +   if (entry_has_rmap(entry2))
  3009. +       entry2->item->entry_index = index2;
  3010. +
  3011. +   if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
  3012. +       inc_rmap_list_pool_count(entry1->item->slot, index1);
  3013. +       dec_rmap_list_pool_count(entry1->item->slot, index2);
  3014. +   } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
  3015. +       inc_rmap_list_pool_count(entry2->item->slot, index2);
  3016. +       dec_rmap_list_pool_count(entry2->item->slot, index1);
  3017. +   }
  3018. +}
  3019. +
  3020. +static inline void free_entry_item(struct rmap_list_entry *entry)
  3021. +{
  3022. +   unsigned long index;
  3023. +   struct rmap_item *item;
  3024. +
  3025. +   if (!is_addr(entry->addr)) {
  3026. +       BUG_ON(!entry->item);
  3027. +       item = entry->item;
  3028. +       entry->addr = get_rmap_addr(item);
  3029. +       set_is_addr(entry->addr);
  3030. +       index = item->entry_index;
  3031. +       remove_rmap_item_from_tree(item);
  3032. +       dec_rmap_list_pool_count(item->slot, index);
  3033. +       free_rmap_item(item);
  3034. +   }
  3035. +}
  3036. +
  3037. +static inline int pool_entry_boundary(unsigned long index)
  3038. +{
  3039. +   unsigned long linear_addr;
  3040. +
  3041. +   linear_addr = sizeof(struct rmap_list_entry *) * index;
  3042. +   return index && !offset_in_page(linear_addr);
  3043. +}
  3044. +
  3045. +static inline void try_free_last_pool(struct vma_slot *slot,
  3046. +                     unsigned long index)
  3047. +{
  3048. +   unsigned long pool_index;
  3049. +
  3050. +   pool_index = get_pool_index(slot, index);
  3051. +   if (slot->rmap_list_pool[pool_index] &&
  3052. +       !slot->pool_counts[pool_index]) {
  3053. +       __free_page(slot->rmap_list_pool[pool_index]);
  3054. +       slot->rmap_list_pool[pool_index] = NULL;
  3055. +       slot->flags |= UKSM_SLOT_NEED_SORT;
  3056. +   }
  3057. +
  3058. +}
  3059. +
  3060. +static inline unsigned long vma_item_index(struct vm_area_struct *vma,
  3061. +                      struct rmap_item *item)
  3062. +{
  3063. +   return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
  3064. +}
  3065. +
  3066. +static int within_same_pool(struct vma_slot *slot,
  3067. +               unsigned long i, unsigned long j)
  3068. +{
  3069. +   unsigned long pool_i, pool_j;
  3070. +
  3071. +   pool_i = get_pool_index(slot, i);
  3072. +   pool_j = get_pool_index(slot, j);
  3073. +
  3074. +   return (pool_i == pool_j);
  3075. +}
  3076. +
  3077. +static void sort_rmap_entry_list(struct vma_slot *slot)
  3078. +{
  3079. +   unsigned long i, j;
  3080. +   struct rmap_list_entry *entry, *swap_entry;
  3081. +
  3082. +   entry = get_rmap_list_entry(slot, 0, 0);
  3083. +   for (i = 0; i < slot->pages; ) {
  3084. +
  3085. +       if (!entry)
  3086. +           goto skip_whole_pool;
  3087. +
  3088. +       if (entry_is_new(entry))
  3089. +           goto next_entry;
  3090. +
  3091. +       if (is_addr(entry->addr)) {
  3092. +           entry->addr = 0;
  3093. +           goto next_entry;
  3094. +       }
  3095. +
  3096. +       j = vma_item_index(slot->vma, entry->item);
  3097. +       if (j == i)
  3098. +           goto next_entry;
  3099. +
  3100. +       if (within_same_pool(slot, i, j))
  3101. +           swap_entry = entry + j - i;
  3102. +       else
  3103. +           swap_entry = get_rmap_list_entry(slot, j, 1);
  3104. +
  3105. +       swap_entries(entry, i, swap_entry, j);
  3106. +       if (!within_same_pool(slot, i, j))
  3107. +           put_rmap_list_entry(slot, j);
  3108. +       continue;
  3109. +
  3110. +skip_whole_pool:
  3111. +       i += PAGE_SIZE / sizeof(*entry);
  3112. +       if (i < slot->pages)
  3113. +           entry = get_rmap_list_entry(slot, i, 0);
  3114. +       continue;
  3115. +
  3116. +next_entry:
  3117. +       if (i >= slot->pages - 1 ||
  3118. +           !within_same_pool(slot, i, i + 1)) {
  3119. +           put_rmap_list_entry(slot, i);
  3120. +           if (i + 1 < slot->pages)
  3121. +               entry = get_rmap_list_entry(slot, i + 1, 0);
  3122. +       } else
  3123. +           entry++;
  3124. +       i++;
  3125. +       continue;
  3126. +   }
  3127. +
  3128. +   /* free empty pool entries which contain no rmap_item */
  3129. +   /* CAN be simplied to based on only pool_counts when bug freed !!!!! */
  3130. +   for (i = 0; i < slot->pool_size; i++) {
  3131. +       unsigned char has_rmap;
  3132. +       void *addr;
  3133. +
  3134. +       if (!slot->rmap_list_pool[i])
  3135. +           continue;
  3136. +
  3137. +       has_rmap = 0;
  3138. +       addr = kmap(slot->rmap_list_pool[i]);
  3139. +       BUG_ON(!addr);
  3140. +       for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
  3141. +           entry = (struct rmap_list_entry *)addr + j;
  3142. +           if (is_addr(entry->addr))
  3143. +               continue;
  3144. +           if (!entry->item)
  3145. +               continue;
  3146. +           has_rmap = 1;
  3147. +       }
  3148. +       kunmap(slot->rmap_list_pool[i]);
  3149. +       if (!has_rmap) {
  3150. +           BUG_ON(slot->pool_counts[i]);
  3151. +           __free_page(slot->rmap_list_pool[i]);
  3152. +           slot->rmap_list_pool[i] = NULL;
  3153. +       }
  3154. +   }
  3155. +
  3156. +   slot->flags &= ~UKSM_SLOT_NEED_SORT;
  3157. +}
  3158. +
  3159. +/*
  3160. + * vma_fully_scanned() - if all the pages in this slot have been scanned.
  3161. + */
  3162. +static inline int vma_fully_scanned(struct vma_slot *slot)
  3163. +{
  3164. +   return slot->pages_scanned == slot->pages;
  3165. +}
  3166. +
  3167. +/**
  3168. + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
  3169. + * its random permutation. This function is embedded with the random
  3170. + * permutation index management code.
  3171. + */
  3172. +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
  3173. +{
  3174. +   unsigned long rand_range, addr, swap_index, scan_index;
  3175. +   struct rmap_item *item = NULL;
  3176. +   struct rmap_list_entry *scan_entry, *swap_entry = NULL;
  3177. +   struct page *page;
  3178. +
  3179. +   scan_index = swap_index = slot->pages_scanned % slot->pages;
  3180. +
  3181. +   if (pool_entry_boundary(scan_index))
  3182. +       try_free_last_pool(slot, scan_index - 1);
  3183. +
  3184. +   if (vma_fully_scanned(slot)) {
  3185. +       if (slot->flags & UKSM_SLOT_NEED_SORT)
  3186. +           slot->flags |= UKSM_SLOT_NEED_RERAND;
  3187. +       else
  3188. +           slot->flags &= ~UKSM_SLOT_NEED_RERAND;
  3189. +       if (slot->flags & UKSM_SLOT_NEED_SORT)
  3190. +           sort_rmap_entry_list(slot);
  3191. +   }
  3192. +
  3193. +   scan_entry = get_rmap_list_entry(slot, scan_index, 1);
  3194. +   if (!scan_entry)
  3195. +       return NULL;
  3196. +
  3197. +   if (entry_is_new(scan_entry)) {
  3198. +       scan_entry->addr = get_index_orig_addr(slot, scan_index);
  3199. +       set_is_addr(scan_entry->addr);
  3200. +   }
  3201. +
  3202. +   if (slot->flags & UKSM_SLOT_NEED_RERAND) {
  3203. +       rand_range = slot->pages - scan_index;
  3204. +       BUG_ON(!rand_range);
  3205. +       swap_index = scan_index + (random32() % rand_range);
  3206. +   }
  3207. +
  3208. +   if (swap_index != scan_index) {
  3209. +       swap_entry = get_rmap_list_entry(slot, swap_index, 1);
  3210. +       if (entry_is_new(swap_entry)) {
  3211. +           swap_entry->addr = get_index_orig_addr(slot,
  3212. +                                  swap_index);
  3213. +           set_is_addr(swap_entry->addr);
  3214. +       }
  3215. +       swap_entries(scan_entry, scan_index, swap_entry, swap_index);
  3216. +   }
  3217. +
  3218. +   addr = get_entry_address(scan_entry);
  3219. +   item = get_entry_item(scan_entry);
  3220. +   BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
  3221. +
  3222. +   page = follow_page(slot->vma, addr, FOLL_GET);
  3223. +   if (IS_ERR_OR_NULL(page))
  3224. +       goto nopage;
  3225. +
  3226. +   if (!PageAnon(page) && !page_trans_compound_anon(page))
  3227. +       goto putpage;
  3228. +
  3229. +   /*check is zero_page pfn or uksm_zero_page*/
  3230. +   if ((page_to_pfn(page) == zero_pfn)
  3231. +           || (page_to_pfn(page) == uksm_zero_pfn))
  3232. +       goto putpage;
  3233. +
  3234. +   flush_anon_page(slot->vma, page, addr);
  3235. +   flush_dcache_page(page);
  3236. +
  3237. +
  3238. +   *hash = page_hash(page, hash_strength, 1);
  3239. +   inc_uksm_pages_scanned();
  3240. +   /*if the page content all zero, re-map to zero-page*/
  3241. +   if (find_zero_page_hash(hash_strength, *hash)) {
  3242. +       if (!cmp_and_merge_zero_page(slot->vma, page)) {
  3243. +           slot->pages_merged++;
  3244. +           __inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
  3245. +           dec_mm_counter(slot->mm, MM_ANONPAGES);
  3246. +
  3247. +           /* For full-zero pages, no need to create rmap item */
  3248. +           goto putpage;
  3249. +       } else {
  3250. +           inc_rshash_neg(memcmp_cost / 2);
  3251. +       }
  3252. +   }
  3253. +
  3254. +   if (!item) {
  3255. +       item = alloc_rmap_item();
  3256. +       if (item) {
  3257. +           /* It has already been zeroed */
  3258. +           item->slot = slot;
  3259. +           item->address = addr;
  3260. +           item->entry_index = scan_index;
  3261. +           scan_entry->item = item;
  3262. +           inc_rmap_list_pool_count(slot, scan_index);
  3263. +       } else
  3264. +           goto putpage;
  3265. +   }
  3266. +
  3267. +   BUG_ON(item->slot != slot);
  3268. +   /* the page may have changed */
  3269. +   item->page = page;
  3270. +   put_rmap_list_entry(slot, scan_index);
  3271. +   if (swap_entry)
  3272. +       put_rmap_list_entry(slot, swap_index);
  3273. +   return item;
  3274. +
  3275. +putpage:
  3276. +   put_page(page);
  3277. +   page = NULL;
  3278. +nopage:
  3279. +   /* no page, store addr back and free rmap_item if possible */
  3280. +   free_entry_item(scan_entry);
  3281. +   put_rmap_list_entry(slot, scan_index);
  3282. +   if (swap_entry)
  3283. +       put_rmap_list_entry(slot, swap_index);
  3284. +   return NULL;
  3285. +}
  3286. +
  3287. +static inline int in_stable_tree(struct rmap_item *rmap_item)
  3288. +{
  3289. +   return rmap_item->address & STABLE_FLAG;
  3290. +}
  3291. +
  3292. +/**
  3293. + * scan_vma_one_page() - scan the next page in a vma_slot. Called with
  3294. + * mmap_sem locked.
  3295. + */
  3296. +static noinline void scan_vma_one_page(struct vma_slot *slot)
  3297. +{
  3298. +   u32 hash;
  3299. +   struct mm_struct *mm;
  3300. +   struct rmap_item *rmap_item = NULL;
  3301. +   struct vm_area_struct *vma = slot->vma;
  3302. +
  3303. +   mm = vma->vm_mm;
  3304. +   BUG_ON(!mm);
  3305. +   BUG_ON(!slot);
  3306. +
  3307. +   rmap_item = get_next_rmap_item(slot, &hash);
  3308. +   if (!rmap_item)
  3309. +       goto out1;
  3310. +
  3311. +   if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
  3312. +       goto out2;
  3313. +
  3314. +   cmp_and_merge_page(rmap_item, hash);
  3315. +out2:
  3316. +   put_page(rmap_item->page);
  3317. +out1:
  3318. +   slot->pages_scanned++;
  3319. +   if (slot->fully_scanned_round != fully_scanned_round)
  3320. +       scanned_virtual_pages++;
  3321. +
  3322. +   if (vma_fully_scanned(slot))
  3323. +       slot->fully_scanned_round = fully_scanned_round;
  3324. +}
  3325. +
  3326. +static inline unsigned long rung_get_pages(struct scan_rung *rung)
  3327. +{
  3328. +   struct slot_tree_node *node;
  3329. +
  3330. +   if (!rung->vma_root.rnode)
  3331. +       return 0;
  3332. +
  3333. +   node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
  3334. +
  3335. +   return node->size;
  3336. +}
  3337. +
  3338. +#define RUNG_SAMPLED_MIN   3
  3339. +
  3340. +static inline
  3341. +void uksm_calc_rung_step(struct scan_rung *rung,
  3342. +            unsigned long page_time, unsigned long ratio)
  3343. +{
  3344. +   unsigned long sampled, pages;
  3345. +
  3346. +   /* will be fully scanned ? */
  3347. +   if (!rung->cover_msecs) {
  3348. +       rung->step = 1;
  3349. +       return;
  3350. +   }
  3351. +
  3352. +   sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
  3353. +         * ratio / page_time;
  3354. +
  3355. +   /*
  3356. +    *  Before we finsish a scan round and expensive per-round jobs,
  3357. +    *  we need to have a chance to estimate the per page time. So
  3358. +    *  the sampled number can not be too small.
  3359. +    */
  3360. +   if (sampled < RUNG_SAMPLED_MIN)
  3361. +       sampled = RUNG_SAMPLED_MIN;
  3362. +
  3363. +   pages = rung_get_pages(rung);
  3364. +   if (likely(pages > sampled))
  3365. +       rung->step = pages / sampled;
  3366. +   else
  3367. +       rung->step = 1;
  3368. +}
  3369. +
  3370. +static inline int step_need_recalc(struct scan_rung *rung)
  3371. +{
  3372. +   unsigned long pages, stepmax;
  3373. +
  3374. +   pages = rung_get_pages(rung);
  3375. +   stepmax = pages / RUNG_SAMPLED_MIN;
  3376. +
  3377. +   return pages && (rung->step > pages ||
  3378. +            (stepmax && rung->step > stepmax));
  3379. +}
  3380. +
  3381. +static inline
  3382. +void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
  3383. +{
  3384. +   struct vma_slot *slot;
  3385. +
  3386. +   if (finished)
  3387. +       rung->flags |= UKSM_RUNG_ROUND_FINISHED;
  3388. +
  3389. +   if (step_recalc || step_need_recalc(rung)) {
  3390. +       uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
  3391. +       BUG_ON(step_need_recalc(rung));
  3392. +   }
  3393. +
  3394. +   slot_iter_index = random32() % rung->step;
  3395. +   BUG_ON(!rung->vma_root.rnode);
  3396. +   slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
  3397. +   BUG_ON(!slot);
  3398. +
  3399. +   rung->current_scan = slot;
  3400. +   rung->current_offset = slot_iter_index;
  3401. +}
  3402. +
  3403. +static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
  3404. +{
  3405. +   return &slot->rung->vma_root;
  3406. +}
  3407. +
  3408. +/*
  3409. + * return if resetted.
  3410. + */
  3411. +static int advance_current_scan(struct scan_rung *rung)
  3412. +{
  3413. +   unsigned short n;
  3414. +   struct vma_slot *slot, *next = NULL;
  3415. +
  3416. +   BUG_ON(!rung->vma_root.num);
  3417. +
  3418. +   slot = rung->current_scan;
  3419. +   n = (slot->pages - rung->current_offset) % rung->step;
  3420. +   slot_iter_index = rung->step - n;
  3421. +   next = sradix_tree_next(&rung->vma_root, slot->snode,
  3422. +               slot->sindex, slot_iter);
  3423. +
  3424. +   if (next) {
  3425. +       rung->current_offset = slot_iter_index;
  3426. +       rung->current_scan = next;
  3427. +       return 0;
  3428. +   } else {
  3429. +       reset_current_scan(rung, 1, 0);
  3430. +       return 1;
  3431. +   }
  3432. +}
  3433. +
  3434. +static inline void rung_rm_slot(struct vma_slot *slot)
  3435. +{
  3436. +   struct scan_rung *rung = slot->rung;
  3437. +   struct sradix_tree_root *root;
  3438. +
  3439. +   if (rung->current_scan == slot)
  3440. +       advance_current_scan(rung);
  3441. +
  3442. +   root = slot_get_root(slot);
  3443. +   sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
  3444. +   slot->snode = NULL;
  3445. +   if (step_need_recalc(rung)) {
  3446. +       uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
  3447. +       BUG_ON(step_need_recalc(rung));
  3448. +   }
  3449. +
  3450. +   /* In case advance_current_scan loop back to this slot again */
  3451. +   if (rung->vma_root.num && rung->current_scan == slot)
  3452. +       reset_current_scan(slot->rung, 1, 0);
  3453. +}
  3454. +
  3455. +static inline void rung_add_new_slots(struct scan_rung *rung,
  3456. +           struct vma_slot **slots, unsigned long num)
  3457. +{
  3458. +   int err;
  3459. +   struct vma_slot *slot;
  3460. +   unsigned long i;
  3461. +   struct sradix_tree_root *root = &rung->vma_root;
  3462. +
  3463. +   err = sradix_tree_enter(root, (void **)slots, num);
  3464. +   BUG_ON(err);
  3465. +
  3466. +   for (i = 0; i < num; i++) {
  3467. +       slot = slots[i];
  3468. +       slot->rung = rung;
  3469. +       BUG_ON(vma_fully_scanned(slot));
  3470. +   }
  3471. +
  3472. +   if (rung->vma_root.num == num)
  3473. +       reset_current_scan(rung, 0, 1);
  3474. +}
  3475. +
  3476. +static inline int rung_add_one_slot(struct scan_rung *rung,
  3477. +                    struct vma_slot *slot)
  3478. +{
  3479. +   int err;
  3480. +
  3481. +   err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
  3482. +   if (err)
  3483. +       return err;
  3484. +
  3485. +   slot->rung = rung;
  3486. +   if (rung->vma_root.num == 1)
  3487. +       reset_current_scan(rung, 0, 1);
  3488. +
  3489. +   return 0;
  3490. +}
  3491. +
  3492. +/*
  3493. + * Return true if the slot is deleted from its rung.
  3494. + */
  3495. +static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
  3496. +{
  3497. +   struct scan_rung *old_rung = slot->rung;
  3498. +   int err;
  3499. +
  3500. +   if (old_rung == rung)
  3501. +       return 0;
  3502. +
  3503. +   rung_rm_slot(slot);
  3504. +   err = rung_add_one_slot(rung, slot);
  3505. +   if (err) {
  3506. +       err = rung_add_one_slot(old_rung, slot);
  3507. +       WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
  3508. +   }
  3509. +
  3510. +   return 1;
  3511. +}
  3512. +
  3513. +static inline int vma_rung_up(struct vma_slot *slot)
  3514. +{
  3515. +   struct scan_rung *rung;
  3516. +
  3517. +   rung = slot->rung;
  3518. +   if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
  3519. +       rung++;
  3520. +
  3521. +   return vma_rung_enter(slot, rung);
  3522. +}
  3523. +
  3524. +static inline int vma_rung_down(struct vma_slot *slot)
  3525. +{
  3526. +   struct scan_rung *rung;
  3527. +
  3528. +   rung = slot->rung;
  3529. +   if (slot->rung != &uksm_scan_ladder[0])
  3530. +       rung--;
  3531. +
  3532. +   return vma_rung_enter(slot, rung);
  3533. +}
  3534. +
  3535. +/**
  3536. + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
  3537. + */
  3538. +static unsigned long cal_dedup_ratio(struct vma_slot *slot)
  3539. +{
  3540. +   unsigned long ret;
  3541. +
  3542. +   BUG_ON(slot->pages_scanned == slot->last_scanned);
  3543. +
  3544. +   ret = slot->pages_merged;
  3545. +
  3546. +   /* Thrashing area filtering */
  3547. +   if (ret && uksm_thrash_threshold) {
  3548. +       if (slot->pages_cowed * 100 / slot->pages_merged
  3549. +           > uksm_thrash_threshold) {
  3550. +           ret = 0;
  3551. +       } else {
  3552. +           ret = slot->pages_merged - slot->pages_cowed;
  3553. +       }
  3554. +   }
  3555. +
  3556. +   return ret;
  3557. +}
  3558. +
  3559. +/**
  3560. + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
  3561. + */
  3562. +static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
  3563. +{
  3564. +   unsigned long ret;
  3565. +   unsigned long pages_scanned;
  3566. +
  3567. +   pages_scanned = slot->pages_scanned;
  3568. +   if (!pages_scanned) {
  3569. +       if (uksm_thrash_threshold)
  3570. +           return 0;
  3571. +       else
  3572. +           pages_scanned = slot->pages_scanned;
  3573. +   }
  3574. +
  3575. +   ret = slot->pages_bemerged * 100 / pages_scanned;
  3576. +
  3577. +   /* Thrashing area filtering */
  3578. +   if (ret && uksm_thrash_threshold) {
  3579. +       if (slot->pages_cowed * 100 / slot->pages_bemerged
  3580. +           > uksm_thrash_threshold) {
  3581. +           ret = 0;
  3582. +       } else {
  3583. +           ret = slot->pages_bemerged - slot->pages_cowed;
  3584. +       }
  3585. +   }
  3586. +
  3587. +   return ret;
  3588. +}
  3589. +
  3590. +/**
  3591. + * stable_node_reinsert() - When the hash_strength has been adjusted, the
  3592. + * stable tree need to be restructured, this is the function re-inserting the
  3593. + * stable node.
  3594. + */
  3595. +static inline void stable_node_reinsert(struct stable_node *new_node,
  3596. +                   struct page *page,
  3597. +                   struct rb_root *root_treep,
  3598. +                   struct list_head *tree_node_listp,
  3599. +                   u32 hash)
  3600. +{
  3601. +   struct rb_node **new = &root_treep->rb_node;
  3602. +   struct rb_node *parent = NULL;
  3603. +   struct stable_node *stable_node;
  3604. +   struct tree_node *tree_node;
  3605. +   struct page *tree_page;
  3606. +   int cmp;
  3607. +
  3608. +   while (*new) {
  3609. +       int cmp;
  3610. +
  3611. +       tree_node = rb_entry(*new, struct tree_node, node);
  3612. +
  3613. +       cmp = hash_cmp(hash, tree_node->hash);
  3614. +
  3615. +       if (cmp < 0) {
  3616. +           parent = *new;
  3617. +           new = &parent->rb_left;
  3618. +       } else if (cmp > 0) {
  3619. +           parent = *new;
  3620. +           new = &parent->rb_right;
  3621. +       } else
  3622. +           break;
  3623. +   }
  3624. +
  3625. +   if (*new) {
  3626. +       /* find a stable tree node with same first level hash value */
  3627. +       stable_node_hash_max(new_node, page, hash);
  3628. +       if (tree_node->count == 1) {
  3629. +           stable_node = rb_entry(tree_node->sub_root.rb_node,
  3630. +                          struct stable_node, node);
  3631. +           tree_page = get_uksm_page(stable_node, 1, 0);
  3632. +           if (tree_page) {
  3633. +               stable_node_hash_max(stable_node,
  3634. +                             tree_page, hash);
  3635. +               put_page(tree_page);
  3636. +
  3637. +               /* prepare for stable node insertion */
  3638. +
  3639. +               cmp = hash_cmp(new_node->hash_max,
  3640. +                          stable_node->hash_max);
  3641. +               parent = &stable_node->node;
  3642. +               if (cmp < 0)
  3643. +                   new = &parent->rb_left;
  3644. +               else if (cmp > 0)
  3645. +                   new = &parent->rb_right;
  3646. +               else
  3647. +                   goto failed;
  3648. +
  3649. +               goto add_node;
  3650. +           } else {
  3651. +               /* the only stable_node deleted, the tree node
  3652. +                * was not deleted.
  3653. +                */
  3654. +               goto tree_node_reuse;
  3655. +           }
  3656. +       }
  3657. +
  3658. +       /* well, search the collision subtree */
  3659. +       new = &tree_node->sub_root.rb_node;
  3660. +       parent = NULL;
  3661. +       BUG_ON(!*new);
  3662. +       while (*new) {
  3663. +           int cmp;
  3664. +
  3665. +           stable_node = rb_entry(*new, struct stable_node, node);
  3666. +
  3667. +           cmp = hash_cmp(new_node->hash_max,
  3668. +                      stable_node->hash_max);
  3669. +
  3670. +           if (cmp < 0) {
  3671. +               parent = *new;
  3672. +               new = &parent->rb_left;
  3673. +           } else if (cmp > 0) {
  3674. +               parent = *new;
  3675. +               new = &parent->rb_right;
  3676. +           } else {
  3677. +               /* oh, no, still a collision */
  3678. +               goto failed;
  3679. +           }
  3680. +       }
  3681. +
  3682. +       goto add_node;
  3683. +   }
  3684. +
  3685. +   /* no tree node found */
  3686. +   tree_node = alloc_tree_node(tree_node_listp);
  3687. +   if (!tree_node) {
  3688. +       printk(KERN_ERR "UKSM: memory allocation error!\n");
  3689. +       goto failed;
  3690. +   } else {
  3691. +       tree_node->hash = hash;
  3692. +       rb_link_node(&tree_node->node, parent, new);
  3693. +       rb_insert_color(&tree_node->node, root_treep);
  3694. +
  3695. +tree_node_reuse:
  3696. +       /* prepare for stable node insertion */
  3697. +       parent = NULL;
  3698. +       new = &tree_node->sub_root.rb_node;
  3699. +   }
  3700. +
  3701. +add_node:
  3702. +   rb_link_node(&new_node->node, parent, new);
  3703. +   rb_insert_color(&new_node->node, &tree_node->sub_root);
  3704. +   new_node->tree_node = tree_node;
  3705. +   tree_node->count++;
  3706. +   return;
  3707. +
  3708. +failed:
  3709. +   /* This can only happen when two nodes have collided
  3710. +    * in two levels.
  3711. +    */
  3712. +   new_node->tree_node = NULL;
  3713. +   return;
  3714. +}
  3715. +
  3716. +static inline void free_all_tree_nodes(struct list_head *list)
  3717. +{
  3718. +   struct tree_node *node, *tmp;
  3719. +
  3720. +   list_for_each_entry_safe(node, tmp, list, all_list) {
  3721. +       free_tree_node(node);
  3722. +   }
  3723. +}
  3724. +
  3725. +/**
  3726. + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
  3727. + * strength to the current hash_strength. It re-structures the hole tree.
  3728. + */
  3729. +static inline void stable_tree_delta_hash(u32 prev_hash_strength)
  3730. +{
  3731. +   struct stable_node *node, *tmp;
  3732. +   struct rb_root *root_new_treep;
  3733. +   struct list_head *new_tree_node_listp;
  3734. +
  3735. +   stable_tree_index = (stable_tree_index + 1) % 2;
  3736. +   root_new_treep = &root_stable_tree[stable_tree_index];
  3737. +   new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
  3738. +   *root_new_treep = RB_ROOT;
  3739. +   BUG_ON(!list_empty(new_tree_node_listp));
  3740. +
  3741. +   /*
  3742. +    * we need to be safe, the node could be removed by get_uksm_page()
  3743. +    */
  3744. +   list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
  3745. +       void *addr;
  3746. +       struct page *node_page;
  3747. +       u32 hash;
  3748. +
  3749. +       /*
  3750. +        * We are completely re-structuring the stable nodes to a new
  3751. +        * stable tree. We don't want to touch the old tree unlinks and
  3752. +        * old tree_nodes. The old tree_nodes will be freed at once.
  3753. +        */
  3754. +       node_page = get_uksm_page(node, 0, 0);
  3755. +       if (!node_page)
  3756. +           continue;
  3757. +
  3758. +       if (node->tree_node) {
  3759. +           hash = node->tree_node->hash;
  3760. +
  3761. +           addr = kmap_atomic(node_page);
  3762. +
  3763. +           hash = delta_hash(addr, prev_hash_strength,
  3764. +                     hash_strength, hash);
  3765. +           kunmap_atomic(addr);
  3766. +       } else {
  3767. +           /*
  3768. +            *it was not inserted to rbtree due to collision in last
  3769. +            *round scan.
  3770. +            */
  3771. +           hash = page_hash(node_page, hash_strength, 0);
  3772. +       }
  3773. +
  3774. +       stable_node_reinsert(node, node_page, root_new_treep,
  3775. +                    new_tree_node_listp, hash);
  3776. +       put_page(node_page);
  3777. +   }
  3778. +
  3779. +   root_stable_treep = root_new_treep;
  3780. +   free_all_tree_nodes(stable_tree_node_listp);
  3781. +   BUG_ON(!list_empty(stable_tree_node_listp));
  3782. +   stable_tree_node_listp = new_tree_node_listp;
  3783. +}
  3784. +
  3785. +static inline void inc_hash_strength(unsigned long delta)
  3786. +{
  3787. +   hash_strength += 1 << delta;
  3788. +   if (hash_strength > HASH_STRENGTH_MAX)
  3789. +       hash_strength = HASH_STRENGTH_MAX;
  3790. +}
  3791. +
  3792. +static inline void dec_hash_strength(unsigned long delta)
  3793. +{
  3794. +   unsigned long change = 1 << delta;
  3795. +
  3796. +   if (hash_strength <= change + 1)
  3797. +       hash_strength = 1;
  3798. +   else
  3799. +       hash_strength -= change;
  3800. +}
  3801. +
  3802. +static inline void inc_hash_strength_delta(void)
  3803. +{
  3804. +   hash_strength_delta++;
  3805. +   if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
  3806. +       hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
  3807. +}
  3808. +
  3809. +/*
  3810. +static inline unsigned long get_current_neg_ratio(void)
  3811. +{
  3812. +   if (!rshash_pos || rshash_neg > rshash_pos)
  3813. +       return 100;
  3814. +
  3815. +   return div64_u64(100 * rshash_neg , rshash_pos);
  3816. +}
  3817. +*/
  3818. +
  3819. +static inline unsigned long get_current_neg_ratio(void)
  3820. +{
  3821. +   u64 pos = benefit.pos;
  3822. +   u64 neg = benefit.neg;
  3823. +
  3824. +   if (!neg)
  3825. +       return 0;
  3826. +
  3827. +   if (!pos || neg > pos)
  3828. +       return 100;
  3829. +
  3830. +   if (neg > div64_u64(U64_MAX, 100))
  3831. +       pos = div64_u64(pos, 100);
  3832. +   else
  3833. +       neg *= 100;
  3834. +
  3835. +   return div64_u64(neg, pos);
  3836. +}
  3837. +
  3838. +static inline unsigned long get_current_benefit(void)
  3839. +{
  3840. +   u64 pos = benefit.pos;
  3841. +   u64 neg = benefit.neg;
  3842. +   u64 scanned = benefit.scanned;
  3843. +
  3844. +   if (neg > pos)
  3845. +       return 0;
  3846. +
  3847. +   return div64_u64((pos - neg), scanned);
  3848. +}
  3849. +
  3850. +static inline int judge_rshash_direction(void)
  3851. +{
  3852. +   u64 current_neg_ratio, stable_benefit;
  3853. +   u64 current_benefit, delta = 0;
  3854. +   int ret = STILL;
  3855. +
  3856. +   /* Try to probe a value after the boot, and in case the system
  3857. +      are still for a long time. */
  3858. +   if ((fully_scanned_round & 0xFFULL) == 10) {
  3859. +       ret = OBSCURE;
  3860. +       goto out;
  3861. +   }
  3862. +
  3863. +   current_neg_ratio = get_current_neg_ratio();
  3864. +
  3865. +   if (current_neg_ratio == 0) {
  3866. +       rshash_neg_cont_zero++;
  3867. +       if (rshash_neg_cont_zero > 2)
  3868. +           return GO_DOWN;
  3869. +       else
  3870. +           return STILL;
  3871. +   }
  3872. +   rshash_neg_cont_zero = 0;
  3873. +
  3874. +   if (current_neg_ratio > 90) {
  3875. +       ret = GO_UP;
  3876. +       goto out;
  3877. +   }
  3878. +
  3879. +   current_benefit = get_current_benefit();
  3880. +   stable_benefit = rshash_state.stable_benefit;
  3881. +
  3882. +   if (!stable_benefit) {
  3883. +       ret = OBSCURE;
  3884. +       goto out;
  3885. +   }
  3886. +
  3887. +   if (current_benefit > stable_benefit)
  3888. +       delta = current_benefit - stable_benefit;
  3889. +   else if (current_benefit < stable_benefit)
  3890. +       delta = stable_benefit - current_benefit;
  3891. +
  3892. +   delta = div64_u64(100 * delta , stable_benefit);
  3893. +
  3894. +   if (delta > 50) {
  3895. +       rshash_cont_obscure++;
  3896. +       if (rshash_cont_obscure > 2)
  3897. +           return OBSCURE;
  3898. +       else
  3899. +           return STILL;
  3900. +   }
  3901. +
  3902. +out:
  3903. +   rshash_cont_obscure = 0;
  3904. +   return ret;
  3905. +}
  3906. +
  3907. +/**
  3908. + * rshash_adjust() - The main function to control the random sampling state
  3909. + * machine for hash strength adapting.
  3910. + *
  3911. + * return true if hash_strength has changed.
  3912. + */
  3913. +static inline int rshash_adjust(void)
  3914. +{
  3915. +   unsigned long prev_hash_strength = hash_strength;
  3916. +
  3917. +   if (!encode_benefit())
  3918. +       return 0;
  3919. +
  3920. +   switch (rshash_state.state) {
  3921. +   case RSHASH_STILL:
  3922. +       switch (judge_rshash_direction()) {
  3923. +       case GO_UP:
  3924. +           if (rshash_state.pre_direct == GO_DOWN)
  3925. +               hash_strength_delta = 0;
  3926. +
  3927. +           inc_hash_strength(hash_strength_delta);
  3928. +           inc_hash_strength_delta();
  3929. +           rshash_state.stable_benefit = get_current_benefit();
  3930. +           rshash_state.pre_direct = GO_UP;
  3931. +           break;
  3932. +
  3933. +       case GO_DOWN:
  3934. +           if (rshash_state.pre_direct == GO_UP)
  3935. +               hash_strength_delta = 0;
  3936. +
  3937. +           dec_hash_strength(hash_strength_delta);
  3938. +           inc_hash_strength_delta();
  3939. +           rshash_state.stable_benefit = get_current_benefit();
  3940. +           rshash_state.pre_direct = GO_DOWN;
  3941. +           break;
  3942. +
  3943. +       case OBSCURE:
  3944. +           rshash_state.stable_point = hash_strength;
  3945. +           rshash_state.turn_point_down = hash_strength;
  3946. +           rshash_state.turn_point_up = hash_strength;
  3947. +           rshash_state.turn_benefit_down = get_current_benefit();
  3948. +           rshash_state.turn_benefit_up = get_current_benefit();
  3949. +           rshash_state.lookup_window_index = 0;
  3950. +           rshash_state.state = RSHASH_TRYDOWN;
  3951. +           dec_hash_strength(hash_strength_delta);
  3952. +           inc_hash_strength_delta();
  3953. +           break;
  3954. +
  3955. +       case STILL:
  3956. +           break;
  3957. +       default:
  3958. +           BUG();
  3959. +       }
  3960. +       break;
  3961. +
  3962. +   case RSHASH_TRYDOWN:
  3963. +       if (rshash_state.lookup_window_index++ % 5 == 0)
  3964. +           rshash_state.below_count = 0;
  3965. +
  3966. +       if (get_current_benefit() < rshash_state.stable_benefit)
  3967. +           rshash_state.below_count++;
  3968. +       else if (get_current_benefit() >
  3969. +            rshash_state.turn_benefit_down) {
  3970. +           rshash_state.turn_point_down = hash_strength;
  3971. +           rshash_state.turn_benefit_down = get_current_benefit();
  3972. +       }
  3973. +
  3974. +       if (rshash_state.below_count >= 3 ||
  3975. +           judge_rshash_direction() == GO_UP ||
  3976. +           hash_strength == 1) {
  3977. +           hash_strength = rshash_state.stable_point;
  3978. +           hash_strength_delta = 0;
  3979. +           inc_hash_strength(hash_strength_delta);
  3980. +           inc_hash_strength_delta();
  3981. +           rshash_state.lookup_window_index = 0;
  3982. +           rshash_state.state = RSHASH_TRYUP;
  3983. +           hash_strength_delta = 0;
  3984. +       } else {
  3985. +           dec_hash_strength(hash_strength_delta);
  3986. +           inc_hash_strength_delta();
  3987. +       }
  3988. +       break;
  3989. +
  3990. +   case RSHASH_TRYUP:
  3991. +       if (rshash_state.lookup_window_index++ % 5 == 0)
  3992. +           rshash_state.below_count = 0;
  3993. +
  3994. +       if (get_current_benefit() < rshash_state.turn_benefit_down)
  3995. +           rshash_state.below_count++;
  3996. +       else if (get_current_benefit() > rshash_state.turn_benefit_up) {
  3997. +           rshash_state.turn_point_up = hash_strength;
  3998. +           rshash_state.turn_benefit_up = get_current_benefit();
  3999. +       }
  4000. +
  4001. +       if (rshash_state.below_count >= 3 ||
  4002. +           judge_rshash_direction() == GO_DOWN ||
  4003. +           hash_strength == HASH_STRENGTH_MAX) {
  4004. +           hash_strength = rshash_state.turn_benefit_up >
  4005. +               rshash_state.turn_benefit_down ?
  4006. +               rshash_state.turn_point_up :
  4007. +               rshash_state.turn_point_down;
  4008. +
  4009. +           rshash_state.state = RSHASH_PRE_STILL;
  4010. +       } else {
  4011. +           inc_hash_strength(hash_strength_delta);
  4012. +           inc_hash_strength_delta();
  4013. +       }
  4014. +
  4015. +       break;
  4016. +
  4017. +   case RSHASH_NEW:
  4018. +   case RSHASH_PRE_STILL:
  4019. +       rshash_state.stable_benefit = get_current_benefit();
  4020. +       rshash_state.state = RSHASH_STILL;
  4021. +       hash_strength_delta = 0;
  4022. +       break;
  4023. +   default:
  4024. +       BUG();
  4025. +   }
  4026. +
  4027. +   /* rshash_neg = rshash_pos = 0; */
  4028. +   reset_benefit();
  4029. +
  4030. +   if (prev_hash_strength != hash_strength)
  4031. +       stable_tree_delta_hash(prev_hash_strength);
  4032. +
  4033. +   return prev_hash_strength != hash_strength;
  4034. +}
  4035. +
  4036. +/**
  4037. + * round_update_ladder() - The main function to do update of all the
  4038. + * adjustments whenever a scan round is finished.
  4039. + */
  4040. +static noinline void round_update_ladder(void)
  4041. +{
  4042. +   int i;
  4043. +   unsigned long dedup;
  4044. +   struct vma_slot *slot, *tmp_slot;
  4045. +
  4046. +   for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  4047. +       uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
  4048. +   }
  4049. +
  4050. +   list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
  4051. +
  4052. +       /* slot may be rung_rm_slot() when mm exits */
  4053. +       if (slot->snode) {
  4054. +           dedup = cal_dedup_ratio_old(slot);
  4055. +           if (dedup && dedup >= uksm_abundant_threshold)
  4056. +               vma_rung_up(slot);
  4057. +       }
  4058. +
  4059. +       slot->pages_bemerged = 0;
  4060. +       slot->pages_cowed = 0;
  4061. +
  4062. +       list_del_init(&slot->dedup_list);
  4063. +   }
  4064. +}
  4065. +
  4066. +static void uksm_del_vma_slot(struct vma_slot *slot)
  4067. +{
  4068. +   int i, j;
  4069. +   struct rmap_list_entry *entry;
  4070. +
  4071. +   if (slot->snode) {
  4072. +       /*
  4073. +        * In case it just failed when entering the rung, it's not
  4074. +        * necessary.
  4075. +        */
  4076. +       rung_rm_slot(slot);
  4077. +   }
  4078. +
  4079. +   if (!list_empty(&slot->dedup_list))
  4080. +       list_del(&slot->dedup_list);
  4081. +
  4082. +   if (!slot->rmap_list_pool || !slot->pool_counts) {
  4083. +       /* In case it OOMed in uksm_vma_enter() */
  4084. +       goto out;
  4085. +   }
  4086. +
  4087. +   for (i = 0; i < slot->pool_size; i++) {
  4088. +       void *addr;
  4089. +
  4090. +       if (!slot->rmap_list_pool[i])
  4091. +           continue;
  4092. +
  4093. +       addr = kmap(slot->rmap_list_pool[i]);
  4094. +       for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
  4095. +           entry = (struct rmap_list_entry *)addr + j;
  4096. +           if (is_addr(entry->addr))
  4097. +               continue;
  4098. +           if (!entry->item)
  4099. +               continue;
  4100. +
  4101. +           remove_rmap_item_from_tree(entry->item);
  4102. +           free_rmap_item(entry->item);
  4103. +           slot->pool_counts[i]--;
  4104. +       }
  4105. +       BUG_ON(slot->pool_counts[i]);
  4106. +       kunmap(slot->rmap_list_pool[i]);
  4107. +       __free_page(slot->rmap_list_pool[i]);
  4108. +   }
  4109. +   kfree(slot->rmap_list_pool);
  4110. +   kfree(slot->pool_counts);
  4111. +
  4112. +out:
  4113. +   slot->rung = NULL;
  4114. +   BUG_ON(uksm_pages_total < slot->pages);
  4115. +   if (slot->flags & UKSM_SLOT_IN_UKSM)
  4116. +       uksm_pages_total -= slot->pages;
  4117. +
  4118. +   if (slot->fully_scanned_round == fully_scanned_round)
  4119. +       scanned_virtual_pages -= slot->pages;
  4120. +   else
  4121. +       scanned_virtual_pages -= slot->pages_scanned;
  4122. +   free_vma_slot(slot);
  4123. +}
  4124. +
  4125. +
  4126. +#define SPIN_LOCK_PERIOD   32
  4127. +static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
  4128. +static inline void cleanup_vma_slots(void)
  4129. +{
  4130. +   struct vma_slot *slot;
  4131. +   int i;
  4132. +
  4133. +   i = 0;
  4134. +   spin_lock(&vma_slot_list_lock);
  4135. +   while (!list_empty(&vma_slot_del)) {
  4136. +       slot = list_entry(vma_slot_del.next,
  4137. +                 struct vma_slot, slot_list);
  4138. +       list_del(&slot->slot_list);
  4139. +       cleanup_slots[i++] = slot;
  4140. +       if (i == SPIN_LOCK_PERIOD) {
  4141. +           spin_unlock(&vma_slot_list_lock);
  4142. +           while (--i >= 0)
  4143. +               uksm_del_vma_slot(cleanup_slots[i]);
  4144. +           i = 0;
  4145. +           spin_lock(&vma_slot_list_lock);
  4146. +       }
  4147. +   }
  4148. +   spin_unlock(&vma_slot_list_lock);
  4149. +
  4150. +   while (--i >= 0)
  4151. +       uksm_del_vma_slot(cleanup_slots[i]);
  4152. +}
  4153. +
  4154. +/*
  4155. +*expotional moving average formula
  4156. +*/
  4157. +static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
  4158. +{
  4159. +   /*
  4160. +    * For a very high burst, even the ema cannot work well, a false very
  4161. +    * high per-page time estimation can result in feedback in very high
  4162. +    * overhead of context swith and rung update -- this will then lead
  4163. +    * to higher per-paper time, this may not converge.
  4164. +    *
  4165. +    * Instead, we try to approach this value in a binary manner.
  4166. +    */
  4167. +   if (curr > last_ema * 10)
  4168. +       return last_ema * 2;
  4169. +
  4170. +   return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
  4171. +}
  4172. +
  4173. +/*
  4174. + * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
  4175. + * nanoseconds based on current uksm_sleep_jiffies.
  4176. + */
  4177. +static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
  4178. +{
  4179. +   return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
  4180. +       (TIME_RATIO_SCALE - ratio) * ratio;
  4181. +}
  4182. +
  4183. +
  4184. +static inline unsigned long rung_real_ratio(int cpu_time_ratio)
  4185. +{
  4186. +   unsigned long ret;
  4187. +
  4188. +   BUG_ON(!cpu_time_ratio);
  4189. +
  4190. +   if (cpu_time_ratio > 0)
  4191. +       ret = cpu_time_ratio;
  4192. +   else
  4193. +       ret = (unsigned long)(-cpu_time_ratio) *
  4194. +           uksm_max_cpu_percentage / 100UL;
  4195. +
  4196. +   return ret ? ret : 1;
  4197. +}
  4198. +
  4199. +static noinline void uksm_calc_scan_pages(void)
  4200. +{
  4201. +   struct scan_rung *ladder = uksm_scan_ladder;
  4202. +   unsigned long sleep_usecs, nsecs;
  4203. +   unsigned long ratio;
  4204. +   int i;
  4205. +   unsigned long per_page;
  4206. +
  4207. +   if (uksm_ema_page_time > 100000 ||
  4208. +       (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
  4209. +       uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
  4210. +
  4211. +   per_page = uksm_ema_page_time;
  4212. +   BUG_ON(!per_page);
  4213. +
  4214. +   /*
  4215. +    * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
  4216. +    * based on saved user input.
  4217. +    */
  4218. +   if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
  4219. +       uksm_sleep_jiffies = uksm_sleep_saved;
  4220. +
  4221. +   /* We require a rung scan at least 1 page in a period. */
  4222. +   nsecs = per_page;
  4223. +   ratio = rung_real_ratio(ladder[0].cpu_ratio);
  4224. +   if (cpu_ratio_to_nsec(ratio) < nsecs) {
  4225. +       sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
  4226. +               / NSEC_PER_USEC;
  4227. +       uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
  4228. +   }
  4229. +
  4230. +   for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  4231. +       ratio = rung_real_ratio(ladder[i].cpu_ratio);
  4232. +       ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
  4233. +                   per_page;
  4234. +       BUG_ON(!ladder[i].pages_to_scan);
  4235. +       uksm_calc_rung_step(&ladder[i], per_page, ratio);
  4236. +   }
  4237. +}
  4238. +
  4239. +/*
  4240. + * From the scan time of this round (ns) to next expected min sleep time
  4241. + * (ms), be careful of the possible overflows. ratio is taken from
  4242. + * rung_real_ratio()
  4243. + */
  4244. +static inline
  4245. +unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
  4246. +{
  4247. +   scan_time >>= 20; /* to msec level now */
  4248. +   BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
  4249. +
  4250. +   return (unsigned int) ((unsigned long) scan_time *
  4251. +                  (TIME_RATIO_SCALE - ratio) / ratio);
  4252. +}
  4253. +
  4254. +#define __round_mask(x, y) ((__typeof__(x))((y)-1))
  4255. +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
  4256. +
  4257. +static inline unsigned long vma_pool_size(struct vma_slot *slot)
  4258. +{
  4259. +   return round_up(sizeof(struct rmap_list_entry) * slot->pages,
  4260. +           PAGE_SIZE) >> PAGE_SHIFT;
  4261. +}
  4262. +
  4263. +static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
  4264. +{
  4265. +   struct scan_rung *rung;
  4266. +   unsigned long pool_size, i;
  4267. +   struct vma_slot *slot;
  4268. +   int failed;
  4269. +
  4270. +   rung = &uksm_scan_ladder[0];
  4271. +
  4272. +   failed = 0;
  4273. +   for (i = 0; i < num; i++) {
  4274. +       slot = slots[i];
  4275. +
  4276. +       pool_size = vma_pool_size(slot);
  4277. +       slot->rmap_list_pool = kzalloc(sizeof(struct page *) *
  4278. +                          pool_size, GFP_KERNEL);
  4279. +       if (!slot->rmap_list_pool)
  4280. +           break;
  4281. +
  4282. +       slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size,
  4283. +                       GFP_KERNEL);
  4284. +       if (!slot->pool_counts) {
  4285. +           kfree(slot->rmap_list_pool);
  4286. +           break;
  4287. +       }
  4288. +
  4289. +       slot->pool_size = pool_size;
  4290. +       BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
  4291. +       slot->flags |= UKSM_SLOT_IN_UKSM;
  4292. +       uksm_pages_total += slot->pages;
  4293. +   }
  4294. +
  4295. +   if (i)
  4296. +       rung_add_new_slots(rung, slots, i);
  4297. +
  4298. +   return;
  4299. +}
  4300. +
  4301. +static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
  4302. +
  4303. +static void uksm_enter_all_slots(void)
  4304. +{
  4305. +   struct vma_slot *slot;
  4306. +   unsigned long index;
  4307. +   struct list_head empty_vma_list;
  4308. +   int i;
  4309. +
  4310. +   i = 0;
  4311. +   index = 0;
  4312. +   INIT_LIST_HEAD(&empty_vma_list);
  4313. +
  4314. +   spin_lock(&vma_slot_list_lock);
  4315. +   while (!list_empty(&vma_slot_new)) {
  4316. +       slot = list_entry(vma_slot_new.next,
  4317. +                 struct vma_slot, slot_list);
  4318. +
  4319. +       if (!slot->vma->anon_vma) {
  4320. +           list_move(&slot->slot_list, &empty_vma_list);
  4321. +       } else if (vma_can_enter(slot->vma)) {
  4322. +           batch_slots[index++] = slot;
  4323. +           list_del_init(&slot->slot_list);
  4324. +       } else {
  4325. +           list_move(&slot->slot_list, &vma_slot_noadd);
  4326. +       }
  4327. +
  4328. +       if (++i == SPIN_LOCK_PERIOD ||
  4329. +           (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
  4330. +           spin_unlock(&vma_slot_list_lock);
  4331. +
  4332. +           if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
  4333. +               uksm_vma_enter(batch_slots, index);
  4334. +               index = 0;
  4335. +           }
  4336. +           i = 0;
  4337. +           cond_resched();
  4338. +           spin_lock(&vma_slot_list_lock);
  4339. +       }
  4340. +   }
  4341. +
  4342. +   list_splice(&empty_vma_list, &vma_slot_new);
  4343. +
  4344. +   spin_unlock(&vma_slot_list_lock);
  4345. +
  4346. +   if (index)
  4347. +       uksm_vma_enter(batch_slots, index);
  4348. +
  4349. +}
  4350. +
  4351. +static inline int rung_round_finished(struct scan_rung *rung)
  4352. +{
  4353. +   return rung->flags & UKSM_RUNG_ROUND_FINISHED;
  4354. +}
  4355. +
  4356. +static inline void judge_slot(struct vma_slot *slot)
  4357. +{
  4358. +   struct scan_rung *rung = slot->rung;
  4359. +   unsigned long dedup;
  4360. +   int deleted;
  4361. +
  4362. +   dedup = cal_dedup_ratio(slot);
  4363. +   if (vma_fully_scanned(slot) && uksm_thrash_threshold)
  4364. +       deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
  4365. +   else if (dedup && dedup >= uksm_abundant_threshold)
  4366. +       deleted = vma_rung_up(slot);
  4367. +   else
  4368. +       deleted = vma_rung_down(slot);
  4369. +
  4370. +   slot->pages_merged = 0;
  4371. +   slot->pages_cowed = 0;
  4372. +
  4373. +   if (vma_fully_scanned(slot))
  4374. +       slot->pages_scanned = 0;
  4375. +
  4376. +   slot->last_scanned = slot->pages_scanned;
  4377. +
  4378. +   /* If its deleted in above, then rung was already advanced. */
  4379. +   if (!deleted)
  4380. +       advance_current_scan(rung);
  4381. +}
  4382. +
  4383. +
  4384. +static inline int hash_round_finished(void)
  4385. +{
  4386. +   if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
  4387. +       scanned_virtual_pages = 0;
  4388. +       if (uksm_pages_scanned)
  4389. +           fully_scanned_round++;
  4390. +
  4391. +       return 1;
  4392. +   } else {
  4393. +       return 0;
  4394. +   }
  4395. +}
  4396. +
  4397. +#define UKSM_MMSEM_BATCH   5
  4398. +/**
  4399. + * uksm_do_scan()  - the main worker function.
  4400. + */
  4401. +static noinline void uksm_do_scan(void)
  4402. +{
  4403. +   struct vma_slot *slot, *iter;
  4404. +   struct mm_struct *busy_mm;
  4405. +   unsigned char round_finished, all_rungs_emtpy;
  4406. +   int i, err, mmsem_batch;
  4407. +   unsigned long pcost;
  4408. +   long long delta_exec;
  4409. +   unsigned long vpages, max_cpu_ratio;
  4410. +   unsigned long long start_time, end_time, scan_time;
  4411. +   unsigned int expected_jiffies;
  4412. +
  4413. +   might_sleep();
  4414. +
  4415. +   vpages = 0;
  4416. +
  4417. +   start_time = task_sched_runtime(current);
  4418. +   max_cpu_ratio = 0;
  4419. +   mmsem_batch = 0;
  4420. +
  4421. +   for (i = 0; i < SCAN_LADDER_SIZE;) {
  4422. +       struct scan_rung *rung = &uksm_scan_ladder[i];
  4423. +       unsigned long ratio;
  4424. +
  4425. +       if (!rung->pages_to_scan) {
  4426. +           i++;
  4427. +           continue;
  4428. +       }
  4429. +
  4430. +       if (!rung->vma_root.num) {
  4431. +           rung->pages_to_scan = 0;
  4432. +           i++;
  4433. +           continue;
  4434. +       }
  4435. +
  4436. +       ratio = rung_real_ratio(rung->cpu_ratio);
  4437. +       if (ratio > max_cpu_ratio)
  4438. +           max_cpu_ratio = ratio;
  4439. +
  4440. +       /*
  4441. +        * Do not consider rung_round_finished() here, just used up the
  4442. +        * rung->pages_to_scan quota.
  4443. +        */
  4444. +       while (rung->pages_to_scan && rung->vma_root.num &&
  4445. +              likely(!freezing(current))) {
  4446. +           int reset = 0;
  4447. +
  4448. +           slot = rung->current_scan;
  4449. +
  4450. +           BUG_ON(vma_fully_scanned(slot));
  4451. +
  4452. +           if (mmsem_batch) {
  4453. +               err = 0;
  4454. +           } else {
  4455. +               err = try_down_read_slot_mmap_sem(slot);
  4456. +           }
  4457. +
  4458. +           if (err == -ENOENT) {
  4459. +rm_slot:
  4460. +               rung_rm_slot(slot);
  4461. +               continue;
  4462. +           }
  4463. +
  4464. +           busy_mm = slot->mm;
  4465. +
  4466. +           if (err == -EBUSY) {
  4467. +               /* skip other vmas on the same mm */
  4468. +               do {
  4469. +                   reset = advance_current_scan(rung);
  4470. +                   iter = rung->current_scan;
  4471. +                   if (iter->vma->vm_mm != busy_mm)
  4472. +                       break;
  4473. +               } while (!reset);
  4474. +
  4475. +               if (iter->vma->vm_mm != busy_mm) {
  4476. +                   continue;
  4477. +               } else {
  4478. +                   /* scan round finsished */
  4479. +                   break;
  4480. +               }
  4481. +           }
  4482. +
  4483. +           BUG_ON(!vma_can_enter(slot->vma));
  4484. +           if (uksm_test_exit(slot->vma->vm_mm)) {
  4485. +               mmsem_batch = 0;
  4486. +               up_read(&slot->vma->vm_mm->mmap_sem);
  4487. +               goto rm_slot;
  4488. +           }
  4489. +
  4490. +           if (mmsem_batch)
  4491. +               mmsem_batch--;
  4492. +           else
  4493. +               mmsem_batch = UKSM_MMSEM_BATCH;
  4494. +
  4495. +           /* Ok, we have take the mmap_sem, ready to scan */
  4496. +           scan_vma_one_page(slot);
  4497. +           rung->pages_to_scan--;
  4498. +           vpages++;
  4499. +
  4500. +           if (rung->current_offset + rung->step > slot->pages - 1
  4501. +               || vma_fully_scanned(slot)) {
  4502. +               up_read(&slot->vma->vm_mm->mmap_sem);
  4503. +               judge_slot(slot);
  4504. +               mmsem_batch = 0;
  4505. +           } else {
  4506. +               rung->current_offset += rung->step;
  4507. +               if (!mmsem_batch)
  4508. +                   up_read(&slot->vma->vm_mm->mmap_sem);
  4509. +           }
  4510. +
  4511. +           cond_resched();
  4512. +       }
  4513. +
  4514. +       if (mmsem_batch) {
  4515. +           up_read(&slot->vma->vm_mm->mmap_sem);
  4516. +           mmsem_batch = 0;
  4517. +       }
  4518. +
  4519. +       if (freezing(current))
  4520. +           break;
  4521. +
  4522. +       cond_resched();
  4523. +   }
  4524. +   end_time = task_sched_runtime(current);
  4525. +   delta_exec = end_time - start_time;
  4526. +
  4527. +   if (freezing(current))
  4528. +       return;
  4529. +
  4530. +   cleanup_vma_slots();
  4531. +   uksm_enter_all_slots();
  4532. +
  4533. +   round_finished = 1;
  4534. +   all_rungs_emtpy = 1;
  4535. +   for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  4536. +       struct scan_rung *rung = &uksm_scan_ladder[i];
  4537. +
  4538. +       if (rung->vma_root.num) {
  4539. +           all_rungs_emtpy = 0;
  4540. +           if (!rung_round_finished(rung))
  4541. +               round_finished = 0;
  4542. +       }
  4543. +   }
  4544. +
  4545. +   if (all_rungs_emtpy)
  4546. +       round_finished = 0;
  4547. +
  4548. +   if (round_finished) {
  4549. +       round_update_ladder();
  4550. +       uksm_eval_round++;
  4551. +
  4552. +       if (hash_round_finished() && rshash_adjust()) {
  4553. +           /* Reset the unstable root iff hash strength changed */
  4554. +           uksm_hash_round++;
  4555. +           root_unstable_tree = RB_ROOT;
  4556. +           free_all_tree_nodes(&unstable_tree_node_list);
  4557. +       }
  4558. +
  4559. +       /*
  4560. +        * A number of pages can hang around indefinitely on per-cpu
  4561. +        * pagevecs, raised page count preventing write_protect_page
  4562. +        * from merging them.  Though it doesn't really matter much,
  4563. +        * it is puzzling to see some stuck in pages_volatile until
  4564. +        * other activity jostles them out, and they also prevented
  4565. +        * LTP's KSM test from succeeding deterministically; so drain
  4566. +        * them here (here rather than on entry to uksm_do_scan(),
  4567. +        * so we don't IPI too often when pages_to_scan is set low).
  4568. +        */
  4569. +       lru_add_drain_all();
  4570. +   }
  4571. +
  4572. +
  4573. +   if (vpages && delta_exec > 0) {
  4574. +       pcost = (unsigned long) delta_exec / vpages;
  4575. +       if (likely(uksm_ema_page_time))
  4576. +           uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
  4577. +       else
  4578. +           uksm_ema_page_time = pcost;
  4579. +   }
  4580. +
  4581. +   uksm_calc_scan_pages();
  4582. +   uksm_sleep_real = uksm_sleep_jiffies;
  4583. +   /* in case of radical cpu bursts, apply the upper bound */
  4584. +   end_time = task_sched_runtime(current);
  4585. +   if (max_cpu_ratio && end_time > start_time) {
  4586. +       scan_time = end_time - start_time;
  4587. +       expected_jiffies = msecs_to_jiffies(
  4588. +           scan_time_to_sleep(scan_time, max_cpu_ratio));
  4589. +
  4590. +       if (expected_jiffies > uksm_sleep_real)
  4591. +           uksm_sleep_real = expected_jiffies;
  4592. +
  4593. +       /* We have a 1 second up bound for responsiveness. */
  4594. +       if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
  4595. +           uksm_sleep_real = msecs_to_jiffies(1000);
  4596. +   }
  4597. +
  4598. +   return;
  4599. +}
  4600. +
  4601. +static int ksmd_should_run(void)
  4602. +{
  4603. +   return uksm_run & UKSM_RUN_MERGE;
  4604. +}
  4605. +
  4606. +static int uksm_scan_thread(void *nothing)
  4607. +{
  4608. +   set_freezable();
  4609. +   set_user_nice(current, 5);
  4610. +
  4611. +   while (!kthread_should_stop()) {
  4612. +       mutex_lock(&uksm_thread_mutex);
  4613. +       if (ksmd_should_run()) {
  4614. +           uksm_do_scan();
  4615. +       }
  4616. +       mutex_unlock(&uksm_thread_mutex);
  4617. +
  4618. +       try_to_freeze();
  4619. +
  4620. +       if (ksmd_should_run()) {
  4621. +           schedule_timeout_interruptible(uksm_sleep_real);
  4622. +           uksm_sleep_times++;
  4623. +       } else {
  4624. +           wait_event_freezable(uksm_thread_wait,
  4625. +               ksmd_should_run() || kthread_should_stop());
  4626. +       }
  4627. +   }
  4628. +   return 0;
  4629. +}
  4630. +
  4631. +int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
  4632. +           unsigned long *vm_flags)
  4633. +{
  4634. +   struct stable_node *stable_node;
  4635. +   struct node_vma *node_vma;
  4636. +   struct rmap_item *rmap_item;
  4637. +   struct hlist_node *hlist, *rmap_hlist;
  4638. +   unsigned int mapcount = page_mapcount(page);
  4639. +   int referenced = 0;
  4640. +   int search_new_forks = 0;
  4641. +   unsigned long address;
  4642. +
  4643. +   VM_BUG_ON(!PageKsm(page));
  4644. +   VM_BUG_ON(!PageLocked(page));
  4645. +
  4646. +   stable_node = page_stable_node(page);
  4647. +   if (!stable_node)
  4648. +       return 0;
  4649. +
  4650. +
  4651. +again:
  4652. +   hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
  4653. +       hlist_for_each_entry(rmap_item, rmap_hlist,
  4654. +                    &node_vma->rmap_hlist, hlist) {
  4655. +           struct anon_vma *anon_vma = rmap_item->anon_vma;
  4656. +           struct anon_vma_chain *vmac;
  4657. +           struct vm_area_struct *vma;
  4658. +
  4659. +           anon_vma_lock(anon_vma);
  4660. +           list_for_each_entry(vmac, &anon_vma->head,
  4661. +                       same_anon_vma) {
  4662. +               vma = vmac->vma;
  4663. +               address = get_rmap_addr(rmap_item);
  4664. +
  4665. +               if (address < vma->vm_start ||
  4666. +                   address >= vma->vm_end)
  4667. +                   continue;
  4668. +               /*
  4669. +                * Initially we examine only the vma which
  4670. +                * covers this rmap_item; but later, if there
  4671. +                * is still work to do, we examine covering
  4672. +                * vmas in other mms: in case they were forked
  4673. +                * from the original since ksmd passed.
  4674. +                */
  4675. +               if ((rmap_item->slot->vma == vma) ==
  4676. +                   search_new_forks)
  4677. +                   continue;
  4678. +
  4679. +               if (memcg &&
  4680. +                   !mm_match_cgroup(vma->vm_mm, memcg))
  4681. +                   continue;
  4682. +
  4683. +               referenced +=
  4684. +                   page_referenced_one(page, vma,
  4685. +                       address, &mapcount, vm_flags);
  4686. +               if (!search_new_forks || !mapcount)
  4687. +                   break;
  4688. +           }
  4689. +
  4690. +           anon_vma_unlock(anon_vma);
  4691. +           if (!mapcount)
  4692. +               goto out;
  4693. +       }
  4694. +   }
  4695. +   if (!search_new_forks++)
  4696. +       goto again;
  4697. +out:
  4698. +   return referenced;
  4699. +}
  4700. +
  4701. +int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
  4702. +{
  4703. +   struct stable_node *stable_node;
  4704. +   struct node_vma *node_vma;
  4705. +   struct hlist_node *hlist, *rmap_hlist;
  4706. +   struct rmap_item *rmap_item;
  4707. +   int ret = SWAP_AGAIN;
  4708. +   int search_new_forks = 0;
  4709. +   unsigned long address;
  4710. +
  4711. +   VM_BUG_ON(!PageKsm(page));
  4712. +   VM_BUG_ON(!PageLocked(page));
  4713. +
  4714. +   stable_node = page_stable_node(page);
  4715. +   if (!stable_node)
  4716. +       return SWAP_FAIL;
  4717. +again:
  4718. +   hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
  4719. +       hlist_for_each_entry(rmap_item, rmap_hlist,
  4720. +                    &node_vma->rmap_hlist, hlist) {
  4721. +           struct anon_vma *anon_vma = rmap_item->anon_vma;
  4722. +           struct anon_vma_chain *vmac;
  4723. +           struct vm_area_struct *vma;
  4724. +
  4725. +           anon_vma_lock(anon_vma);
  4726. +           list_for_each_entry(vmac, &anon_vma->head,
  4727. +                       same_anon_vma) {
  4728. +               vma = vmac->vma;
  4729. +               address = get_rmap_addr(rmap_item);
  4730. +
  4731. +               if (address < vma->vm_start ||
  4732. +                   address >= vma->vm_end)
  4733. +                   continue;
  4734. +               /*
  4735. +                * Initially we examine only the vma which
  4736. +                * covers this rmap_item; but later, if there
  4737. +                * is still work to do, we examine covering
  4738. +                * vmas in other mms: in case they were forked
  4739. +                * from the original since ksmd passed.
  4740. +                */
  4741. +               if ((rmap_item->slot->vma == vma) ==
  4742. +                   search_new_forks)
  4743. +                   continue;
  4744. +
  4745. +               ret = try_to_unmap_one(page, vma,
  4746. +                              address, flags);
  4747. +               if (ret != SWAP_AGAIN || !page_mapped(page)) {
  4748. +                   anon_vma_unlock(anon_vma);
  4749. +                   goto out;
  4750. +               }
  4751. +           }
  4752. +           anon_vma_unlock(anon_vma);
  4753. +       }
  4754. +   }
  4755. +   if (!search_new_forks++)
  4756. +       goto again;
  4757. +out:
  4758. +   return ret;
  4759. +}
  4760. +
  4761. +#ifdef CONFIG_MIGRATION
  4762. +int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
  4763. +         struct vm_area_struct *, unsigned long, void *), void *arg)
  4764. +{
  4765. +   struct stable_node *stable_node;
  4766. +   struct node_vma *node_vma;
  4767. +   struct hlist_node *hlist, *rmap_hlist;
  4768. +   struct rmap_item *rmap_item;
  4769. +   int ret = SWAP_AGAIN;
  4770. +   int search_new_forks = 0;
  4771. +   unsigned long address;
  4772. +
  4773. +   VM_BUG_ON(!PageKsm(page));
  4774. +   VM_BUG_ON(!PageLocked(page));
  4775. +
  4776. +   stable_node = page_stable_node(page);
  4777. +   if (!stable_node)
  4778. +       return ret;
  4779. +again:
  4780. +   hlist_for_each_entry(node_vma, hlist, &stable_node->hlist, hlist) {
  4781. +       hlist_for_each_entry(rmap_item, rmap_hlist,
  4782. +                    &node_vma->rmap_hlist, hlist) {
  4783. +           struct anon_vma *anon_vma = rmap_item->anon_vma;
  4784. +           struct anon_vma_chain *vmac;
  4785. +           struct vm_area_struct *vma;
  4786. +
  4787. +           anon_vma_lock(anon_vma);
  4788. +           list_for_each_entry(vmac, &anon_vma->head,
  4789. +                       same_anon_vma) {
  4790. +               vma = vmac->vma;
  4791. +               address = get_rmap_addr(rmap_item);
  4792. +
  4793. +               if (address < vma->vm_start ||
  4794. +                   address >= vma->vm_end)
  4795. +                   continue;
  4796. +
  4797. +               if ((rmap_item->slot->vma == vma) ==
  4798. +                   search_new_forks)
  4799. +                   continue;
  4800. +
  4801. +               ret = rmap_one(page, vma, address, arg);
  4802. +               if (ret != SWAP_AGAIN) {
  4803. +                   anon_vma_unlock(anon_vma);
  4804. +                   goto out;
  4805. +               }
  4806. +           }
  4807. +           anon_vma_unlock(anon_vma);
  4808. +       }
  4809. +   }
  4810. +   if (!search_new_forks++)
  4811. +       goto again;
  4812. +out:
  4813. +   return ret;
  4814. +}
  4815. +
  4816. +/* Common ksm interface but may be specific to uksm */
  4817. +void ksm_migrate_page(struct page *newpage, struct page *oldpage)
  4818. +{
  4819. +   struct stable_node *stable_node;
  4820. +
  4821. +   VM_BUG_ON(!PageLocked(oldpage));
  4822. +   VM_BUG_ON(!PageLocked(newpage));
  4823. +   VM_BUG_ON(newpage->mapping != oldpage->mapping);
  4824. +
  4825. +   stable_node = page_stable_node(newpage);
  4826. +   if (stable_node) {
  4827. +       VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
  4828. +       stable_node->kpfn = page_to_pfn(newpage);
  4829. +   }
  4830. +}
  4831. +#endif /* CONFIG_MIGRATION */
  4832. +
  4833. +#ifdef CONFIG_MEMORY_HOTREMOVE
  4834. +static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
  4835. +                        unsigned long end_pfn)
  4836. +{
  4837. +   struct rb_node *node;
  4838. +
  4839. +   for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
  4840. +       struct stable_node *stable_node;
  4841. +
  4842. +       stable_node = rb_entry(node, struct stable_node, node);
  4843. +       if (stable_node->kpfn >= start_pfn &&
  4844. +           stable_node->kpfn < end_pfn)
  4845. +           return stable_node;
  4846. +   }
  4847. +   return NULL;
  4848. +}
  4849. +
  4850. +static int uksm_memory_callback(struct notifier_block *self,
  4851. +                  unsigned long action, void *arg)
  4852. +{
  4853. +   struct memory_notify *mn = arg;
  4854. +   struct stable_node *stable_node;
  4855. +
  4856. +   switch (action) {
  4857. +   case MEM_GOING_OFFLINE:
  4858. +       /*
  4859. +        * Keep it very simple for now: just lock out ksmd and
  4860. +        * MADV_UNMERGEABLE while any memory is going offline.
  4861. +        * mutex_lock_nested() is necessary because lockdep was alarmed
  4862. +        * that here we take uksm_thread_mutex inside notifier chain
  4863. +        * mutex, and later take notifier chain mutex inside
  4864. +        * uksm_thread_mutex to unlock it.   But that's safe because both
  4865. +        * are inside mem_hotplug_mutex.
  4866. +        */
  4867. +       mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
  4868. +       break;
  4869. +
  4870. +   case MEM_OFFLINE:
  4871. +       /*
  4872. +        * Most of the work is done by page migration; but there might
  4873. +        * be a few stable_nodes left over, still pointing to struct
  4874. +        * pages which have been offlined: prune those from the tree.
  4875. +        */
  4876. +       while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
  4877. +                   mn->start_pfn + mn->nr_pages)) != NULL)
  4878. +           remove_node_from_stable_tree(stable_node, 1, 1);
  4879. +       /* fallthrough */
  4880. +
  4881. +   case MEM_CANCEL_OFFLINE:
  4882. +       mutex_unlock(&uksm_thread_mutex);
  4883. +       break;
  4884. +   }
  4885. +   return NOTIFY_OK;
  4886. +}
  4887. +#endif /* CONFIG_MEMORY_HOTREMOVE */
  4888. +
  4889. +#ifdef CONFIG_SYSFS
  4890. +/*
  4891. + * This all compiles without CONFIG_SYSFS, but is a waste of space.
  4892. + */
  4893. +
  4894. +#define UKSM_ATTR_RO(_name) \
  4895. +   static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
  4896. +#define UKSM_ATTR(_name) \
  4897. +   static struct kobj_attribute _name##_attr = \
  4898. +       __ATTR(_name, 0644, _name##_show, _name##_store)
  4899. +
  4900. +static ssize_t max_cpu_percentage_show(struct kobject *kobj,
  4901. +                   struct kobj_attribute *attr, char *buf)
  4902. +{
  4903. +   return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
  4904. +}
  4905. +
  4906. +static ssize_t max_cpu_percentage_store(struct kobject *kobj,
  4907. +                    struct kobj_attribute *attr,
  4908. +                    const char *buf, size_t count)
  4909. +{
  4910. +   unsigned long max_cpu_percentage;
  4911. +   int err;
  4912. +
  4913. +   err = strict_strtoul(buf, 10, &max_cpu_percentage);
  4914. +   if (err || max_cpu_percentage > 100)
  4915. +       return -EINVAL;
  4916. +
  4917. +   if (max_cpu_percentage == 100)
  4918. +       max_cpu_percentage = 99;
  4919. +   else if (max_cpu_percentage < 10)
  4920. +       max_cpu_percentage = 10;
  4921. +
  4922. +   uksm_max_cpu_percentage = max_cpu_percentage;
  4923. +
  4924. +   return count;
  4925. +}
  4926. +UKSM_ATTR(max_cpu_percentage);
  4927. +
  4928. +static ssize_t sleep_millisecs_show(struct kobject *kobj,
  4929. +                   struct kobj_attribute *attr, char *buf)
  4930. +{
  4931. +   return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
  4932. +}
  4933. +
  4934. +static ssize_t sleep_millisecs_store(struct kobject *kobj,
  4935. +                    struct kobj_attribute *attr,
  4936. +                    const char *buf, size_t count)
  4937. +{
  4938. +   unsigned long msecs;
  4939. +   int err;
  4940. +
  4941. +   err = strict_strtoul(buf, 10, &msecs);
  4942. +   if (err || msecs > MSEC_PER_SEC)
  4943. +       return -EINVAL;
  4944. +
  4945. +   uksm_sleep_jiffies = msecs_to_jiffies(msecs);
  4946. +   uksm_sleep_saved = uksm_sleep_jiffies;
  4947. +
  4948. +   return count;
  4949. +}
  4950. +UKSM_ATTR(sleep_millisecs);
  4951. +
  4952. +
  4953. +static ssize_t cpu_governor_show(struct kobject *kobj,
  4954. +                 struct kobj_attribute *attr, char *buf)
  4955. +{
  4956. +   int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
  4957. +   int i;
  4958. +
  4959. +   buf[0] = '\0';
  4960. +   for (i = 0; i < n ; i++) {
  4961. +       if (uksm_cpu_governor == i)
  4962. +           strcat(buf, "[");
  4963. +
  4964. +       strcat(buf, uksm_cpu_governor_str[i]);
  4965. +
  4966. +       if (uksm_cpu_governor == i)
  4967. +           strcat(buf, "]");
  4968. +
  4969. +       strcat(buf, " ");
  4970. +   }
  4971. +   strcat(buf, "\n");
  4972. +
  4973. +   return strlen(buf);
  4974. +}
  4975. +
  4976. +static inline void init_performance_values(void)
  4977. +{
  4978. +   int i;
  4979. +   struct scan_rung *rung;
  4980. +   struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
  4981. +
  4982. +
  4983. +   for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  4984. +       rung = uksm_scan_ladder + i;
  4985. +       rung->cpu_ratio = preset->cpu_ratio[i];
  4986. +       rung->cover_msecs = preset->cover_msecs[i];
  4987. +   }
  4988. +
  4989. +   uksm_max_cpu_percentage = preset->max_cpu;
  4990. +}
  4991. +
  4992. +static ssize_t cpu_governor_store(struct kobject *kobj,
  4993. +                  struct kobj_attribute *attr,
  4994. +                  const char *buf, size_t count)
  4995. +{
  4996. +   int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
  4997. +
  4998. +   for (n--; n >=0 ; n--) {
  4999. +       if (!strncmp(buf, uksm_cpu_governor_str[n],
  5000. +                strlen(uksm_cpu_governor_str[n])))
  5001. +           break;
  5002. +   }
  5003. +
  5004. +   if (n < 0)
  5005. +       return -EINVAL;
  5006. +   else
  5007. +       uksm_cpu_governor = n;
  5008. +
  5009. +   init_performance_values();
  5010. +
  5011. +   return count;
  5012. +}
  5013. +UKSM_ATTR(cpu_governor);
  5014. +
  5015. +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
  5016. +           char *buf)
  5017. +{
  5018. +   return sprintf(buf, "%u\n", uksm_run);
  5019. +}
  5020. +
  5021. +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
  5022. +            const char *buf, size_t count)
  5023. +{
  5024. +   int err;
  5025. +   unsigned long flags;
  5026. +
  5027. +   err = strict_strtoul(buf, 10, &flags);
  5028. +   if (err || flags > UINT_MAX)
  5029. +       return -EINVAL;
  5030. +   if (flags > UKSM_RUN_MERGE)
  5031. +       return -EINVAL;
  5032. +
  5033. +   mutex_lock(&uksm_thread_mutex);
  5034. +   if (uksm_run != flags) {
  5035. +       uksm_run = flags;
  5036. +   }
  5037. +   mutex_unlock(&uksm_thread_mutex);
  5038. +
  5039. +   if (flags & UKSM_RUN_MERGE)
  5040. +       wake_up_interruptible(&uksm_thread_wait);
  5041. +
  5042. +   return count;
  5043. +}
  5044. +UKSM_ATTR(run);
  5045. +
  5046. +static ssize_t abundant_threshold_show(struct kobject *kobj,
  5047. +                    struct kobj_attribute *attr, char *buf)
  5048. +{
  5049. +   return sprintf(buf, "%u\n", uksm_abundant_threshold);
  5050. +}
  5051. +
  5052. +static ssize_t abundant_threshold_store(struct kobject *kobj,
  5053. +                     struct kobj_attribute *attr,
  5054. +                     const char *buf, size_t count)
  5055. +{
  5056. +   int err;
  5057. +   unsigned long flags;
  5058. +
  5059. +   err = strict_strtoul(buf, 10, &flags);
  5060. +   if (err || flags > 99)
  5061. +       return -EINVAL;
  5062. +
  5063. +   uksm_abundant_threshold = flags;
  5064. +
  5065. +   return count;
  5066. +}
  5067. +UKSM_ATTR(abundant_threshold);
  5068. +
  5069. +static ssize_t thrash_threshold_show(struct kobject *kobj,
  5070. +                    struct kobj_attribute *attr, char *buf)
  5071. +{
  5072. +   return sprintf(buf, "%u\n", uksm_thrash_threshold);
  5073. +}
  5074. +
  5075. +static ssize_t thrash_threshold_store(struct kobject *kobj,
  5076. +                     struct kobj_attribute *attr,
  5077. +                     const char *buf, size_t count)
  5078. +{
  5079. +   int err;
  5080. +   unsigned long flags;
  5081. +
  5082. +   err = strict_strtoul(buf, 10, &flags);
  5083. +   if (err || flags > 99)
  5084. +       return -EINVAL;
  5085. +
  5086. +   uksm_thrash_threshold = flags;
  5087. +
  5088. +   return count;
  5089. +}
  5090. +UKSM_ATTR(thrash_threshold);
  5091. +
  5092. +static ssize_t cpu_ratios_show(struct kobject *kobj,
  5093. +                  struct kobj_attribute *attr, char *buf)
  5094. +{
  5095. +   int i, size;
  5096. +   struct scan_rung *rung;
  5097. +   char *p = buf;
  5098. +
  5099. +   for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  5100. +       rung = &uksm_scan_ladder[i];
  5101. +
  5102. +       if (rung->cpu_ratio > 0)
  5103. +           size = sprintf(p, "%d ", rung->cpu_ratio);
  5104. +       else
  5105. +           size = sprintf(p, "MAX/%d ",
  5106. +                   TIME_RATIO_SCALE / -rung->cpu_ratio);
  5107. +
  5108. +       p += size;
  5109. +   }
  5110. +
  5111. +   *p++ = '\n';
  5112. +   *p = '\0';
  5113. +
  5114. +   return p - buf;
  5115. +}
  5116. +
  5117. +static ssize_t cpu_ratios_store(struct kobject *kobj,
  5118. +                     struct kobj_attribute *attr,
  5119. +                     const char *buf, size_t count)
  5120. +{
  5121. +   int i, cpuratios[SCAN_LADDER_SIZE], err;
  5122. +   unsigned long value;
  5123. +   struct scan_rung *rung;
  5124. +   char *p, *end = NULL;
  5125. +
  5126. +   p = kzalloc(count, GFP_KERNEL);
  5127. +   if (!p)
  5128. +       return -ENOMEM;
  5129. +
  5130. +   memcpy(p, buf, count);
  5131. +
  5132. +   for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  5133. +       if (i != SCAN_LADDER_SIZE -1) {
  5134. +           end = strchr(p, ' ');
  5135. +           if (!end)
  5136. +               return -EINVAL;
  5137. +
  5138. +           *end = '\0';
  5139. +       }
  5140. +
  5141. +       if (strstr(p, "MAX/")) {
  5142. +           p = strchr(p, '/') + 1;
  5143. +           err = strict_strtoul(p, 10, &value);
  5144. +           if (err || value > TIME_RATIO_SCALE || !value)
  5145. +               return -EINVAL;
  5146. +
  5147. +           cpuratios[i] = - (int) (TIME_RATIO_SCALE / value);
  5148. +       } else {
  5149. +           err = strict_strtoul(p, 10, &value);
  5150. +           if (err || value > TIME_RATIO_SCALE || !value)
  5151. +               return -EINVAL;
  5152. +
  5153. +           cpuratios[i] = value;
  5154. +       }
  5155. +
  5156. +       p = end + 1;
  5157. +   }
  5158. +
  5159. +   for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  5160. +       rung = &uksm_scan_ladder[i];
  5161. +
  5162. +       rung->cpu_ratio = cpuratios[i];
  5163. +   }
  5164. +
  5165. +   return count;
  5166. +}
  5167. +UKSM_ATTR(cpu_ratios);
  5168. +
  5169. +static ssize_t eval_intervals_show(struct kobject *kobj,
  5170. +                  struct kobj_attribute *attr, char *buf)
  5171. +{
  5172. +   int i, size;
  5173. +   struct scan_rung *rung;
  5174. +   char *p = buf;
  5175. +
  5176. +   for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  5177. +       rung = &uksm_scan_ladder[i];
  5178. +       size = sprintf(p, "%u ", rung->cover_msecs);
  5179. +       p += size;
  5180. +   }
  5181. +
  5182. +   *p++ = '\n';
  5183. +   *p = '\0';
  5184. +
  5185. +   return p - buf;
  5186. +}
  5187. +
  5188. +static ssize_t eval_intervals_store(struct kobject *kobj,
  5189. +                     struct kobj_attribute *attr,
  5190. +                     const char *buf, size_t count)
  5191. +{
  5192. +   int i, err;
  5193. +   unsigned long values[SCAN_LADDER_SIZE];
  5194. +   struct scan_rung *rung;
  5195. +   char *p, *end = NULL;
  5196. +
  5197. +   p = kzalloc(count, GFP_KERNEL);
  5198. +   if (!p)
  5199. +       return -ENOMEM;
  5200. +
  5201. +   memcpy(p, buf, count);
  5202. +
  5203. +   for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  5204. +       if (i != SCAN_LADDER_SIZE -1) {
  5205. +           end = strchr(p, ' ');
  5206. +           if (!end)
  5207. +               return -EINVAL;
  5208. +
  5209. +           *end = '\0';
  5210. +       }
  5211. +
  5212. +       err = strict_strtoul(p, 10, &values[i]);
  5213. +       if (err)
  5214. +           return -EINVAL;
  5215. +
  5216. +       p = end + 1;
  5217. +   }
  5218. +
  5219. +   for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  5220. +       rung = &uksm_scan_ladder[i];
  5221. +
  5222. +       rung->cover_msecs = values[i];
  5223. +   }
  5224. +
  5225. +   return count;
  5226. +}
  5227. +UKSM_ATTR(eval_intervals);
  5228. +
  5229. +static ssize_t ema_per_page_time_show(struct kobject *kobj,
  5230. +                struct kobj_attribute *attr, char *buf)
  5231. +{
  5232. +   return sprintf(buf, "%lu\n", uksm_ema_page_time);
  5233. +}
  5234. +UKSM_ATTR_RO(ema_per_page_time);
  5235. +
  5236. +static ssize_t pages_shared_show(struct kobject *kobj,
  5237. +                struct kobj_attribute *attr, char *buf)
  5238. +{
  5239. +   return sprintf(buf, "%lu\n", uksm_pages_shared);
  5240. +}
  5241. +UKSM_ATTR_RO(pages_shared);
  5242. +
  5243. +static ssize_t pages_sharing_show(struct kobject *kobj,
  5244. +                 struct kobj_attribute *attr, char *buf)
  5245. +{
  5246. +   return sprintf(buf, "%lu\n", uksm_pages_sharing);
  5247. +}
  5248. +UKSM_ATTR_RO(pages_sharing);
  5249. +
  5250. +static ssize_t pages_unshared_show(struct kobject *kobj,
  5251. +                  struct kobj_attribute *attr, char *buf)
  5252. +{
  5253. +   return sprintf(buf, "%lu\n", uksm_pages_unshared);
  5254. +}
  5255. +UKSM_ATTR_RO(pages_unshared);
  5256. +
  5257. +static ssize_t full_scans_show(struct kobject *kobj,
  5258. +                  struct kobj_attribute *attr, char *buf)
  5259. +{
  5260. +   return sprintf(buf, "%llu\n", fully_scanned_round);
  5261. +}
  5262. +UKSM_ATTR_RO(full_scans);
  5263. +
  5264. +static ssize_t pages_scanned_show(struct kobject *kobj,
  5265. +                 struct kobj_attribute *attr, char *buf)
  5266. +{
  5267. +   unsigned long base = 0;
  5268. +   u64 delta, ret;
  5269. +
  5270. +   if (pages_scanned_stored) {
  5271. +       base = pages_scanned_base;
  5272. +       ret = pages_scanned_stored;
  5273. +       delta = uksm_pages_scanned >> base;
  5274. +       if (CAN_OVERFLOW_U64(ret, delta)) {
  5275. +           ret >>= 1;
  5276. +           delta >>= 1;
  5277. +           base++;
  5278. +           ret += delta;
  5279. +       }
  5280. +   } else {
  5281. +       ret = uksm_pages_scanned;
  5282. +   }
  5283. +
  5284. +   while (ret > ULONG_MAX) {
  5285. +       ret >>= 1;
  5286. +       base++;
  5287. +   }
  5288. +
  5289. +   if (base)
  5290. +       return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
  5291. +   else
  5292. +       return sprintf(buf, "%lu\n", (unsigned long)ret);
  5293. +}
  5294. +UKSM_ATTR_RO(pages_scanned);
  5295. +
  5296. +static ssize_t hash_strength_show(struct kobject *kobj,
  5297. +                 struct kobj_attribute *attr, char *buf)
  5298. +{
  5299. +   return sprintf(buf, "%lu\n", hash_strength);
  5300. +}
  5301. +UKSM_ATTR_RO(hash_strength);
  5302. +
  5303. +static ssize_t sleep_times_show(struct kobject *kobj,
  5304. +                 struct kobj_attribute *attr, char *buf)
  5305. +{
  5306. +   return sprintf(buf, "%llu\n", uksm_sleep_times);
  5307. +}
  5308. +UKSM_ATTR_RO(sleep_times);
  5309. +
  5310. +
  5311. +static struct attribute *uksm_attrs[] = {
  5312. +   &max_cpu_percentage_attr.attr,
  5313. +   &sleep_millisecs_attr.attr,
  5314. +   &cpu_governor_attr.attr,
  5315. +   &run_attr.attr,
  5316. +   &ema_per_page_time_attr.attr,
  5317. +   &pages_shared_attr.attr,
  5318. +   &pages_sharing_attr.attr,
  5319. +   &pages_unshared_attr.attr,
  5320. +   &full_scans_attr.attr,
  5321. +   &pages_scanned_attr.attr,
  5322. +   &hash_strength_attr.attr,
  5323. +   &sleep_times_attr.attr,
  5324. +   &thrash_threshold_attr.attr,
  5325. +   &abundant_threshold_attr.attr,
  5326. +   &cpu_ratios_attr.attr,
  5327. +   &eval_intervals_attr.attr,
  5328. +   NULL,
  5329. +};
  5330. +
  5331. +static struct attribute_group uksm_attr_group = {
  5332. +   .attrs = uksm_attrs,
  5333. +   .name = "uksm",
  5334. +};
  5335. +#endif /* CONFIG_SYSFS */
  5336. +
  5337. +static inline void init_scan_ladder(void)
  5338. +{
  5339. +   int i;
  5340. +   struct scan_rung *rung;
  5341. +
  5342. +   for (i = 0; i < SCAN_LADDER_SIZE; i++) {
  5343. +       rung = uksm_scan_ladder + i;
  5344. +       slot_tree_init_root(&rung->vma_root);
  5345. +   }
  5346. +
  5347. +   init_performance_values();
  5348. +   uksm_calc_scan_pages();
  5349. +}
  5350. +
  5351. +static inline int cal_positive_negative_costs(void)
  5352. +{
  5353. +   struct page *p1, *p2;
  5354. +   unsigned char *addr1, *addr2;
  5355. +   unsigned long i, time_start, hash_cost;
  5356. +   unsigned long loopnum = 0;
  5357. +
  5358. +   /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
  5359. +   volatile u32 hash;
  5360. +   volatile int ret;
  5361. +
  5362. +   p1 = alloc_page(GFP_KERNEL);
  5363. +   if (!p1)
  5364. +       return -ENOMEM;
  5365. +
  5366. +   p2 = alloc_page(GFP_KERNEL);
  5367. +   if (!p2)
  5368. +       return -ENOMEM;
  5369. +
  5370. +   addr1 = kmap_atomic(p1);
  5371. +   addr2 = kmap_atomic(p2);
  5372. +   memset(addr1, random32(), PAGE_SIZE);
  5373. +   memcpy(addr2, addr1, PAGE_SIZE);
  5374. +
  5375. +   /* make sure that the two pages differ in last byte */
  5376. +   addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
  5377. +   kunmap_atomic(addr2);
  5378. +   kunmap_atomic(addr1);
  5379. +
  5380. +   time_start = jiffies;
  5381. +   while (jiffies - time_start < 100) {
  5382. +       for (i = 0; i < 100; i++)
  5383. +           hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
  5384. +       loopnum += 100;
  5385. +   }
  5386. +   hash_cost = (jiffies - time_start);
  5387. +
  5388. +   time_start = jiffies;
  5389. +   for (i = 0; i < loopnum; i++)
  5390. +       ret = pages_identical(p1, p2);
  5391. +   memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
  5392. +   memcmp_cost /= hash_cost;
  5393. +   printk(KERN_INFO "UKSM: relative memcmp_cost = %lu "
  5394. +            "hash=%u cmp_ret=%d.\n",
  5395. +          memcmp_cost, hash, ret);
  5396. +
  5397. +   __free_page(p1);
  5398. +   __free_page(p2);
  5399. +   return 0;
  5400. +}
  5401. +
  5402. +static int init_zeropage_hash_table(void)
  5403. +{
  5404. +   struct page *page;
  5405. +   char *addr;
  5406. +   int i;
  5407. +
  5408. +   page = alloc_page(GFP_KERNEL);
  5409. +   if (!page)
  5410. +       return -ENOMEM;
  5411. +
  5412. +   addr = kmap_atomic(page);
  5413. +   memset(addr, 0, PAGE_SIZE);
  5414. +   kunmap_atomic(addr);
  5415. +
  5416. +   zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32),
  5417. +       GFP_KERNEL);
  5418. +   if (!zero_hash_table)
  5419. +       return -ENOMEM;
  5420. +
  5421. +   for (i = 0; i < HASH_STRENGTH_MAX; i++)
  5422. +       zero_hash_table[i] = page_hash(page, i, 0);
  5423. +
  5424. +   __free_page(page);
  5425. +
  5426. +   return 0;
  5427. +}
  5428. +
  5429. +static inline int init_random_sampling(void)
  5430. +{
  5431. +   unsigned long i;
  5432. +   random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
  5433. +   if (!random_nums)
  5434. +       return -ENOMEM;
  5435. +
  5436. +   for (i = 0; i < HASH_STRENGTH_FULL; i++)
  5437. +       random_nums[i] = i;
  5438. +
  5439. +   for (i = 0; i < HASH_STRENGTH_FULL; i++) {
  5440. +       unsigned long rand_range, swap_index, tmp;
  5441. +
  5442. +       rand_range = HASH_STRENGTH_FULL - i;
  5443. +       swap_index = i + random32() % rand_range;
  5444. +       tmp = random_nums[i];
  5445. +       random_nums[i] =  random_nums[swap_index];
  5446. +       random_nums[swap_index] = tmp;
  5447. +   }
  5448. +
  5449. +   rshash_state.state = RSHASH_NEW;
  5450. +   rshash_state.below_count = 0;
  5451. +   rshash_state.lookup_window_index = 0;
  5452. +
  5453. +   return cal_positive_negative_costs();
  5454. +}
  5455. +
  5456. +static int __init uksm_slab_init(void)
  5457. +{
  5458. +   rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
  5459. +   if (!rmap_item_cache)
  5460. +       goto out;
  5461. +
  5462. +   stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
  5463. +   if (!stable_node_cache)
  5464. +       goto out_free1;
  5465. +
  5466. +   node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
  5467. +   if (!node_vma_cache)
  5468. +       goto out_free2;
  5469. +
  5470. +   vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
  5471. +   if (!vma_slot_cache)
  5472. +       goto out_free3;
  5473. +
  5474. +   tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
  5475. +   if (!tree_node_cache)
  5476. +       goto out_free4;
  5477. +
  5478. +   return 0;
  5479. +
  5480. +out_free4:
  5481. +   kmem_cache_destroy(vma_slot_cache);
  5482. +out_free3:
  5483. +   kmem_cache_destroy(node_vma_cache);
  5484. +out_free2:
  5485. +   kmem_cache_destroy(stable_node_cache);
  5486. +out_free1:
  5487. +   kmem_cache_destroy(rmap_item_cache);
  5488. +out:
  5489. +   return -ENOMEM;
  5490. +}
  5491. +
  5492. +static void __init uksm_slab_free(void)
  5493. +{
  5494. +   kmem_cache_destroy(stable_node_cache);
  5495. +   kmem_cache_destroy(rmap_item_cache);
  5496. +   kmem_cache_destroy(node_vma_cache);
  5497. +   kmem_cache_destroy(vma_slot_cache);
  5498. +   kmem_cache_destroy(tree_node_cache);
  5499. +}
  5500. +
  5501. +/* Common interface to ksm, different to it. */
  5502. +int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
  5503. +       unsigned long end, int advice, unsigned long *vm_flags)
  5504. +{
  5505. +   int err;
  5506. +
  5507. +   switch (advice) {
  5508. +   case MADV_MERGEABLE:
  5509. +       return 0;       /* just ignore the advice */
  5510. +
  5511. +   case MADV_UNMERGEABLE:
  5512. +       if (!(*vm_flags & VM_MERGEABLE))
  5513. +           return 0;       /* just ignore the advice */
  5514. +
  5515. +       if (vma->anon_vma) {
  5516. +           err = unmerge_uksm_pages(vma, start, end);
  5517. +           if (err)
  5518. +               return err;
  5519. +       }
  5520. +
  5521. +       uksm_remove_vma(vma);
  5522. +       *vm_flags &= ~VM_MERGEABLE;
  5523. +       break;
  5524. +   }
  5525. +
  5526. +   return 0;
  5527. +}
  5528. +
  5529. +/* Common interface to ksm, actually the same. */
  5530. +struct page *ksm_does_need_to_copy(struct page *page,
  5531. +           struct vm_area_struct *vma, unsigned long address)
  5532. +{
  5533. +   struct page *new_page;
  5534. +
  5535. +   new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
  5536. +   if (new_page) {
  5537. +       copy_user_highpage(new_page, page, address, vma);
  5538. +
  5539. +       SetPageDirty(new_page);
  5540. +       __SetPageUptodate(new_page);
  5541. +       SetPageSwapBacked(new_page);
  5542. +       __set_page_locked(new_page);
  5543. +
  5544. +       if (page_evictable(new_page, vma))
  5545. +           lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
  5546. +       else
  5547. +           add_page_to_unevictable_list(new_page);
  5548. +   }
  5549. +
  5550. +   return new_page;
  5551. +}
  5552. +
  5553. +static int __init uksm_init(void)
  5554. +{
  5555. +   struct task_struct *uksm_thread;
  5556. +   int err;
  5557. +
  5558. +   uksm_sleep_jiffies = msecs_to_jiffies(100);
  5559. +   uksm_sleep_saved = uksm_sleep_jiffies;
  5560. +
  5561. +   slot_tree_init();
  5562. +   init_scan_ladder();
  5563. +
  5564. +
  5565. +   err = init_random_sampling();
  5566. +   if (err)
  5567. +       goto out_free2;
  5568. +
  5569. +   err = uksm_slab_init();
  5570. +   if (err)
  5571. +       goto out_free1;
  5572. +
  5573. +   err = init_zeropage_hash_table();
  5574. +   if (err)
  5575. +       goto out_free0;
  5576. +
  5577. +   uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
  5578. +   if (IS_ERR(uksm_thread)) {
  5579. +       printk(KERN_ERR "uksm: creating kthread failed\n");
  5580. +       err = PTR_ERR(uksm_thread);
  5581. +       goto out_free;
  5582. +   }
  5583. +
  5584. +#ifdef CONFIG_SYSFS
  5585. +   err = sysfs_create_group(mm_kobj, &uksm_attr_group);
  5586. +   if (err) {
  5587. +       printk(KERN_ERR "uksm: register sysfs failed\n");
  5588. +       kthread_stop(uksm_thread);
  5589. +       goto out_free;
  5590. +   }
  5591. +#else
  5592. +   uksm_run = UKSM_RUN_MERGE;  /* no way for user to start it */
  5593. +
  5594. +#endif /* CONFIG_SYSFS */
  5595. +
  5596. +#ifdef CONFIG_MEMORY_HOTREMOVE
  5597. +   /*
  5598. +    * Choose a high priority since the callback takes uksm_thread_mutex:
  5599. +    * later callbacks could only be taking locks which nest within that.
  5600. +    */
  5601. +   hotplug_memory_notifier(uksm_memory_callback, 100);
  5602. +#endif
  5603. +   return 0;
  5604. +
  5605. +out_free:
  5606. +   kfree(zero_hash_table);
  5607. +out_free0:
  5608. +   uksm_slab_free();
  5609. +out_free1:
  5610. +   kfree(random_nums);
  5611. +out_free2:
  5612. +   kfree(uksm_scan_ladder);
  5613. +   return err;
  5614. +}
  5615. +
  5616. +#ifdef MODULE
  5617. +module_init(uksm_init)
  5618. +#else
  5619. +late_initcall(uksm_init);
  5620. +#endif
  5621. +
RAW Paste Data