Advertisement
arter97

Untitled

May 19th, 2022
825
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. commit f769193bec18d1384ff8b2094088d46972da1673
  2. Author: Juhyung Park <qkrwngud825@gmail.com>
  3. Date:   Fri May 20 07:00:12 2022 +0900
  4.  
  5.     Samsung
  6.    
  7.     Signed-off-by: Juhyung Park <qkrwngud825@gmail.com>
  8.  
  9. diff --git a/mm/page_alloc.c b/mm/page_alloc.c
  10. index 46d253155a255..f96d8044b5a3d 100644
  11. --- a/mm/page_alloc.c
  12. +++ b/mm/page_alloc.c
  13. @@ -64,12 +64,12 @@
  14.  #include <linux/page_owner.h>
  15.  #include <linux/kthread.h>
  16.  #include <linux/memcontrol.h>
  17. -#include <linux/show_mem_notifier.h>
  18.  #include <linux/ftrace.h>
  19.  #include <linux/lockdep.h>
  20.  #include <linux/nmi.h>
  21.  #include <linux/psi.h>
  22.  #include <linux/khugepaged.h>
  23. +#include <linux/sched/cputime.h>
  24.  
  25.  #include <asm/sections.h>
  26.  #include <asm/tlbflush.h>
  27. @@ -81,6 +81,36 @@
  28.  static DEFINE_MUTEX(pcp_batch_high_lock);
  29.  #define MIN_PERCPU_PAGELIST_FRACTION   (8)
  30.  
  31. +/* If RANK_BIT position in physical address is zero, it is main rank */
  32. +#define is_main_rank(page) !rankid(page)
  33. +
  34. +static inline void rank_list_add(struct page *page, struct list_head *list)
  35. +{
  36. +   if (is_main_rank(page))
  37. +       list_add(&(page)->lru, list);
  38. +   else
  39. +       list_add_tail(&(page)->lru, list);
  40. +}
  41. +
  42. +static inline void rank_free_area_add(struct page *page, struct free_area *area,
  43. +                     int migratetype)
  44. +{
  45. +   if (is_main_rank(page))
  46. +       add_to_free_area(page, area, migratetype);
  47. +   else
  48. +       add_to_free_area_tail(page, area, migratetype);
  49. +}
  50. +
  51. +static inline void rank_free_area_move(struct page *page,
  52. +                      struct free_area *area,
  53. +                      int migratetype)
  54. +{
  55. +   if (is_main_rank(page))
  56. +       move_to_free_area(page, area, migratetype);
  57. +   else
  58. +       move_to_free_area_tail(page, area, migratetype);
  59. +}
  60. +
  61.  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
  62.  DEFINE_PER_CPU(int, numa_node);
  63.  EXPORT_PER_CPU_SYMBOL(numa_node);
  64. @@ -187,24 +217,6 @@ static int __init early_init_on_free(char *buf)
  65.  }
  66.  early_param("init_on_free", early_init_on_free);
  67.  
  68. -/*
  69. - * A cached value of the page's pageblock's migratetype, used when the page is
  70. - * put on a pcplist. Used to avoid the pageblock migratetype lookup when
  71. - * freeing from pcplists in most cases, at the cost of possibly becoming stale.
  72. - * Also the migratetype set in the page does not necessarily match the pcplist
  73. - * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
  74. - * other index - this ensures that it will be put on the correct CMA freelist.
  75. - */
  76. -static inline int get_pcppage_migratetype(struct page *page)
  77. -{
  78. -   return page->index;
  79. -}
  80. -
  81. -static inline void set_pcppage_migratetype(struct page *page, int migratetype)
  82. -{
  83. -   page->index = migratetype;
  84. -}
  85. -
  86.  #ifdef CONFIG_PM_SLEEP
  87.  /*
  88.   * The following functions are used by the suspend/hibernate code to temporarily
  89. @@ -919,7 +931,7 @@ static inline void __free_one_page(struct page *page,
  90.     unsigned int max_order;
  91.     struct capture_control *capc = task_capc(zone);
  92.  
  93. -   max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
  94. +   max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
  95.  
  96.     VM_BUG_ON(!zone_is_initialized(zone));
  97.     VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
  98. @@ -932,7 +944,7 @@ static inline void __free_one_page(struct page *page,
  99.     VM_BUG_ON_PAGE(bad_range(zone, page), page);
  100.  
  101.  continue_merging:
  102. -   while (order < max_order - 1) {
  103. +   while (order < max_order) {
  104.         if (compaction_capture(capc, page, order, migratetype)) {
  105.             __mod_zone_freepage_state(zone, -(1 << order),
  106.                                 migratetype);
  107. @@ -958,7 +970,7 @@ continue_merging:
  108.         pfn = combined_pfn;
  109.         order++;
  110.     }
  111. -   if (max_order < MAX_ORDER) {
  112. +   if (order < MAX_ORDER - 1) {
  113.         /* If we are here, it means order is >= pageblock_order.
  114.          * We want to prevent merge between freepages on isolate
  115.          * pageblock and normal pageblock. Without this, pageblock
  116. @@ -979,7 +991,7 @@ continue_merging:
  117.                         is_migrate_isolate(buddy_mt)))
  118.                 goto done_merging;
  119.         }
  120. -       max_order++;
  121. +       max_order = order + 1;
  122.         goto continue_merging;
  123.     }
  124.  
  125. @@ -1003,17 +1015,17 @@ done_merging:
  126.         higher_buddy = higher_page + (buddy_pfn - combined_pfn);
  127.         if (pfn_valid_within(buddy_pfn) &&
  128.             page_is_buddy(higher_page, higher_buddy, order + 1)) {
  129. -           add_to_free_area_tail(page, &zone->free_area[order],
  130. -                         migratetype);
  131. +           rank_free_area_add(page, &zone->free_area[order],
  132. +                      migratetype);
  133.             return;
  134.         }
  135.     }
  136.  
  137.     if (is_shuffle_order(order))
  138.         add_to_free_area_random(page, &zone->free_area[order],
  139. -               migratetype);
  140. +                   migratetype);
  141.     else
  142. -       add_to_free_area(page, &zone->free_area[order], migratetype);
  143. +       rank_free_area_add(page, &zone->free_area[order], migratetype);
  144.  
  145.  }
  146.  
  147. @@ -1430,15 +1442,35 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
  148.     }
  149.  }
  150.  
  151. +#ifdef CONFIG_HUGEPAGE_POOL
  152. +static void  __free_pages_ok(struct page *page, unsigned int order)
  153. +{
  154. +   ___free_pages_ok(page, order, false);
  155. +}
  156. +
  157. +void ___free_pages_ok(struct page *page, unsigned int order,
  158. +             bool skip_hugepage_pool)
  159. +#else
  160.  static void __free_pages_ok(struct page *page, unsigned int order)
  161. +#endif
  162.  {
  163.     unsigned long flags;
  164.     int migratetype;
  165.     unsigned long pfn = page_to_pfn(page);
  166.  
  167. +#ifdef CONFIG_HUGEPAGE_POOL
  168. +   if (!skip_hugepage_pool && !free_pages_prepare(page, order, true))
  169. +       return;
  170. +#else
  171.     if (!free_pages_prepare(page, order, true))
  172.         return;
  173. +#endif
  174.  
  175. +#ifdef CONFIG_HUGEPAGE_POOL
  176. +   if (!skip_hugepage_pool && order == HUGEPAGE_ORDER &&
  177. +       insert_hugepage_pool(page, order))
  178. +       return;
  179. +#endif
  180.     migratetype = get_pfnblock_migratetype(page, pfn);
  181.     local_irq_save(flags);
  182.     __count_vm_events(PGFREE, 1 << order);
  183. @@ -2049,7 +2081,7 @@ static inline void expand(struct zone *zone, struct page *page,
  184.         if (set_page_guard(zone, &page[size], high, migratetype))
  185.             continue;
  186.  
  187. -       add_to_free_area(&page[size], area, migratetype);
  188. +       rank_free_area_add(&page[size], area, migratetype);
  189.         set_page_order(&page[size], high);
  190.     }
  191.  }
  192. @@ -2166,8 +2198,13 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
  193.     set_page_owner(page, order, gfp_flags);
  194.  }
  195.  
  196. +#ifdef CONFIG_HUGEPAGE_POOL
  197. +void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
  198. +                           unsigned int alloc_flags)
  199. +#else
  200.  static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
  201.                             unsigned int alloc_flags)
  202. +#endif
  203.  {
  204.     post_alloc_hook(page, order, gfp_flags);
  205.  
  206. @@ -2190,14 +2227,18 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
  207.  }
  208.  
  209.  /*
  210. - * Go through the free lists for the given migratetype and remove
  211. - * the smallest available page from the freelists
  212. + * Search the free lists from requested order to MAX_ORDER to find
  213. + * the main rank page and returns the order if exists.
  214. + * If main rank page doesn't exist, returns the smallest order of
  215. + * available backup rank page.
  216. + *
  217. + * MAX_ORDER is returned if there's no available pages.
  218.   */
  219.  static __always_inline
  220. -struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
  221. -                       int migratetype)
  222. +unsigned int __get_min_rank_aware_order(struct zone *zone,
  223. +                   unsigned int order, int migratetype)
  224.  {
  225. -   unsigned int current_order;
  226. +   unsigned int current_order, backup_order = MAX_ORDER;
  227.     struct free_area *area;
  228.     struct page *page;
  229.  
  230. @@ -2205,15 +2246,36 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
  231.     for (current_order = order; current_order < MAX_ORDER; ++current_order) {
  232.         area = &(zone->free_area[current_order]);
  233.         page = get_page_from_free_area(area, migratetype);
  234. -       if (!page)
  235. -           continue;
  236. -       del_page_from_free_area(page, area);
  237. -       expand(zone, page, order, current_order, area, migratetype);
  238. -       set_pcppage_migratetype(page, migratetype);
  239. -       return page;
  240. +       if (page) {
  241. +           if (is_main_rank(page))
  242. +               return current_order;
  243. +           if (backup_order == MAX_ORDER)
  244. +               backup_order = current_order;
  245. +       }
  246.     }
  247.  
  248. -   return NULL;
  249. +   return backup_order;
  250. +}
  251. +
  252. +static __always_inline
  253. +struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
  254. +                       int migratetype)
  255. +{
  256. +   unsigned int current_order;
  257. +   struct free_area *area;
  258. +   struct page *page;
  259. +
  260. +   current_order = __get_min_rank_aware_order(zone, order, migratetype);
  261. +   if (current_order == MAX_ORDER)
  262. +       return NULL;
  263. +
  264. +   area = &(zone->free_area[current_order]);
  265. +   page = get_page_from_free_area(area, migratetype);
  266. +   del_page_from_free_area(page, area);
  267. +   expand(zone, page, order, current_order, area, migratetype);
  268. +   set_pcppage_migratetype(page, migratetype);
  269. +
  270. +   return page;
  271.  }
  272.  
  273.  
  274. @@ -2282,7 +2344,7 @@ static int move_freepages(struct zone *zone,
  275.         VM_BUG_ON_PAGE(page_zone(page) != zone, page);
  276.  
  277.         order = page_order(page);
  278. -       move_to_free_area(page, &zone->free_area[order], migratetype);
  279. +       rank_free_area_move(page, &zone->free_area[order], migratetype);
  280.         page += 1 << order;
  281.         pages_moved += 1 << order;
  282.     }
  283. @@ -2359,38 +2421,11 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
  284.     return false;
  285.  }
  286.  
  287. -static bool boost_eligible(struct zone *z)
  288. -{
  289. -   unsigned long high_wmark, threshold;
  290. -   unsigned long reclaim_eligible, free_pages;
  291. -
  292. -   high_wmark = z->_watermark[WMARK_HIGH];
  293. -   reclaim_eligible = zone_page_state_snapshot(z, NR_ZONE_INACTIVE_FILE) +
  294. -           zone_page_state_snapshot(z, NR_ZONE_ACTIVE_FILE);
  295. -   free_pages = zone_page_state(z, NR_FREE_PAGES) -
  296. -           zone_page_state(z, NR_FREE_CMA_PAGES);
  297. -   threshold = high_wmark + (2 * mult_frac(high_wmark,
  298. -                   watermark_boost_factor, 10000));
  299. -
  300. -   /*
  301. -    * Don't boost watermark If we are already low on memory where the
  302. -    * boosting can simply put the watermarks at higher levels for a
  303. -    * longer duration of time and thus the other users relied on the
  304. -    * watermarks are forced to choose unintended decissions. If memory
  305. -    * is so low, kswapd in normal mode should help.
  306. -    */
  307. -
  308. -   if (reclaim_eligible < threshold && free_pages < threshold)
  309. -       return false;
  310. -
  311. -   return true;
  312. -}
  313. -
  314.  static inline bool boost_watermark(struct zone *zone)
  315.  {
  316.     unsigned long max_boost;
  317.  
  318. -   if (!watermark_boost_factor || !boost_eligible(zone))
  319. +   if (!watermark_boost_factor)
  320.         return false;
  321.     /*
  322.      * Don't bother in zones that are unlikely to produce results.
  323. @@ -2506,7 +2541,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
  324.  
  325.  single_page:
  326.     area = &zone->free_area[current_order];
  327. -   move_to_free_area(page, area, start_type);
  328. +   rank_free_area_move(page, area, start_type);
  329.  }
  330.  
  331.  /*
  332. @@ -2837,7 +2872,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
  333.          * for IO devices that can merge IO requests if the physical
  334.          * pages are ordered properly.
  335.          */
  336. -       list_add_tail(&page->lru, list);
  337. +       rank_list_add(page, list);
  338.         alloced++;
  339.         if (is_migrate_cma(get_pcppage_migratetype(page)))
  340.             __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
  341. @@ -3143,7 +3178,7 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
  342.     }
  343.  
  344.     pcp = &this_cpu_ptr(zone->pageset)->pcp;
  345. -   list_add(&page->lru, &pcp->lists[migratetype]);
  346. +   rank_list_add(page, &pcp->lists[migratetype]);
  347.     pcp->count++;
  348.     if (pcp->count >= pcp->high) {
  349.         unsigned long batch = READ_ONCE(pcp->batch);
  350. @@ -3329,7 +3364,6 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
  351.             if (unlikely(list == NULL) ||
  352.                     unlikely(list_empty(list)))
  353.                 return NULL;
  354. -
  355.         }
  356.  
  357.         page = list_first_entry(list, struct page, lru);
  358. @@ -3402,7 +3436,6 @@ struct page *rmqueue(struct zone *preferred_zone,
  359.  
  360.         if (!page)
  361.             page = __rmqueue(zone, order, migratetype, alloc_flags);
  362. -
  363.     } while (page && check_new_pages(page, order));
  364.  
  365.     spin_unlock(&zone->lock);
  366. @@ -3498,6 +3531,29 @@ static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  367.  
  368.  #endif /* CONFIG_FAIL_PAGE_ALLOC */
  369.  
  370. +static inline long __zone_watermark_unusable_free(struct zone *z,
  371. +               unsigned int order, unsigned int alloc_flags)
  372. +{
  373. +   const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
  374. +   long unusable_free = (1 << order) - 1;
  375. +
  376. +   /*
  377. +    * If the caller does not have rights to ALLOC_HARDER then subtract
  378. +    * the high-atomic reserves. This will over-estimate the size of the
  379. +    * atomic reserve but it avoids a search.
  380. +    */
  381. +   if (likely(!alloc_harder))
  382. +       unusable_free += z->nr_reserved_highatomic;
  383. +
  384. +#ifdef CONFIG_CMA
  385. +   /* If allocation can't use CMA areas don't use free CMA pages */
  386. +   if (!(alloc_flags & ALLOC_CMA))
  387. +       unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
  388. +#endif
  389. +
  390. +   return unusable_free;
  391. +}
  392. +
  393.  noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  394.  {
  395.     return __should_fail_alloc_page(gfp_mask, order);
  396. @@ -3519,19 +3575,12 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
  397.     const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
  398.  
  399.     /* free_pages may go negative - that's OK */
  400. -   free_pages -= (1 << order) - 1;
  401. +   free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
  402.  
  403.     if (alloc_flags & ALLOC_HIGH)
  404.         min -= min / 2;
  405.  
  406. -   /*
  407. -    * If the caller does not have rights to ALLOC_HARDER then subtract
  408. -    * the high-atomic reserves. This will over-estimate the size of the
  409. -    * atomic reserve but it avoids a search.
  410. -    */
  411. -   if (likely(!alloc_harder)) {
  412. -       free_pages -= z->nr_reserved_highatomic;
  413. -   } else {
  414. +   if (unlikely(alloc_harder)) {
  415.         /*
  416.          * OOM victims can try even harder than normal ALLOC_HARDER
  417.          * users on the grounds that it's definitely going to be in
  418. @@ -3544,13 +3593,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
  419.             min -= min / 4;
  420.     }
  421.  
  422. -
  423. -#ifdef CONFIG_CMA
  424. -   /* If allocation can't use CMA areas don't use free CMA pages */
  425. -   if (!(alloc_flags & ALLOC_CMA))
  426. -       free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
  427. -#endif
  428. -
  429.     /*
  430.      * Check watermarks for an order-0 allocation request. If these
  431.      * are not met, then a high-order request also cannot go ahead
  432. @@ -3572,14 +3614,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
  433.             continue;
  434.  
  435.         for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
  436. -#ifdef CONFIG_CMA
  437. -           /*
  438. -            * Note that this check is needed only
  439. -            * when MIGRATE_CMA < MIGRATE_PCPTYPES.
  440. -            */
  441. -           if (mt == MIGRATE_CMA)
  442. -               continue;
  443. -#endif
  444.             if (!free_area_empty(area, mt))
  445.                 return true;
  446.         }
  447. @@ -3608,24 +3642,22 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
  448.                 unsigned long mark, int classzone_idx,
  449.                 unsigned int alloc_flags, gfp_t gfp_mask)
  450.  {
  451. -   long free_pages = zone_page_state(z, NR_FREE_PAGES);
  452. -   long cma_pages = 0;
  453. +   long free_pages;
  454.  
  455. -#ifdef CONFIG_CMA
  456. -   /* If allocation can't use CMA areas don't use free CMA pages */
  457. -   if (!(alloc_flags & ALLOC_CMA))
  458. -       cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
  459. -#endif
  460. +   free_pages = zone_page_state(z, NR_FREE_PAGES);
  461.  
  462.     /*
  463.      * Fast check for order-0 only. If this fails then the reserves
  464. -    * need to be calculated. There is a corner case where the check
  465. -    * passes but only the high-order atomic reserve are free. If
  466. -    * the caller is !atomic then it'll uselessly search the free
  467. -    * list. That corner case is then slower but it is harmless.
  468. +    * need to be calculated.
  469.      */
  470. -   if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
  471. -       return true;
  472. +   if (!order) {
  473. +       long fast_free;
  474. +
  475. +       fast_free = free_pages;
  476. +       fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags);
  477. +       if (fast_free > mark + z->lowmem_reserve[classzone_idx])
  478. +           return true;
  479. +   }
  480.  
  481.     if (__zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
  482.                     free_pages))
  483. @@ -3783,20 +3815,6 @@ retry:
  484.         }
  485.  
  486.         mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
  487. -       /*
  488. -        * Allow high, atomic, harder order-0 allocation requests
  489. -        * to skip the ->watermark_boost for min watermark check.
  490. -        * In doing so, check for:
  491. -        *  1) ALLOC_WMARK_MIN - Allow to wake up kswapd in the
  492. -        *           slow path.
  493. -        *  2) ALLOC_HIGH - Allow high priority requests.
  494. -        *  3) ALLOC_HARDER - Allow (__GFP_ATOMIC && !__GFP_NOMEMALLOC),
  495. -        *          of the others.
  496. -        */
  497. -       if (unlikely(!order && !(alloc_flags & ALLOC_WMARK_MASK) &&
  498. -            (alloc_flags & (ALLOC_HARDER | ALLOC_HIGH)))) {
  499. -           mark = zone->_watermark[WMARK_MIN];
  500. -       }
  501.         if (!zone_watermark_fast(zone, order, mark,
  502.                        ac_classzone_idx(ac), alloc_flags,
  503.                        gfp_mask)) {
  504. @@ -3893,7 +3911,6 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
  505.         filter &= ~SHOW_MEM_FILTER_NODES;
  506.  
  507.     show_mem(filter, nodemask);
  508. -   show_mem_call_notifiers();
  509.  }
  510.  
  511.  void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
  512. @@ -4308,7 +4325,8 @@ retry:
  513.      */
  514.     if (!page && !drained) {
  515.         unreserve_highatomic_pageblock(ac, false);
  516. -       drain_all_pages(NULL);
  517. +       if (!need_memory_boosting(NULL))
  518. +           drain_all_pages(NULL);
  519.         drained = true;
  520.         goto retry;
  521.     }
  522. @@ -4367,8 +4385,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
  523.         alloc_flags |= ALLOC_KSWAPD;
  524.  
  525.  #ifdef CONFIG_CMA
  526. -   if ((gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) &&
  527. -               (gfp_mask & __GFP_CMA))
  528. +   if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
  529.         alloc_flags |= ALLOC_CMA;
  530.  #endif
  531.     return alloc_flags;
  532. @@ -4558,13 +4575,19 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
  533.     const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
  534.     struct page *page = NULL;
  535.     unsigned int alloc_flags;
  536. -   unsigned long did_some_progress;
  537. +   unsigned long did_some_progress = 0;
  538.     enum compact_priority compact_priority;
  539.     enum compact_result compact_result;
  540.     int compaction_retries;
  541.     int no_progress_loops;
  542.     unsigned int cpuset_mems_cookie;
  543.     int reserve_flags;
  544. +   unsigned long pages_reclaimed = 0;
  545. +   int retry_loop_count = 0;
  546. +   unsigned long jiffies_s = jiffies;
  547. +   u64 utime, stime_s, stime_e, stime_d;
  548. +
  549. +   task_cputime(current, &utime, &stime_s);
  550.  
  551.     /*
  552.      * We also sanity check to catch abuse of atomic reserves being used by
  553. @@ -4679,6 +4702,7 @@ retry_cpuset:
  554.     }
  555.  
  556.  retry:
  557. +   retry_loop_count++;
  558.     /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
  559.     if (alloc_flags & ALLOC_KSWAPD)
  560.         wake_all_kswapds(order, gfp_mask, ac);
  561. @@ -4711,13 +4735,10 @@ retry:
  562.     if (current->flags & PF_MEMALLOC)
  563.         goto nopage;
  564.  
  565. -   if (fatal_signal_pending(current) && !(gfp_mask & __GFP_NOFAIL) &&
  566. -           (gfp_mask & __GFP_FS))
  567. -       goto nopage;
  568. -
  569.     /* Try direct reclaim and then allocating */
  570.     page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
  571.                             &did_some_progress);
  572. +   pages_reclaimed += did_some_progress;
  573.     if (page)
  574.         goto got_pg;
  575.  
  576. @@ -4825,6 +4846,29 @@ fail:
  577.     warn_alloc(gfp_mask, ac->nodemask,
  578.             "page allocation failure: order:%u", order);
  579.  got_pg:
  580. +   task_cputime(current, &utime, &stime_e);
  581. +   stime_d = stime_e - stime_s;
  582. +   if (stime_d / NSEC_PER_MSEC > 256) {
  583. +       pg_data_t *pgdat;
  584. +
  585. +       unsigned long a_anon = 0;
  586. +       unsigned long in_anon = 0;
  587. +       unsigned long a_file = 0;
  588. +       unsigned long in_file = 0;
  589. +       for_each_online_pgdat(pgdat) {
  590. +           a_anon += node_page_state(pgdat, NR_ACTIVE_ANON);
  591. +           in_anon += node_page_state(pgdat, NR_INACTIVE_ANON);
  592. +           a_file += node_page_state(pgdat, NR_ACTIVE_FILE);
  593. +           in_file += node_page_state(pgdat, NR_INACTIVE_FILE);
  594. +       }
  595. +       pr_info("alloc stall: timeJS(ms):%u|%llu rec:%lu|%lu ret:%d o:%d gfp:%#x(%pGg) AaiFai:%lukB|%lukB|%lukB|%lukB\n",
  596. +           jiffies_to_msecs(jiffies - jiffies_s),
  597. +           stime_d / NSEC_PER_MSEC,
  598. +           did_some_progress, pages_reclaimed, retry_loop_count,
  599. +           order, gfp_mask, &gfp_mask,
  600. +           a_anon << (PAGE_SHIFT-10), in_anon << (PAGE_SHIFT-10),
  601. +           a_file << (PAGE_SHIFT-10), in_file << (PAGE_SHIFT-10));
  602. +   }
  603.     return page;
  604.  }
  605.  
  606. @@ -4854,8 +4898,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
  607.     if (should_fail_alloc_page(gfp_mask, order))
  608.         return false;
  609.  
  610. -   if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE &&
  611. -           (gfp_mask & __GFP_CMA))
  612. +   if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
  613.         *alloc_flags |= ALLOC_CMA;
  614.  
  615.     return true;
  616. @@ -5299,6 +5342,9 @@ long si_mem_available(void)
  617.     reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
  618.             global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
  619.     available += reclaimable - min(reclaimable / 2, wmark_low);
  620. +#ifdef CONFIG_ION_RBIN_HEAP
  621. +   available += atomic_read(&rbin_cached_pages);
  622. +#endif
  623.  
  624.     if (available < 0)
  625.         available = 0;
  626. @@ -5309,6 +5355,9 @@ EXPORT_SYMBOL_GPL(si_mem_available);
  627.  void si_meminfo(struct sysinfo *val)
  628.  {
  629.     val->totalram = totalram_pages();
  630. +#ifdef CONFIG_ION_RBIN_HEAP
  631. +   val->totalram += totalrbin_pages;
  632. +#endif
  633.     val->sharedram = global_node_page_state(NR_SHMEM);
  634.     val->freeram = global_zone_page_state(NR_FREE_PAGES);
  635.     val->bufferram = nr_blockdev_pages();
  636. @@ -7056,8 +7105,6 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
  637.     pg_data_t *pgdat = NODE_DATA(nid);
  638.     unsigned long start_pfn = 0;
  639.     unsigned long end_pfn = 0;
  640. -   u64 i;
  641. -   phys_addr_t start, end;
  642.  
  643.     /* pg_data_t should be reset to zero when it's allocated */
  644.     WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
  645. @@ -7071,10 +7118,6 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
  646.         (u64)start_pfn << PAGE_SHIFT,
  647.         end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
  648.  #else
  649. -   for_each_mem_range(i, &memblock.memory, NULL, nid, MEMBLOCK_NONE,
  650. -              &start, &end, NULL)
  651. -       subsection_map_init((unsigned long)start >> PAGE_SHIFT,
  652. -                   (unsigned long)(end - start) >> PAGE_SHIFT);
  653.     start_pfn = node_start_pfn;
  654.  #endif
  655.     calculate_node_totalpages(pgdat, start_pfn, end_pfn,
  656. @@ -7692,14 +7735,15 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
  657.         free_reserved_page(page);
  658.     }
  659.  
  660. -   if (pages && s)
  661. -       pr_info("Freeing %s memory: %ldK\n",
  662. -           s, pages << (PAGE_SHIFT - 10));
  663. +   if (pages && s) {
  664. +       pr_info("Freeing %s memory: %ldK\n", s, pages << (PAGE_SHIFT - 10));
  665. +       if (!strcmp(s, "initrd") || !strcmp(s, "unused kernel")) {
  666. +           long size;
  667.  
  668. -#ifdef CONFIG_HAVE_MEMBLOCK
  669. -       memblock_dbg("memblock_free: [%#016llx-%#016llx] %pS\n",
  670. -           __pa(start), __pa(end), (void *)_RET_IP_);
  671. -#endif
  672. +           size = -1 * (long)(pages << PAGE_SHIFT);
  673. +           memblock_memsize_mod_kernel_size(size);
  674. +       }
  675. +   }
  676.  
  677.     return pages;
  678.  }
  679. @@ -7978,11 +8022,11 @@ static void __setup_per_zone_wmarks(void)
  680.                 mult_frac(zone_managed_pages(zone),
  681.                       watermark_scale_factor, 10000));
  682.  
  683. -       zone->watermark_boost = 0;
  684.         zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) +
  685.                     low + tmp;
  686.         zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) +
  687.                     low + tmp * 2;
  688. +       zone->watermark_boost = 0;
  689.  
  690.         spin_unlock_irqrestore(&zone->lock, flags);
  691.     }
  692. @@ -8097,22 +8141,6 @@ int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
  693.     return 0;
  694.  }
  695.  
  696. -#ifdef CONFIG_MULTIPLE_KSWAPD
  697. -int kswapd_threads_sysctl_handler(struct ctl_table *table, int write,
  698. -   void __user *buffer, size_t *length, loff_t *ppos)
  699. -{
  700. -   int rc;
  701. -
  702. -   rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
  703. -   if (rc)
  704. -       return rc;
  705. -
  706. -   if (write)
  707. -       update_kswapd_threads();
  708. -
  709. -   return 0;
  710. -}
  711. -#endif
  712.  int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
  713.     void __user *buffer, size_t *length, loff_t *ppos)
  714.  {
  715. @@ -8522,7 +8550,8 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
  716.  
  717.  /* [start, end) must belong to a single zone. */
  718.  static int __alloc_contig_migrate_range(struct compact_control *cc,
  719. -                   unsigned long start, unsigned long end)
  720. +                   unsigned long start, unsigned long end,
  721. +                   bool drain)
  722.  {
  723.     /* This function is based on compact_zone() from compaction.c. */
  724.     unsigned long nr_reclaimed;
  725. @@ -8530,7 +8559,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
  726.     unsigned int tries = 0;
  727.     int ret = 0;
  728.  
  729. -   migrate_prep();
  730. +   if (drain)
  731. +       migrate_prep();
  732.  
  733.     while (pfn < end || !list_empty(&cc->migratepages)) {
  734.         if (fatal_signal_pending(current)) {
  735. @@ -8586,8 +8616,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
  736.   * pages which PFN is in [start, end) are allocated for the caller and
  737.   * need to be freed with free_contig_range().
  738.   */
  739. -int alloc_contig_range(unsigned long start, unsigned long end,
  740. -              unsigned migratetype, gfp_t gfp_mask)
  741. +int __alloc_contig_range(unsigned long start, unsigned long end,
  742. +            unsigned migratetype, gfp_t gfp_mask, bool drain)
  743.  {
  744.     unsigned long outer_start, outer_end;
  745.     unsigned int order;
  746. @@ -8646,7 +8676,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
  747.      * allocated.  So, if we fall through be sure to clear ret so that
  748.      * -EBUSY is not accidentally used or returned to caller.
  749.      */
  750. -   ret = __alloc_contig_migrate_range(&cc, start, end);
  751. +   ret = __alloc_contig_migrate_range(&cc, start, end, drain);
  752.     if (ret && ret != -EBUSY)
  753.         goto done;
  754.     ret =0;
  755. @@ -8668,37 +8698,40 @@ int alloc_contig_range(unsigned long start, unsigned long end,
  756.      * isolated thus they won't get removed from buddy.
  757.      */
  758.  
  759. -   lru_add_drain_all();
  760. -
  761.     order = 0;
  762.     outer_start = start;
  763. -   while (!PageBuddy(pfn_to_page(outer_start))) {
  764. -       if (++order >= MAX_ORDER) {
  765. -           outer_start = start;
  766. -           break;
  767. -       }
  768. -       outer_start &= ~0UL << order;
  769. -   }
  770.  
  771. -   if (outer_start != start) {
  772. -       order = page_order(pfn_to_page(outer_start));
  773. +   if (drain) {
  774. +       lru_add_drain_all();
  775. +       drain_all_pages(cc.zone);
  776.  
  777. -       /*
  778. -        * outer_start page could be small order buddy page and
  779. -        * it doesn't include start page. Adjust outer_start
  780. -        * in this case to report failed page properly
  781. -        * on tracepoint in test_pages_isolated()
  782. -        */
  783. -       if (outer_start + (1UL << order) <= start)
  784. -           outer_start = start;
  785. -   }
  786. +       while (!PageBuddy(pfn_to_page(outer_start))) {
  787. +           if (++order >= MAX_ORDER) {
  788. +               outer_start = start;
  789. +               break;
  790. +           }
  791. +           outer_start &= ~0UL << order;
  792. +       }
  793.  
  794. -   /* Make sure the range is really isolated. */
  795. -   if (test_pages_isolated(outer_start, end, false)) {
  796. -       pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
  797. -           __func__, outer_start, end);
  798. -       ret = -EBUSY;
  799. -       goto done;
  800. +       if (outer_start != start) {
  801. +           order = page_order(pfn_to_page(outer_start));
  802. +
  803. +           /*
  804. +            * outer_start page could be small order buddy page and
  805. +            * it doesn't include start page. Adjust outer_start
  806. +            * in this case to report failed page properly
  807. +            * on tracepoint in test_pages_isolated()
  808. +            */
  809. +           if (outer_start + (1UL << order) <= start)
  810. +               outer_start = start;
  811. +       }
  812. +       /* Make sure the range is really isolated. */
  813. +       if (test_pages_isolated(outer_start, end, false)) {
  814. +           pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
  815. +               __func__, outer_start, end);
  816. +           ret = -EBUSY;
  817. +           goto done;
  818. +       }
  819.     }
  820.  
  821.     /* Grab isolated pages from freelists. */
  822. @@ -8722,6 +8755,17 @@ done:
  823.  #endif
  824.     return ret;
  825.  }
  826. +int alloc_contig_range(unsigned long start, unsigned long end,
  827. +              unsigned migratetype, gfp_t gfp_mask)
  828. +{
  829. +   return __alloc_contig_range(start, end, migratetype, gfp_mask, true);
  830. +}
  831. +
  832. +int alloc_contig_range_fast(unsigned long start, unsigned long end,
  833. +               unsigned migratetype)
  834. +{
  835. +   return __alloc_contig_range(start, end, migratetype, GFP_KERNEL, false);
  836. +}
  837.  #endif /* CONFIG_CONTIG_ALLOC */
  838.  
  839.  void free_contig_range(unsigned long pfn, unsigned int nr_pages)
  840. diff --git a/mm/vmscan.c b/mm/vmscan.c
  841. index 2d57e7eddfeb8..753985b1051aa 100644
  842. --- a/mm/vmscan.c
  843. +++ b/mm/vmscan.c
  844. @@ -89,9 +89,12 @@ struct scan_control {
  845.     unsigned int may_swap:1;
  846.  
  847.     /*
  848. -    * Cgroups are not reclaimed below their configured memory.low,
  849. -    * unless we threaten to OOM. If any cgroups are skipped due to
  850. -    * memory.low and nothing was reclaimed, go back for memory.low.
  851. +    * Cgroup memory below memory.low is protected as long as we
  852. +    * don't threaten to OOM. If any cgroup is reclaimed at
  853. +    * reduced force or passed over entirely due to its memory.low
  854. +    * setting (memcg_low_skipped), and nothing is reclaimed as a
  855. +    * result, then go back for one more cycle that reclaims the protected
  856. +    * memory (memcg_low_reclaim) to avert OOM.
  857.      */
  858.     unsigned int memcg_low_reclaim:1;
  859.     unsigned int memcg_low_skipped:1;
  860. @@ -131,21 +134,8 @@ struct scan_control {
  861.  
  862.     /* for recording the reclaimed slab by now */
  863.     struct reclaim_state reclaim_state;
  864. -   /*
  865. -    * Reclaim pages from a vma. If the page is shared by other tasks
  866. -    * it is zapped from a vma without reclaim so it ends up remaining
  867. -    * on memory until last task zap it.
  868. -    */
  869. -   struct vm_area_struct *target_vma;
  870.  };
  871.  
  872. -/*
  873. - * Number of active kswapd threads
  874. - */
  875. -#define DEF_KSWAPD_THREADS_PER_NODE 1
  876. -int kswapd_threads = DEF_KSWAPD_THREADS_PER_NODE;
  877. -int kswapd_threads_current = DEF_KSWAPD_THREADS_PER_NODE;
  878. -
  879.  #ifdef ARCH_HAS_PREFETCH
  880.  #define prefetch_prev_lru_page(_page, _base, _field)           \
  881.     do {                                \
  882. @@ -485,10 +475,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
  883.     long batch_size = shrinker->batch ? shrinker->batch
  884.                       : SHRINK_BATCH;
  885.     long scanned = 0, next_deferred;
  886. -   long min_cache_size = batch_size;
  887. -
  888. -   if (current_is_kswapd())
  889. -       min_cache_size = 0;
  890.  
  891.     if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
  892.         nid = 0;
  893. @@ -568,7 +554,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
  894.      * scanning at high prio and therefore should try to reclaim as much as
  895.      * possible.
  896.      */
  897. -   while (total_scan > min_cache_size ||
  898. +   while (total_scan >= batch_size ||
  899.            total_scan >= freeable) {
  900.         unsigned long ret;
  901.         unsigned long nr_to_scan = min(batch_size, total_scan);
  902. @@ -614,6 +600,10 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
  903.     unsigned long ret, freed = 0;
  904.     int i;
  905.  
  906. +   /* allow shrink_slab_memcg for only kswapd */
  907. +   if (!current_is_kswapd())
  908. +       return 0;
  909. +
  910.     if (!mem_cgroup_online(memcg))
  911.         return 0;
  912.  
  913. @@ -642,8 +632,10 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
  914.  
  915.         /* Call non-slab shrinkers even though kmem is disabled */
  916.         if (!memcg_kmem_enabled() &&
  917. -           !(shrinker->flags & SHRINKER_NONSLAB))
  918. +           !(shrinker->flags & SHRINKER_NONSLAB)) {
  919. +           clear_bit(i, map->map);
  920.             continue;
  921. +       }
  922.  
  923.         ret = do_shrink_slab(&sc, shrinker, priority);
  924.         if (ret == SHRINK_EMPTY) {
  925. @@ -1165,8 +1157,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
  926.             goto keep;
  927.  
  928.         VM_BUG_ON_PAGE(PageActive(page), page);
  929. -       if (pgdat)
  930. -           VM_BUG_ON_PAGE(page_pgdat(page) != pgdat, page);
  931.  
  932.         nr_pages = compound_nr(page);
  933.  
  934. @@ -1179,6 +1169,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
  935.         if (!sc->may_unmap && page_mapped(page))
  936.             goto keep_locked;
  937.  
  938. +#ifdef CONFIG_HUGEPAGE_POOL
  939. +       if (PageTransHuge(page))
  940. +           goto keep_locked;
  941. +#endif
  942. +
  943.         may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
  944.             (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
  945.  
  946. @@ -1253,8 +1248,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
  947.             /* Case 1 above */
  948.             if (current_is_kswapd() &&
  949.                 PageReclaim(page) &&
  950. -               (pgdat &&
  951. -               test_bit(PGDAT_WRITEBACK, &pgdat->flags))) {
  952. +               test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
  953.                 stat->nr_immediate++;
  954.                 goto activate_locked;
  955.  
  956. @@ -1326,6 +1320,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
  957.                 if (!add_to_swap(page)) {
  958.                     if (!PageTransHuge(page))
  959.                         goto activate_locked_split;
  960. +#ifdef CONFIG_HUGEPAGE_POOL_DEBUG
  961. +                   BUG();
  962. +#endif
  963.                     /* Fallback to swap normal pages */
  964.                     if (split_huge_page_to_list(page,
  965.                                     page_list))
  966. @@ -1366,11 +1363,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
  967.          */
  968.         if (page_mapped(page)) {
  969.             enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
  970. +           bool was_swapbacked = PageSwapBacked(page);
  971.  
  972.             if (unlikely(PageTransHuge(page)))
  973.                 flags |= TTU_SPLIT_HUGE_PMD;
  974. -           if (!try_to_unmap(page, flags, sc->target_vma)) {
  975. +           if (!try_to_unmap(page, flags)) {
  976.                 stat->nr_unmap_fail += nr_pages;
  977. +               if (!was_swapbacked && PageSwapBacked(page))
  978. +                   stat->nr_lazyfree_fail += nr_pages;
  979.                 goto activate_locked;
  980.             }
  981.         }
  982. @@ -1388,8 +1388,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
  983.              */
  984.             if (page_is_file_cache(page) &&
  985.                 (!current_is_kswapd() || !PageReclaim(page) ||
  986. -                (pgdat &&
  987. -               !test_bit(PGDAT_DIRTY, &pgdat->flags)))) {
  988. +                !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
  989.                 /*
  990.                  * Immediately reclaim when written back.
  991.                  * Similar in principal to deactivate_page()
  992. @@ -1513,14 +1512,13 @@ free_it:
  993.         else
  994.             list_add(&page->lru, &free_pages);
  995.         /*
  996. -        * If pagelist are from multiple nodes, we should decrease
  997. +        * If pagelist are from multiple zones, we should decrease
  998.          * NR_ISOLATED_ANON + x on freed pages in here.
  999.          */
  1000.         if (!pgdat)
  1001.             dec_node_page_state(page, NR_ISOLATED_ANON +
  1002. -                   page_is_file_cache(page));
  1003. +                       page_is_file_cache(page));
  1004.         continue;
  1005. -
  1006.  activate_locked_split:
  1007.         /*
  1008.          * The tail pages that are failed to add into swap cache
  1009. @@ -1570,7 +1568,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
  1010.         .may_unmap = 1,
  1011.     };
  1012.     struct reclaim_stat dummy_stat;
  1013. -   unsigned long ret;
  1014. +   unsigned long nr_reclaimed;
  1015.     struct page *page, *next;
  1016.     LIST_HEAD(clean_pages);
  1017.  
  1018. @@ -1582,16 +1580,25 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
  1019.         }
  1020.     }
  1021.  
  1022. -   ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
  1023. +   nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
  1024.             TTU_IGNORE_ACCESS, &dummy_stat, true);
  1025.     list_splice(&clean_pages, page_list);
  1026. -   mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
  1027. -   return ret;
  1028. +   mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed);
  1029. +   /*
  1030. +    * Since lazyfree pages are isolated from file LRU from the beginning,
  1031. +    * they will rotate back to anonymous LRU in the end if it failed to
  1032. +    * discard so isolated count will be mismatched.
  1033. +    * Compensate the isolated count for both LRU lists.
  1034. +    */
  1035. +   mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
  1036. +               dummy_stat.nr_lazyfree_fail);
  1037. +   mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
  1038. +               -dummy_stat.nr_lazyfree_fail);
  1039. +
  1040. +   return nr_reclaimed;
  1041.  }
  1042.  
  1043. -#ifdef CONFIG_PROCESS_RECLAIM
  1044. -unsigned long reclaim_pages_from_list(struct list_head *page_list,
  1045. -                   struct vm_area_struct *vma)
  1046. +unsigned long reclaim_pages_from_list(struct list_head *page_list)
  1047.  {
  1048.     struct scan_control sc = {
  1049.         .gfp_mask = GFP_KERNEL,
  1050. @@ -1599,30 +1606,34 @@ unsigned long reclaim_pages_from_list(struct list_head *page_list,
  1051.         .may_writepage = 1,
  1052.         .may_unmap = 1,
  1053.         .may_swap = 1,
  1054. -       .target_vma = vma,
  1055.     };
  1056. -
  1057. +   struct reclaim_stat dummy_stat;
  1058.     unsigned long nr_reclaimed;
  1059. -   struct reclaim_stat stat;
  1060. -   struct page *page;
  1061. +   struct page *page, *next;
  1062. +   LIST_HEAD(unevictable_pages);
  1063.  
  1064. -   list_for_each_entry(page, page_list, lru)
  1065. +   list_for_each_entry_safe(page, next, page_list, lru) {
  1066. +       if (PageUnevictable(page)) {
  1067. +           list_move(&page->lru, &unevictable_pages);
  1068. +           continue;
  1069. +       }
  1070.         ClearPageActive(page);
  1071. +   }
  1072.  
  1073.     nr_reclaimed = shrink_page_list(page_list, NULL, &sc,
  1074. -           TTU_IGNORE_ACCESS, &stat, true);
  1075. +                   TTU_IGNORE_ACCESS, &dummy_stat, true);
  1076.  
  1077. +   list_splice(&unevictable_pages, page_list);
  1078.     while (!list_empty(page_list)) {
  1079.         page = lru_to_page(page_list);
  1080.         list_del(&page->lru);
  1081.         dec_node_page_state(page, NR_ISOLATED_ANON +
  1082. -               page_is_file_cache(page));
  1083. +                   page_is_file_cache(page));
  1084.         putback_lru_page(page);
  1085.     }
  1086.  
  1087.     return nr_reclaimed;
  1088.  }
  1089. -#endif
  1090.  
  1091.  /*
  1092.   * Attempt to remove the specified page from its LRU.  Only take this page
  1093. @@ -1770,7 +1781,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
  1094.         nr_pages = compound_nr(page);
  1095.         total_scan += nr_pages;
  1096.  
  1097. +#ifdef CONFIG_HUGEPAGE_POOL
  1098. +       if (page_zonenum(page) > sc->reclaim_idx
  1099. +           || PageTransHuge(page)) {
  1100. +#else
  1101.         if (page_zonenum(page) > sc->reclaim_idx) {
  1102. +#endif
  1103.             list_move(&page->lru, &pages_skipped);
  1104.             nr_skipped[page_zonenum(page)] += nr_pages;
  1105.             continue;
  1106. @@ -2026,13 +2042,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
  1107.         if (stalled)
  1108.             return 0;
  1109.  
  1110. -       /* We are about to die and free our memory. Return now. */
  1111. -       if (fatal_signal_pending(current))
  1112. -           return SWAP_CLUSTER_MAX;
  1113. -
  1114.         /* wait a bit for the reclaimer. */
  1115.         msleep(100);
  1116.         stalled = true;
  1117. +
  1118. +       /* We are about to die and free our memory. Return now. */
  1119. +       if (fatal_signal_pending(current))
  1120. +           return SWAP_CLUSTER_MAX;
  1121.     }
  1122.  
  1123.     lru_add_drain();
  1124. @@ -2355,6 +2371,214 @@ enum scan_balance {
  1125.     SCAN_FILE,
  1126.  };
  1127.  
  1128. +/* mem_boost throttles only kswapd's behavior */
  1129. +enum mem_boost {
  1130. +   NO_BOOST,
  1131. +   BOOST_MID = 1,
  1132. +   BOOST_HIGH = 2,
  1133. +   BOOST_KILL = 3,
  1134. +};
  1135. +static int mem_boost_mode = NO_BOOST;
  1136. +static unsigned long last_mode_change;
  1137. +static bool am_app_launch = false;
  1138. +
  1139. +#define MEM_BOOST_MAX_TIME (5 * HZ) /* 5 sec */
  1140. +
  1141. +#if CONFIG_KSWAPD_CPU
  1142. +static int set_kswapd_cpu_affinity_as_config(void);
  1143. +static int set_kswapd_cpu_affinity_as_boost(void);
  1144. +#endif
  1145. +
  1146. +#ifdef CONFIG_SYSFS
  1147. +static ssize_t mem_boost_mode_show(struct kobject *kobj,
  1148. +                   struct kobj_attribute *attr, char *buf)
  1149. +{
  1150. +   if (mem_boost_mode != NO_BOOST &&
  1151. +       time_after(jiffies, last_mode_change + MEM_BOOST_MAX_TIME)) {
  1152. +       mem_boost_mode = NO_BOOST;
  1153. +#ifdef CONFIG_KSWAPD_CPU
  1154. +       set_kswapd_cpu_affinity_as_config();
  1155. +#endif
  1156. +   }
  1157. +   return sprintf(buf, "%d\n", mem_boost_mode);
  1158. +}
  1159. +
  1160. +static ssize_t mem_boost_mode_store(struct kobject *kobj,
  1161. +                    struct kobj_attribute *attr,
  1162. +                    const char *buf, size_t count)
  1163. +{
  1164. +   int mode;
  1165. +   int err;
  1166. +
  1167. +   err = kstrtoint(buf, 10, &mode);
  1168. +   if (err || mode > BOOST_KILL || mode < NO_BOOST)
  1169. +       return -EINVAL;
  1170. +
  1171. +   mem_boost_mode = mode;
  1172. +   last_mode_change = jiffies;
  1173. +#ifdef CONFIG_ION_RBIN_HEAP
  1174. +   if (mem_boost_mode >= BOOST_HIGH)
  1175. +       wake_ion_rbin_heap_prereclaim();
  1176. +#endif
  1177. +#if CONFIG_KSWAPD_CPU
  1178. +   if (mem_boost_mode >= BOOST_HIGH)
  1179. +       set_kswapd_cpu_affinity_as_boost();
  1180. +   else if (mem_boost_mode == NO_BOOST)
  1181. +       set_kswapd_cpu_affinity_as_config();
  1182. +#endif
  1183. +   return count;
  1184. +}
  1185. +
  1186. +static inline bool mem_boost_pgdat_wmark(struct pglist_data *pgdat)
  1187. +{
  1188. +   int z;
  1189. +   struct zone *zone;
  1190. +   unsigned long mark;
  1191. +
  1192. +   for (z = 0; z < MAX_NR_ZONES; z++) {
  1193. +       zone = &pgdat->node_zones[z];
  1194. +       if (!managed_zone(zone))
  1195. +           continue;
  1196. +       mark = low_wmark_pages(zone); //TODO: low, high, or (low + high)/2
  1197. +       if (zone_watermark_ok_safe(zone, 0, mark, 0))
  1198. +           return true;
  1199. +   }
  1200. +   return false;
  1201. +}
  1202. +
  1203. +#define MB_TO_PAGES(x) ((x) << (20 - PAGE_SHIFT))
  1204. +#define GB_TO_PAGES(x) ((x) << (30 - PAGE_SHIFT))
  1205. +static unsigned long low_threshold;
  1206. +
  1207. +static inline bool is_too_low_file(struct pglist_data *pgdat)
  1208. +{
  1209. +       unsigned long pgdatfile;
  1210. +       if (!low_threshold) {
  1211. +               if (totalram_pages() > GB_TO_PAGES(2))
  1212. +                       low_threshold = MB_TO_PAGES(600);
  1213. +               else if (totalram_pages() > GB_TO_PAGES(1))
  1214. +                       low_threshold = MB_TO_PAGES(300);
  1215. +               else
  1216. +                       low_threshold = MB_TO_PAGES(200);
  1217. +       }
  1218. +
  1219. +       pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
  1220. +                  node_page_state(pgdat, NR_INACTIVE_FILE);
  1221. +       return pgdatfile < low_threshold;
  1222. +}
  1223. +
  1224. +inline bool need_memory_boosting(struct pglist_data *pgdat)
  1225. +{
  1226. +   bool ret;
  1227. +
  1228. +   if (mem_boost_mode != NO_BOOST &&
  1229. +       (time_after(jiffies, last_mode_change + MEM_BOOST_MAX_TIME) ||
  1230. +       is_too_low_file(pgdat))) {
  1231. +       mem_boost_mode = NO_BOOST;
  1232. +#if CONFIG_KSWAPD_CPU
  1233. +       set_kswapd_cpu_affinity_as_config();
  1234. +#endif
  1235. +   }
  1236. +
  1237. +   switch (mem_boost_mode) {
  1238. +       case BOOST_KILL:
  1239. +       case BOOST_HIGH:
  1240. +           ret = true;
  1241. +           break;
  1242. +       case BOOST_MID:
  1243. +#ifndef CONFIG_NEED_MULTIPLE_NODES
  1244. +           if (!pgdat)
  1245. +               pgdat = &contig_page_data;
  1246. +#endif
  1247. +           ret = mem_boost_pgdat_wmark(pgdat) ? false : true;
  1248. +           break;
  1249. +       case NO_BOOST:
  1250. +       default:
  1251. +           ret = false;
  1252. +           break;
  1253. +   }
  1254. +   return ret;
  1255. +}
  1256. +
  1257. +ATOMIC_NOTIFIER_HEAD(am_app_launch_notifier);
  1258. +
  1259. +int am_app_launch_notifier_register(struct notifier_block *nb)
  1260. +{
  1261. +   return atomic_notifier_chain_register(&am_app_launch_notifier, nb);
  1262. +}
  1263. +
  1264. +int am_app_launch_notifier_unregister(struct notifier_block *nb)
  1265. +{
  1266. +   return  atomic_notifier_chain_unregister(&am_app_launch_notifier, nb);
  1267. +}
  1268. +
  1269. +static ssize_t am_app_launch_show(struct kobject *kobj,
  1270. +                 struct kobj_attribute *attr, char *buf)
  1271. +{
  1272. +   int ret;
  1273. +
  1274. +   ret = am_app_launch ? 1 : 0;
  1275. +   return sprintf(buf, "%d\n", ret);
  1276. +}
  1277. +
  1278. +static int notify_app_launch_started(void)
  1279. +{
  1280. +   trace_printk("%s\n", "am_app_launch started");
  1281. +   atomic_notifier_call_chain(&am_app_launch_notifier, 1, NULL);
  1282. +   return 0;
  1283. +}
  1284. +
  1285. +static int notify_app_launch_finished(void)
  1286. +{
  1287. +   trace_printk("%s\n", "am_app_launch finished");
  1288. +   atomic_notifier_call_chain(&am_app_launch_notifier, 0, NULL);
  1289. +   return 0;
  1290. +}
  1291. +
  1292. +static ssize_t am_app_launch_store(struct kobject *kobj,
  1293. +                  struct kobj_attribute *attr,
  1294. +                  const char *buf, size_t count)
  1295. +{
  1296. +   int mode;
  1297. +   int err;
  1298. +   bool am_app_launch_new;
  1299. +
  1300. +   err = kstrtoint(buf, 10, &mode);
  1301. +   if (err || (mode != 0 && mode != 1))
  1302. +       return -EINVAL;
  1303. +
  1304. +   am_app_launch_new = mode ? true : false;
  1305. +   trace_printk("am_app_launch %d -> %d\n", am_app_launch,
  1306. +            am_app_launch_new);
  1307. +   if (am_app_launch != am_app_launch_new) {
  1308. +       if (am_app_launch_new)
  1309. +           notify_app_launch_started();
  1310. +       else
  1311. +           notify_app_launch_finished();
  1312. +   }
  1313. +   am_app_launch = am_app_launch_new;
  1314. +
  1315. +   return count;
  1316. +}
  1317. +
  1318. +#define MEM_BOOST_ATTR(_name) \
  1319. +   static struct kobj_attribute _name##_attr = \
  1320. +       __ATTR(_name, 0644, _name##_show, _name##_store)
  1321. +MEM_BOOST_ATTR(mem_boost_mode);
  1322. +MEM_BOOST_ATTR(am_app_launch);
  1323. +
  1324. +static struct attribute *vmscan_attrs[] = {
  1325. +   &mem_boost_mode_attr.attr,
  1326. +   &am_app_launch_attr.attr,
  1327. +   NULL,
  1328. +};
  1329. +
  1330. +static struct attribute_group vmscan_attr_group = {
  1331. +   .attrs = vmscan_attrs,
  1332. +   .name = "vmscan",
  1333. +};
  1334. +#endif
  1335. +
  1336.  /*
  1337.   * Determine how aggressively the anon and file LRU lists should be
  1338.   * scanned.  The relative value of each set of LRU lists is determined
  1339. @@ -2449,6 +2673,11 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
  1340.         }
  1341.     }
  1342.  
  1343. +   if (current_is_kswapd() && need_memory_boosting(pgdat)) {
  1344. +       scan_balance = SCAN_FILE;
  1345. +       goto out;
  1346. +   }
  1347. +
  1348.     /*
  1349.      * If there is enough inactive page cache, i.e. if the size of the
  1350.      * inactive list is greater than that of the active list *and* the
  1351. @@ -2522,14 +2751,14 @@ out:
  1352.     for_each_evictable_lru(lru) {
  1353.         int file = is_file_lru(lru);
  1354.         unsigned long lruvec_size;
  1355. +       unsigned long low, min;
  1356.         unsigned long scan;
  1357. -       unsigned long protection;
  1358.  
  1359.         lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
  1360. -       protection = mem_cgroup_protection(memcg,
  1361. -                          sc->memcg_low_reclaim);
  1362. +       mem_cgroup_protection(sc->target_mem_cgroup, memcg,
  1363. +                     &min, &low);
  1364.  
  1365. -       if (protection) {
  1366. +       if (min || low) {
  1367.             /*
  1368.              * Scale a cgroup's reclaim pressure by proportioning
  1369.              * its current usage to its memory.low or memory.min
  1370. @@ -2560,6 +2789,15 @@ out:
  1371.              * hard protection.
  1372.              */
  1373.             unsigned long cgroup_size = mem_cgroup_size(memcg);
  1374. +           unsigned long protection;
  1375. +
  1376. +           /* memory.low scaling, make sure we retry before OOM */
  1377. +           if (!sc->memcg_low_reclaim && low > min) {
  1378. +               protection = low;
  1379. +               sc->memcg_low_skipped = 1;
  1380. +           } else {
  1381. +               protection = min;
  1382. +           }
  1383.  
  1384.             /* Avoid TOCTOU with earlier protection check */
  1385.             cgroup_size = max(cgroup_size, protection);
  1386. @@ -2621,6 +2859,65 @@ out:
  1387.     }
  1388.  }
  1389.  
  1390. +#ifdef CONFIG_MEMCG_HEIMDALL
  1391. +void forced_shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
  1392. +                 int type, unsigned long nr_requested)
  1393. +{
  1394. +   struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
  1395. +   unsigned long nr[NR_LRU_LISTS] = {0,};
  1396. +   unsigned long nr_to_scan;
  1397. +   enum lru_list lru;
  1398. +   unsigned long nr_reclaimed = 0;
  1399. +   struct blk_plug plug;
  1400. +   unsigned long anon = 0, file = 0;
  1401. +   struct scan_control sc = {
  1402. +       .nr_to_reclaim = SWAP_CLUSTER_MAX,
  1403. +       .gfp_mask = GFP_KERNEL,
  1404. +       .reclaim_idx = MAX_NR_ZONES - 1,
  1405. +       .target_mem_cgroup = memcg,
  1406. +       .priority = DEF_PRIORITY,
  1407. +       .may_writepage = !laptop_mode,
  1408. +       .may_unmap = 1,
  1409. +       .may_swap = 1,
  1410. +   };
  1411. +
  1412. +   if (type == MEMCG_HEIMDALL_SHRINK_ANON) {
  1413. +       anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
  1414. +           lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
  1415. +       nr[LRU_ACTIVE_ANON] = nr[LRU_INACTIVE_ANON] = anon;
  1416. +       nr[LRU_ACTIVE_FILE] = nr[LRU_INACTIVE_FILE] = 0;
  1417. +   } else if (type == MEMCG_HEIMDALL_SHRINK_FILE) {
  1418. +       file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
  1419. +           lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
  1420. +       nr[LRU_ACTIVE_ANON] = nr[LRU_INACTIVE_ANON] = 0;
  1421. +       nr[LRU_ACTIVE_FILE] = nr[LRU_INACTIVE_FILE] = file;
  1422. +   }
  1423. +
  1424. +   trace_printk("%s heimdall start %d %lu %lu %lu\n", __func__, type, nr_requested, anon, file);
  1425. +   blk_start_plug(&plug);
  1426. +   while (nr[LRU_INACTIVE_ANON] > 0 || nr[LRU_INACTIVE_FILE] > 0) {
  1427. +       for_each_evictable_lru(lru) {
  1428. +           if (nr[lru]) {
  1429. +               nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
  1430. +               nr[lru] -= nr_to_scan;
  1431. +
  1432. +               nr_reclaimed += shrink_list(lru, nr_to_scan,
  1433. +                               lruvec, &sc);
  1434. +           }
  1435. +       }
  1436. +
  1437. +       if (nr_reclaimed >= nr_requested)
  1438. +           break;
  1439. +
  1440. +       cond_resched();
  1441. +   }
  1442. +   blk_finish_plug(&plug);
  1443. +   sc.nr_reclaimed += nr_reclaimed;
  1444. +   trace_printk("%s end %d %lu %lu %lu\n", __func__, type, nr_reclaimed,
  1445. +       nr[LRU_INACTIVE_ANON], nr[LRU_INACTIVE_FILE]);
  1446. +}
  1447. +#endif
  1448. +
  1449.  /*
  1450.   * This is a basic per-node page freer.  Used by both kswapd and direct reclaim.
  1451.   */
  1452. @@ -2731,6 +3028,9 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
  1453.     blk_finish_plug(&plug);
  1454.     sc->nr_reclaimed += nr_reclaimed;
  1455.  
  1456. +   if (need_memory_boosting(NULL))
  1457. +       return;
  1458. +
  1459.     /*
  1460.      * Even if we did not try to evict anon pages at all, we want to
  1461.      * rebalance the anon lru active/inactive ratio.
  1462. @@ -3347,7 +3647,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
  1463.         .priority = DEF_PRIORITY,
  1464.         .may_writepage = !laptop_mode,
  1465.         .may_unmap = 1,
  1466. +#ifdef CONFIG_DIRECT_RECLAIM_FILE_PAGES_ONLY
  1467. +       .may_swap = 0,
  1468. +#else
  1469.         .may_swap = 1,
  1470. +#endif
  1471.     };
  1472.  
  1473.     /*
  1474. @@ -3954,6 +4258,65 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
  1475.     finish_wait(&pgdat->kswapd_wait, &wait);
  1476.  }
  1477.  
  1478. +#if CONFIG_KSWAPD_CPU
  1479. +static struct cpumask kswapd_cpumask;
  1480. +
  1481. +#define KSWAPD_CPU_BIG 0xF0
  1482. +static struct cpumask kswapd_cpumask_boost;
  1483. +
  1484. +static void init_kswapd_cpumask(void)
  1485. +{
  1486. +   int i;
  1487. +
  1488. +   cpumask_clear(&kswapd_cpumask);
  1489. +   for (i = 0; i < nr_cpu_ids; i++) {
  1490. +       if (CONFIG_KSWAPD_CPU & (1 << i))
  1491. +           cpumask_set_cpu(i, &kswapd_cpumask);
  1492. +   }
  1493. +
  1494. +   cpumask_clear(&kswapd_cpumask_boost);
  1495. +   for (i = 0; i < nr_cpu_ids; i++) {
  1496. +       if (KSWAPD_CPU_BIG & (1 << i))
  1497. +           cpumask_set_cpu(i, &kswapd_cpumask_boost);
  1498. +   }
  1499. +}
  1500. +
  1501. +/* follow like kswapd_cpu_online(unsigned int cpu) */
  1502. +static int set_kswapd_cpu_affinity_as_config(void)
  1503. +{
  1504. +   int nid;
  1505. +
  1506. +   for_each_node_state(nid, N_MEMORY) {
  1507. +       pg_data_t *pgdat = NODE_DATA(nid);
  1508. +       const struct cpumask *mask;
  1509. +
  1510. +       mask = &kswapd_cpumask;
  1511. +
  1512. +       if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
  1513. +           /* One of our CPUs online: restore mask */
  1514. +           set_cpus_allowed_ptr(pgdat->kswapd, mask);
  1515. +   }
  1516. +   return 0;
  1517. +}
  1518. +
  1519. +static int set_kswapd_cpu_affinity_as_boost(void)
  1520. +{
  1521. +   int nid;
  1522. +
  1523. +   for_each_node_state(nid, N_MEMORY) {
  1524. +       pg_data_t *pgdat = NODE_DATA(nid);
  1525. +       const struct cpumask *mask;
  1526. +
  1527. +       mask = &kswapd_cpumask_boost;
  1528. +
  1529. +       if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
  1530. +           /* One of our CPUs online: restore mask */
  1531. +           set_cpus_allowed_ptr(pgdat->kswapd, mask);
  1532. +   }
  1533. +   return 0;
  1534. +}
  1535. +#endif
  1536. +
  1537.  /*
  1538.   * The background pageout daemon, started as a kernel thread
  1539.   * from the init process.
  1540. @@ -3973,7 +4336,11 @@ static int kswapd(void *p)
  1541.     unsigned int classzone_idx = MAX_NR_ZONES - 1;
  1542.     pg_data_t *pgdat = (pg_data_t*)p;
  1543.     struct task_struct *tsk = current;
  1544. +#if CONFIG_KSWAPD_CPU
  1545. +   const struct cpumask *cpumask = &kswapd_cpumask;
  1546. +#else
  1547.     const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
  1548. +#endif
  1549.  
  1550.     if (!cpumask_empty(cpumask))
  1551.         set_cpus_allowed_ptr(tsk, cpumask);
  1552. @@ -4133,116 +4500,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
  1553.  }
  1554.  #endif /* CONFIG_HIBERNATION */
  1555.  
  1556. -#ifdef CONFIG_MULTIPLE_KSWAPD
  1557. -static void update_kswapd_threads_node(int nid)
  1558. -{
  1559. -   pg_data_t *pgdat;
  1560. -   int drop, increase;
  1561. -   int last_idx, start_idx, hid;
  1562. -   int nr_threads = kswapd_threads_current;
  1563. -
  1564. -   pgdat = NODE_DATA(nid);
  1565. -   last_idx = nr_threads - 1;
  1566. -   if (kswapd_threads < nr_threads) {
  1567. -       drop = nr_threads - kswapd_threads;
  1568. -       for (hid = last_idx; hid > (last_idx - drop); hid--) {
  1569. -           if (pgdat->mkswapd[hid]) {
  1570. -               kthread_stop(pgdat->mkswapd[hid]);
  1571. -               pgdat->mkswapd[hid] = NULL;
  1572. -           }
  1573. -       }
  1574. -   } else {
  1575. -       increase = kswapd_threads - nr_threads;
  1576. -       start_idx = last_idx + 1;
  1577. -       for (hid = start_idx; hid < (start_idx + increase); hid++) {
  1578. -           pgdat->mkswapd[hid] = kthread_run(kswapd, pgdat,
  1579. -                       "kswapd%d:%d", nid, hid);
  1580. -           if (IS_ERR(pgdat->mkswapd[hid])) {
  1581. -               pr_err("Failed to start kswapd%d on node %d\n",
  1582. -                   hid, nid);
  1583. -               pgdat->mkswapd[hid] = NULL;
  1584. -               /*
  1585. -                * We are out of resources. Do not start any
  1586. -                * more threads.
  1587. -                */
  1588. -               break;
  1589. -           }
  1590. -       }
  1591. -   }
  1592. -}
  1593. -
  1594. -void update_kswapd_threads(void)
  1595. -{
  1596. -   int nid;
  1597. -
  1598. -   if (kswapd_threads_current == kswapd_threads)
  1599. -       return;
  1600. -
  1601. -   /*
  1602. -    * Hold the memory hotplug lock to avoid racing with memory
  1603. -    * hotplug initiated updates
  1604. -    */
  1605. -   mem_hotplug_begin();
  1606. -   for_each_node_state(nid, N_MEMORY)
  1607. -       update_kswapd_threads_node(nid);
  1608. -
  1609. -   pr_info("kswapd_thread count changed, old:%d new:%d\n",
  1610. -       kswapd_threads_current, kswapd_threads);
  1611. -   kswapd_threads_current = kswapd_threads;
  1612. -   mem_hotplug_done();
  1613. -}
  1614. -
  1615. -static int multi_kswapd_run(int nid)
  1616. -{
  1617. -   pg_data_t *pgdat = NODE_DATA(nid);
  1618. -   int hid, nr_threads = kswapd_threads;
  1619. -   int ret = 0;
  1620. -
  1621. -   pgdat->mkswapd[0] = pgdat->kswapd;
  1622. -   for (hid = 1; hid < nr_threads; ++hid) {
  1623. -       pgdat->mkswapd[hid] = kthread_run(kswapd, pgdat, "kswapd%d:%d",
  1624. -                               nid, hid);
  1625. -       if (IS_ERR(pgdat->mkswapd[hid])) {
  1626. -           /* failure at boot is fatal */
  1627. -           WARN_ON(system_state < SYSTEM_RUNNING);
  1628. -           pr_err("Failed to start kswapd%d on node %d\n",
  1629. -               hid, nid);
  1630. -           ret = PTR_ERR(pgdat->mkswapd[hid]);
  1631. -           pgdat->mkswapd[hid] = NULL;
  1632. -       }
  1633. -   }
  1634. -   kswapd_threads_current = nr_threads;
  1635. -
  1636. -   return ret;
  1637. -}
  1638. -
  1639. -static void multi_kswapd_stop(int nid)
  1640. -{
  1641. -   int hid = 0;
  1642. -   int nr_threads = kswapd_threads_current;
  1643. -   struct task_struct *kswapd;
  1644. -
  1645. -   NODE_DATA(nid)->mkswapd[hid] = NULL;
  1646. -   for (hid = 1; hid < nr_threads; hid++) {
  1647. -       kswapd = NODE_DATA(nid)->mkswapd[hid];
  1648. -       if (kswapd) {
  1649. -           kthread_stop(kswapd);
  1650. -           NODE_DATA(nid)->mkswapd[hid] = NULL;
  1651. -       }
  1652. -   }
  1653. -}
  1654. -
  1655. -static void multi_kswapd_cpu_online(pg_data_t *pgdat,
  1656. -                   const struct cpumask *mask)
  1657. -{
  1658. -   int hid;
  1659. -   int nr_threads = kswapd_threads_current;
  1660. -
  1661. -   for (hid = 1; hid < nr_threads; hid++)
  1662. -       set_cpus_allowed_ptr(pgdat->mkswapd[hid], mask);
  1663. -}
  1664. -#endif
  1665. -
  1666.  /* It's optimal to keep kswapds on the same CPUs as their memory, but
  1667.     not required for correctness.  So if the last cpu in a node goes
  1668.     away, we get changed to run anywhere: as the first one comes back,
  1669. @@ -4255,13 +4512,15 @@ static int kswapd_cpu_online(unsigned int cpu)
  1670.         pg_data_t *pgdat = NODE_DATA(nid);
  1671.         const struct cpumask *mask;
  1672.  
  1673. +#if CONFIG_KSWAPD_CPU
  1674. +       mask = &kswapd_cpumask;
  1675. +#else
  1676.         mask = cpumask_of_node(pgdat->node_id);
  1677. +#endif
  1678.  
  1679. -       if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) {
  1680. +       if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
  1681.             /* One of our CPUs online: restore mask */
  1682.             set_cpus_allowed_ptr(pgdat->kswapd, mask);
  1683. -           multi_kswapd_cpu_online(pgdat, mask);
  1684. -       }
  1685.     }
  1686.     return 0;
  1687.  }
  1688. @@ -4278,17 +4537,14 @@ int kswapd_run(int nid)
  1689.     if (pgdat->kswapd)
  1690.         return 0;
  1691.  
  1692. -   pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d:0", nid);
  1693. +   pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
  1694.     if (IS_ERR(pgdat->kswapd)) {
  1695.         /* failure at boot is fatal */
  1696.         BUG_ON(system_state < SYSTEM_RUNNING);
  1697.         pr_err("Failed to start kswapd on node %d\n", nid);
  1698.         ret = PTR_ERR(pgdat->kswapd);
  1699.         pgdat->kswapd = NULL;
  1700. -       return ret;
  1701.     }
  1702. -   ret = multi_kswapd_run(nid);
  1703. -
  1704.     return ret;
  1705.  }
  1706.  
  1707. @@ -4304,14 +4560,15 @@ void kswapd_stop(int nid)
  1708.         kthread_stop(kswapd);
  1709.         NODE_DATA(nid)->kswapd = NULL;
  1710.     }
  1711. -
  1712. -   multi_kswapd_stop(nid);
  1713.  }
  1714.  
  1715.  static int __init kswapd_init(void)
  1716.  {
  1717.     int nid, ret;
  1718.  
  1719. +#if CONFIG_KSWAPD_CPU
  1720. +   init_kswapd_cpumask();
  1721. +#endif
  1722.     swap_setup();
  1723.     for_each_node_state(nid, N_MEMORY)
  1724.         kswapd_run(nid);
  1725. @@ -4319,6 +4576,10 @@ static int __init kswapd_init(void)
  1726.                     "mm/vmscan:online", kswapd_cpu_online,
  1727.                     NULL);
  1728.     WARN_ON(ret < 0);
  1729. +#ifdef CONFIG_SYSFS
  1730. +   if (sysfs_create_group(mm_kobj, &vmscan_attr_group))
  1731. +       pr_err("vmscan: register sysfs failed\n");
  1732. +#endif
  1733.     return 0;
  1734.  }
  1735.  
Advertisement
RAW Paste Data Copied
Advertisement